3737#include < pybind11/numpy.h>
3838
3939namespace py = pybind11;
40+ using namespace pybind11 ::literals;
4041
4142// Include the exception class
4243class PythonBackendException : public std ::exception {
@@ -224,6 +225,188 @@ bool compare_arrays(const py::array& cpp_result, const py::array& python_result,
224225 return true ;
225226}
226227
228+ // Focused benchmark for struct.unpack_from operation
229+ void benchmark_struct_unpack_operations () {
230+ std::cout << " \n\n === PERFORMANCE BENCHMARK: struct.unpack_from vs C++ ===" << std::endl;
231+ std::cout << " =========================================================" << std::endl;
232+
233+ // Test different string sizes
234+ std::vector<size_t > test_sizes = {10 , 100 , 1000 , 10000 , 100000 };
235+
236+ for (size_t string_size : test_sizes) {
237+ std::cout << " \n Testing single string of size: " << string_size << " bytes" << std::endl;
238+
239+ // Create test data - single string with length prefix
240+ std::vector<uint8_t > test_data;
241+
242+ // Add 4-byte length prefix (little-endian)
243+ uint32_t length = static_cast <uint32_t >(string_size);
244+ test_data.push_back (length & 0xFF );
245+ test_data.push_back ((length >> 8 ) & 0xFF );
246+ test_data.push_back ((length >> 16 ) & 0xFF );
247+ test_data.push_back ((length >> 24 ) & 0xFF );
248+
249+ // Add string data
250+ for (size_t i = 0 ; i < string_size; i++) {
251+ test_data.push_back (' A' + (i % 26 ));
252+ }
253+
254+ py::bytes py_data = py::bytes (reinterpret_cast <const char *>(test_data.data ()), test_data.size ());
255+
256+ // Measure multiple iterations for stability
257+ const int iterations = 10000 ;
258+
259+ // Benchmark Python struct.unpack_from for the exact line 117 operation
260+ {
261+ std::cout << " Testing: sb = struct.unpack_from(\" <{}s\" .format(l), val_buf, offset)[0]" << std::endl;
262+
263+ // Setup Python code that mimics the exact operation
264+ py::exec (R"(
265+ import struct
266+ def single_unpack_string(val_buf, l, offset):
267+ # This is the exact operation from line 117
268+ sb = struct.unpack_from("<{}s".format(l), val_buf, offset)[0]
269+ return sb
270+ )" );
271+
272+ py::object py_func = py::globals ()[" single_unpack_string" ];
273+
274+ // Warm up
275+ for (int i = 0 ; i < 100 ; i++) {
276+ py::bytes result = py_func (py_data, length, 4 );
277+ }
278+
279+ // Measure Python version
280+ auto py_start = std::chrono::high_resolution_clock::now ();
281+ for (int i = 0 ; i < iterations; i++) {
282+ py::bytes result = py_func (py_data, length, 4 );
283+ }
284+ auto py_end = std::chrono::high_resolution_clock::now ();
285+
286+ // Measure C++ equivalent
287+ auto cpp_start = std::chrono::high_resolution_clock::now ();
288+ for (int i = 0 ; i < iterations; i++) {
289+ // Create py::bytes object - more comparable to struct.unpack_from
290+ py::bytes bytes_obj (reinterpret_cast <const char *>(test_data.data () + 4 ), length);
291+ }
292+ auto cpp_end = std::chrono::high_resolution_clock::now ();
293+
294+ auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
295+ auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
296+
297+ double py_per_call = static_cast <double >(py_time.count ()) / iterations;
298+ double cpp_per_call = static_cast <double >(cpp_time.count ()) / iterations;
299+
300+ std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
301+ std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
302+ std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
303+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
304+ << py_per_call / cpp_per_call << " x" << std::endl;
305+ }
306+
307+ // Also test just the length unpacking (line 115)
308+ {
309+ std::cout << " \n Testing: l = struct.unpack_from(\" <I\" , val_buf, offset)[0]" << std::endl;
310+
311+ py::exec (R"(
312+ import struct
313+ def single_unpack_length(val_buf, offset):
314+ # This is from line 115
315+ l = struct.unpack_from("<I", val_buf, offset)[0]
316+ return l
317+ )" );
318+
319+ py::object py_func = py::globals ()[" single_unpack_length" ];
320+
321+ // Warm up
322+ for (int i = 0 ; i < 100 ; i++) {
323+ py::int_ result = py_func (py_data, 0 );
324+ }
325+
326+ // Measure Python version
327+ auto py_start = std::chrono::high_resolution_clock::now ();
328+ for (int i = 0 ; i < iterations; i++) {
329+ py::int_ result = py_func (py_data, 0 );
330+ }
331+ auto py_end = std::chrono::high_resolution_clock::now ();
332+
333+ // Measure C++ equivalent
334+ auto cpp_start = std::chrono::high_resolution_clock::now ();
335+ for (int i = 0 ; i < iterations; i++) {
336+ volatile uint32_t result = *reinterpret_cast <const uint32_t *>(test_data.data ());
337+ (void )result; // Prevent optimization
338+ }
339+ auto cpp_end = std::chrono::high_resolution_clock::now ();
340+
341+ auto py_time = std::chrono::duration_cast<std::chrono::nanoseconds>(py_end - py_start);
342+ auto cpp_time = std::chrono::duration_cast<std::chrono::nanoseconds>(cpp_end - cpp_start);
343+
344+ double py_per_call = static_cast <double >(py_time.count ()) / iterations;
345+ double cpp_per_call = static_cast <double >(cpp_time.count ()) / iterations;
346+
347+ std::cout << " Python struct.unpack_from: " << py_per_call << " ns per call" << std::endl;
348+ std::cout << " C++ direct access: " << cpp_per_call << " ns per call" << std::endl;
349+ std::cout << " Overhead per call: " << (py_per_call - cpp_per_call) << " ns" << std::endl;
350+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
351+ << py_per_call / cpp_per_call << " x" << std::endl;
352+ }
353+ }
354+
355+ // Test with a realistic workload
356+ std::cout << " \n\n Realistic Workload Test (15000 strings)" << std::endl;
357+ std::cout << " ========================================" << std::endl;
358+
359+ size_t num_strings = 15000 ;
360+ std::vector<std::string> test_strings;
361+ test_strings.reserve (num_strings);
362+ for (size_t i = 0 ; i < num_strings; i++) {
363+ test_strings.push_back (" string_" + std::to_string (i));
364+ }
365+
366+ std::vector<uint8_t > serialized = serialize_strings (test_strings);
367+ py::bytes py_serialized = py::bytes (reinterpret_cast <const char *>(serialized.data ()), serialized.size ());
368+
369+ py::module triton_pb_utils = py::module::import (" triton_python_backend_utils" );
370+
371+ // Full function comparison
372+ const int iterations = 100 ;
373+
374+ // Warm up
375+ for (int i = 0 ; i < 5 ; i++) {
376+ py::array py_result = triton_pb_utils.attr (" deserialize_bytes_tensor" )(py_serialized);
377+ py::array cpp_result = deserialize_bytes_tensor_cpp (serialized.data (), serialized.size ());
378+ }
379+
380+ // Python version
381+ auto py_start = std::chrono::high_resolution_clock::now ();
382+ for (int i = 0 ; i < iterations; i++) {
383+ py::array py_result = triton_pb_utils.attr (" deserialize_bytes_tensor" )(py_serialized);
384+ }
385+ auto py_end = std::chrono::high_resolution_clock::now ();
386+
387+ // C++ version
388+ auto cpp_start = std::chrono::high_resolution_clock::now ();
389+ for (int i = 0 ; i < iterations; i++) {
390+ py::array cpp_result = deserialize_bytes_tensor_cpp (serialized.data (), serialized.size ());
391+ }
392+ auto cpp_end = std::chrono::high_resolution_clock::now ();
393+
394+ auto py_time = std::chrono::duration_cast<std::chrono::microseconds>(py_end - py_start);
395+ auto cpp_time = std::chrono::duration_cast<std::chrono::microseconds>(cpp_end - cpp_start);
396+
397+ std::cout << " Python deserialize_bytes_tensor: " << py_time.count () / iterations << " μs per call" << std::endl;
398+ std::cout << " C++ deserialize_bytes_tensor: " << cpp_time.count () / iterations << " μs per call" << std::endl;
399+ std::cout << " Speedup: " << std::fixed << std::setprecision (2 )
400+ << static_cast <double >(py_time.count ()) / cpp_time.count () << " x" << std::endl;
401+
402+ // Calculate estimated impact of struct.unpack_from
403+ double estimated_unpack_overhead = num_strings * 2 * 500 ; // ~500ns per unpack call (estimate from above)
404+ std::cout << " \n Estimated struct.unpack_from overhead: ~" << estimated_unpack_overhead / 1000 << " μs" << std::endl;
405+ std::cout << " Actual performance difference: " << (py_time.count () - cpp_time.count ()) / iterations << " μs" << std::endl;
406+
407+ std::cout << " \n === END OF PERFORMANCE BENCHMARK ===" << std::endl;
408+ }
409+
227410int main () {
228411 std::cout << " Simple Deserialize Function Equivalence Test" << std::endl;
229412 std::cout << " =============================================" << std::endl;
@@ -366,13 +549,16 @@ int main() {
366549 std::cout << " Passed: " << passed << std::endl;
367550 std::cout << " Failed: " << failed << std::endl;
368551 std::cout << " Total time: " << total_time.count () << " ms" << std::endl;
369- std::cout << " Success rate: " << std::fixed << std::setprecision (1 )
552+ std::cout << " Success rate: " << std::fixed << std::setprecision (1 )
370553 << (static_cast <double >(passed) / test_cases.size () * 100 ) << " %" << std::endl;
371-
554+
372555 if (passed > 0 ) {
373556 std::cout << " \n All functional equivalence tests passed!" << std::endl;
374557 std::cout << " The C++ and Python implementations produce identical results." << std::endl;
375558 }
376-
559+
560+ // Run the performance benchmark
561+ benchmark_struct_unpack_operations ();
562+
377563 return failed > 0 ? 1 : 0 ;
378564}
0 commit comments