Optimization in LSTM for batch > 1 cases on HiFi.

pramods-cad · pramods-cad · commit c12210579cb3 · 2025-10-27T08:22:34.000-07:00
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc
@@ -473,7 +473,7 @@ void LstmStepManager::UpdateBatch() {
 // Multi-batch for time_major input
 RuntimeShape LstmStepManager::InputShape() const {
   int batch_size = 1;
-  if (size_info_.time_major) {
+  if (size_info_.time_major || ((size_info_.batch_size > 1 && size_info_.time_steps == 1))) {
     batch_size = size_info_.batch_size;
   }
   const int dims[2] = {batch_size, size_info_.input_dimension};
@@ -485,7 +485,7 @@ RuntimeShape LstmStepManager::InputShape() const {
 // Multi-batch for time_major input
 RuntimeShape LstmStepManager::StateShape() const {
   int batch_size = 1;
-  if (size_info_.time_major) {
+  if (size_info_.time_major || (size_info_.batch_size > 1 && size_info_.time_steps == 1)) {
     batch_size = size_info_.batch_size;
   }
   const int dims[2] = {batch_size, size_info_.state_dimension};
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h
@@ -666,6 +666,11 @@ void LstmStep(const LstmStepManager& step_info, const OpDataLSTM& op_data,
   int input_dimension = step_info.input_dimension();
   int state_dimension = step_info.state_dimension();
 
+  const auto& size_info = op_data.size_info;
+  if(size_info.batch_size > 1 && size_info.time_steps == 1) {
+    num_batches = size_info.batch_size;
+  }
+
   // Check offset validity to avoid memory overflow
   TFLITE_DCHECK_LE(step_info.InputOffset() + num_batches * input_dimension,
                    tflite::micro::GetTensorShape(input).FlatSize());
@@ -805,16 +810,22 @@ TfLiteStatus EvalLstm(const OpDataLSTM& op_data,
     }
   } else {
     // batch first, unable to size the input data. single batch inference
-    for (int b = 0; b < size_info.batch_size; b++) {
-      for (int t = 0; t < size_info.time_steps; t++) {
-        lstm_internal::LstmStep<ActivationType, WeightType, CellType, BiasType>(
-            step_info, op_data, kernel_content, buffers);
-        // prepare for the next time step
-        step_info.UpdateTime();
+    if(size_info.batch_size > 1 && size_info.time_steps == 1) {
+      // Ramesh
+      lstm_internal::LstmStep<ActivationType, WeightType, CellType, BiasType>(
+              step_info, op_data, kernel_content, buffers);
+    } else {
+      for (int b = 0; b < size_info.batch_size; b++) {
+        for (int t = 0; t < size_info.time_steps; t++) {
+          lstm_internal::LstmStep<ActivationType, WeightType, CellType, BiasType>(
+              step_info, op_data, kernel_content, buffers);
+          // prepare for the next time step
+          step_info.UpdateTime();
+        }
+        // prepare for the next batch
+        step_info.UpdateBatch();
+        step_info.ResetTime();
       }
-      // prepare for the next batch
-      step_info.UpdateBatch();
-      step_info.ResetTime();
     }
   }
   return kTfLiteOk;