Revert "[inference] Async dynamic batching (hpcaitech#4894)" (hpcaitech#4909)

tiandiao123 · CjhHa1 · commit 6402240b49de · 2023-10-20T11:04:04.000+08:00
This reverts commit fced140.
diff --git a/colossalai/inference/dynamic_batching/io_struct.py b/colossalai/inference/dynamic_batching/io_struct.py
@@ -103,21 +103,17 @@ def mark_finished_req(self, eos_id):
                 has_new_finish = True
         return has_new_finish
 
-    def filter_finished(self)->List[Req]:
+    def filter_finished(self):
         """
         Filter finished requests from the batch, the finished ones will be removed from 'reqs'.
         """
         # TODO: the logic of return should be defined here.
         unfinished_req = []
-        finished_req = []
         for req in self.reqs:
             if not req.has_generate_finished:
-                unfinished_req.append(req)   
-            else:
-                finished_req.append(req)             
+                unfinished_req.append(req)
         self.reqs = unfinished_req
         self.id_to_reqs = {req.request_id: req for req in self.reqs}
-        return finished_req
 
     def is_clear(self):
         return len(self.reqs) == 0
diff --git a/colossalai/inference/manager.py b/colossalai/inference/manager.py
@@ -8,8 +8,6 @@
 from .dynamic_batching.stats import Stats
 from .tensor_parallel import TPInferEngine
 
-from transformers import AutoTokenizer
-_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
 
 class DynamicBatchManager:
     def __init__(
@@ -61,6 +59,7 @@ def add_req(self, prompt_ids: List[int], sampling_params: SamplingParams, reques
         print("len(self.req_queue): ", len(self.req_queue))
         return
 
+<<<<<<< HEAD
     def add_input(self, request_id, sampling_params, prompts):
         """
         Encode and Add new input to req queue. support one sequence input for now.
@@ -75,6 +74,8 @@ def add_input(self, request_id, sampling_params, prompts):
         self.add_req(prompt_ids, sampling_params, request_id, prompts)
         return
      
+=======
+>>>>>>> 78cd937f... Revert "[inference] Async dynamic batching  (#4894)" (#4909)
     def abort(self, request_id):
         if self.running_batch is not None:
             for req in self.running_batch.reqs:
@@ -114,26 +115,6 @@ def loop_for_fwd(self):
             if self.running_batch is None:
                 time.sleep(0.1)  # 10ms
 
-    def _set_tokenizer(self, tokenizer=None, tokenizer_name: str = "", trust_remote_code: bool = False, use_fast:bool = True,):
-        if tokenizer is not None:
-            self.tokenizer = tokenizer 
-        else:
-            if "llama" in tokenizer_name.lower() and use_fast == True:
-                print(
-                "For some LLaMA-based models, initializing the fast tokenizer may "
-                "take a long time. To eliminate the initialization time, consider "
-                f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
-                "tokenizer. This is done automatically in Colossalai.")
-                
-                tokenizer_name = _FAST_LLAMA_TOKENIZER  
-        
-            try: 
-                self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=use_fast,trust_remote_code=trust_remote_code)
-            except TypeError as e:
-                use_fast = False
-                self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=use_fast,trust_remote_code=trust_remote_code)
-
-
     def _step(self):
         """
         Logic for handling requests
@@ -144,33 +125,32 @@ def _step(self):
             if new_batch is not None:
                 self.stats_tool.count_prompt_tokens(new_batch)
                 self.running_batch = new_batch
-                yield from self._prefill_batch(self.running_batch)
+                self._prefill_batch(self.running_batch)
                 self._filter_runing_batch()
                 self.has_wait_tokens = 0
             return
 
         if self.has_wait_tokens < self.max_wait_tokens:
             self.stats_tool.count_output_tokens(self.running_batch)
-            yield from self._decode_batch(self.running_batch)
+            self._decode_batch(self.running_batch)
             self._filter_runing_batch()
             self.has_wait_tokens += 1
             return
         else:
             new_mini_batch = self.req_queue.generate_new_batch(self.running_batch)
             if new_mini_batch is not None:
                 self.stats_tool.count_prompt_tokens(new_mini_batch)
-                yield from self._prefill_batch(new_mini_batch)
+                self._prefill_batch(new_mini_batch)
                 if not new_mini_batch.is_clear():
                     self._merge_batch(self.running_batch, new_mini_batch)
                     self.running_batch.merge(new_mini_batch)
                 self.has_wait_tokens = 0
-                
             else:
                 self.stats_tool.count_output_tokens(self.running_batch)
-                yield from self._decode_batch(self.running_batch)
+                self._decode_batch(self.running_batch)
                 self._filter_runing_batch()
                 self.has_wait_tokens += 1
-         
+
         return
 
     def _init_batch(self, batch: Batch, dtype="fp16"):
@@ -206,8 +186,7 @@ def _prefill_batch(self, batch):
         req_to_out_token_id = ans
         self._add_token_id_to_req(batch, req_to_out_token_id)
         has_new_finished_req = batch.mark_finished_req(self.eos_id)
-        yield from self._handle_finish_req(batch, has_new_finished_req)
-        
+        self._handle_finish_req(batch, has_new_finished_req)
         # delete finished reqs
 
     def _decode_batch(self, batch: Batch):
@@ -218,7 +197,7 @@ def _decode_batch(self, batch: Batch):
         req_to_out_token_id = ans
         self._add_token_id_to_req(batch, req_to_out_token_id)
         has_new_finished_req = batch.mark_finished_req(self.eos_id)
-        yield from self._handle_finish_req(batch, has_new_finished_req)
+        self._handle_finish_req(batch, has_new_finished_req)
 
     def _filter_batch(self, batch: Batch):
         batch_id = batch.batch_id
@@ -250,13 +229,11 @@ def _remove_batch(self, batch):
 
     def _handle_finish_req(self, batch: Batch, has_new_finished_req):
         if has_new_finished_req:
-            finished_reqs=batch.filter_finished()
+            batch.filter_finished()
             if batch.is_clear():
                 self._remove_batch(batch)
             else:
                 self._filter_batch(batch)
-            yield from self._output_process(finished_reqs)
-
 
     def _filter_runing_batch(self):
         if self.running_batch is not None and self.running_batch.is_clear():
@@ -269,6 +246,7 @@ def _add_token_id_to_req(self, batch: Batch, req_ans):
             req.output_metadata_list.append(new_gen_metadata)
         return
 
+<<<<<<< HEAD
     def _output_process(self, finished_reqs: List[Req]):
         """
         Process the output of a batch.
@@ -277,10 +255,13 @@ def _output_process(self, finished_reqs: List[Req]):
             output = self.tokenizer.decode(req.output_ids)
             yield req.prompts + output
 
+=======
+>>>>>>> 78cd937f... Revert "[inference] Async dynamic batching  (#4894)" (#4909)
     def clean_up(self):
         # this logic should be implemented in the future.
         pass
 
+<<<<<<< HEAD
     def generate(self,prompts,sampling_params,request_id):
         """
         Generate the output of a request.
@@ -306,3 +287,24 @@ def start_dynamic_batching(args, tp_engine, waiting_req_list):
         raise
 
     return batch_manager
+=======
+
+def start_dynamic_batching(args, tp_engine, waiting_req_list):
+    # try:
+    batch_manager = DynamicBatchManager(
+        tp_engine=tp_engine,
+        max_total_token_num=args.max_total_token_num,
+        batch_max_tokens=args.batch_max_tokens,
+        eos_id=args.eos_id,
+        log_stats=not args.disable_log_stats,
+        log_stats_interval=args.log_stats_interval,
+        waiting_req_list=waiting_req_list,
+    )
+
+    # except Exception:
+    #     batch_manager.clean_up()
+    #     raise
+
+    batch_manager.loop_for_fwd()
+    return
+>>>>>>> 78cd937f... Revert "[inference] Async dynamic batching  (#4894)" (#4909)