[Docs] Add doc for max and mean gen len, shift factor; and buildArgs (mlc-ai#1119)

CharlieFRuan · web-flow · commit 206103b57a92 · 2023-10-24T11:54:01.000-04:00
* Add doc for max and mean gen len, shift factor

* Update python docs for BuildArgs
diff --git a/mlc_llm/core.py b/mlc_llm/core.py
@@ -79,11 +79,35 @@ class BuildArgs:
         Build with separated embedding layer, only applicable to LlaMa. This
         feature is in testing stage, and will be formally replaced after massive
         overhaul of embedding feature for all models and use cases.
+    cc_path: str
+        ``/path/to/cross_compiler_path``; currently only used for cross-compile
+        for nvidia/jetson device.
+    use_safetensors: bool
+        Specifies whether to use ``.safetensors`` instead of the default ``.bin``
+        when loading in model weights.
     enable_batching: bool
         Build the model for batched inference.
         This is a temporary flag used to control the model execution flow in single-
         sequence and batching settings for now. We will eventually merge two flows
         in the future and remove this flag then.
+    no_cutlass_attn: bool
+        Disable offloading attention operations to CUTLASS.
+    no_cutlass_norm: bool
+        Disable offloading layer and RMS norm operations to CUTLASS.
+    no_cublas: bool
+        Disable the step that offloads matmul to cuBLAS. Without this flag,
+        matmul will be offloaded to cuBLAS if quantization mode is ``q0f16`` or 
+        ``q0f32``, target is CUDA and TVM has been built with cuBLAS enabled.
+    use_cuda_graph: bool
+        Specifies whether to enable CUDA Graph for the decoder. MLP and QKV
+        projection between two attention layers are put into a graph.
+    num_shards: int
+        Number of shards to split the model into in tensor parallelism multi-gpu
+        inference. Only useful when ``build_model_only`` is set.
+    use_flash_attn_mqa: bool
+        Offload multi-query attention workload to Flash Attention.
+    pdb: bool
+        If set, drop into a pdb debugger on error.
     """
     model: str = field(
         default="auto",
@@ -217,7 +241,7 @@ class BuildArgs:
             "help": (
                 "Disable the step that offloads matmul to cuBLAS. Without this flag, "
                 "matmul will be offloaded to cuBLAS if quantization mode is q0f16 or q0f32, "
-                "target is CUDA and TVM has been built with cuBLAS enbaled."
+                "target is CUDA and TVM has been built with cuBLAS enabled."
             ),
             "action": "store_true",
         },
diff --git a/python/mlc_chat/chat_module.py b/python/mlc_chat/chat_module.py
@@ -91,7 +91,7 @@ class ChatConfig:
     :class:`mlc_chat.ChatModule` instance to override the default setting in
     ``mlc-chat-config.json`` under the model folder.
 
-    Since the configuraiton is partial, everything will be ``Optional``.
+    Since the configuration is partial, everything will be ``Optional``.
 
     Note that we will exploit this class to also represent ``mlc-chat-config.json``
     during intermediate processing.
@@ -131,14 +131,19 @@ class ChatConfig:
         For additional information on top-p sampling, please refer to this blog
         post: https://huggingface.co/blog/how-to-generate#top-p-nucleus-sampling.
     mean_gen_len : Optional[int]
+        The approximated average number of generated tokens in each round. Used
+        to determine whether the maximum window size would be exceeded.
     max_gen_len : Optional[int]
+        The maximum number of tokens to be generated in each round. Would simply
+        stop generating after this number is exceeded.
     shift_fill_factor : Optional[float]
+        The fraction of maximum window size to shift when it is exceeded.
     tokenizer_files : Optional[List[str]]
         List of tokenizer files of the model.
     conv_config : Optional[ConvConfig]
         The partial overriding configuration for conversation template. Will first
         load the predefined template with the name specified in ``conv_template``
-        and then override some of the configuraitons specified in ``conv_config``.
+        and then override some of the configurations specified in ``conv_config``.
     model_category : Optional[str]
         The category of the model's architecture (e.g. ``llama``, ``gpt_neox``, ``rwkv``).
     model_name : Optional[str]
@@ -216,7 +221,11 @@ class GenerationConfig:
         For additional information on top-p sampling, please refer to this blog
         post: https://huggingface.co/blog/how-to-generate#top-p-nucleus-sampling.
     mean_gen_len : Optional[int]
+        The approximated average number of generated tokens in each round. Used
+        to determine whether the maximum window size would be exceeded.
     max_gen_len : Optional[int]
+        The maximum number of tokens to be generated in each round. Would simply
+        stop generating after this number is exceeded.
     """
 
     temperature: Optional[float] = None