OpenBMB · RanchiZhao · Jul 28, 2023
diff --git a/src/config/cpm-bee-10b.json b/src/config/cpm-bee-10b.json
@@ -10,5 +10,6 @@
     "position_bias_num_segment_buckets": 256,
     "position_bias_max_distance" : 2048,
     "eps" : 1e-6,
-    "half" : true
+    "half" : true,
+    "int4" : false
 }
diff --git a/src/config/cpm-bee-3b.json b/src/config/cpm-bee-3b.json
@@ -10,5 +10,6 @@
     "position_bias_num_segment_buckets": 256,
     "position_bias_max_distance" : 2048,
     "eps" : 1e-6,
-    "half" : true
+    "half" : true,
+    "int4" : false
 }
diff --git a/src/cpm_live/layers/__init__.py b/src/cpm_live/layers/__init__.py
@@ -1,6 +1,6 @@
 from .embedding import Embedding, EmbeddingExt
 from .position_embedding import SegmentPositionEmbedding, BucketPositionBias, RotaryEmbedding
-from .linear import Linear
+from .linear import Linear, Linear4bit, Params4bit
 from .layernorm import LayerNorm
 from .attention import Attention
 from .feedforward import FeedForward

diff --git a/src/cpm_live/layers/attention.py b/src/cpm_live/layers/attention.py
@@ -17,7 +17,7 @@
 import torch
 import bmtrain as bmt
 import math
-from .linear import Linear
+from .linear import Linear, Linear4bit
 
 
 class Attention(bmt.DistributedModule):
@@ -28,6 +28,8 @@ def __init__(
         dim_head: int,
         dtype: torch.dtype = torch.half,
         dropout_p: Optional[float] = None,
+        int4: Optional[bool] = None,
+
     ) -> None:
 
         super().__init__()
@@ -36,12 +38,17 @@ def __init__(
         self.num_heads = num_heads
         self.dim_head = dim_head
 
-        self.project_q = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
-        self.project_k = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
-        self.project_v = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
-
-        self.attention_out = Linear(self.num_heads * self.dim_head, self.dim_model, dtype=dtype)
-
+        if int4 is None or int4 is False:
+            self.project_q = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
+            self.project_k = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
+            self.project_v = Linear(self.dim_model, self.num_heads * self.dim_head, dtype=dtype)
+            self.attention_out = Linear(self.num_heads * self.dim_head, self.dim_model, dtype=dtype)
+        else:
+            self.project_q = Linear4bit(self.dim_model, self.num_heads * self.dim_head)
+            self.project_k = Linear4bit(self.dim_model, self.num_heads * self.dim_head)
+            self.project_v = Linear4bit(self.dim_model, self.num_heads * self.dim_head)
+            self.attention_out = Linear4bit(self.num_heads * self.dim_head, self.dim_model)
+
         self.softmax = torch.nn.Softmax(dim=-1)
 
         if dropout_p is not None:

diff --git a/src/cpm_live/layers/blocks.py b/src/cpm_live/layers/blocks.py
@@ -31,6 +31,7 @@ class SelfAttentionBlock(bmt.DistributedModule):
         dtype (optional): Defaults to torch.half.
         eps (float, optional): eps used in :py:class:`model_center.layer.LayerNorm`. Defaults to 1e-5.
         dropout_p (float, optional): Defaults to 0.
+        int4 (int, optional): whether to use int4 to load model. Defaults to False.
     """  # noqa: E501
 
     def __init__(
@@ -41,6 +42,8 @@ def __init__(
         dtype=torch.half,
         eps: float = 1e-6,
         dropout_p: Optional[float] = None,
+        int4: Optional[bool] = None,
+
     ):
 
         super().__init__()
@@ -57,6 +60,7 @@ def __init__(
             dim_head=dim_head,
             dtype=dtype,
             dropout_p=dropout_p,
+            int4=int4,
         )
 
         if dropout_p:
@@ -108,6 +112,7 @@ class FFNBlock(torch.nn.Module):
         dtype (optional): Defaults to torch.half.
         eps (float, optional): eps used in :py:class:`model_center.layer.LayerNorm`. Defaults to 1e-5.
         dropout_p (float, optional): Defaults to 0.
+        int4 (int, optional): whether to use int4 to load model. Defaults to False.
     """  # noqa: E501
 
     def __init__(
@@ -117,6 +122,7 @@ def __init__(
         dtype=torch.half,
         eps: float = 1e-6,
         dropout_p: Optional[float] = 0,
+        int4: Optional[bool] = None,
     ):
         super().__init__()
 
@@ -131,6 +137,7 @@ def __init__(
             dim_ff,
             dtype=dtype,
             dropout_p=dropout_p,
+            int4=int4,
         )
 
         if dropout_p:
@@ -169,6 +176,7 @@ class TransformerBlock(torch.nn.Module):
         dtype (optional): Defaults to torch.half.
         eps (float, optional): eps used in :py:class:`model_center.layer.LayerNorm`. Defaults to 1e-5.
         dropout_p (float, optional): Defaults to 0.
+        int4 (int, optional): whether to use int4 to load model. Defaults to False.
     """  # noqa: E501
 
     def __init__(
@@ -182,6 +190,7 @@ def __init__(
         dropout_p: Optional[float] = None,
         mask_att: bool = False,
         mask_ffn: bool = False,
+        int4: Optional[bool] = None,
     ):
         super().__init__()
         self.mask_att = mask_att
@@ -195,6 +204,7 @@ def __init__(
                 dtype=dtype,
                 eps=eps,
                 dropout_p=dropout_p,
+                int4=int4,
             )
 
         if not self.mask_ffn:
@@ -204,6 +214,7 @@ def __init__(
                 dtype=dtype,
                 eps=eps,
                 dropout_p=dropout_p,
+                int4=int4,
             )
 
     def forward(

diff --git a/src/cpm_live/layers/feedforward.py b/src/cpm_live/layers/feedforward.py
@@ -16,7 +16,7 @@
 from typing import Optional
 import torch
 import bmtrain as bmt
-from .linear import Linear
+from .linear import Linear, Linear4bit
 
 
 class DenseGatedACT(bmt.DistributedModule):
@@ -25,22 +25,33 @@ def __init__(
         dim_in: int,
         dim_ff: int,
         dtype=torch.half,
+        int4: Optional[bool] = None,
     ):
         super().__init__()
-
-        self.w_0 = Linear(
-            dim_in=dim_in,
-            dim_out=dim_ff,
-            dtype=dtype,
-            scale_before=False,
-        )
-
-        self.w_1 = Linear(
-            dim_in=dim_in,
-            dim_out=dim_ff,
-            dtype=dtype,
-            scale_before=False,
-        )
+        if int4 is None or int4 is False:   
+            self.w_0 = Linear(
+                dim_in=dim_in,
+                dim_out=dim_ff,
+                dtype=dtype,
+                scale_before=False,
+            )
+
+            self.w_1 = Linear(
+                dim_in=dim_in,
+                dim_out=dim_ff,
+                dtype=dtype,
+                scale_before=False,
+            )
+        else:
+            self.w_0 = Linear4bit(
+                dim_in=dim_in,
+                dim_out=dim_ff,
+            )
+
+            self.w_1 = Linear4bit(
+                dim_in=dim_in,
+                dim_out=dim_ff,
+            )
         self.act = torch.nn.GELU()
 
     def forward(self, x: torch.Tensor):
@@ -74,6 +85,7 @@ class FeedForward(bmt.DistributedModule):
         bias (bool, optional): whether to use bias term in fully-connected layers used in feed-forward module. Defaults to False.
         activate_fn (str, optional): Defaults to `gated_gelu`.
         dropout_p (int, optional): Defaults to 0.
+        int4 (int, optional): whether to use int4 to load model. Defaults to False.
     """  # noqa: E501
 
     def __init__(
@@ -82,6 +94,7 @@ def __init__(
         dim_ff: int,
         dtype=torch.half,
         dropout_p: Optional[float] = None,
+        int4: Optional[bool] = None,
     ):
 
         super().__init__()
@@ -90,18 +103,25 @@ def __init__(
             dim_in=dim_model,
             dim_ff=dim_ff,
             dtype=dtype,
+            int4=int4,
         )
 
         if dropout_p is not None:
             self.dropout = torch.nn.Dropout(dropout_p)
         else:
             self.dropout = None
 
-        self.w_out = Linear(
-            dim_in=dim_ff,
-            dim_out=dim_model,
-            dtype=dtype,
-            scale_before=False,
+        if int4 is None or int4 is False:
+            self.w_out = Linear(
+                dim_in=dim_ff,
+                dim_out=dim_model,
+                dtype=dtype,
+                scale_before=False,
+            )
+        else:
+            self.w_out = Linear4bit(
+                dim_in=dim_ff,
+                dim_out=dim_model,
         )
 
     def forward(self, x: torch.Tensor):