google · copybara-service · Apr 9, 2024 · Apr 4, 2024
diff --git a/README.md b/README.md
@@ -241,6 +241,24 @@ Example invocation for the following configuration:
 --model 2b-it
 ```
 
+### RecurrentGemma
+
+This repository includes a version of Gemma based on Griffin
+([paper](https://arxiv.org/abs/2402.19427),
+[code](https://github.com/google-deepmind/recurrentgemma)). Its architecture
+includes both recurrent layers and local attention, thus it is more efficient
+for longer sequences and has a smaller memory footprint than standard Gemma. We
+here provide a C++ implementation of this model based on the paper.
+
+To use the recurrent version of Gemma included in this repository, build the
+gemma binary as noted above in Step 3. Download the compressed weights and
+tokenizer from
+[Kaggle](https://www.kaggle.com/models/google/recurrentgemma/gemmaCpp) as in
+Step 1, and run the binary as follows:
+
+`./gemma --tokenizer tokenizer.spm --model gr2b-it --compressed_weights 2b-it-sfp.sbs`
+
+
 ### Troubleshooting and FAQs
 
 **Running `./gemma` fails with "Failed to read cache gating_ein_0 (error 294) ..."**
@@ -478,4 +496,9 @@ gemma.cpp was started in fall 2023 by [Austin Huang](mailto:austinvhuang@google.
 and [Jan Wassenberg](mailto:[email protected]), and subsequently released February 2024
 thanks to contributions from Phil Culliton, Paul Chang, and Dan Zheng.
 
+Griffin support was implemented in April 2024 thanks to contributions by Andrey
+Mikhaylov, Eugene Kliuchnikov, Jan Wassenberg, Jyrki Alakuijala, Lode
+Vandevenne, Luca Versari, Martin Bruse, Phil Culliton, Sami Boukortt, Thomas
+Fischbacher and Zoltan Szabadka.
+
 This is not an officially supported Google product.
diff --git a/benchmark.cc b/benchmark.cc
@@ -10,14 +10,14 @@
 #include "nlohmann/json.hpp"
 // copybara:import_next_line:gemma_cpp
 #include "gemma.h"
-// copybara:import_next_line:gemma_cpp
-#include "util/app.h"
-// copybara:import_next_line:gemma_cpp
-#include "util/args.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
 #include "hwy/timer.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/app.h"
+// copybara:import_next_line:gemma_cpp
+#include "util/args.h"
 
 using json = nlohmann::json;
 
@@ -259,6 +259,13 @@ int main(int argc, char** argv) {
   gcpp::AppArgs app(argc, argv);
   BenchmarkArgs benchmark_args(argc, argv);
 
+  if (const char* error = loader.Validate()) {
+    HWY_ABORT("\nInvalid loader args: %s", error);
+  }
+  if (const char* error = args.Validate()) {
+    HWY_ABORT("\nInvalid inference args: %s", error);
+  }
+
   hwy::ThreadPool inner_pool(0);
   hwy::ThreadPool pool(app.num_threads);
   // For many-core, pinning threads to cores helps.
@@ -275,7 +282,7 @@ int main(int argc, char** argv) {
 
   if (!benchmark_args.goldens.path.empty()) {
     const std::string golden_path =
-        benchmark_args.goldens.path + "/" + loader.model_type + ".txt";
+        benchmark_args.goldens.path + "/" + loader.model_type_str + ".txt";
     return BenchmarkGoldens(model, args, app, kv_cache, inner_pool, pool,
                             golden_path);
   } else if (!benchmark_args.summarize_text.path.empty()) {

diff --git a/compress_weights.cc b/compress_weights.cc
@@ -44,35 +44,14 @@ struct Args : public ArgsBase<Args> {
     ChooseNumThreads();
   }
 
-  static std::string ToLower(const std::string& text) {
-    std::string result = text;
-    std::transform(begin(result), end(result), begin(result),
-                   [](unsigned char c) { return std::tolower(c); });
-    return result;
-  }
-
-  gcpp::Model ModelType() const {
-    const std::string model_type_lc = ToLower(model_type);
-    if (model_type_lc.substr(0, 2) == "2b") {
-      return gcpp::Model::GEMMA_2B;
-    } else if (model_type_lc.substr(0, 2) == "7b") {
-      return gcpp::Model::GEMMA_7B;
-    } else {
-      HWY_ABORT("Unknown model type %s", model_type_lc.c_str());
-    }
-  }
+  gcpp::Model ModelType() const { return model_type; }
 
   // Returns error string or nullptr if OK.
-  const char* Validate() const {
-    const std::string model_type_lc = ToLower(model_type);
-    if (model_type.empty()) {
-      return "Missing --model flag, need to specify either 2b-pt, 7b-pt, "
-             "2b-it, 7b-it.";
-    }
-    if (model_type_lc != "2b-pt" && model_type_lc != "7b-pt" &&
-        model_type_lc != "2b-it" && model_type_lc != "7b-it") {
-      return "Model type must be 2b-pt, 7b-pt, 2b-it, 7b-it.";
-    }
+  const char* Validate() {
+    ModelTraining model_training;
+    const char* parse_result =
+        ParseModelTypeAndTraining(model_type_str, model_type, model_training);
+    if (parse_result) return parse_result;
     if (weights.path.empty()) {
       return "Missing --weights flag, a file for the uncompressed model.";
     }
@@ -88,18 +67,21 @@ struct Args : public ArgsBase<Args> {
 
   Path weights;             // uncompressed weights file location
   Path compressed_weights;  // compressed weights file location
-  std::string model_type;
+  std::string model_type_str;
+  Model model_type;
   size_t num_threads;
 
   template <class Visitor>
   void ForEach(const Visitor& visitor) {
     visitor(weights, "weights", Path(),
             "Path name of model weights (.sbs) file.\n"
             "    Required argument.");
-    visitor(model_type, "model", std::string(),
+    visitor(model_type_str, "model", std::string(),
             "Model type\n    2b-it = 2B parameters, instruction-tuned\n    "
             "2b-pt = 2B parameters, pretrained\n    7b-it = 7B parameters "
             "instruction-tuned\n    7b-pt = 7B parameters, pretrained\n    "
+            "gr2b-it = griffin 2B parameters, instruction-tuned\n    "
+            "gr2b-pt = griffin 2B parameters, pretrained\n    "
             "    Required argument.");
     visitor(compressed_weights, "compressed_weights", Path(),
             "Path name where compressed weights file will be written.\n"
@@ -115,7 +97,7 @@ struct Args : public ArgsBase<Args> {
 void ShowHelp(gcpp::Args& args) {
   std::cerr
       << "Usage:\n./compress_weights --weights <path to uncompressed weights> "
-      " --model <model type> --compressed_weights <output path>\n";
+         " --model <model type> --compressed_weights <output path>\n";
   std::cerr << "\n*Arguments*\n\n";
   args.Help();
   std::cerr << "\n";

diff --git a/configs.h b/configs.h
@@ -30,6 +30,8 @@
 
 #include <stddef.h>
 
+#include <array>
+
 // copybara:import_next_line:gemma_cpp
 #include "compression/sfp.h"
 #include "hwy/base.h"  // hwy::bfloat16_t
@@ -45,34 +47,121 @@ namespace gcpp {
 static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
 static constexpr size_t kTopK = GEMMA_TOPK;
 
+enum class LayerAttentionType {
+  kGemma,
+  kGriffinRecurrentBlock,
+};
+
+template <size_t kNum>
+constexpr std::array<LayerAttentionType, kNum> FixedLayerConfig(
+    LayerAttentionType type) {
+  std::array<LayerAttentionType, kNum> config = {};
+  for (LayerAttentionType& l : config) {
+    l = type;
+  }
+  return config;
+}
+
 struct ConfigGemma7B {
   static constexpr int kSeqLen = gcpp::kSeqLen;
   static constexpr int kVocabSize = 256000;
-  static constexpr int kLayers = 28;
+  static constexpr std::array<LayerAttentionType, 28> kLayerConfig =
+      FixedLayerConfig<28>(LayerAttentionType::kGemma);
+  static constexpr int kLayers = kLayerConfig.size();
   static constexpr int kModelDim = 3072;
   static constexpr int kFFHiddenDim = 16 * 3072 / 2;  // = 24576
   static constexpr int kHeads = 16;
   static constexpr int kKVHeads = 16;  // standard MHA
   static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = gcpp::kTopK;
+
+  // SSM config.
+  static constexpr int kConv1dWidth = 0;
+  static constexpr bool kFFBiases = false;
+  static constexpr bool kSoftmaxAttnOutputBiases = false;
+  static constexpr bool kUseHalfRope = false;
+  static constexpr bool kUseLocalAttention = false;
+  static constexpr bool kInterleaveQKV = true;
   static constexpr int kNumTensorScales = 0;
   using WeightT = GEMMA_WEIGHT_T;
 };
 
 struct ConfigGemma2B {
   static constexpr int kSeqLen = gcpp::kSeqLen;
   static constexpr int kVocabSize = 256000;
-  static constexpr int kLayers = 18;
+  static constexpr std::array<LayerAttentionType, 18> kLayerConfig =
+      FixedLayerConfig<18>(LayerAttentionType::kGemma);
+  static constexpr int kLayers = kLayerConfig.size();
   static constexpr int kModelDim = 2048;
   static constexpr int kFFHiddenDim = 16 * 2048 / 2;  // = 16384
   static constexpr int kHeads = 8;
   static constexpr int kKVHeads = 1;
   static constexpr int kQKVDim = 256;  // query size == key size == value size
   static constexpr int kTopK = gcpp::kTopK;
+
+  // SSM config.
+  static constexpr int kConv1dWidth = 0;
+  static constexpr bool kFFBiases = false;
+  static constexpr bool kSoftmaxAttnOutputBiases = false;
+  static constexpr bool kUseHalfRope = false;
+  static constexpr bool kUseLocalAttention = false;
+  static constexpr bool kInterleaveQKV = true;
   static constexpr int kNumTensorScales = 0;
   using WeightT = GEMMA_WEIGHT_T;
 };
 
+struct ConfigGriffin2B {
+  // Griffin uses local attention, so kSeqLen is actually the local attention
+  // window.
+  static constexpr int kSeqLen = 2048;
+  static constexpr int kVocabSize = 256000;
+  static constexpr std::array<LayerAttentionType, 26> kLayerConfig = {
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGemma,
+      LayerAttentionType::kGriffinRecurrentBlock,
+      LayerAttentionType::kGriffinRecurrentBlock,
+  };
+  static constexpr int kLayers = kLayerConfig.size();
+  static constexpr int kModelDim = 2560;
+  static constexpr int kFFHiddenDim = 7680;
+  static constexpr int kHeads = 10;
+  static constexpr int kKVHeads = 1;
+  static constexpr int kQKVDim = 256;  // query size == key size == value size
+  static constexpr int kTopK = gcpp::kTopK;
+
+  // SSM config.
+  static constexpr int kConv1dWidth = 4;
+  static constexpr bool kFFBiases = true;
+  static constexpr bool kSoftmaxAttnOutputBiases = true;
+  static constexpr bool kUseHalfRope = true;
+  static constexpr bool kUseLocalAttention = true;
+  static constexpr bool kInterleaveQKV = false;
+  static constexpr int kNumTensorScales = 140;
+  using WeightT = GEMMA_WEIGHT_T;
+};
+
 }  // namespace gcpp
 
 #endif  // THIRD_PARTY_GEMMA_CPP_CONFIGS_H_