[comb-to-synth] Implement Sklanskey Tree and Architecture Selection based on Attribute (#9021)

cowardsa · web-flow · commit 0d433e12dfed · 2025-09-29T14:32:05.000+01:00
* Add squarer partial product array reduction

* Implement Sklansky adder tree

* Add tests for different adder architecture selection via attributes

* Only modify particular files

* Remove brackets

* Address comments

* Discuss motivation behind adder arch selection
diff --git a/integration_test/circt-synth/comb-lowering-lec.mlir b/integration_test/circt-synth/comb-lowering-lec.mlir
@@ -28,6 +28,34 @@ hw.module @add(in %arg0: i4, in %arg1: i4, in %arg2: i4,  out add: i4) {
   hw.output %0 : i4
 }
 
+// RUN: circt-lec %t.mlir %s -c1=add_ripple_carry -c2=add_ripple_carry --shared-libs=%libz3 | FileCheck %s --check-prefix=COMB_ADD_RIPPLE_CARRY
+// COMB_ADD_RIPPLE_CARRY: c1 == c2
+hw.module @add_ripple_carry(in %arg0: i4, in %arg1: i4, in %arg2: i4,  out add: i4) {
+  %0 = comb.add %arg0, %arg1, %arg2 {synth.test.arch = "RIPPLE-CARRY"} : i4
+  hw.output %0 : i4
+}
+
+// RUN: circt-lec %t.mlir %s -c1=add_sklanskey -c2=add_sklanskey --shared-libs=%libz3 | FileCheck %s --check-prefix=COMB_ADD_SKLANSKEY
+// COMB_ADD_SKLANSKEY: c1 == c2
+hw.module @add_sklanskey(in %arg0: i4, in %arg1: i4, in %arg2: i4,  out add: i4) {
+  %0 = comb.add %arg0, %arg1, %arg2 {synth.test.arch = "SKLANSKEY"} : i4
+  hw.output %0 : i4
+}
+
+// RUN: circt-lec %t.mlir %s -c1=add_kogge_stone -c2=add_kogge_stone --shared-libs=%libz3 | FileCheck %s --check-prefix=COMB_ADD_KOGGE_STONE
+// COMB_ADD_KOGGE_STONE: c1 == c2
+hw.module @add_kogge_stone(in %arg0: i4, in %arg1: i4, in %arg2: i4,  out add: i4) {
+  %0 = comb.add %arg0, %arg1, %arg2 {synth.test.arch = "KOGGE-STONE"} : i4
+  hw.output %0 : i4
+}
+
+// RUN: circt-lec %t.mlir %s -c1=add_brent_kung -c2=add_brent_kung --shared-libs=%libz3 | FileCheck %s --check-prefix=COMB_ADD_BRENT_KUNG
+// COMB_ADD_BRENT_KUNG: c1 == c2
+hw.module @add_brent_kung(in %arg0: i4, in %arg1: i4, in %arg2: i4,  out add: i4) {
+  %0 = comb.add %arg0, %arg1, %arg2 {synth.test.arch = "BRENT-KUNG"} : i4
+  hw.output %0 : i4
+}
+
 // RUN: circt-lec %t.mlir %s -c1=sub -c2=sub --shared-libs=%libz3 | FileCheck %s --check-prefix=COMB_SUB
 // COMB_SUB: c1 == c2
 hw.module @sub(in %lhs: i4, in %rhs: i4, out out: i4) {
diff --git a/lib/Conversion/CombToSynth/CombToSynth.cpp b/lib/Conversion/CombToSynth/CombToSynth.cpp
@@ -480,17 +480,47 @@ struct CombAddOpConversion : OpConversionPattern<AddOp> {
       return success();
     }
 
+    // Check if the architecture is specified by an attribute.
+    auto arch = determineAdderArch(op, width);
+    if (arch == AdderArchitecture::RippleCarry)
+      return lowerRippleCarryAdder(op, inputs, rewriter);
+    return lowerParallelPrefixAdder(op, inputs, rewriter);
+  }
+
+  enum AdderArchitecture { RippleCarry, Sklanskey, KoggeStone, BrentKung };
+  static AdderArchitecture determineAdderArch(Operation *op, int64_t width) {
+    auto strAttr = op->getAttrOfType<StringAttr>("synth.test.arch");
+    if (strAttr) {
+      return llvm::StringSwitch<AdderArchitecture>(strAttr.getValue())
+          .Case("SKLANSKEY", Sklanskey)
+          .Case("KOGGE-STONE", KoggeStone)
+          .Case("BRENT-KUNG", BrentKung)
+          .Case("RIPPLE-CARRY", RippleCarry);
+    }
+    // Determine using width as a heuristic.
+    // TODO: Perform a more thorough analysis to motivate the choices or
+    // implement an adder synthesis algorithm to construct an optimal adder
+    // under the given timing constraints - see the work of Zimmermann
+
+    // For very small adders, overhead of a parallel prefix adder is likely not
+    // worth it.
     if (width < 8)
-      lowerRippleCarryAdder(op, inputs, rewriter);
-    else
-      lowerParallelPrefixAdder(op, inputs, rewriter);
+      return AdderArchitecture::RippleCarry;
 
-    return success();
+    // Sklanskey is a good compromise for high-performance, but has high fanout
+    // which may lead to wiring congestion for very large adders.
+    if (width <= 32)
+      return AdderArchitecture::Sklanskey;
+
+    // Kogge-Stone uses greater area than Sklanskey but has lower fanout thus
+    // may be preferable for larger adders.
+    return AdderArchitecture::KoggeStone;
   }
 
   // Implement a basic ripple-carry adder for small bitwidths.
-  void lowerRippleCarryAdder(comb::AddOp op, ValueRange inputs,
-                             ConversionPatternRewriter &rewriter) const {
+  LogicalResult
+  lowerRippleCarryAdder(comb::AddOp op, ValueRange inputs,
+                        ConversionPatternRewriter &rewriter) const {
     auto width = op.getType().getIntOrFloatBitWidth();
     // Implement a naive Ripple-carry full adder.
     Value carry;
@@ -528,13 +558,15 @@ struct CombAddOpConversion : OpConversionPattern<AddOp> {
                             << width << "\n");
 
     replaceOpWithNewOpAndCopyNamehint<comb::ConcatOp>(rewriter, op, results);
+    return success();
   }
 
   // Implement a parallel prefix adder - with Kogge-Stone or Brent-Kung trees
   // Will introduce unused signals for the carry bits but these will be removed
   // by the AIG pass.
-  void lowerParallelPrefixAdder(comb::AddOp op, ValueRange inputs,
-                                ConversionPatternRewriter &rewriter) const {
+  LogicalResult
+  lowerParallelPrefixAdder(comb::AddOp op, ValueRange inputs,
+                           ConversionPatternRewriter &rewriter) const {
     auto width = op.getType().getIntOrFloatBitWidth();
 
     auto aBits = extractBits(rewriter, inputs[0]);
@@ -566,10 +598,33 @@ struct CombAddOpConversion : OpConversionPattern<AddOp> {
     // Create copies of p and g for the prefix computation
     SmallVector<Value> pPrefix = p;
     SmallVector<Value> gPrefix = g;
-    if (width < 32)
+
+    // Check if the architecture is specified by an attribute.
+    auto arch = determineAdderArch(op, width);
+
+    switch (arch) {
+    case AdderArchitecture::RippleCarry:
+      llvm_unreachable("Ripple-Carry should be handled separately");
+      break;
+    case AdderArchitecture::Sklanskey:
+      lowerSklanskeyPrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
+      break;
+    case AdderArchitecture::KoggeStone:
       lowerKoggeStonePrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
-    else
+      break;
+    case AdderArchitecture::BrentKung:
       lowerBrentKungPrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
+      break;
+    }
+    // if (arch == AdderArchitecture::Sklanskey) {
+    //   lowerSklanskeyPrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
+    // } else if (arch == AdderArchitecture::KoggeStone) {
+    //   lowerKoggeStonePrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
+    // } else if (arch == AdderArchitecture::BrentKung) {
+    //   lowerBrentKungPrefixTree(op, inputs, rewriter, pPrefix, gPrefix);
+    // } else {
+    //   return failure();
+    // }
 
     // Generate result sum bits
     // NOTE: The result is stored in reverse order.
@@ -592,6 +647,62 @@ struct CombAddOpConversion : OpConversionPattern<AddOp> {
       for (int64_t i = 1; i < width; ++i)
         llvm::dbgs() << "RES" << i << " = P" << i << " XOR G" << i - 1 << "\n";
     });
+
+    return success();
+  }
+
+  // Implement the Sklansky parallel prefix tree
+  // High fan-out, low depth, low area
+  void lowerSklanskeyPrefixTree(comb::AddOp op, ValueRange inputs,
+                                ConversionPatternRewriter &rewriter,
+                                SmallVector<Value> &pPrefix,
+                                SmallVector<Value> &gPrefix) const {
+    auto width = op.getType().getIntOrFloatBitWidth();
+    SmallVector<Value> pPrefixNew = pPrefix;
+    SmallVector<Value> gPrefixNew = gPrefix;
+
+    for (int64_t stride = 1; stride < width; stride *= 2) {
+      for (int64_t i = stride; i < width; i += 2 * stride) {
+        for (int64_t k = 0; k < stride && i + k < width; ++k) {
+          int64_t idx = i + k;
+          int64_t j = i - 1;
+          // Group generate: g_idx OR (p_idx AND g_j)
+          Value andPG = comb::AndOp::create(rewriter, op.getLoc(), pPrefix[idx],
+                                            gPrefix[j]);
+          gPrefixNew[idx] =
+              comb::OrOp::create(rewriter, op.getLoc(), gPrefix[idx], andPG);
+
+          // Group propagate: p_idx AND p_j
+          pPrefixNew[idx] = comb::AndOp::create(rewriter, op.getLoc(),
+                                                pPrefix[idx], pPrefix[j]);
+        }
+      }
+      pPrefix = pPrefixNew;
+      gPrefix = gPrefixNew;
+    }
+    LLVM_DEBUG({
+      int64_t stage = 0;
+      for (int64_t stride = 1; stride < width; stride *= 2) {
+        llvm::dbgs()
+            << "--------------------------------------- Sklanskey Stage "
+            << stage << "\n";
+        for (int64_t i = stride; i < width; i += 2 * stride) {
+          for (int64_t k = 0; k < stride && i + k < width; ++k) {
+            int64_t idx = i + k;
+            int64_t j = i - 1;
+            // Group generate: g_i OR (p_i AND g_j)
+            llvm::dbgs() << "G" << idx << stage + 1 << " = G" << idx << stage
+                         << " OR (P" << idx << stage << " AND G" << j << stage
+                         << ")\n";
+
+            // Group propagate: p_i AND p_j
+            llvm::dbgs() << "P" << idx << stage + 1 << " = P" << idx << stage
+                         << " AND P" << j << stage << "\n";
+          }
+        }
+        ++stage;
+      }
+    });
   }
 
   // Implement the Kogge-Stone parallel prefix tree
diff --git a/test/Conversion/CombToSynth/comb-to-aig-arith.mlir b/test/Conversion/CombToSynth/comb-to-aig-arith.mlir
@@ -25,10 +25,107 @@ hw.module @add(in %lhs: i2, in %rhs: i2, out out: i2) {
   // CHECK-NEXT: %[[sum1:.*]] = comb.xor bin %[[lhs1]], %[[rhs1]], %[[carry0]] : i1
   // CHECK-NEXT: %[[concat:.*]] = comb.concat %[[sum1]], %[[sum0]] : i1, i1
   // CHECK-NEXT: hw.output %[[concat]] : i2
-  %0 = comb.add %lhs, %rhs : i2
+  %0 = comb.add %lhs, %rhs {synth.test.arch = "RIPPLE-CARRY"} : i2
   hw.output %0 : i2
 }
 
+// CHECK-LABEL: @add_sklanskey
+hw.module @add_sklanskey(in %lhs: i3, in %rhs: i3, out out: i3) {
+  // CHECK-NEXT: %[[LHS0:.+]] = comb.extract %lhs from 0 : (i3) -> i1
+  // CHECK-NEXT: %[[LHS1:.+]] = comb.extract %lhs from 1 : (i3) -> i1
+  // CHECK-NEXT: %[[LHS2:.+]] = comb.extract %lhs from 2 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS0:.+]] = comb.extract %rhs from 0 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS1:.+]] = comb.extract %rhs from 1 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS2:.+]] = comb.extract %rhs from 2 : (i3) -> i1
+  // CHECK-NEXT: %[[P0:.+]] = comb.xor %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[G0:.+]] = comb.and %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[P1:.+]] = comb.xor %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[G1:.+]] = comb.and %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[P2:.+]] = comb.xor %[[LHS2]], %[[RHS2]] : i1
+  // CHECK-NEXT: %[[G2:.+]] = comb.and %[[LHS2]], %[[RHS2]] : i1
+  // Reduction Tree
+  // CHECK-NEXT: %[[G10PRE:.+]] = comb.and %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[G10:.+]] = comb.or %[[G1]], %[[G10PRE]] : i1
+  // CHECK-NEXT: comb.and %[[P1]], %[[P0]] : i1
+  // CHECK-NEXT: comb.and %[[P2]], %[[G10]] : i1
+  // Sum Completion
+  // CHECK-NEXT: %[[S1:.+]] = comb.xor %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[S2:.+]] = comb.xor %[[P2]], %[[G10]] : i1
+  // CHECK-NEXT: %[[RES:.+]] = comb.concat %[[S2]], %[[S1]], %[[P0]] : i1, i1, i1
+  // CHECK-NEXT: hw.output %[[RES]] : i3
+  %0 = comb.add %lhs, %rhs {synth.test.arch = "SKLANSKEY"} : i3
+  hw.output %0 : i3
+}
+
+// CHECK-LABEL: @add_kogge_stone
+hw.module @add_kogge_stone(in %lhs: i3, in %rhs: i3, out out: i3) {
+  // CHECK-NEXT: %[[LHS0:.+]] = comb.extract %lhs from 0 : (i3) -> i1
+  // CHECK-NEXT: %[[LHS1:.+]] = comb.extract %lhs from 1 : (i3) -> i1
+  // CHECK-NEXT: %[[LHS2:.+]] = comb.extract %lhs from 2 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS0:.+]] = comb.extract %rhs from 0 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS1:.+]] = comb.extract %rhs from 1 : (i3) -> i1
+  // CHECK-NEXT: %[[RHS2:.+]] = comb.extract %rhs from 2 : (i3) -> i1
+  // CHECK-NEXT: %[[P0:.+]] = comb.xor %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[G0:.+]] = comb.and %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[P1:.+]] = comb.xor %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[G1:.+]] = comb.and %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[P2:.+]] = comb.xor %[[LHS2]], %[[RHS2]] : i1
+  // CHECK-NEXT: %[[G2:.+]] = comb.and %[[LHS2]], %[[RHS2]] : i1
+  // Reduction Tree
+  // CHECK-NEXT: %[[G10PRE:.+]] = comb.and %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[G10:.+]] = comb.or %[[G1]], %[[G10PRE]] : i1
+  // CHECK-NEXT: %[[G21PRE:.+]] = comb.and %[[P2]], %[[G1]] : i1
+  // CHECK-NEXT: comb.or %[[G2]], %[[G21PRE]] : i1
+  // CHECK-NEXT: %[[P21:.+]] = comb.and %[[P2]], %[[P1]] : i1
+  // CHECK-NEXT: comb.and %[[P21]], %[[G0]] : i1
+  // Sum Completion
+  // CHECK-NEXT: %[[S1:.+]] = comb.xor %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[S2:.+]] = comb.xor %[[P2]], %[[G10]] : i1
+  // CHECK-NEXT: %[[RES:.+]] = comb.concat %[[S2]], %[[S1]], %[[P0]] : i1, i1, i1
+  // CHECK-NEXT: hw.output %[[RES]] : i3
+  %0 = comb.add %lhs, %rhs {synth.test.arch = "KOGGE-STONE"} : i3
+  hw.output %0 : i3
+}
+
+// CHECK-LABEL: @add_brent_kung
+hw.module @add_brent_kung(in %lhs: i4, in %rhs: i4, out out: i4) {
+  // CHECK-NEXT: %[[LHS0:.+]] = comb.extract %lhs from 0 : (i4) -> i1
+  // CHECK-NEXT: %[[LHS1:.+]] = comb.extract %lhs from 1 : (i4) -> i1
+  // CHECK-NEXT: %[[LHS2:.+]] = comb.extract %lhs from 2 : (i4) -> i1
+  // CHECK-NEXT: %[[LHS3:.+]] = comb.extract %lhs from 3 : (i4) -> i1
+  // CHECK-NEXT: %[[RHS0:.+]] = comb.extract %rhs from 0 : (i4) -> i1
+  // CHECK-NEXT: %[[RHS1:.+]] = comb.extract %rhs from 1 : (i4) -> i1
+  // CHECK-NEXT: %[[RHS2:.+]] = comb.extract %rhs from 2 : (i4) -> i1
+  // CHECK-NEXT: %[[RHS3:.+]] = comb.extract %rhs from 3 : (i4) -> i1
+  //
+  // CHECK-NEXT: %[[P0:.+]] = comb.xor %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[G0:.+]] = comb.and %[[LHS0]], %[[RHS0]] : i1
+  // CHECK-NEXT: %[[P1:.+]] = comb.xor %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[G1:.+]] = comb.and %[[LHS1]], %[[RHS1]] : i1
+  // CHECK-NEXT: %[[P2:.+]] = comb.xor %[[LHS2]], %[[RHS2]] : i1
+  // CHECK-NEXT: %[[G2:.+]] = comb.and %[[LHS2]], %[[RHS2]] : i1
+  // CHECK-NEXT: %[[P3:.+]] = comb.xor %[[LHS3]], %[[RHS3]] : i1
+  // CHECK-NEXT: %[[G3:.+]] = comb.and %[[LHS3]], %[[RHS3]] : i1
+  // Reduction Tree
+  // CHECK-NEXT: %[[G10PRE:.+]] = comb.and %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[G10:.+]] = comb.or %[[G1]], %[[G10PRE]] : i1
+  // CHECK-NEXT: comb.and %[[P1]], %[[P0]] : i1
+  // CHECK-NEXT: %[[G32PRE:.+]] = comb.and %[[P3]], %[[G2]] : i1
+  // CHECK-NEXT: comb.or %[[G3]], %[[G32PRE]] : i1
+  // CHECK-NEXT: %[[P32:.+]] = comb.and %[[P3]], %[[P2]] : i1
+  // CHECK-NEXT: comb.and %[[P32]], %[[G10]] : i1
+  // CHECK-NEXT: %[[G20PRE:.+]] = comb.and %[[P2]], %[[G10]] : i1
+  // CHECK-NEXT: %[[G20:.+]] = comb.or %[[G2]], %[[G20PRE]] : i1
+  // Sum Completion
+  // CHECK-NEXT: %[[S1:.+]] = comb.xor %[[P1]], %[[G0]] : i1
+  // CHECK-NEXT: %[[S2:.+]] = comb.xor %[[P2]], %[[G10]] : i1
+  // CHECK-NEXT: %[[S3:.+]] = comb.xor %[[P3]], %[[G20]] : i1
+  // CHECK-NEXT: %[[RES:.+]] = comb.concat %[[S3]], %[[S2]], %[[S1]], %[[P0]] : i1, i1, i1, i1
+  // CHECK-NEXT: hw.output %[[RES]] : i4
+  %0 = comb.add %lhs, %rhs {synth.test.arch = "BRENT-KUNG"} : i4
+  hw.output %0 : i4
+}
+
 // CHECK-LABEL: @add_17
 hw.module @add_17(in %lhs: i17, in %rhs: i17, out out: i17) {
   %0 = comb.add %lhs, %rhs : i17
diff --git a/test/circt-synth/path-e2e.mlir b/test/circt-synth/path-e2e.mlir
@@ -6,8 +6,8 @@
 // COMMON-LABEL: # Longest Path Analysis result for "counter"
 // COMMON-NEXT: Found 168 paths
 // COMMON-NEXT: Found 32 unique end points
-// AIG-NEXT: Maximum path delay: 41
-// MIG-NEXT: Maximum path delay: 41
+// AIG-NEXT: Maximum path delay: 40
+// MIG-NEXT: Maximum path delay: 40
 // LUT6-NEXT: Maximum path delay: 7
 // Don't test detailed reports as they are not stable.