google
diff --git a/‎io/blob_compare.cc‎
Lines changed: 31 additions & 34 deletions b/‎io/blob_compare.cc‎
Lines changed: 31 additions & 34 deletions
diff --git a/‎ops/dot_test.cc‎
Lines changed: 3 additions & 2 deletions b/‎ops/dot_test.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎ops/matmul.cc‎
Lines changed: 3 additions & 3 deletions b/‎ops/matmul.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ops/matmul.h‎
Lines changed: 9 additions & 14 deletions b/‎ops/matmul.h‎
Lines changed: 9 additions & 14 deletions
diff --git a/‎ops/matmul_test.cc‎
Lines changed: 15 additions & 22 deletions b/‎ops/matmul_test.cc‎
Lines changed: 15 additions & 22 deletions
diff --git a/‎util/allocator.cc‎
Lines changed: 1 addition & 1 deletion b/‎util/allocator.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎util/allocator.h‎
Lines changed: 1 addition & 1 deletion b/‎util/allocator.h‎
Lines changed: 1 addition & 1 deletion
@@ -28,7 +28,6 @@
 #include "util/threading_context.h"
 #include "hwy/aligned_allocator.h"  // Span
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/timer.h"
 
 namespace gcpp {
@@ -104,27 +103,31 @@ BlobVec ReserveMemory(const RangeVec& ranges, BytePtr& all_blobs, size_t& pos) {
 // Reads one set of blobs in parallel (helpful if in disk cache).
 // Aborts on error.
 void ReadBlobs(BlobReader& reader, const RangeVec& ranges, BlobVec& blobs,
-               hwy::ThreadPool& pool) {
+               ThreadingContext& ctx, size_t cluster_idx) {
   HWY_ASSERT(reader.Keys().size() == blobs.size());
   HWY_ASSERT(ranges.size() == blobs.size());
-  pool.Run(0, blobs.size(), [&](size_t i, size_t /*thread*/) {
-    HWY_ASSERT(ranges[i].bytes == blobs[i].size());
-    reader.file().Read(ranges[i].offset, ranges[i].bytes, blobs[i].data());
-  });
+  ParallelFor(ParallelismStrategy::kWithinCluster, blobs.size(), ctx,
+              cluster_idx, [&](size_t i, size_t /*thread*/) {
+                HWY_ASSERT(ranges[i].bytes == blobs[i].size());
+                reader.file().Read(ranges[i].offset, ranges[i].bytes,
+                                   blobs[i].data());
+              });
 }
 
 // Parallelizes ReadBlobs across (two) packages, if available.
 void ReadBothBlobs(BlobReader& reader1, BlobReader& reader2,
                    const RangeVec& ranges1, const RangeVec& ranges2,
                    size_t total_bytes, BlobVec& blobs1, BlobVec& blobs2,
-                   NestedPools& pools) {
+                   ThreadingContext& ctx) {
   const double t0 = hwy::platform::Now();
-  HWY_WARN("Reading %zu GiB, %zux%zu cores: ", total_bytes >> 30,
-           pools.AllPackages().NumWorkers(), pools.Pool().NumWorkers());
-  pools.AllPackages().Run(0, 2, [&](size_t task, size_t pkg_idx) {
-    ReadBlobs(task ? reader2 : reader1, task ? ranges2 : ranges1,
-              task ? blobs2 : blobs1, pools.Pool(pkg_idx));
-  });
+  HWY_WARN("Reading %zu GiB, %zu clusters: ", total_bytes >> 30,
+           ctx.pools.NumClusters());
+  ParallelFor(ParallelismStrategy::kAcrossClusters, 2, ctx, 0,
+              [&](const size_t task, size_t cluster_idx) {
+                ReadBlobs(task ? reader1 : reader2, task ? ranges1 : ranges2,
+                          task ? blobs1 : blobs2, ctx, cluster_idx);
+              });
+
   const double t1 = hwy::platform::Now();
   HWY_WARN("%.1f GB/s\n", total_bytes / (t1 - t0) * 1E-9);
 }
@@ -181,29 +184,23 @@ size_t BlobDifferences(const ByteSpan data1, const ByteSpan data2,
 }
 
 void CompareBlobs(const KeyVec& keys, BlobVec& blobs1, BlobVec& blobs2,
-                  size_t total_bytes, NestedPools& pools) {
+                  size_t total_bytes, ThreadingContext& ctx) {
   HWY_WARN("Comparing %zu blobs in parallel: ", keys.size());
   const double t0 = hwy::platform::Now();
   std::atomic<size_t> blobs_equal{};
   std::atomic<size_t> blobs_diff{};
-  const IndexRangePartition ranges = StaticPartition(
-      IndexRange(0, keys.size()), pools.AllPackages().NumWorkers(), 1);
-  ParallelizeOneRange(
-      ranges, pools.AllPackages(),
-      [&](const IndexRange& range, size_t pkg_idx) {
-        pools.Pool(pkg_idx).Run(
-            range.begin(), range.end(), [&](size_t i, size_t /*thread*/) {
-              const size_t mismatches =
-                  BlobDifferences(blobs1[i], blobs2[i], keys[i]);
-              if (mismatches != 0) {
-                HWY_WARN("key %s has %zu mismatches in %zu bytes!\n",
-                         keys[i].c_str(), mismatches, blobs1[i].size());
-                blobs_diff.fetch_add(1);
-              } else {
-                blobs_equal.fetch_add(1);
-              }
-            });
-      });
+  ParallelFor(ParallelismStrategy::kHierarchical, keys.size(), ctx, 0,
+              [&](size_t i, size_t /*thread*/) {
+                const size_t mismatches =
+                    BlobDifferences(blobs1[i], blobs2[i], keys[i]);
+                if (mismatches != 0) {
+                  HWY_WARN("key %s has %zu mismatches in %zu bytes!\n",
+                           keys[i].c_str(), mismatches, blobs1[i].size());
+                  blobs_diff.fetch_add(1);
+                } else {
+                  blobs_equal.fetch_add(1);
+                }
+              });
   const double t1 = hwy::platform::Now();
   HWY_WARN("%.1f GB/s; total blob matches=%zu, mismatches=%zu\n",
            total_bytes / (t1 - t0) * 1E-9, blobs_equal.load(),
@@ -230,9 +227,9 @@ void ReadAndCompareBlobs(const Path& path1, const Path& path2) {
   ThreadingArgs args;
   ThreadingContext ctx(args);
   ReadBothBlobs(reader1, reader2, ranges1, ranges2, total_bytes, blobs1, blobs2,
-                ctx.pools);
+                ctx);
 
-  CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, ctx.pools);
+  CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, ctx);
 }
 
 }  // namespace gcpp
 
@@ -1124,8 +1124,9 @@ void TestAllDot() {
                              MatPadding::kOdd);
     std::array<DotStats, kMaxWorkers> all_stats;
 
-    ctx.pools.Cluster(0, 0).Run(
-        0, kReps, [&](const uint32_t rep, size_t thread) {
+    ParallelFor(
+        ParallelismStrategy::kWithinCluster, kReps, ctx, 0,
+        [&](size_t rep, size_t thread) {
           float* HWY_RESTRICT pa = a.Row(thread);
           float* HWY_RESTRICT pb = b.Row(thread);
           double* HWY_RESTRICT buf = bufs.Row(thread);
 
@@ -351,7 +351,7 @@ std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
 
 MatMulEnv::MatMulEnv(ThreadingContext& ctx)
     : ctx(ctx), A_BF(ctx.allocator), C_tiles(ctx) {
-  const size_t num_clusters = ctx.pools.AllClusters(/*pkg_idx=*/0).NumWorkers();
+  const size_t num_clusters = ctx.pools.NumClusters();
   per_cluster.resize(num_clusters);
   for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
     row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(kMaxBatchSize));  // C
@@ -368,7 +368,7 @@ void BindB(ThreadingContext& ctx, MatPtr& B, size_t sizeof_TC) {
 
   PROFILER_ZONE("Startup.BindB");
 
-  const size_t node = ctx.topology.GetCluster(/*pkg_idx=*/0, 0).Node();
+  const size_t node = ctx.topology.GetCluster(0).Node();
   uintptr_t begin = reinterpret_cast<uintptr_t>(B.RowBytes(0));
   uintptr_t end = begin + B.Rows() * B.Stride() * B.ElementBytes();
   // B row padding is less than the page size, so only bind the subset that
@@ -394,7 +394,7 @@ void BindC(ThreadingContext& ctx, MatPtr& C) {
   const size_t end = hwy::RoundDownTo(cols_c.end() * C.ElementBytes(),
                                       allocator.BasePageBytes());
 
-  const size_t node = ctx.topology.GetCluster(/*pkg_idx=*/0, 0).Node();
+  const size_t node = ctx.topology.GetCluster(0).Node();
   bool ok = true;
   for (size_t im = 0; im < C.Rows(); ++im) {
     ok &= allocator.BindMemory(C.RowBytes(im) + begin, end - begin, node);
 
@@ -105,8 +105,7 @@ struct MMParallelWithinCluster {
             size_t inner_tasks, size_t cluster_idx, const Func& func) const {
     HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
 
-    const size_t pkg_idx = 0;
-    hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+    hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
     const size_t base = ctx.Worker(cluster_idx);
 
     const IndexRangePartition ranges_n = StaticPartition(
@@ -122,8 +121,7 @@ struct MMParallelWithinCluster {
                       const IndexRangePartition& ranges_mc,
                       const IndexRangePartition& ranges_nc, size_t cluster_idx,
                       const Func& func) const {
-    const size_t pkg_idx = 0;
-    hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+    hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
     const size_t base = ctx.Worker(cluster_idx);
 
     // Low-batch: avoid Divide/Remainder.
@@ -143,8 +141,7 @@ struct MMParallelWithinCluster {
   template <class Func>
   void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
                   size_t cluster_idx, const Func& func) const {
-    const size_t pkg_idx = 0;
-    hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+    hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
     const size_t base = ctx.Worker(cluster_idx);
 
     cluster.Run(
@@ -164,12 +161,11 @@ struct MMParallelHierarchical {
     HWY_DASSERT(caller_cluster_idx == 0);
 
     // Single cluster: parallel-for over static partition of `range_n`.
-    const size_t pkg_idx = 0;
-    hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(pkg_idx);
+    hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
     const size_t num_clusters = all_clusters.NumWorkers();
     if (num_clusters == 1) {
       const size_t cluster_idx = 0;
-      hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+      hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
       const IndexRangePartition ranges_n = StaticPartition(
           range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
       return ParallelizeOneRange(
@@ -185,7 +181,7 @@ struct MMParallelHierarchical {
     ParallelizeOneRange(
         ranges_n, all_clusters,
         [&](const IndexRange& n_range, const size_t cluster_idx) {
-          hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+          hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
           const size_t cluster_base = ctx.Worker(cluster_idx);
           // Parallel-for over sub-ranges of `cluster_range` within the cluster.
           const IndexRangePartition worker_ranges = StaticPartition(
@@ -206,17 +202,16 @@ struct MMParallelHierarchical {
                       const IndexRangePartition& ranges_nc,
                       HWY_MAYBE_UNUSED size_t caller_cluster_idx,
                       const Func& func) const {
-    const size_t pkg_idx = 0;
     HWY_DASSERT(caller_cluster_idx == 0);
 
-    hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(pkg_idx);
+    hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
     // `all_clusters` is a pool with one worker per cluster in a package.
     const size_t num_clusters = all_clusters.NumWorkers();
     // Single (big) cluster: collapse two range indices into one parallel-for
     // to reduce the number of fork-joins.
     if (num_clusters == 1) {
       const size_t cluster_idx = 0;
-      hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+      hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
       // Low-batch: avoid Divide/Remainder.
       if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) {
         return ParallelizeOneRange(
@@ -237,7 +232,7 @@ struct MMParallelHierarchical {
         ranges_nc, all_clusters,
         [&](const IndexRange range_nc, size_t cluster_idx) {
           const size_t cluster_base = ctx.Worker(cluster_idx);
-          hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
+          hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
           ParallelizeOneRange(ranges_mc, cluster,
                               [&](const IndexRange& range_mc, size_t worker) {
                                 func(range_mc, range_nc, cluster_base + worker);
 
@@ -191,29 +191,22 @@ HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
   const IndexRange all_cols_c(0, C.Cols());
 
   NestedPools& pools = env.ctx.pools;
-  hwy::ThreadPool& all_packages = pools.AllPackages();
-  const IndexRangePartition get_row_c =
-      StaticPartition(all_rows_c, all_packages.NumWorkers(), 1);
+  hwy::ThreadPool& all_clusters = pools.AllClusters();
+  const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
+  const IndexRangePartition get_col_c =
+      StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
   ParallelizeOneRange(
-      get_row_c, all_packages,
-      [&](const IndexRange& rows_c, size_t package_idx) HWY_ATTR {
-        hwy::ThreadPool& all_clusters = pools.AllClusters(package_idx);
-        const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
-        const IndexRangePartition get_col_c =
-            StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
-        ParallelizeOneRange(
-            get_col_c, all_clusters,
-            [&](const IndexRange& cols_c, size_t cluster_idx) HWY_ATTR {
-              for (size_t r : rows_c) {
-                TC* HWY_RESTRICT C_row = C.Row(r);
-                for (size_t c : cols_c) {
-                  const float add = add_row ? add_row[c] : 0.0f;
-                  const float dot =
-                      Dot(df, b_span, c * B.Stride(), A.Row(r), A.Cols());
-                  C_row[c] = hwy::ConvertScalarTo<TC>(add + scale * dot);
-                }
-              }
-            });
+      get_col_c, all_clusters,
+      [&](const IndexRange& cols_c, size_t cluster_idx) HWY_ATTR {
+        for (size_t r : all_rows_c) {
+          TC* HWY_RESTRICT C_row = C.Row(r);
+          for (size_t c : cols_c) {
+            const float add = add_row ? add_row[c] : 0.0f;
+            const float dot =
+                Dot(df, b_span, c * B.Stride(), A.Row(r), A.Cols());
+            C_row[c] = hwy::ConvertScalarTo<TC>(add + scale * dot);
+          }
+        }
       });
 }
 
 
@@ -139,7 +139,7 @@ CacheInfo::CacheInfo(const BoundedTopology& topology) {
 
   step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
 
-  const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
+  const BoundedTopology::Cluster& cluster = topology.GetCluster(0);
   if (const hwy::Cache* caches = hwy::DataCaches()) {
     l1_bytes_ = caches[1].size_kib << 10;
     l2_bytes_ = caches[2].size_kib << 10;
 
@@ -169,7 +169,7 @@ class Allocator {
   bool ShouldBind() const { return should_bind_; }
 
   // Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
-  // typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
+  // typically `BoundedTopology::GetCluster(cluster_idx).node`.
   // Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
   // `p` and `bytes` must be multiples of `QuantumBytes()`.
   bool BindMemory(void* p, size_t bytes, size_t node) const;