Back out "change from first element to a random element for cache missing items" (#5048)

emlin · meta-codesync[bot] · commit 6dc4fc8a80d0 · 2025-10-24T10:00:59.000-07:00
Summary: Pull Request resolved: #5048 X-link: https://github.com/facebookresearch/FBGEMM/pull/2058 Original commit changeset: 23e7f0d1e249 Original Phabricator Diff: D83612329 Since D83612329 is released, the model has started crashing intermittently. Revert the change and, at the same time, look for better optimization approaches. Reviewed By: jma99fb Differential Revision: D85404797 fbshipit-source-id: 5dad6196ead14cb1e5e8845bab12a30a058cbf75
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_inference_embedding.h
@@ -22,7 +22,6 @@
 #include <thrift/lib/cpp2/protocol/CompactProtocol.h>
 #include <thrift/lib/cpp2/protocol/Serializer.h>
 #include <torch/script.h>
-#include <random>
 #include "common/time/Time.h"
 
 #include "../ssd_split_embeddings_cache/initializer.h"
@@ -435,36 +434,9 @@ class DramKVInferenceEmbedding {
                           before_read_lock_ts;
 
                       if (!wlmap->empty() && !disable_random_init_) {
-                        // Simple block-based randomization using get_block with
-                        // cursor
-                        auto* pool = kv_store_.pool_by(shard_id);
-
-                        // Random starting cursor based on map size for good
-                        // entropy
-                        size_t random_start =
-                            folly::Random::rand32(wlmap->size());
-
-                        // Try to find a used block starting from random
-                        // position
-                        weight_type* block = nullptr;
-                        for (int attempts = 0; attempts < 16; ++attempts) {
-                          block = pool->template get_block<weight_type>(
-                              random_start + attempts);
-                          if (block != nullptr) {
-                            // Block is used (not null)
-                            row_storage_data_ptr =
-                                FixedBlockPool::data_ptr<weight_type>(block);
-                            break;
-                          }
-                        }
-
-                        // Fallback: if no used block found, use first element
-                        // from map
-                        if (block == nullptr) {
-                          row_storage_data_ptr =
-                              FixedBlockPool::data_ptr<weight_type>(
-                                  wlmap->begin()->second);
-                        }
+                        row_storage_data_ptr =
+                            FixedBlockPool::data_ptr<weight_type>(
+                                wlmap->begin()->second);
                       } else {
                         const auto& init_storage =
                             initializers_[shard_id]->row_storage_;
@@ -569,9 +541,7 @@ class DramKVInferenceEmbedding {
                   read_lookup_cache_total_duration / num_shards_;
               read_acquire_lock_avg_duration_ +=
                   read_acquire_lock_total_duration / num_shards_;
-              LOG_EVERY_MS(INFO, 5000)
-                  << "get_kv_db_async total read_missing_load per batch: "
-                  << read_missing_load;
+              read_missing_load_avg_ += read_missing_load / num_shards_;
               return std::vector<folly::Unit>(results.size());
             });
   };
diff --git a/fbgemm_gpu/test/tbe/dram_kv/dram_kv_inference_test.py b/fbgemm_gpu/test/tbe/dram_kv/dram_kv_inference_test.py
@@ -261,127 +261,3 @@ def reader_thread() -> None:  # pyre-ignore
         self.assertTrue(equal_one_of(embs[5, :4], possible_embs))
         reader_thread.join()
         self.assertFalse(reader_failed_event.is_set())
-
-    def test_randomized_cache_miss_initialization(self) -> None:
-        """Test that cache misses use randomized data from existing blocks."""
-        num_shards = 8
-        uniform_init_lower: float = -0.01
-        uniform_init_upper: float = 0.01
-
-        # Create DRAM KV inference cache
-        kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
-            num_shards,
-            uniform_init_lower,
-            uniform_init_upper,
-            False,  # disable_random_init
-        )
-        kv_embedding_cache.init(
-            [(32, 4, SparseType.FP16.as_int())],
-            32,
-            4,
-            torch.tensor([0, 100], dtype=torch.int64),
-        )
-
-        # Setup: Populate the cache with many initial values for better randomization diversity
-        # Use 400 setup items to ensure each shard (8 shards) gets ~50 entries for good randomization
-        setup_indices = torch.arange(0, 400, dtype=torch.int64)  # 400 setup items
-        setup_weights = torch.randint(
-            1, 255, (400, 32), dtype=torch.uint8
-        )  # Non-zero values to ensure randomization source
-        print(f"setup_weights: {setup_weights}")
-
-        # Populate cache
-        kv_embedding_cache.set_embeddings(setup_indices, setup_weights)
-
-        # Execute: Request cache misses multiple times - these should get randomized initialization
-        # Use indices outside the range [0, 399] to ensure they are actual cache misses
-        miss_indices = torch.tensor([500, 501, 502, 503, 504], dtype=torch.int64)
-
-        # Get the cache miss results multiple times to check for randomization
-        results = []
-        for _ in range(5):
-            current_output = kv_embedding_cache.get_embeddings(miss_indices)
-            results.append(current_output.clone())
-
-        # Assert: Verify that randomization occurs
-        # The results should not all be identical if randomization is working
-        all_identical = True
-        for i in range(1, len(results)):
-            if not torch.equal(
-                results[0][:, :4], results[i][:, :4]
-            ):  # Only check first 4 columns (actual data)
-                all_identical = False
-                break
-
-        # Since we're using randomization, results should be different
-        # Note: There's a small chance they could be identical by random chance,
-        # but with 5 trials of 5 vectors of 4 bytes, this is extremely unlikely
-        self.assertFalse(
-            all_identical,
-            "Randomized cache miss initialization should produce different results",
-        )
-
-        # All results should be non-zero (since we populated the cache with non-zero random values)
-        for result in results:
-            # Check that at least some values are non-zero (indicating data came from existing blocks)
-            self.assertTrue(
-                torch.any(result[:, :4] != 0),
-                "Cache miss results should contain non-zero values when cache has data",
-            )
-
-    def test_zero_cache_miss_initialization_with_embedding_cache_mode(self) -> None:
-        """Test that cache misses return all zero values when embedding_cache_mode=True."""
-        num_shards = 8
-        uniform_init_lower: float = -0.01
-        uniform_init_upper: float = 0.01
-
-        # Setup: Create DRAM KV inference cache with embedding_cache_mode=True (zero initialization)
-        kv_embedding_cache = torch.classes.fbgemm.DramKVEmbeddingInferenceWrapper(
-            num_shards,
-            uniform_init_lower,
-            uniform_init_upper,
-            True,  # embedding_cache_mode=True for zero initialization
-        )
-        kv_embedding_cache.init(
-            [(32, 4, SparseType.FP16.as_int())],
-            32,
-            4,
-            torch.tensor([0, 100], dtype=torch.int64),
-        )
-
-        # Populate the cache with some initial non-zero values to ensure zero initialization
-        # is not just due to empty cache
-        setup_indices = torch.arange(0, 50, dtype=torch.int64)
-        setup_weights = torch.randint(
-            1, 255, (50, 32), dtype=torch.uint8
-        )  # Non-zero values
-        kv_embedding_cache.set_embeddings(setup_indices, setup_weights)
-
-        # Execute: Request cache misses - these should get zero initialization due to embedding_cache_mode=True
-        # Use indices outside the range [0, 49] to ensure they are actual cache misses
-        miss_indices = torch.tensor([100, 101, 102, 103, 104], dtype=torch.int64)
-        results = []
-
-        # Get cache miss results multiple times to ensure consistent behavior
-        for _ in range(3):
-            current_output = kv_embedding_cache.get_embeddings(miss_indices)
-            results.append(current_output.clone())
-
-        # Assert: Verify that all cache miss results are zeros when embedding_cache_mode=True
-        expected_zeros = torch.zeros((5, 32), dtype=torch.uint8)
-
-        for i, result in enumerate(results):
-            # Check that all cache miss results are zero
-            self.assertTrue(
-                torch.equal(result, expected_zeros),
-                f"Cache miss results should be all zeros when embedding_cache_mode=True, "
-                f"but got non-zero values in iteration {i}: {result[:, :4]}",
-            )
-
-        # Additional verification: all results should be identical since they're all zeros
-        for i in range(1, len(results)):
-            self.assertTrue(
-                torch.equal(results[0], results[i]),
-                f"All zero cache miss results should be identical across calls, "
-                f"but results[0] != results[{i}]",
-            )