Use two-level dedup in named data store (#18351)

mergennachin · web-flow · commit 7976dc3e0043 · 2026-03-19T23:54:00.000-04:00
Replace single SHA-256 hash with a two-level approach:
1. Fast fingerprint (length + first 32 bytes) for cheap rejection
2. SHA-256 only when the fingerprint matches, to confirm without
   full byte comparison

For a 35B MoE model with ~29 GB of named data where most buffers
are unique, the fingerprint rejects non-matches instantly. SHA-256
is only computed on the rare fingerprint match, avoiding the ~98s
cost of hashing everything upfront.

Fingerprint collisions are handled by storing a list of candidate
buffer indices per fingerprint, so no dedup opportunities are lost.

Test plan:
- All 12 tests pass in test_named_data_store.py
- Added test_fingerprint_collision: same fingerprint, different
  content produces separate buffers
- Added test_fingerprint_collision_with_dedup: after a collision,
  a true duplicate of an earlier blob still dedupes correctly
diff --git a/exir/_serialize/_named_data_store.py b/exir/_serialize/_named_data_store.py
@@ -8,7 +8,7 @@
 
 import hashlib
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from executorch.exir._serialize.data_serializer import DataEntry
@@ -75,13 +75,11 @@ class NamedDataStore:
     # Map of {filename: {key: DataEntry}}.
     external_data: Dict[str, Dict[str, DataEntry]]
 
-    # Cache of the data hash for deduplication.
-    # Use a hash instead of the data as a key because a sha256 collision is
-    # unlikely, and the data may be large.
-    data_hash_to_buffer_idx: Dict[bytes, int]
-    # Cache of the key to buffer idx to ensure uniqueness.
-    # If a key is added multiple times, check the buffer idx to ensure that the
-    # data is identical too.
+    # Fast fingerprint for dedup: (length, first 32 bytes) -> buffer indices.
+    fingerprint_to_buffer_idx: Dict[Tuple[int, bytes], List[int]]
+    # SHA-256 digest per buffer index, computed lazily on first dedup check.
+    buffer_sha256: Dict[int, bytes]
+    # Cache of key to buffer idx to detect duplicate key registration.
     key_to_buffer_idx: Dict[str, int]
 
     def __init__(self) -> None:
@@ -91,10 +89,17 @@ def __init__(self) -> None:
         self.buffers = []
         self.pte_data = {}
         self.external_data = {}
-
-        self.data_hash_to_buffer_idx = {}
+        self.fingerprint_to_buffer_idx = {}
+        self.buffer_sha256 = {}
         self.key_to_buffer_idx = {}
 
+    def _get_buffer_sha256(self, buffer_idx: int) -> bytes:
+        sha = self.buffer_sha256.get(buffer_idx)
+        if sha is None:
+            sha = hashlib.sha256(self.buffers[buffer_idx]).digest()
+            self.buffer_sha256[buffer_idx] = sha
+        return sha
+
     def _add_named_data_to_map(
         self,
         key: str,
@@ -119,31 +124,34 @@ def _add_named_data_to_map(
             ValueError: when the key exists in the store, and corresponding data
                 is different.
         """
-        # Get data hash.
-        hashed = hashlib.sha256(data).digest()
-
         # Check if the key exists.
         buffer_idx = self.key_to_buffer_idx.get(key, -1)
-        # If the key exists, the corresponding data must be identical.
-        if (
-            buffer_idx != -1
-            and self.data_hash_to_buffer_idx.get(hashed, -1) != buffer_idx
-        ):
-            raise ValueError(
-                f"Duplicate key {key} with different data. "
-                f"Existing data size: {len(self.buffers[buffer_idx])} bytes. "
-                f"New data size: {len(data)} bytes."
-            )
+        if buffer_idx != -1:
+            if data != self.buffers[buffer_idx]:
+                raise ValueError(
+                    f"Duplicate key {key} with different data. "
+                    f"Existing data size: {len(self.buffers[buffer_idx])} bytes. "
+                    f"New data size: {len(data)} bytes."
+                )
         else:
-            # Key doesn't exist; check if the data exists.
-            buffer_idx = self.data_hash_to_buffer_idx.get(hashed, -1)
+            # Two-level dedup: cheap fingerprint rejects non-matches fast,
+            # SHA-256 confirms matches without full byte comparison.
+            fingerprint = (len(data), data[:32])
+            candidates = self.fingerprint_to_buffer_idx.get(fingerprint)
+            if candidates is not None:
+                new_sha = hashlib.sha256(data).digest()
+                for candidate in candidates:
+                    if new_sha == self._get_buffer_sha256(candidate):
+                        buffer_idx = candidate
+                        break
+
             if buffer_idx == -1:
-                # The data doesn't exist; add it to the data store.
                 buffer_idx = len(self.buffers)
                 self.buffers.append(data)
-                self.data_hash_to_buffer_idx[hashed] = buffer_idx
+                self.fingerprint_to_buffer_idx.setdefault(fingerprint, []).append(
+                    buffer_idx
+                )
 
-            # Add key to the map and the key cache.
             local_key_to_buffer_idx[key] = DataEntry(
                 buffer_index=buffer_idx,
                 alignment=alignment,
diff --git a/exir/_serialize/test/test_named_data_store.py b/exir/_serialize/test/test_named_data_store.py
@@ -210,3 +210,40 @@ def test_merge_duplicate_error(self) -> None:
         # Merge store2 into store1 raises error as key1 is already in store1
         # with different data.
         self.assertRaises(ValueError, store1.merge_named_data_store, output2)
+
+    def test_fingerprint_collision(self) -> None:
+        """Two blobs with same length and first 32 bytes but different content
+        must not be deduped."""
+        store = NamedDataStore()
+        prefix = b"A" * 32
+        data1 = prefix + b"X" * 100
+        data2 = prefix + b"Y" * 100
+        self.assertEqual(len(data1), len(data2))
+
+        store.add_named_data("key1", data1, None, None)
+        store.add_named_data("key2", data2, None, None)
+
+        output = store.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 2)
+        self.assertEqual(output.buffers[0], data1)
+        self.assertEqual(output.buffers[1], data2)
+        self.assertEqual(output.pte_data["key1"].buffer_index, 0)
+        self.assertEqual(output.pte_data["key2"].buffer_index, 1)
+
+    def test_fingerprint_collision_with_dedup(self) -> None:
+        """After a fingerprint collision, a true duplicate of the first blob
+        must still be deduped correctly."""
+        store = NamedDataStore()
+        prefix = b"A" * 32
+        data1 = prefix + b"X" * 100
+        data2 = prefix + b"Y" * 100
+
+        store.add_named_data("key1", data1, None, None)
+        store.add_named_data("key2", data2, None, None)
+        store.add_named_data("key3", data1, None, None)  # duplicate of key1
+
+        output = store.get_named_data_store_output()
+        self.assertEqual(len(output.buffers), 2)
+        self.assertEqual(output.pte_data["key1"].buffer_index, 0)
+        self.assertEqual(output.pte_data["key2"].buffer_index, 1)
+        self.assertEqual(output.pte_data["key3"].buffer_index, 0)