refactor: use kv_storage for cache of ParallelFileScanner

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 9da4e9bdc8f7 · 2026-01-27T18:58:44.000+08:00
diff --git a/graphgen/bases/base_storage.py b/graphgen/bases/base_storage.py
@@ -39,6 +39,12 @@ def filter_keys(self, data: list[str]) -> set[str]:
     def upsert(self, data: dict[str, T]):
         raise NotImplementedError
 
+    def update(self, data: dict[str, T]):
+        raise NotImplementedError
+
+    def delete(self, ids: list[str]):
+        raise NotImplementedError
+
     def drop(self):
         raise NotImplementedError
 
diff --git a/graphgen/common/init_storage.py b/graphgen/common/init_storage.py
@@ -42,6 +42,12 @@ def filter_keys(self, data: list[str]) -> set[str]:
     def upsert(self, data: dict) -> dict:
         return self.kv.upsert(data)
 
+    def update(self, data: dict):
+        return self.kv.update(data)
+
+    def delete(self, ids: list[str]):
+        return self.kv.delete(ids)
+
     def drop(self):
         return self.kv.drop()
 
@@ -168,6 +174,12 @@ def filter_keys(self, data: list[str]) -> set[str]:
     def upsert(self, data: Dict[str, Any]):
         return ray.get(self.actor.upsert.remote(data))
 
+    def update(self, data: Dict[str, Any]):
+        return ray.get(self.actor.update.remote(data))
+
+    def delete(self, ids: list[str]):
+        return ray.get(self.actor.delete.remote(ids))
+
     def drop(self):
         return ray.get(self.actor.drop.remote())
 
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -44,11 +44,5 @@
 from .searcher.web.bing_search import BingSearch
 from .searcher.web.google_search import GoogleSearch
 from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
-from .storage import (
-    JsonKVStorage,
-    KuzuStorage,
-    NetworkXStorage,
-    RocksDBCache,
-    RocksDBKVStorage,
-)
+from .storage import JsonKVStorage, KuzuStorage, NetworkXStorage, RocksDBKVStorage
 from .tokenizer import Tokenizer
diff --git a/graphgen/models/storage/__init__.py b/graphgen/models/storage/__init__.py
@@ -2,5 +2,3 @@
 from graphgen.models.storage.graph.networkx_storage import NetworkXStorage
 from graphgen.models.storage.kv.json_storage import JsonKVStorage
 from graphgen.models.storage.kv.rocksdb_storage import RocksDBKVStorage
-
-from .rocksdb_cache import RocksDBCache
diff --git a/graphgen/models/storage/kv/json_storage.py b/graphgen/models/storage/kv/json_storage.py
@@ -1,7 +1,7 @@
 import os
 from dataclasses import dataclass
 
-from graphgen.bases.base_storage import BaseKVStorage
+from graphgen.bases.base_storage import BaseKVStorage, T
 from graphgen.utils import load_json, write_json
 
 
@@ -51,6 +51,15 @@ def upsert(self, data: dict):
             self._data.update(left_data)
         return left_data
 
+    def update(self, data: dict[str, T]):
+        for k, v in data.items():
+            self._data[k] = v
+
+    def delete(self, ids: list[str]):
+        for _id in ids:
+            if _id in self._data:
+                del self._data[_id]
+
     def drop(self):
         if self._data:
             self._data.clear()
diff --git a/graphgen/models/storage/kv/rocksdb_storage.py b/graphgen/models/storage/kv/rocksdb_storage.py
@@ -68,6 +68,15 @@ def upsert(self, data: Dict[str, Any]):
 
         return left_data
 
+    def update(self, data: Dict[str, Any]):
+        for k, v in data.items():
+            self._db[k] = v
+
+    def delete(self, ids: List[str]):
+        for _id in ids:
+            if _id in self._db:
+                del self._db[_id]
+
     def drop(self):
         self._db.close()
         Rdict.destroy(self._db_path)
diff --git a/graphgen/models/storage/rocksdb_cache.py b/graphgen/models/storage/rocksdb_cache.py
diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py
@@ -2,17 +2,22 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
 
-from graphgen.models import RocksDBCache
+from graphgen.bases import BaseKVStorage
+from graphgen.utils import compute_content_hash, logger
 
 
 class ParallelFileScanner:
     def __init__(
-        self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
+        self,
+        read_cache: BaseKVStorage,
+        allowed_suffix: Optional[List[str]] = None,
+        rescan: bool = False,
+        max_workers: int = 4,
     ):
-        self.cache = RocksDBCache(os.path.join(cache_dir, "input_paths.db"))
-        self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
+        self.cache = read_cache
+        self.allowed_suffix = set(allowed_suffix) if allowed_suffix else set()
         self.rescan = rescan
         self.max_workers = max_workers
 
@@ -55,8 +60,10 @@ def _scan_files(
             return self._empty_result(path_str)
 
         # cache check
-        cache_key = f"scan::{path_str}::recursive::{recursive}"
-        cached = self.cache.get(cache_key)
+        cache_key = compute_content_hash(
+            f"scan::{path_str}::recursive::{recursive}", prefix="read-"
+        )
+        cached = self.cache.get_by_id(cache_key)
         if cached and not self.rescan:
             return cached["data"]
 
@@ -66,7 +73,9 @@ def _scan_files(
         try:
             path_stat = path.stat()
             if path.is_file():
-                return self._scan_single_file(path, path_str, path_stat)
+                result = self._scan_single_file(path, path_str, path_stat)
+                self._cache_result(cache_key, result, path)
+                return result
             if path.is_dir():
                 with os.scandir(path_str) as entries:
                     for entry in entries:
@@ -113,6 +122,12 @@ def _scan_files(
                 stats["file_count"] += sub_data["stats"].get("file_count", 0)
 
         result = {"path": path_str, "files": files, "dirs": dirs, "stats": stats}
+        logger.debug(
+            "Scanned %s: %d files, %d dirs",
+            path_str,
+            stats["file_count"],
+            stats["dir_count"],
+        )
         self._cache_result(cache_key, result, path)
         return result
 
@@ -174,31 +189,26 @@ def _scan_subdirs(self, dir_list: List[Dict], visited: Set[str]) -> Dict[str, An
 
     def _cache_result(self, key: str, result: Dict, path: Path):
         """Cache the scan result"""
-        self.cache.set(
-            key,
+        self.cache.upsert(
             {
-                "data": result,
-                "dir_mtime": path.stat().st_mtime,
-                "cached_at": time.time(),
-            },
+                key: {
+                    "data": result,
+                    "dir_mtime": path.stat().st_mtime,
+                    "cached_at": time.time(),
+                },
+            }
         )
 
     def _is_allowed_file(self, path: Path) -> bool:
         """Check if the file has an allowed suffix"""
-        if self.allowed_suffix is None:
+        if not self.allowed_suffix or len(self.allowed_suffix) == 0:
             return True
         suffix = path.suffix.lower().lstrip(".")
         return suffix in self.allowed_suffix
 
-    def invalidate(self, path: str):
-        """Invalidate cache for a specific path"""
-        path = Path(path).resolve()
-        keys = [k for k in self.cache if k.startswith(f"scan::{path}")]
-        for k in keys:
-            self.cache.delete(k)
-
     def close(self):
-        self.cache.close()
+        self.cache.index_done_callback()
+        del self.cache
 
     def __enter__(self):
         return self
diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
@@ -3,6 +3,7 @@
 
 import ray
 
+from graphgen.common import init_storage
 from graphgen.models import (
     CSVReader,
     JSONReader,
@@ -51,6 +52,7 @@ def read(
     input_path: Union[str, List[str]],
     allowed_suffix: Optional[List[str]] = None,
     working_dir: Optional[str] = "cache",
+    kv_backend: str = "rocksdb",
     parallelism: int = 4,
     recursive: bool = True,
     read_nums: Optional[int] = None,
@@ -62,71 +64,79 @@ def read(
     :param input_path: File or directory path(s) to read from
     :param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
     :param working_dir: Directory to cache intermediate files (PDF processing)
+    :param kv_backend: Backend for key-value storage
     :param parallelism: Number of parallel workers
     :param recursive: Whether to scan directories recursively
     :param read_nums: Limit the number of documents to read
     :param reader_kwargs: Additional kwargs passed to readers
     :return: Ray Dataset containing all documents
     """
+
+    read_cache = init_storage(
+        backend=kv_backend, working_dir=working_dir, namespace="read"
+    )
     try:
         # 1. Scan all paths to discover files
         logger.info("[READ] Scanning paths: %s", input_path)
-        scanner = ParallelFileScanner(
-            cache_dir=working_dir,
+        with ParallelFileScanner(
+            read_cache=read_cache,
             allowed_suffix=allowed_suffix,
             rescan=False,
             max_workers=parallelism if parallelism > 0 else 1,
-        )
-
-        all_files = []
-        scan_results = scanner.scan(input_path, recursive=recursive)
-
-        for result in scan_results.values():
-            all_files.extend(result.get("files", []))
-
-        logger.info("[READ] Found %d files to process", len(all_files))
-
-        if not all_files:
-            raise ValueError("No files found to read.")
-
-        # 2. Group files by suffix to use appropriate reader
-        files_by_suffix = {}
-        for file_info in all_files:
-            suffix = Path(file_info["path"]).suffix.lower().lstrip(".")
-            if allowed_suffix and suffix not in [
-                s.lower().lstrip(".") for s in allowed_suffix
-            ]:
-                continue
-            files_by_suffix.setdefault(suffix, []).append(file_info["path"])
-
-        # 3. Create read tasks
-        read_tasks = []
-        for suffix, file_paths in files_by_suffix.items():
-            reader = _build_reader(suffix, working_dir, **reader_kwargs)
-            ds = reader.read(file_paths)
-            read_tasks.append(ds)
-
-        # 4. Combine all datasets
-        if not read_tasks:
-            raise ValueError("No datasets created from the provided files.")
-
-        if len(read_tasks) == 1:
-            combined_ds = read_tasks[0]
-        else:
-            combined_ds = read_tasks[0].union(*read_tasks[1:])
-
-        combined_ds = combined_ds.map(
-            lambda record: {
-                **record,
-                "_doc_id": compute_mm_hash(record, prefix="doc-"),
-            }
-        )
-
-        if read_nums is not None:
-            combined_ds = combined_ds.limit(read_nums)
-
-        logger.info("[READ] Successfully read files from %s", input_path)
-        return combined_ds
+        ) as scanner:
+            all_files = []
+            scan_results = scanner.scan(input_path, recursive=recursive)
+
+            for result in scan_results.values():
+                all_files.extend(result.get("files", []))
+
+            logger.info("[READ] Found %d files to process", len(all_files))
+
+            if not all_files:
+                raise ValueError("No files found to read.")
+
+            # 2. Group files by suffix to use appropriate reader
+            files_by_suffix = {}
+            for file_info in all_files:
+                suffix = Path(file_info["path"]).suffix.lower().lstrip(".")
+                if allowed_suffix and suffix not in [
+                    s.lower().lstrip(".") for s in allowed_suffix
+                ]:
+                    continue
+                files_by_suffix.setdefault(suffix, []).append(file_info["path"])
+
+            # 3. Create read tasks
+            read_tasks = []
+            for suffix, file_paths in files_by_suffix.items():
+                reader = _build_reader(suffix, working_dir, **reader_kwargs)
+                ds = reader.read(file_paths)
+                read_tasks.append(ds)
+
+            # 4. Combine all datasets
+            if not read_tasks:
+                raise ValueError("No datasets created from the provided files.")
+
+            if len(read_tasks) == 1:
+                combined_ds = read_tasks[0]
+            else:
+                combined_ds = read_tasks[0].union(*read_tasks[1:])
+
+            combined_ds = combined_ds.map(
+                lambda record: {
+                    **record,
+                    "_trace_id": compute_mm_hash(record, prefix="doc-"),
+                }
+            )
+
+            if read_nums is not None:
+                combined_ds = combined_ds.limit(read_nums)
+
+            # sample record
+            for i, item in enumerate(combined_ds.take(1)):
+                logger.debug("[READ] Sample record %d: %s", i, item)
+
+            logger.info("[READ] Successfully read files from %s", input_path)
+            return combined_ds
 
     except Exception as e:
         logger.error("[READ] Failed to read files from %s: %s", input_path, e)