feat: add read_storage

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 492c89f63c75 · 2026-01-28T15:02:16.000+08:00
diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py
@@ -22,7 +22,7 @@ def read(self, input_path: Union[str, List[str]]) -> Dataset:
         :return: Ray Dataset containing validated and filtered data.
         """
 
-        ds = ray.data.read_csv(input_path)
+        ds = ray.data.read_csv(input_path, include_paths=True)
         ds = ds.map_batches(self._validate_batch, batch_format="pandas")
         ds = ds.filter(self._should_keep_item)
         return ds
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
@@ -34,10 +34,13 @@ def read(self, input_path: Union[str, List[str]]) -> ray.data.Dataset:
                     with open(file, "r", encoding="utf-8") as f:
                         data = json.load(f)
                         data = self._unify_schema(data)
+                # add path
+                for item in data:
+                    item["path"] = file
                 file_ds: ray.data.Dataset = ray.data.from_items(data)
                 ds = ds.union(file_ds)  # type: ignore
         else:
-            ds = ray.data.read_json(input_path)
+            ds = ray.data.read_json(input_path, include_paths=True)
         ds = ds.map_batches(self._validate_batch, batch_format="pandas")
         ds = ds.filter(self._should_keep_item)
         return ds
diff --git a/graphgen/models/reader/parquet_reader.py b/graphgen/models/reader/parquet_reader.py
@@ -24,7 +24,7 @@ def read(self, input_path: Union[str, List[str]]) -> Dataset:
         if not ray.is_initialized():
             ray.init()
 
-        ds = ray.data.read_parquet(input_path)
+        ds = ray.data.read_parquet(input_path, include_paths=True)
         ds = ds.map_batches(self._validate_batch, batch_format="pandas")
         ds = ds.filter(self._should_keep_item)
         return ds
diff --git a/graphgen/models/reader/rdf_reader.py b/graphgen/models/reader/rdf_reader.py
@@ -118,7 +118,7 @@ def _parse_rdf_file(self, file_path: Path) -> List[Dict[str, Any]]:
                 "id": str(subj),
                 self.text_column: text,
                 "properties": props,
-                "source_file": str(file_path),
+                "path": str(file_path),
             }
             docs.append(doc)
 
diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py
@@ -18,13 +18,14 @@ def read(
         """
         docs_ds = ray.data.read_binary_files(
             input_path,
-            include_paths=False,
+            include_paths=True,
         )
 
         docs_ds = docs_ds.map(
             lambda row: {
                 "type": "text",
                 self.text_column: row["bytes"].decode("utf-8"),
+                "path": row["path"],
             }
         )
 
diff --git a/graphgen/operators/read/parallel_file_scanner.py b/graphgen/operators/read/parallel_file_scanner.py
@@ -11,12 +11,12 @@
 class ParallelFileScanner:
     def __init__(
         self,
-        read_cache: BaseKVStorage,
+        input_path_cache: BaseKVStorage,
         allowed_suffix: Optional[List[str]] = None,
         rescan: bool = False,
         max_workers: int = 4,
     ):
-        self.cache = read_cache
+        self.cache = input_path_cache
         self.allowed_suffix = set(allowed_suffix) if allowed_suffix else set()
         self.rescan = rescan
         self.max_workers = max_workers
@@ -61,7 +61,7 @@ def _scan_files(
 
         # cache check
         cache_key = compute_content_hash(
-            f"scan::{path_str}::recursive::{recursive}", prefix="read-"
+            f"scan::{path_str}::recursive::{recursive}", prefix="path-"
         )
         cached = self.cache.get_by_id(cache_key)
         if cached and not self.rescan:
diff --git a/graphgen/operators/read/read.py b/graphgen/operators/read/read.py
@@ -13,7 +13,7 @@
     RDFReader,
     TXTReader,
 )
-from graphgen.utils import compute_mm_hash, logger
+from graphgen.utils import compute_dict_hash, logger
 
 from .parallel_file_scanner import ParallelFileScanner
 
@@ -71,15 +71,17 @@ def read(
     :param reader_kwargs: Additional kwargs passed to readers
     :return: Ray Dataset containing all documents
     """
-
-    read_cache = init_storage(
+    input_path_cache = init_storage(
+        backend=kv_backend, working_dir=working_dir, namespace="input_path"
+    )
+    read_storage = init_storage(
         backend=kv_backend, working_dir=working_dir, namespace="read"
     )
     try:
         # 1. Scan all paths to discover files
         logger.info("[READ] Scanning paths: %s", input_path)
         with ParallelFileScanner(
-            read_cache=read_cache,
+            input_path_cache=input_path_cache,
             allowed_suffix=allowed_suffix,
             rescan=False,
             max_workers=parallelism if parallelism > 0 else 1,
@@ -124,12 +126,17 @@ def read(
             if read_nums is not None:
                 combined_ds = combined_ds.limit(read_nums)
 
-            combined_ds = combined_ds.map(
-                lambda record: {
-                    **record,
-                    "_trace_id": compute_mm_hash(record, prefix="doc-"),
-                }
-            )
+            def add_trace_id(batch):
+                batch["_trace_id"] = batch.apply(
+                    lambda row: compute_dict_hash(row, prefix="read-"), axis=1
+                )
+                records = batch.to_dict(orient="records")
+                data_to_upsert = {record["_trace_id"]: record for record in records}
+                read_storage.upsert(data_to_upsert)
+                read_storage.index_done_callback()
+                return batch
+
+            combined_ds = combined_ds.map_batches(add_trace_id, batch_format="pandas")
 
             # sample record
             for i, item in enumerate(combined_ds.take(1)):
diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py
@@ -9,12 +9,7 @@
     split_string_by_multi_markers,
     write_json,
 )
-from .hash import (
-    compute_args_hash,
-    compute_content_hash,
-    compute_dict_hash,
-    compute_mm_hash,
-)
+from .hash import compute_args_hash, compute_content_hash, compute_dict_hash
 from .help_nltk import NLTKHelper
 from .log import CURRENT_LOGGER_VAR, logger, set_logger
 from .loop import create_event_loop
diff --git a/graphgen/utils/hash.py b/graphgen/utils/hash.py
@@ -9,20 +9,6 @@ def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
 
 
-def compute_mm_hash(item, prefix: str = ""):
-    if item.get("type") == "text" and item.get("text"):
-        content = item["text"].strip()
-    elif item.get("type") == "image" and item.get("img_path"):
-        content = f"image:{item['img_path']}"
-    elif item.get("type") == "table" and item.get("table_body"):
-        content = f"table:{item['table_body']}"
-    elif item.get("type") == "equation" and item.get("text"):
-        content = f"equation:{item['text']}"
-    else:
-        content = str(item)
-    return prefix + md5(content.encode()).hexdigest()
-
-
 def compute_dict_hash(d: dict, prefix: str = ""):
     items = tuple(sorted(d.items()))
     return prefix + md5(str(items).encode()).hexdigest()

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ def _parse_rdf_file(self, file_path: Path) -> List[Dict[str, Any]]:`
`118`	`118`	`"id": str(subj),`
`119`	`119`	`self.text_column: text,`
`120`	`120`	`"properties": props,`
`121`		`- "source_file": str(file_path),`
	`121`	`+ "path": str(file_path),`
`122`	`122`	`}`
`123`	`123`	`docs.append(doc)`
`124`	`124`
Original file line number	Diff line number	Diff line change
`@@ -18,13 +18,14 @@ def read(`
`18`	`18`	`"""`
`19`	`19`	`docs_ds = ray.data.read_binary_files(`
`20`	`20`	`input_path,`
`21`		`- include_paths=False,`
	`21`	`+ include_paths=True,`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`docs_ds = docs_ds.map(`
`25`	`25`	`lambda row: {`
`26`	`26`	`"type": "text",`
`27`	`27`	`self.text_column: row["bytes"].decode("utf-8"),`
	`28`	`+ "path": row["path"],`
`28`	`29`	`}`
`29`	`30`	`)`
`30`	`31`