|
13 | 13 | RDFReader, |
14 | 14 | TXTReader, |
15 | 15 | ) |
16 | | -from graphgen.utils import compute_mm_hash, logger |
| 16 | +from graphgen.utils import compute_dict_hash, logger |
17 | 17 |
|
18 | 18 | from .parallel_file_scanner import ParallelFileScanner |
19 | 19 |
|
@@ -71,15 +71,17 @@ def read( |
71 | 71 | :param reader_kwargs: Additional kwargs passed to readers |
72 | 72 | :return: Ray Dataset containing all documents |
73 | 73 | """ |
74 | | - |
75 | | - read_cache = init_storage( |
| 74 | + input_path_cache = init_storage( |
| 75 | + backend=kv_backend, working_dir=working_dir, namespace="input_path" |
| 76 | + ) |
| 77 | + read_storage = init_storage( |
76 | 78 | backend=kv_backend, working_dir=working_dir, namespace="read" |
77 | 79 | ) |
78 | 80 | try: |
79 | 81 | # 1. Scan all paths to discover files |
80 | 82 | logger.info("[READ] Scanning paths: %s", input_path) |
81 | 83 | with ParallelFileScanner( |
82 | | - read_cache=read_cache, |
| 84 | + input_path_cache=input_path_cache, |
83 | 85 | allowed_suffix=allowed_suffix, |
84 | 86 | rescan=False, |
85 | 87 | max_workers=parallelism if parallelism > 0 else 1, |
@@ -124,12 +126,17 @@ def read( |
124 | 126 | if read_nums is not None: |
125 | 127 | combined_ds = combined_ds.limit(read_nums) |
126 | 128 |
|
127 | | - combined_ds = combined_ds.map( |
128 | | - lambda record: { |
129 | | - **record, |
130 | | - "_trace_id": compute_mm_hash(record, prefix="doc-"), |
131 | | - } |
132 | | - ) |
| 129 | + def add_trace_id(batch): |
| 130 | + batch["_trace_id"] = batch.apply( |
| 131 | + lambda row: compute_dict_hash(row, prefix="read-"), axis=1 |
| 132 | + ) |
| 133 | + records = batch.to_dict(orient="records") |
| 134 | + data_to_upsert = {record["_trace_id"]: record for record in records} |
| 135 | + read_storage.upsert(data_to_upsert) |
| 136 | + read_storage.index_done_callback() |
| 137 | + return batch |
| 138 | + |
| 139 | + combined_ds = combined_ds.map_batches(add_trace_id, batch_format="pandas") |
133 | 140 |
|
134 | 141 | # sample record |
135 | 142 | for i, item in enumerate(combined_ds.take(1)): |
|
0 commit comments