33
44import ray
55
6+ from graphgen .common import init_storage
67from graphgen .models import (
78 CSVReader ,
89 JSONReader ,
@@ -51,6 +52,7 @@ def read(
5152 input_path : Union [str , List [str ]],
5253 allowed_suffix : Optional [List [str ]] = None ,
5354 working_dir : Optional [str ] = "cache" ,
55+ kv_backend : str = "rocksdb" ,
5456 parallelism : int = 4 ,
5557 recursive : bool = True ,
5658 read_nums : Optional [int ] = None ,
@@ -62,71 +64,79 @@ def read(
6264 :param input_path: File or directory path(s) to read from
6365 :param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
6466 :param working_dir: Directory to cache intermediate files (PDF processing)
67+ :param kv_backend: Backend for key-value storage
6568 :param parallelism: Number of parallel workers
6669 :param recursive: Whether to scan directories recursively
6770 :param read_nums: Limit the number of documents to read
6871 :param reader_kwargs: Additional kwargs passed to readers
6972 :return: Ray Dataset containing all documents
7073 """
74+
75+ read_cache = init_storage (
76+ backend = kv_backend , working_dir = working_dir , namespace = "read"
77+ )
7178 try :
7279 # 1. Scan all paths to discover files
7380 logger .info ("[READ] Scanning paths: %s" , input_path )
74- scanner = ParallelFileScanner (
75- cache_dir = working_dir ,
81+ with ParallelFileScanner (
82+ read_cache = read_cache ,
7683 allowed_suffix = allowed_suffix ,
7784 rescan = False ,
7885 max_workers = parallelism if parallelism > 0 else 1 ,
79- )
80-
81- all_files = []
82- scan_results = scanner .scan (input_path , recursive = recursive )
83-
84- for result in scan_results .values ():
85- all_files .extend (result .get ("files" , []))
86-
87- logger .info ("[READ] Found %d files to process" , len (all_files ))
88-
89- if not all_files :
90- raise ValueError ("No files found to read." )
91-
92- # 2. Group files by suffix to use appropriate reader
93- files_by_suffix = {}
94- for file_info in all_files :
95- suffix = Path (file_info ["path" ]).suffix .lower ().lstrip ("." )
96- if allowed_suffix and suffix not in [
97- s .lower ().lstrip ("." ) for s in allowed_suffix
98- ]:
99- continue
100- files_by_suffix .setdefault (suffix , []).append (file_info ["path" ])
101-
102- # 3. Create read tasks
103- read_tasks = []
104- for suffix , file_paths in files_by_suffix .items ():
105- reader = _build_reader (suffix , working_dir , ** reader_kwargs )
106- ds = reader .read (file_paths )
107- read_tasks .append (ds )
108-
109- # 4. Combine all datasets
110- if not read_tasks :
111- raise ValueError ("No datasets created from the provided files." )
112-
113- if len (read_tasks ) == 1 :
114- combined_ds = read_tasks [0 ]
115- else :
116- combined_ds = read_tasks [0 ].union (* read_tasks [1 :])
117-
118- combined_ds = combined_ds .map (
119- lambda record : {
120- ** record ,
121- "_doc_id" : compute_mm_hash (record , prefix = "doc-" ),
122- }
123- )
124-
125- if read_nums is not None :
126- combined_ds = combined_ds .limit (read_nums )
127-
128- logger .info ("[READ] Successfully read files from %s" , input_path )
129- return combined_ds
86+ ) as scanner :
87+ all_files = []
88+ scan_results = scanner .scan (input_path , recursive = recursive )
89+
90+ for result in scan_results .values ():
91+ all_files .extend (result .get ("files" , []))
92+
93+ logger .info ("[READ] Found %d files to process" , len (all_files ))
94+
95+ if not all_files :
96+ raise ValueError ("No files found to read." )
97+
98+ # 2. Group files by suffix to use appropriate reader
99+ files_by_suffix = {}
100+ for file_info in all_files :
101+ suffix = Path (file_info ["path" ]).suffix .lower ().lstrip ("." )
102+ if allowed_suffix and suffix not in [
103+ s .lower ().lstrip ("." ) for s in allowed_suffix
104+ ]:
105+ continue
106+ files_by_suffix .setdefault (suffix , []).append (file_info ["path" ])
107+
108+ # 3. Create read tasks
109+ read_tasks = []
110+ for suffix , file_paths in files_by_suffix .items ():
111+ reader = _build_reader (suffix , working_dir , ** reader_kwargs )
112+ ds = reader .read (file_paths )
113+ read_tasks .append (ds )
114+
115+ # 4. Combine all datasets
116+ if not read_tasks :
117+ raise ValueError ("No datasets created from the provided files." )
118+
119+ if len (read_tasks ) == 1 :
120+ combined_ds = read_tasks [0 ]
121+ else :
122+ combined_ds = read_tasks [0 ].union (* read_tasks [1 :])
123+
124+ combined_ds = combined_ds .map (
125+ lambda record : {
126+ ** record ,
127+ "_trace_id" : compute_mm_hash (record , prefix = "doc-" ),
128+ }
129+ )
130+
131+ if read_nums is not None :
132+ combined_ds = combined_ds .limit (read_nums )
133+
134+ # sample record
135+ for i , item in enumerate (combined_ds .take (1 )):
136+ logger .debug ("[READ] Sample record %d: %s" , i , item )
137+
138+ logger .info ("[READ] Successfully read files from %s" , input_path )
139+ return combined_ds
130140
131141 except Exception as e :
132142 logger .error ("[READ] Failed to read files from %s: %s" , input_path , e )
0 commit comments