1616database = "data/processed_paper_database.csv"
1717
1818
19- def filter_tex_files (tex_files : List [str ], main_path : str = None ) -> List [str ]:
20- """extract all MAIN.tex files for processing, if main_path is not None, then
21- only extract MAIN.tex files in the main_path (not recursive)
19+ def filter_tex_files (tex_files : List [str ], main_path : str ) -> List [str ]:
20+ """extract all MAIN.tex files for processing,
21+ only MAIN.tex files in the main_path (not recursive) are extracted
2222
2323 Args:
2424 tex_files (List[str]): list of tex files
25- main_path (str, optional ): path to main directory. Defaults to None .
25+ main_path (str): path to main directory.
2626
2727 Returns:
2828 List[str]: list of tex files that are compiable.
2929 """
30+
31+ # TODO: move this to config
32+ redundant_tex_files = [
33+ "paper_colored.tex" ,
34+ "paper_white.tex" ,
35+ "paper_original.tex" ,
36+ ]
37+
3038 result = []
3139 for tex_file in tex_files :
40+ if "paper_block_" in tex_file :
41+ continue
42+
43+ if os .path .basename (tex_file ) in redundant_tex_files :
44+ continue
45+
46+ # ensure the tex files inside a subfolder is not included
47+ # ex: cs.AI/1234.4567/figs/draw.tex will be excluded
3248 if main_path and os .path .dirname (os .path .dirname (tex_file )) != main_path :
3349 continue
34- # prevent processing previous generated files
50+
51+ # make sure the tex file is compiable (main document)
3552 try :
3653 with open (tex_file ) as f :
3754 content = f .read ()
3855 if "\\ begin{document}" not in content :
3956 continue
4057 result .append (tex_file )
4158 except UnicodeDecodeError :
42- log .debug (f"failed to read tex file: { tex_file } " )
59+ log .debug (f"failed to read tex file: { tex_file } due to UnicodeDecodeError " )
4360 continue
4461
45- log .info (f"Before filtering, Found { len (result )} tex files" )
62+ # skip processed papers
63+ log .info (f"[VRDU] Before filtering, found { len (result )} tex files" )
4664 if os .path .exists (database ):
4765 df = pd .read_csv (database )
48- processed_papers = set (df [df [ "status" ] != "processing" ][ "path" ])
66+ processed_papers = set (df ["path" ])
4967 result = [x for x in result if os .path .dirname (x ) not in processed_papers ]
5068
51- log .info (f"After filtering, Found { len (result )} tex files" )
69+ log .info (f"[VRDU] After filtering, found { len (result )} tex files" )
5270 return result
5371
5472
5573def process_one_discpline (path : str , cpu_count : int , discpline : str ) -> None :
74+ """Process the data in a specific discpline.
75+
76+ Args:
77+ path (str): The path to the raw data.
78+ cpu_count (int): The number of CPUs to use for multiprocessing.
79+ discpline (str): The discpline to process.
80+
81+ Raises:
82+ Exception: If the processing fails.
83+
84+ Returns:
85+ None
86+ """
5687 discpline_path = os .path .join (path , discpline )
57- log .info (f"path to raw data: { discpline_path } " )
58- log .info (f"Using cpu counts: { cpu_count } " )
88+ log .info (f"[VRDU] Path to raw data: { discpline_path } " )
89+ log .info (f"[VRDU] Using cpu counts: { cpu_count } " )
5990 tex_files = utils .extract_all_tex_files (discpline_path )
6091 tex_files = filter_tex_files (tex_files , discpline_path )
61- log .info (f"Found { len (tex_files )} tex files" )
6292
6393 try :
6494 with multiprocessing .Pool (cpu_count ) as pool :
6595 pool .map (process_one_file , tex_files )
66- # save log file
6796 except Exception :
6897 log .exception (f"[VRDU] discpline: { discpline } , failed to process." )
6998 finally :
@@ -73,6 +102,23 @@ def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
73102
74103
75104def main ():
105+ """This function is the entry point of the application.
106+
107+ Args:
108+ path (str): The path to the raw data.
109+ cpu_count (int): The number of CPUs to use for multiprocessing.
110+ discpline (str): The discpline to process.
111+
112+ Raises:
113+ Exception: If the processing fails.
114+
115+ Returns:
116+ None
117+
118+ References:
119+ https://arxiv.org/category_taxonomy
120+ """
121+ # parse arguments
76122 parser = argparse .ArgumentParser ()
77123 parser .add_argument (
78124 "-p" , "--path" , type = str , required = True , help = "path to raw data"
0 commit comments