11import os
2- import shutil
32import argparse
43import multiprocessing
5- from typing import List
4+ import shutil
5+ from typing import List , Optional
66from uuid import uuid4
77import pandas as pd
88
1616database = "data/processed_paper_database.csv"
1717
1818
19- def filter_tex_files (tex_files : List [str ], main_path : str = None ) -> List [str ]:
19+ def filter_tex_files (
20+ tex_files : List [str ], main_path : Optional [str ] = None
21+ ) -> List [str ]:
2022 """extract all MAIN.tex files for processing, if main_path is not None, then
2123 only extract MAIN.tex files in the main_path (not recursive)
2224
@@ -25,7 +27,7 @@ def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
2527 main_path (str, optional): path to main directory. Defaults to None.
2628
2729 Returns:
28- List[str]: list of tex files that are compiable .
30+ List[str]: list of tex files that are compilable .
2931 """
3032
3133 # TODO: move this to config
@@ -48,7 +50,7 @@ def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
4850 if main_path and os .path .dirname (os .path .dirname (tex_file )) != main_path :
4951 continue
5052
51- # make sure the tex file is compiable (main document)
53+ # make sure the tex file is compilable (main document)
5254 try :
5355 with open (tex_file ) as f :
5456 content = f .read ()
@@ -70,35 +72,35 @@ def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
7072 return result
7173
7274
73- def process_one_discpline (path : str , cpu_count : int , discpline : str ) -> None :
74- """Process the data in a specific discpline .
75+ def process_one_discipline (path : str , cpu_count : int , discipline : str ) -> None :
76+ """Process the data in a specific discipline .
7577
7678 Args:
7779 path (str): The path to the raw data.
7880 cpu_count (int): The number of CPUs to use for multiprocessing.
79- discpline (str): The discpline to process.
81+ discipline (str): The discipline to process.
8082
8183 Raises:
8284 Exception: If the processing fails.
8385
8486 Returns:
8587 None
8688 """
87- discpline_path = os .path .join (path , discpline )
88- log .info (f"[VRDU] Path to raw data: { discpline_path } " )
89+ discipline_path = os .path .join (path , discipline )
90+ log .info (f"[VRDU] Path to raw data: { discipline_path } " )
8991 log .info (f"[VRDU] Using cpu counts: { cpu_count } " )
90- tex_files = utils .extract_all_tex_files (discpline_path )
91- tex_files = filter_tex_files (tex_files , discpline_path )
92+ tex_files = utils .extract_all_tex_files (discipline_path )
93+ tex_files = filter_tex_files (tex_files , discipline_path )
9294
9395 try :
9496 with multiprocessing .Pool (cpu_count ) as pool :
9597 pool .map (process_one_file , tex_files )
9698 except Exception :
97- log .exception (f"[VRDU] discpline : { discpline } , failed to process." )
99+ log .exception (f"[VRDU] discipline : { discipline } , failed to process." )
98100 finally :
99101 # save the process log
100- log .info (f"[VRDU] discpline : { discpline } , finished processing." )
101- shutil .move (log_file , f"data/batch_process_{ discpline } .log" )
102+ log .info (f"[VRDU] discipline : { discipline } , finished processing." )
103+ shutil .move (log_file , f"data/batch_process_{ discipline } .log" )
102104
103105
104106def main ():
@@ -107,7 +109,7 @@ def main():
107109 Args:
108110 path (str): The path to the raw data.
109111 cpu_count (int): The number of CPUs to use for multiprocessing.
110- discpline (str): The discpline to process.
112+ discipline (str): The discipline to process.
111113
112114 Raises:
113115 Exception: If the processing fails.
@@ -131,13 +133,13 @@ def main():
131133 help = "cpu count for multiprocessing" ,
132134 )
133135 parser .add_argument (
134- "-t" , "--discpline " , type = str , required = True , help = "discpline to process"
136+ "-t" , "--discipline " , type = str , required = True , help = "discipline to process"
135137 )
136138 args = parser .parse_args ()
137- path , cpu_count , discpline = args .path , args .cpu_count , args .discpline
139+ path , cpu_count , discipline = args .path , args .cpu_count , args .discipline
138140
139- log .info (f"[VRDU] discpline : { discpline } , start to process." )
140- process_one_discpline (path , cpu_count , discpline )
141+ log .info (f"[VRDU] discipline : { discipline } , start to process." )
142+ process_one_discipline (path , cpu_count , discipline )
141143
142144
143145if __name__ == "__main__" :
0 commit comments