Skip to content

Commit d69795a

Browse files
authored
Merge pull request #1 from UniModal4Reasoning/dev
Training dataset version
2 parents f4f4e62 + 35b500e commit d69795a

23 files changed

Lines changed: 1100 additions & 1369 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,6 @@ TexSoup/tests/
181181

182182
# jupyter notebook
183183
*.ipynb
184+
185+
data/
186+
data/discpline_info.csv

batch_process.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,54 +16,83 @@
1616
database = "data/processed_paper_database.csv"
1717

1818

19-
def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
20-
"""extract all MAIN.tex files for processing, if main_path is not None, then
21-
only extract MAIN.tex files in the main_path (not recursive)
19+
def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
20+
"""extract all MAIN.tex files for processing,
21+
only MAIN.tex files in the main_path (not recursive) are extracted
2222
2323
Args:
2424
tex_files (List[str]): list of tex files
25-
main_path (str, optional): path to main directory. Defaults to None.
25+
main_path (str): path to main directory.
2626
2727
Returns:
2828
List[str]: list of tex files that are compiable.
2929
"""
30+
31+
# TODO: move this to config
32+
redundant_tex_files = [
33+
"paper_colored.tex",
34+
"paper_white.tex",
35+
"paper_original.tex",
36+
]
37+
3038
result = []
3139
for tex_file in tex_files:
40+
if "paper_block_" in tex_file:
41+
continue
42+
43+
if os.path.basename(tex_file) in redundant_tex_files:
44+
continue
45+
46+
# ensure the tex files inside a subfolder is not included
47+
# ex: cs.AI/1234.4567/figs/draw.tex will be excluded
3248
if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path:
3349
continue
34-
# prevent processing previous generated files
50+
51+
# make sure the tex file is compiable (main document)
3552
try:
3653
with open(tex_file) as f:
3754
content = f.read()
3855
if "\\begin{document}" not in content:
3956
continue
4057
result.append(tex_file)
4158
except UnicodeDecodeError:
42-
log.debug(f"failed to read tex file: {tex_file}")
59+
log.debug(f"failed to read tex file: {tex_file} due to UnicodeDecodeError")
4360
continue
4461

45-
log.info(f"Before filtering, Found {len(result)} tex files")
62+
# skip processed papers
63+
log.info(f"[VRDU] Before filtering, found {len(result)} tex files")
4664
if os.path.exists(database):
4765
df = pd.read_csv(database)
48-
processed_papers = set(df[df["status"] != "processing"]["path"])
66+
processed_papers = set(df["path"])
4967
result = [x for x in result if os.path.dirname(x) not in processed_papers]
5068

51-
log.info(f"After filtering, Found {len(result)} tex files")
69+
log.info(f"[VRDU] After filtering, found {len(result)} tex files")
5270
return result
5371

5472

5573
def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
74+
"""Process the data in a specific discpline.
75+
76+
Args:
77+
path (str): The path to the raw data.
78+
cpu_count (int): The number of CPUs to use for multiprocessing.
79+
discpline (str): The discpline to process.
80+
81+
Raises:
82+
Exception: If the processing fails.
83+
84+
Returns:
85+
None
86+
"""
5687
discpline_path = os.path.join(path, discpline)
57-
log.info(f"path to raw data: {discpline_path}")
58-
log.info(f"Using cpu counts: {cpu_count}")
88+
log.info(f"[VRDU] Path to raw data: {discpline_path}")
89+
log.info(f"[VRDU] Using cpu counts: {cpu_count}")
5990
tex_files = utils.extract_all_tex_files(discpline_path)
6091
tex_files = filter_tex_files(tex_files, discpline_path)
61-
log.info(f"Found {len(tex_files)} tex files")
6292

6393
try:
6494
with multiprocessing.Pool(cpu_count) as pool:
6595
pool.map(process_one_file, tex_files)
66-
# save log file
6796
except Exception:
6897
log.exception(f"[VRDU] discpline: {discpline}, failed to process.")
6998
finally:
@@ -73,6 +102,23 @@ def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
73102

74103

75104
def main():
105+
"""This function is the entry point of the application.
106+
107+
Args:
108+
path (str): The path to the raw data.
109+
cpu_count (int): The number of CPUs to use for multiprocessing.
110+
discpline (str): The discpline to process.
111+
112+
Raises:
113+
Exception: If the processing fails.
114+
115+
Returns:
116+
None
117+
118+
References:
119+
https://arxiv.org/category_taxonomy
120+
"""
121+
# parse arguments
76122
parser = argparse.ArgumentParser()
77123
parser.add_argument(
78124
"-p", "--path", type=str, required=True, help="path to raw data"

0 commit comments

Comments
 (0)