Skip to content

Commit 14e1696

Browse files
authored
Merge pull request #2 from UniModal4Reasoning/dev
Test dataset version
2 parents d69795a + 5790c5c commit 14e1696

15 files changed

Lines changed: 768 additions & 234 deletions

batch_process.py

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import os
2-
import shutil
32
import argparse
43
import multiprocessing
5-
from typing import List
4+
import shutil
5+
from typing import List, Optional
66
from uuid import uuid4
77
import pandas as pd
88

@@ -16,16 +16,18 @@
1616
database = "data/processed_paper_database.csv"
1717

1818

19-
def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
20-
"""extract all MAIN.tex files for processing,
21-
only MAIN.tex files in the main_path (not recursive) are extracted
19+
def filter_tex_files(
20+
tex_files: List[str], main_path: Optional[str] = None
21+
) -> List[str]:
22+
"""extract all MAIN.tex files for processing, if main_path is not None, then
23+
only extract MAIN.tex files in the main_path (not recursive)
2224
2325
Args:
2426
tex_files (List[str]): list of tex files
2527
main_path (str): path to main directory.
2628
2729
Returns:
28-
List[str]: list of tex files that are compiable.
30+
List[str]: list of tex files that are compilable.
2931
"""
3032

3133
# TODO: move this to config
@@ -48,7 +50,7 @@ def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
4850
if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path:
4951
continue
5052

51-
# make sure the tex file is compiable (main document)
53+
# make sure the tex file is compilable (main document)
5254
try:
5355
with open(tex_file) as f:
5456
content = f.read()
@@ -70,35 +72,35 @@ def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
7072
return result
7173

7274

73-
def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
74-
"""Process the data in a specific discpline.
75+
def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None:
76+
"""Process the data in a specific discipline.
7577
7678
Args:
7779
path (str): The path to the raw data.
7880
cpu_count (int): The number of CPUs to use for multiprocessing.
79-
discpline (str): The discpline to process.
81+
discipline (str): The discipline to process.
8082
8183
Raises:
8284
Exception: If the processing fails.
8385
8486
Returns:
8587
None
8688
"""
87-
discpline_path = os.path.join(path, discpline)
88-
log.info(f"[VRDU] Path to raw data: {discpline_path}")
89+
discipline_path = os.path.join(path, discipline)
90+
log.info(f"[VRDU] Path to raw data: {discipline_path}")
8991
log.info(f"[VRDU] Using cpu counts: {cpu_count}")
90-
tex_files = utils.extract_all_tex_files(discpline_path)
91-
tex_files = filter_tex_files(tex_files, discpline_path)
92+
tex_files = utils.extract_all_tex_files(discipline_path)
93+
tex_files = filter_tex_files(tex_files, discipline_path)
9294

9395
try:
9496
with multiprocessing.Pool(cpu_count) as pool:
9597
pool.map(process_one_file, tex_files)
9698
except Exception:
97-
log.exception(f"[VRDU] discpline: {discpline}, failed to process.")
99+
log.exception(f"[VRDU] discipline: {discipline}, failed to process.")
98100
finally:
99101
# save the process log
100-
log.info(f"[VRDU] discpline: {discpline}, finished processing.")
101-
shutil.move(log_file, f"data/batch_process_{discpline}.log")
102+
log.info(f"[VRDU] discipline: {discipline}, finished processing.")
103+
shutil.move(log_file, f"data/batch_process_{discipline}.log")
102104

103105

104106
def main():
@@ -107,7 +109,7 @@ def main():
107109
Args:
108110
path (str): The path to the raw data.
109111
cpu_count (int): The number of CPUs to use for multiprocessing.
110-
discpline (str): The discpline to process.
112+
discipline (str): The discipline to process.
111113
112114
Raises:
113115
Exception: If the processing fails.
@@ -131,13 +133,13 @@ def main():
131133
help="cpu count for multiprocessing",
132134
)
133135
parser.add_argument(
134-
"-t", "--discpline", type=str, required=True, help="discpline to process"
136+
"-t", "--discipline", type=str, required=True, help="discipline to process"
135137
)
136138
args = parser.parse_args()
137-
path, cpu_count, discpline = args.path, args.cpu_count, args.discpline
139+
path, cpu_count, discipline = args.path, args.cpu_count, args.discipline
138140

139-
log.info(f"[VRDU] discpline: {discpline}, start to process.")
140-
process_one_discpline(path, cpu_count, discpline)
141+
log.info(f"[VRDU] discipline: {discipline}, start to process.")
142+
process_one_discipline(path, cpu_count, discipline)
141143

142144

143145
if __name__ == "__main__":

main.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,18 @@ def process_one_file(file_name: str) -> None:
9494
log.info(f"[VRDU] file: {file_name}, paper has been processed")
9595
return
9696

97-
# remove redundant files
97+
# make a copy of the original tex file
98+
original_tex = os.path.join(main_directory, "paper_original.tex")
99+
shutil.copyfile(file_name, original_tex)
100+
101+
# remove the output folder if it exists
98102
output_directory = os.path.join(main_directory, "output")
99103
if os.path.exists(output_directory):
100104
shutil.rmtree(output_directory)
101105

102-
# make a copy of the original tex file to avoid polluting the original tex file
103-
original_tex = os.path.join(main_directory, "paper_original.tex")
104-
shutil.copyfile(file_name, original_tex)
106+
# output_directory stores the intermediate results
107+
# result_directory stores the final results
108+
os.makedirs(os.path.join(main_directory, "output/result"))
105109

106110
cwd = os.getcwd()
107111

@@ -128,7 +132,7 @@ def process_one_file(file_name: str) -> None:
128132
log.info(
129133
f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..."
130134
)
131-
vrdu_layout_annotation = layout.LayoutAnnotation(main_directory)
135+
vrdu_layout_annotation = layout.LayoutAnnotation(original_tex)
132136
vrdu_layout_annotation.annotate()
133137

134138
vrdu_order_annotation = order.OrderAnnotation(original_tex)

scripts/retrive_metadata.py

Lines changed: 70 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -1,142 +1,87 @@
1+
import glob
12
import os
2-
import time
3+
from typing import Any, Dict, List
34
import arxiv
4-
import shutil
5+
import argparse
56

7+
import pandas as pd
8+
9+
10+
from vrdu import utils
611
from vrdu import logger
712

8-
log = logger.setup_app_level_logger(file_name="retrive_metadata.log")
13+
log = logger.setup_app_level_logger(file_name="retrieve_metadata.log")
914

1015

11-
def retrive_metadata_for_files(path: str) -> None:
12-
"""Retrieves metadata for files in a specified path.
13-
It first query the primary category by the file's name, then move the
14-
file to the path/{category}
16+
def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]:
17+
paper_ids = list(data.keys())
1518

16-
Args:
17-
path (str): The path to the directory containing the files.
19+
client = arxiv.Client()
1820

19-
Returns:
20-
None
21+
slice_length = 100
22+
paper_metadata = []
23+
24+
for i in range(0, len(paper_ids), slice_length):
25+
slices = paper_ids[i : i + slice_length]
26+
search_results = client.results(arxiv.Search(id_list=slices))
27+
28+
for index, result in enumerate(search_results):
29+
paper_metadata.append(
30+
{
31+
"entry_id": result.entry_id,
32+
"updated": str(result.updated),
33+
"published": str(result.published),
34+
"title": result.title,
35+
"doi": result.doi,
36+
"authors": [str(author) for author in result.authors],
37+
"summary": result.summary,
38+
"journal_ref": result.journal_ref,
39+
"primary_category": result.primary_category,
40+
"categories": result.categories,
41+
"links": [str(link) for link in result.links],
42+
"pdf_url": result.pdf_url,
43+
"paper_id": slices[index],
44+
"paper_path": data[slices[index]],
45+
"quality": "low",
46+
}
47+
)
48+
49+
return paper_metadata
50+
51+
52+
def main():
53+
parser = argparse.ArgumentParser()
54+
parser.add_argument(
55+
"-i", "--input_path", type=str, default="data/discipline_info.csv"
56+
)
57+
args = parser.parse_args()
58+
path = args.input_path
2159

22-
Raises:
23-
None
60+
discipline_info = pd.read_csv("data/discipline_info.csv")
61+
disciplines = set(discipline_info["discipline"])
2462

25-
This function retrieves metadata for the files in the specified path.
26-
It filters the files based on a specific format, moves them to categorized directories, and logs the actions performed.
63+
for discipline in disciplines:
64+
target_discipline_path = os.path.join(path, discipline)
65+
paper_paths = glob.glob(os.path.join(target_discipline_path, "*/"))
2766

28-
Note:
29-
The function assumes that the files in the specified path are in the format 'xxxx.yyyy.ext',
30-
where 'xxxx' and 'yyyy' are digits, and `ext` is the extension format.
67+
data = {
68+
os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths
69+
}
3170

32-
Example:
33-
retrive_metadata_for_files('/path/to/directory')
34-
"""
35-
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
36-
# filter the files have format xxxx.yyyy.ext
37-
filtered_files = [
38-
f for f in files if f[:4].isdigit() and f[5:9].isdigit() and f[4] == "."
39-
]
40-
num_papers = len(filtered_files)
41-
log.info("There are {} files".format(num_papers))
42-
client = arxiv.Client()
43-
slice_length = 100
71+
paper_metadata = retrieve_metadata(data)
4472

45-
for i in range(0, num_papers, slice_length):
46-
filename_without_extensions = [
47-
os.path.splitext(f)[0] for f in filtered_files[i : i + slice_length]
48-
]
49-
50-
for pdf_file, result in zip(
51-
filtered_files[i : i + slice_length],
52-
client.results(arxiv.Search(id_list=filename_without_extensions)),
53-
):
54-
old_path = os.path.join(path, pdf_file)
55-
category = result.primary_category
56-
new_path = os.path.join(path, category)
57-
if not os.path.exists(new_path):
58-
os.makedirs(new_path)
59-
log.info("Created directory: {}".format(new_path))
60-
try:
61-
shutil.move(old_path, new_path)
62-
log.info(
63-
"Move file: {} to {}".format(
64-
old_path,
65-
new_path,
66-
)
67-
)
68-
except Exception:
69-
log.exception(f"Error moving {old_path}")
70-
71-
72-
def retrive_metadata_for_folders(path: str) -> None:
73-
"""Retrieves metadata for subfolders in a specified path.
74-
It first query the primary category by the file's name, then move the
75-
file to the path/{category}
76-
77-
Args:
78-
path (str): The path to the directory containing the subfolders.
79-
80-
Returns:
81-
None
82-
83-
Raises:
84-
None
85-
86-
This function retrieves metadata for the subfolders in the specified path.
87-
It filters the subfolders based on a specific format, moves them to categorized directories, and logs the actions performed.
88-
89-
Note:
90-
The function assumes that the subfolders in the specified path are in the format 'xxxx.yyyy',
91-
where 'xxxx' and 'yyyy' are numeric values and the length of the subfolder name is 9 characters.
92-
93-
Example:
94-
retrive_metadata_for_folders('/path/to/directory')
95-
"""
96-
subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
97-
filtered_subfolders = [
98-
f for f in subfolders if f[:4].isdigit() and f[5:].isdigit() and f[4] == "."
99-
]
100-
num_papers = len(filtered_subfolders)
101-
log.info("There are {} subfolders".format(num_papers))
102-
client = arxiv.Client()
103-
slice_length = 100
104-
for i in range(0, num_papers, slice_length):
105-
slice_list = filtered_subfolders[i : i + slice_length]
106-
for dir_name, result in zip(
107-
slice_list,
108-
client.results(arxiv.Search(id_list=slice_list)),
109-
):
110-
old_path = os.path.join(path, dir_name)
111-
category = result.primary_category
112-
log.info(f"dir_name: {dir_name}, category: {category}")
113-
new_path = os.path.join(path, category)
114-
if not os.path.exists(new_path):
115-
os.makedirs(new_path)
116-
log.info("Created directory: {}".format(new_path))
117-
try:
118-
shutil.move(old_path, new_path)
119-
log.info(
120-
"Move directory: {} to {}".format(
121-
old_path,
122-
new_path,
123-
)
124-
)
125-
except Exception:
126-
log.exception(f"Error moving {old_path}")
127-
128-
129-
def retrive_metadata(path: str) -> None:
130-
retrive_metadata_for_folders(path)
131-
retrive_metadata_for_files(path)
73+
existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json")
74+
existed_json_data = []
75+
if os.path.exists(existed_json_file):
76+
existed_json_data = utils.load_json(existed_json_file)
13277

78+
existed_paper_ids = [x["paper_id"] for x in existed_json_data]
79+
existed_json_data.extend(
80+
[x for x in paper_metadata if x["paper_id"] not in existed_paper_ids]
81+
)
13382

134-
if __name__ == "__main__":
135-
import argparse
83+
utils.export_to_json(existed_json_data, existed_json_file)
13684

137-
parser = argparse.ArgumentParser()
138-
parser.add_argument("-p", "--path", help="path to directory containing subfolders")
139-
args = parser.parse_args()
140-
# run(args.path, args.cpu_count)
141-
# run_v2(args.path)
142-
retrive_metadata(args.path)
85+
86+
if __name__ == "__main__":
87+
main()

scripts/visualize_dataset_distribution.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,24 @@
44
import numpy as np
55
import csv
66

7-
from vrdu import utils
7+
8+
def get_all_categories():
9+
"""
10+
Retrieves all categories from the "category_count.csv" file.
11+
12+
Returns:
13+
categories (list): A list of all categories.
14+
15+
Reference:
16+
https://arxiv.org/category_taxonomy
17+
"""
18+
categories = []
19+
with open("scripts/category_count.csv", "r") as f:
20+
reader = csv.DictReader(f)
21+
for row in reader:
22+
categories.append(row["categories"])
23+
24+
return categories
825

926

1027
def visualize_distribution(dict1, dict2):
@@ -50,7 +67,7 @@ def visualize_distribution(dict1, dict2):
5067

5168

5269
def analyze_raw_data(path):
53-
all_categories = utils.get_all_categories()
70+
all_categories = get_all_categories()
5471

5572
data = defaultdict(int)
5673
for category in all_categories:

0 commit comments

Comments
 (0)