Skip to content

Commit ce67aa3

Browse files
committed
Merge branch 'main' of https://github.com/MaoSong2022/vrdu_data_process into main
2 parents 15f3b7b + 5f1a98d commit ce67aa3

3 files changed

Lines changed: 73 additions & 1 deletion

File tree

batch_process.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ def process_one_file(file_name, success_save_path):
139139
return process_result
140140

141141

142+
143+
144+
142145
def main(path, cpu_count=None):
143146
success_path = path + '_success'
144147
os.makedirs(success_path, exist_ok=True)

scripts/classify_path.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import os
2+
import arxiv
3+
import shutil
4+
5+
6+
def retrieve_arxiv_metadata(path: str):
7+
"""
8+
Retrieves metadata for a given arXiv document folder, the folder follows the pattern "****.****", where * is a digit.
9+
10+
Args:
11+
path (str): The path of the arXiv document.
12+
13+
Returns:
14+
Tuple[str, str]: A tuple containing the file name and the category of the arxiv document.
15+
16+
Raises:
17+
FileNotFoundError: If the metadata of the document cannot be found in arxiv.
18+
"""
19+
search = arxiv.Search(id_list=[path])
20+
21+
file_name, category = None, None
22+
for result in search.results():
23+
file_name = result._get_default_filename(extension="")
24+
category = result.primary_category
25+
break
26+
27+
if file_name is None or category is None:
28+
raise FileNotFoundError(f"metadata of {path} cannot be found in arXiv.")
29+
return file_name[:-1], category
30+
31+
32+
def run(path):
33+
"""
34+
Moves subfolders in the given path to a new location based on arxiv metadata.
35+
subfolders must have pattern "****.****", where * is a digit.
36+
37+
Args:
38+
path (str): The path to the directory containing the subfolders.
39+
40+
Returns:
41+
None
42+
43+
Example:
44+
45+
"""
46+
subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
47+
filtered_subfolders = [
48+
f
49+
for f in subfolders
50+
if len(f) == 9 and f[:4].isdigit() and f[5:].isdigit() and f[4] == "."
51+
]
52+
for dir_name in filtered_subfolders:
53+
new_dir_name, category = retrieve_arxiv_metadata(dir_name)
54+
if not os.path.exists(os.path.join(path, category)):
55+
os.makedirs(os.path.join(path, category))
56+
shutil.move(
57+
os.path.join(path, dir_name),
58+
os.path.join(path, category + "/" + new_dir_name),
59+
)
60+
print(f"Moved {dir_name} to {category}/{new_dir_name}")
61+
62+
63+
if __name__ == "__main__":
64+
import argparse
65+
66+
parser = argparse.ArgumentParser()
67+
parser.add_argument("-p", "--path", help="path to directory containing subfolders")
68+
args = parser.parse_args()
69+
70+
run(args.path)

vrdu/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import datetime
21
import os
32
import re
43
import subprocess

0 commit comments

Comments
 (0)