Skip to content

Commit 5f1a98d

Browse files
committed
refactor(batch_process.py, classify_path.py, utils.py): move classify_path to scripts
1 parent 863eafd commit 5f1a98d

3 files changed

Lines changed: 70 additions & 28 deletions

File tree

batch_process.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -139,22 +139,10 @@ def process_one_file(file_name, success_save_path):
139139
return process_result
140140

141141

142-
def classify_path(path, required=False):
143-
# used to process arxiv_uncompressed
144-
# if process data in vrdu_arxiv, no need for this process
145-
if not required:
146-
return
147142

148-
for dir_name in glob.glob(f"{path}/*/"):
149-
new_dir_name, category = utils.retrive_arxiv_metadata(dir_name)
150-
if not os.path.exists(os.path.join(path, category)):
151-
os.makedirs(os.path.join(path, category))
152-
shutil.move(dir_name, os.path.join(path, category))
153143

154144

155145
def main(path, cpu_count=None):
156-
157-
classify_path(path, required=True)
158146
success_path = path + '_success'
159147
os.makedirs(success_path, exist_ok=True)
160148
tex_files = sorted(extract_tex_files(path))

scripts/classify_path.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import os
2+
import arxiv
3+
import shutil
4+
5+
6+
def retrieve_arxiv_metadata(path: str):
7+
"""
8+
Retrieves metadata for a given arXiv document folder, the folder follows the pattern "****.****", where * is a digit.
9+
10+
Args:
11+
path (str): The path of the arXiv document.
12+
13+
Returns:
14+
Tuple[str, str]: A tuple containing the file name and the category of the arxiv document.
15+
16+
Raises:
17+
FileNotFoundError: If the metadata of the document cannot be found in arxiv.
18+
"""
19+
search = arxiv.Search(id_list=[path])
20+
21+
file_name, category = None, None
22+
for result in search.results():
23+
file_name = result._get_default_filename(extension="")
24+
category = result.primary_category
25+
break
26+
27+
if file_name is None or category is None:
28+
raise FileNotFoundError(f"metadata of {path} cannot be found in arXiv.")
29+
return file_name[:-1], category
30+
31+
32+
def run(path):
33+
"""
34+
Moves subfolders in the given path to a new location based on arxiv metadata.
35+
subfolders must have pattern "****.****", where * is a digit.
36+
37+
Args:
38+
path (str): The path to the directory containing the subfolders.
39+
40+
Returns:
41+
None
42+
43+
Example:
44+
45+
"""
46+
subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
47+
filtered_subfolders = [
48+
f
49+
for f in subfolders
50+
if len(f) == 9 and f[:4].isdigit() and f[5:].isdigit() and f[4] == "."
51+
]
52+
for dir_name in filtered_subfolders:
53+
new_dir_name, category = retrieve_arxiv_metadata(dir_name)
54+
if not os.path.exists(os.path.join(path, category)):
55+
os.makedirs(os.path.join(path, category))
56+
shutil.move(
57+
os.path.join(path, dir_name),
58+
os.path.join(path, category + "/" + new_dir_name),
59+
)
60+
print(f"Moved {dir_name} to {category}/{new_dir_name}")
61+
62+
63+
if __name__ == "__main__":
64+
import argparse
65+
66+
parser = argparse.ArgumentParser()
67+
parser.add_argument("-p", "--path", help="path to directory containing subfolders")
68+
args = parser.parse_args()
69+
70+
run(args.path)

vrdu/utils.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
import datetime
21
import os
32
import re
43
import subprocess
54
import json
65
from typing import Any, Dict
7-
import arxiv
86

97
from TexSoup.TexSoup import TexSoup
108
import TexSoup.app.conversion as conversion
@@ -260,20 +258,6 @@ def get_graphicspath(latex):
260258
return ""
261259

262260

263-
def retrive_arxiv_metadata(path: str):
264-
search = arxiv.Search(id_list=[path])
265-
266-
file_name, category = None, None
267-
for result in search.results():
268-
file_name = result._get_default_filename(extension="")
269-
category = result.primary_category
270-
break
271-
272-
if file_name is None or category is None:
273-
raise FileNotFoundError(f"metadata of {path} cannot be found in arXiv.")
274-
return file_name[:-1], category
275-
276-
277261
def colorize(text: str, category_name: str) -> str:
278262
color = config.name2color[category_name]
279263
if category_name == "Caption":

0 commit comments

Comments
 (0)