Skip to content

Commit d689ce2

Browse files
committed
feat(retrieve_metadata.py): optimize the retrieve logic
1 parent a776d9d commit d689ce2

1 file changed

Lines changed: 70 additions & 123 deletions

File tree

scripts/retrive_metadata.py

Lines changed: 70 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1,142 +1,89 @@
1+
import glob
12
import os
23
import time
4+
from typing import Any, Dict, List
35
import arxiv
46
import shutil
7+
import argparse
58

9+
import pandas as pd
10+
11+
12+
from vrdu import utils
613
from vrdu import logger
714

8-
log = logger.setup_app_level_logger(file_name="retrive_metadata.log")
15+
log = logger.setup_app_level_logger(file_name="retrieve_metadata.log")
916

1017

11-
def retrive_metadata_for_files(path: str) -> None:
12-
"""Retrieves metadata for files in a specified path.
13-
It first query the primary category by the file's name, then move the
14-
file to the path/{category}
18+
def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]:
19+
paper_ids = list(data.keys())
1520

16-
Args:
17-
path (str): The path to the directory containing the files.
21+
client = arxiv.Client()
1822

19-
Returns:
20-
None
23+
slice_length = 100
24+
paper_metadata = []
25+
26+
for i in range(0, len(paper_ids), slice_length):
27+
slices = paper_ids[i : i + slice_length]
28+
search_results = client.results(arxiv.Search(id_list=slices))
29+
30+
for index, result in enumerate(search_results):
31+
paper_metadata.append(
32+
{
33+
"entry_id": result.entry_id,
34+
"updated": str(result.updated),
35+
"published": str(result.published),
36+
"title": result.title,
37+
"doi": result.doi,
38+
"authors": [str(author) for author in result.authors],
39+
"summary": result.summary,
40+
"journal_ref": result.journal_ref,
41+
"primary_category": result.primary_category,
42+
"categories": result.categories,
43+
"links": [str(link) for link in result.links],
44+
"pdf_url": result.pdf_url,
45+
"paper_id": slices[index],
46+
"paper_path": data[slices[index]],
47+
"quality": "low",
48+
}
49+
)
50+
51+
return paper_metadata
52+
53+
54+
def main():
55+
parser = argparse.ArgumentParser()
56+
parser.add_argument(
57+
"-i", "--input_path", type=str, default="data/discipline_info.csv"
58+
)
59+
args = parser.parse_args()
60+
path = args.input_path
2161

22-
Raises:
23-
None
62+
discipline_info = pd.read_csv("data/discipline_info.csv")
63+
disciplines = set(discipline_info["discipline"])
2464

25-
This function retrieves metadata for the files in the specified path.
26-
It filters the files based on a specific format, moves them to categorized directories, and logs the actions performed.
65+
for discipline in disciplines:
66+
target_discipline_path = os.path.join(path, discipline)
67+
paper_paths = glob.glob(os.path.join(target_discipline_path, "*/"))
2768

28-
Note:
29-
The function assumes that the files in the specified path are in the format 'xxxx.yyyy.ext',
30-
where 'xxxx' and 'yyyy' are digits, and `ext` is the extension format.
69+
data = {
70+
os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths
71+
}
3172

32-
Example:
33-
retrive_metadata_for_files('/path/to/directory')
34-
"""
35-
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
36-
# filter the files have format xxxx.yyyy.ext
37-
filtered_files = [
38-
f for f in files if f[:4].isdigit() and f[5:9].isdigit() and f[4] == "."
39-
]
40-
num_papers = len(filtered_files)
41-
log.info("There are {} files".format(num_papers))
42-
client = arxiv.Client()
43-
slice_length = 100
73+
paper_metadata = retrieve_metadata(data)
4474

45-
for i in range(0, num_papers, slice_length):
46-
filename_without_extensions = [
47-
os.path.splitext(f)[0] for f in filtered_files[i : i + slice_length]
48-
]
49-
50-
for pdf_file, result in zip(
51-
filtered_files[i : i + slice_length],
52-
client.results(arxiv.Search(id_list=filename_without_extensions)),
53-
):
54-
old_path = os.path.join(path, pdf_file)
55-
category = result.primary_category
56-
new_path = os.path.join(path, category)
57-
if not os.path.exists(new_path):
58-
os.makedirs(new_path)
59-
log.info("Created directory: {}".format(new_path))
60-
try:
61-
shutil.move(old_path, new_path)
62-
log.info(
63-
"Move file: {} to {}".format(
64-
old_path,
65-
new_path,
66-
)
67-
)
68-
except Exception:
69-
log.exception(f"Error moving {old_path}")
70-
71-
72-
def retrive_metadata_for_folders(path: str) -> None:
73-
"""Retrieves metadata for subfolders in a specified path.
74-
It first query the primary category by the file's name, then move the
75-
file to the path/{category}
76-
77-
Args:
78-
path (str): The path to the directory containing the subfolders.
79-
80-
Returns:
81-
None
82-
83-
Raises:
84-
None
85-
86-
This function retrieves metadata for the subfolders in the specified path.
87-
It filters the subfolders based on a specific format, moves them to categorized directories, and logs the actions performed.
88-
89-
Note:
90-
The function assumes that the subfolders in the specified path are in the format 'xxxx.yyyy',
91-
where 'xxxx' and 'yyyy' are numeric values and the length of the subfolder name is 9 characters.
92-
93-
Example:
94-
retrive_metadata_for_folders('/path/to/directory')
95-
"""
96-
subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
97-
filtered_subfolders = [
98-
f for f in subfolders if f[:4].isdigit() and f[5:].isdigit() and f[4] == "."
99-
]
100-
num_papers = len(filtered_subfolders)
101-
log.info("There are {} subfolders".format(num_papers))
102-
client = arxiv.Client()
103-
slice_length = 100
104-
for i in range(0, num_papers, slice_length):
105-
slice_list = filtered_subfolders[i : i + slice_length]
106-
for dir_name, result in zip(
107-
slice_list,
108-
client.results(arxiv.Search(id_list=slice_list)),
109-
):
110-
old_path = os.path.join(path, dir_name)
111-
category = result.primary_category
112-
log.info(f"dir_name: {dir_name}, category: {category}")
113-
new_path = os.path.join(path, category)
114-
if not os.path.exists(new_path):
115-
os.makedirs(new_path)
116-
log.info("Created directory: {}".format(new_path))
117-
try:
118-
shutil.move(old_path, new_path)
119-
log.info(
120-
"Move directory: {} to {}".format(
121-
old_path,
122-
new_path,
123-
)
124-
)
125-
except Exception:
126-
log.exception(f"Error moving {old_path}")
127-
128-
129-
def retrive_metadata(path: str) -> None:
130-
retrive_metadata_for_folders(path)
131-
retrive_metadata_for_files(path)
75+
existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json")
76+
existed_json_data = []
77+
if os.path.exists(existed_json_file):
78+
existed_json_data = utils.load_json(existed_json_file)
13279

80+
existed_paper_ids = [x["paper_id"] for x in existed_json_data]
81+
existed_json_data.extend(
82+
[x for x in paper_metadata if x["paper_id"] not in existed_paper_ids]
83+
)
13384

134-
if __name__ == "__main__":
135-
import argparse
85+
utils.export_to_json(existed_json_data, existed_json_file)
13686

137-
parser = argparse.ArgumentParser()
138-
parser.add_argument("-p", "--path", help="path to directory containing subfolders")
139-
args = parser.parse_args()
140-
# run(args.path, args.cpu_count)
141-
# run_v2(args.path)
142-
retrive_metadata(args.path)
87+
88+
if __name__ == "__main__":
89+
main()

0 commit comments

Comments
 (0)