|
| 1 | +import glob |
1 | 2 | import os |
2 | 3 | import time |
| 4 | +from typing import Any, Dict, List |
3 | 5 | import arxiv |
4 | 6 | import shutil |
| 7 | +import argparse |
5 | 8 |
|
| 9 | +import pandas as pd |
| 10 | + |
| 11 | + |
| 12 | +from vrdu import utils |
6 | 13 | from vrdu import logger |
7 | 14 |
|
8 | | -log = logger.setup_app_level_logger(file_name="retrive_metadata.log") |
| 15 | +log = logger.setup_app_level_logger(file_name="retrieve_metadata.log") |
9 | 16 |
|
10 | 17 |
|
11 | | -def retrive_metadata_for_files(path: str) -> None: |
12 | | - """Retrieves metadata for files in a specified path. |
13 | | - It first query the primary category by the file's name, then move the |
14 | | - file to the path/{category} |
| 18 | +def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]: |
| 19 | + paper_ids = list(data.keys()) |
15 | 20 |
|
16 | | - Args: |
17 | | - path (str): The path to the directory containing the files. |
| 21 | + client = arxiv.Client() |
18 | 22 |
|
19 | | - Returns: |
20 | | - None |
| 23 | + slice_length = 100 |
| 24 | + paper_metadata = [] |
| 25 | + |
| 26 | + for i in range(0, len(paper_ids), slice_length): |
| 27 | + slices = paper_ids[i : i + slice_length] |
| 28 | + search_results = client.results(arxiv.Search(id_list=slices)) |
| 29 | + |
| 30 | + for index, result in enumerate(search_results): |
| 31 | + paper_metadata.append( |
| 32 | + { |
| 33 | + "entry_id": result.entry_id, |
| 34 | + "updated": str(result.updated), |
| 35 | + "published": str(result.published), |
| 36 | + "title": result.title, |
| 37 | + "doi": result.doi, |
| 38 | + "authors": [str(author) for author in result.authors], |
| 39 | + "summary": result.summary, |
| 40 | + "journal_ref": result.journal_ref, |
| 41 | + "primary_category": result.primary_category, |
| 42 | + "categories": result.categories, |
| 43 | + "links": [str(link) for link in result.links], |
| 44 | + "pdf_url": result.pdf_url, |
| 45 | + "paper_id": slices[index], |
| 46 | + "paper_path": data[slices[index]], |
| 47 | + "quality": "low", |
| 48 | + } |
| 49 | + ) |
| 50 | + |
| 51 | + return paper_metadata |
| 52 | + |
| 53 | + |
| 54 | +def main(): |
| 55 | + parser = argparse.ArgumentParser() |
| 56 | + parser.add_argument( |
| 57 | + "-i", "--input_path", type=str, default="data/discipline_info.csv" |
| 58 | + ) |
| 59 | + args = parser.parse_args() |
| 60 | + path = args.input_path |
21 | 61 |
|
22 | | - Raises: |
23 | | - None |
| 62 | + discipline_info = pd.read_csv("data/discipline_info.csv") |
| 63 | + disciplines = set(discipline_info["discipline"]) |
24 | 64 |
|
25 | | - This function retrieves metadata for the files in the specified path. |
26 | | - It filters the files based on a specific format, moves them to categorized directories, and logs the actions performed. |
| 65 | + for discipline in disciplines: |
| 66 | + target_discipline_path = os.path.join(path, discipline) |
| 67 | + paper_paths = glob.glob(os.path.join(target_discipline_path, "*/")) |
27 | 68 |
|
28 | | - Note: |
29 | | - The function assumes that the files in the specified path are in the format 'xxxx.yyyy.ext', |
30 | | - where 'xxxx' and 'yyyy' are digits, and `ext` is the extension format. |
| 69 | + data = { |
| 70 | + os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths |
| 71 | + } |
31 | 72 |
|
32 | | - Example: |
33 | | - retrive_metadata_for_files('/path/to/directory') |
34 | | - """ |
35 | | - files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] |
36 | | - # filter the files have format xxxx.yyyy.ext |
37 | | - filtered_files = [ |
38 | | - f for f in files if f[:4].isdigit() and f[5:9].isdigit() and f[4] == "." |
39 | | - ] |
40 | | - num_papers = len(filtered_files) |
41 | | - log.info("There are {} files".format(num_papers)) |
42 | | - client = arxiv.Client() |
43 | | - slice_length = 100 |
| 73 | + paper_metadata = retrieve_metadata(data) |
44 | 74 |
|
45 | | - for i in range(0, num_papers, slice_length): |
46 | | - filename_without_extensions = [ |
47 | | - os.path.splitext(f)[0] for f in filtered_files[i : i + slice_length] |
48 | | - ] |
49 | | - |
50 | | - for pdf_file, result in zip( |
51 | | - filtered_files[i : i + slice_length], |
52 | | - client.results(arxiv.Search(id_list=filename_without_extensions)), |
53 | | - ): |
54 | | - old_path = os.path.join(path, pdf_file) |
55 | | - category = result.primary_category |
56 | | - new_path = os.path.join(path, category) |
57 | | - if not os.path.exists(new_path): |
58 | | - os.makedirs(new_path) |
59 | | - log.info("Created directory: {}".format(new_path)) |
60 | | - try: |
61 | | - shutil.move(old_path, new_path) |
62 | | - log.info( |
63 | | - "Move file: {} to {}".format( |
64 | | - old_path, |
65 | | - new_path, |
66 | | - ) |
67 | | - ) |
68 | | - except Exception: |
69 | | - log.exception(f"Error moving {old_path}") |
70 | | - |
71 | | - |
72 | | -def retrive_metadata_for_folders(path: str) -> None: |
73 | | - """Retrieves metadata for subfolders in a specified path. |
74 | | - It first query the primary category by the file's name, then move the |
75 | | - file to the path/{category} |
76 | | -
|
77 | | - Args: |
78 | | - path (str): The path to the directory containing the subfolders. |
79 | | -
|
80 | | - Returns: |
81 | | - None |
82 | | -
|
83 | | - Raises: |
84 | | - None |
85 | | -
|
86 | | - This function retrieves metadata for the subfolders in the specified path. |
87 | | - It filters the subfolders based on a specific format, moves them to categorized directories, and logs the actions performed. |
88 | | -
|
89 | | - Note: |
90 | | - The function assumes that the subfolders in the specified path are in the format 'xxxx.yyyy', |
91 | | - where 'xxxx' and 'yyyy' are numeric values and the length of the subfolder name is 9 characters. |
92 | | -
|
93 | | - Example: |
94 | | - retrive_metadata_for_folders('/path/to/directory') |
95 | | - """ |
96 | | - subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))] |
97 | | - filtered_subfolders = [ |
98 | | - f for f in subfolders if f[:4].isdigit() and f[5:].isdigit() and f[4] == "." |
99 | | - ] |
100 | | - num_papers = len(filtered_subfolders) |
101 | | - log.info("There are {} subfolders".format(num_papers)) |
102 | | - client = arxiv.Client() |
103 | | - slice_length = 100 |
104 | | - for i in range(0, num_papers, slice_length): |
105 | | - slice_list = filtered_subfolders[i : i + slice_length] |
106 | | - for dir_name, result in zip( |
107 | | - slice_list, |
108 | | - client.results(arxiv.Search(id_list=slice_list)), |
109 | | - ): |
110 | | - old_path = os.path.join(path, dir_name) |
111 | | - category = result.primary_category |
112 | | - log.info(f"dir_name: {dir_name}, category: {category}") |
113 | | - new_path = os.path.join(path, category) |
114 | | - if not os.path.exists(new_path): |
115 | | - os.makedirs(new_path) |
116 | | - log.info("Created directory: {}".format(new_path)) |
117 | | - try: |
118 | | - shutil.move(old_path, new_path) |
119 | | - log.info( |
120 | | - "Move directory: {} to {}".format( |
121 | | - old_path, |
122 | | - new_path, |
123 | | - ) |
124 | | - ) |
125 | | - except Exception: |
126 | | - log.exception(f"Error moving {old_path}") |
127 | | - |
128 | | - |
129 | | -def retrive_metadata(path: str) -> None: |
130 | | - retrive_metadata_for_folders(path) |
131 | | - retrive_metadata_for_files(path) |
| 75 | + existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json") |
| 76 | + existed_json_data = [] |
| 77 | + if os.path.exists(existed_json_file): |
| 78 | + existed_json_data = utils.load_json(existed_json_file) |
132 | 79 |
|
| 80 | + existed_paper_ids = [x["paper_id"] for x in existed_json_data] |
| 81 | + existed_json_data.extend( |
| 82 | + [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids] |
| 83 | + ) |
133 | 84 |
|
134 | | -if __name__ == "__main__": |
135 | | - import argparse |
| 85 | + utils.export_to_json(existed_json_data, existed_json_file) |
136 | 86 |
|
137 | | - parser = argparse.ArgumentParser() |
138 | | - parser.add_argument("-p", "--path", help="path to directory containing subfolders") |
139 | | - args = parser.parse_args() |
140 | | - # run(args.path, args.cpu_count) |
141 | | - # run_v2(args.path) |
142 | | - retrive_metadata(args.path) |
| 87 | + |
| 88 | +if __name__ == "__main__": |
| 89 | + main() |
0 commit comments