|
1 | 1 | import glob |
2 | 2 | import shutil |
3 | | -from typing import List |
4 | 3 | import re |
5 | 4 | import os |
6 | | -from tqdm import tqdm |
7 | 5 | import argparse |
8 | 6 | import pandas as pd |
| 7 | +import multiprocessing |
9 | 8 |
|
10 | 9 | from vrdu import logger |
11 | 10 |
|
|
19 | 18 | ] |
20 | 19 |
|
21 | 20 |
|
22 | | -def extract_processed_papers(database_file: str) -> List[str]: |
23 | | - df = pd.read_csv(database_file) |
24 | | - processed_papers = df[df["status"] == "success"]["path"].tolist() |
25 | | - log.info(f"There are {len(processed_papers)} papers") |
26 | | - return processed_papers |
| 21 | +def export_one_paper(main_path: str, target_path: str) -> None: |
| 22 | + """ |
| 23 | + Processes a single paper from the input directory and exports it to the target directory. |
| 24 | +
|
| 25 | + Args: |
| 26 | + main_path (str): The path to the input directory containing the paper to be processed. |
| 27 | + target_path (str): The path to the target directory where the processed paper will be exported. |
| 28 | +
|
| 29 | + Returns: |
| 30 | + None |
| 31 | +
|
| 32 | + Raises: |
| 33 | + FileNotFoundError: If the target directory does not exist and cannot be created. |
| 34 | +
|
| 35 | + Purpose: |
| 36 | + This function processes a single paper by copying its quality report, annotation files, |
| 37 | + and images to a new directory within the target directory. |
| 38 | +
|
| 39 | + Steps: |
| 40 | + 1. Logs the processing of the paper. |
| 41 | + 2. Extracts the output directory and result directory from the input directory. |
| 42 | + 3. Checks if the target directory for the discipline exists, if not, it creates it. |
| 43 | + 4. Creates a new directory for the paper within the target directory. |
| 44 | + 5. Copies the quality report file to the new directory. |
| 45 | + 6. Copies the annotation files to the new directory. |
| 46 | + 7. Copies the original images to the new directory, |
| 47 | + renaming them with a format of "original-page-{page_index}.jpg". |
| 48 | + 8. Copies the annotated images to the new directory. |
| 49 | + """ |
| 50 | + log.info(f"processing paper: {main_path}") |
| 51 | + output_path = os.path.join(main_path, "output") |
| 52 | + result_path = os.path.join(output_path, "result") |
| 53 | + |
| 54 | + paper_id = os.path.basename(main_path) |
| 55 | + discipline = os.path.basename(os.path.dirname(main_path)) |
| 56 | + |
| 57 | + target_discipline_path = os.path.join(target_path, discipline) |
| 58 | + if not os.path.exists(target_discipline_path): |
| 59 | + os.makedirs(target_discipline_path) |
| 60 | + |
| 61 | + new_paper_path = os.path.join(target_discipline_path, paper_id) |
| 62 | + if os.path.exists(new_paper_path): |
| 63 | + return |
27 | 64 |
|
| 65 | + os.makedirs(new_paper_path) |
28 | 66 |
|
29 | | -def export_to_dataset(processed_papers: List[str], target_path: str) -> None: |
30 | | - for main_path in tqdm(processed_papers): |
31 | | - log.info(f"processing paper: {main_path}") |
32 | | - output_path = os.path.join(main_path, "output") |
33 | | - result_path = os.path.join(output_path, "result") |
| 67 | + # coy quality report file |
| 68 | + quality_report_file = os.path.join(result_path, "quality_report.json") |
| 69 | + shutil.copy(quality_report_file, new_paper_path) |
34 | 70 |
|
35 | | - paper_id = os.path.basename(main_path) |
36 | | - discipline = os.path.basename(os.path.dirname(main_path)) |
| 71 | + # copy annotation files |
| 72 | + for json_file in json_files: |
| 73 | + shutil.copy(os.path.join(result_path, json_file), new_paper_path) |
37 | 74 |
|
38 | | - target_discipline_path = os.path.join(target_path, discipline) |
39 | | - if not os.path.exists(target_discipline_path): |
40 | | - os.makedirs(target_discipline_path) |
| 75 | + # copy images |
| 76 | + original_image_path = os.path.join(output_path, "paper_colored") |
41 | 77 |
|
42 | | - new_paper_path = os.path.join(target_discipline_path, paper_id) |
43 | | - if os.path.exists(new_paper_path): |
44 | | - continue |
| 78 | + original_images = glob.glob(os.path.join(original_image_path, "*.jpg")) |
| 79 | + for image in original_images: |
| 80 | + filename = os.path.basename(image) |
| 81 | + match = re.search(r"page-(\d+)", filename) |
| 82 | + page_index = int(match.group(1)) - 1 |
| 83 | + new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4)) |
45 | 84 |
|
46 | | - os.makedirs(new_paper_path) |
| 85 | + shutil.copy(image, os.path.join(new_paper_path, new_image_name)) |
47 | 86 |
|
48 | | - # coy quality report file |
49 | | - quality_report_file = os.path.join(result_path, "quality_report.json") |
50 | | - shutil.copy(quality_report_file, new_paper_path) |
| 87 | + annotated_images = glob.glob(os.path.join(result_path, "*.jpg")) |
| 88 | + for image in annotated_images: |
| 89 | + shutil.copy(image, new_paper_path) |
| 90 | + os.makedirs(new_paper_path) |
51 | 91 |
|
52 | | - # copy annotation files |
53 | | - for json_file in json_files: |
54 | | - shutil.copy(os.path.join(result_path, json_file), new_paper_path) |
| 92 | + # coy quality report file |
| 93 | + quality_report_file = os.path.join(result_path, "quality_report.json") |
| 94 | + shutil.copy(quality_report_file, new_paper_path) |
| 95 | + |
| 96 | + # copy annotation files |
| 97 | + for json_file in json_files: |
| 98 | + shutil.copy(os.path.join(result_path, json_file), new_paper_path) |
| 99 | + |
| 100 | + # copy images |
| 101 | + original_image_path = os.path.join(output_path, "paper_colored") |
| 102 | + |
| 103 | + original_images = glob.glob(os.path.join(original_image_path, "*.jpg")) |
| 104 | + for image in original_images: |
| 105 | + filename = os.path.basename(image) |
| 106 | + match = re.search(r"page-(\d+)", filename) |
| 107 | + page_index = int(match.group(1)) - 1 |
| 108 | + new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4)) |
| 109 | + |
| 110 | + shutil.copy(image, os.path.join(new_paper_path, new_image_name)) |
| 111 | + |
| 112 | + annotated_images = glob.glob(os.path.join(result_path, "*.jpg")) |
| 113 | + for image in annotated_images: |
| 114 | + shutil.copy(image, new_paper_path) |
| 115 | + |
| 116 | + |
| 117 | +def export_to_dataset(database_file: str, output_path: str) -> None: |
| 118 | + """ |
| 119 | + Exports processed papers from the provided database file to the specified output path. |
| 120 | +
|
| 121 | + Args: |
| 122 | + database_file (str): The path to the processed database file containing the list of processed papers. |
| 123 | + output_path (str): The path to the target directory where the processed papers will be exported. |
| 124 | +
|
| 125 | + Returns: |
| 126 | + None |
| 127 | +
|
| 128 | + Raises: |
| 129 | + FileNotFoundError: If the target directory does not exist and cannot be created. |
| 130 | +
|
| 131 | + Purpose: |
| 132 | + This function exports the processed papers by processing each paper individually |
| 133 | + and exporting it to the target directory. |
| 134 | +
|
| 135 | + Steps: |
| 136 | + 1. Reads the processed database file and filters the papers with a status of "success". |
| 137 | + 2. Logs the number of processed papers. |
| 138 | + 3. Creates a list of arguments containing tuples of processed papers and the output path. |
| 139 | + 4. Creates a multiprocessing pool with 28 processes. |
| 140 | + 5. Uses the pool to starmap the export_one_paper function on the list of arguments. |
| 141 | +
|
| 142 | + Note: |
| 143 | + The export_one_paper function is responsible for processing a single paper |
| 144 | + and exporting it to the target directory. |
| 145 | + """ |
| 146 | + df = pd.read_csv(database_file) |
| 147 | + processed_papers = df[df["status"] == "success"]["path"].tolist() |
| 148 | + log.info(f"There are {len(processed_papers)} papers") |
55 | 149 |
|
56 | | - # copy images |
57 | | - original_image_path = os.path.join(output_path, "paper_colored") |
| 150 | + arguments = [(paper, output_path) for paper in processed_papers] |
| 151 | + with multiprocessing.Pool(processes=28) as pool: |
| 152 | + pool.starmap(export_one_paper, arguments) |
58 | 153 |
|
59 | | - original_images = glob.glob(os.path.join(original_image_path, "*.jpg")) |
60 | | - for image in original_images: |
61 | | - filename = os.path.basename(image) |
62 | | - match = re.search(r"page-(\d+)", filename) |
63 | | - page_index = int(match.group(1)) - 1 |
64 | | - new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4)) |
65 | 154 |
|
66 | | - shutil.copy(image, os.path.join(new_paper_path, new_image_name)) |
| 155 | +def main() -> None: |
| 156 | + """ |
| 157 | + The main function that parses command-line arguments and calls the export_to_dataset function. |
67 | 158 |
|
68 | | - annotated_images = glob.glob(os.path.join(result_path, "*.jpg")) |
69 | | - for image in annotated_images: |
70 | | - shutil.copy(image, new_paper_path) |
| 159 | + Args: |
| 160 | + None |
71 | 161 |
|
| 162 | + Returns: |
| 163 | + None |
72 | 164 |
|
73 | | -def extract_dataset(database_file: str, output_path: str): |
74 | | - processed_papers = extract_processed_papers(database_file) |
75 | | - export_to_dataset(processed_papers, output_path) |
| 165 | + Raises: |
| 166 | + None |
76 | 167 |
|
| 168 | + Purpose: |
| 169 | + This function is the entry point of the script. It parses the command-line arguments |
| 170 | + using the argparse module and then calls the export_to_dataset function with the provided arguments. |
77 | 171 |
|
78 | | -def main(): |
| 172 | + Steps: |
| 173 | + 1. Create an ArgumentParser object to parse the command-line arguments. |
| 174 | + 2. Add two command-line arguments: "-d" for the processed database file and "-o" for the output path of the dataset. |
| 175 | + 3. Parse the command-line arguments using the parse_args() method of the ArgumentParser object. |
| 176 | + 4. Call the export_to_dataset function with the parsed arguments. |
| 177 | + """ |
79 | 178 | parser = argparse.ArgumentParser() |
80 | 179 | parser.add_argument( |
81 | 180 | "-d", "--database_file", type=str, help="processed database file" |
82 | 181 | ) |
83 | | - parser.add_argument("-o", "--output_path", type=str, help="output dir") |
| 182 | + parser.add_argument("-o", "--output_path", type=str, help="output path of dataset") |
84 | 183 | args = parser.parse_args() |
85 | 184 |
|
86 | | - extract_dataset(args.database_file, args.output_path) |
| 185 | + export_to_dataset(args.database_file, args.output_path) |
87 | 186 |
|
88 | 187 |
|
89 | 188 | if __name__ == "__main__": |
|
0 commit comments