Skip to content

Commit 931098c

Browse files
committed
refactor(export_to_dataset.py): simplify code logic
1 parent 0da6604 commit 931098c

1 file changed

Lines changed: 144 additions & 45 deletions

File tree

scripts/export_to_dataset.py

Lines changed: 144 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import glob
22
import shutil
3-
from typing import List
43
import re
54
import os
6-
from tqdm import tqdm
75
import argparse
86
import pandas as pd
7+
import multiprocessing
98

109
from vrdu import logger
1110

@@ -19,71 +18,171 @@
1918
]
2019

2120

22-
def extract_processed_papers(database_file: str) -> List[str]:
23-
df = pd.read_csv(database_file)
24-
processed_papers = df[df["status"] == "success"]["path"].tolist()
25-
log.info(f"There are {len(processed_papers)} papers")
26-
return processed_papers
21+
def export_one_paper(main_path: str, target_path: str) -> None:
22+
"""
23+
Processes a single paper from the input directory and exports it to the target directory.
24+
25+
Args:
26+
main_path (str): The path to the input directory containing the paper to be processed.
27+
target_path (str): The path to the target directory where the processed paper will be exported.
28+
29+
Returns:
30+
None
31+
32+
Raises:
33+
FileNotFoundError: If the target directory does not exist and cannot be created.
34+
35+
Purpose:
36+
This function processes a single paper by copying its quality report, annotation files,
37+
and images to a new directory within the target directory.
38+
39+
Steps:
40+
1. Logs the processing of the paper.
41+
2. Extracts the output directory and result directory from the input directory.
42+
3. Checks if the target directory for the discipline exists, if not, it creates it.
43+
4. Creates a new directory for the paper within the target directory.
44+
5. Copies the quality report file to the new directory.
45+
6. Copies the annotation files to the new directory.
46+
7. Copies the original images to the new directory,
47+
renaming them with a format of "original-page-{page_index}.jpg".
48+
8. Copies the annotated images to the new directory.
49+
"""
50+
log.info(f"processing paper: {main_path}")
51+
output_path = os.path.join(main_path, "output")
52+
result_path = os.path.join(output_path, "result")
53+
54+
paper_id = os.path.basename(main_path)
55+
discipline = os.path.basename(os.path.dirname(main_path))
56+
57+
target_discipline_path = os.path.join(target_path, discipline)
58+
if not os.path.exists(target_discipline_path):
59+
os.makedirs(target_discipline_path)
60+
61+
new_paper_path = os.path.join(target_discipline_path, paper_id)
62+
if os.path.exists(new_paper_path):
63+
return
2764

65+
os.makedirs(new_paper_path)
2866

29-
def export_to_dataset(processed_papers: List[str], target_path: str) -> None:
30-
for main_path in tqdm(processed_papers):
31-
log.info(f"processing paper: {main_path}")
32-
output_path = os.path.join(main_path, "output")
33-
result_path = os.path.join(output_path, "result")
67+
# coy quality report file
68+
quality_report_file = os.path.join(result_path, "quality_report.json")
69+
shutil.copy(quality_report_file, new_paper_path)
3470

35-
paper_id = os.path.basename(main_path)
36-
discipline = os.path.basename(os.path.dirname(main_path))
71+
# copy annotation files
72+
for json_file in json_files:
73+
shutil.copy(os.path.join(result_path, json_file), new_paper_path)
3774

38-
target_discipline_path = os.path.join(target_path, discipline)
39-
if not os.path.exists(target_discipline_path):
40-
os.makedirs(target_discipline_path)
75+
# copy images
76+
original_image_path = os.path.join(output_path, "paper_colored")
4177

42-
new_paper_path = os.path.join(target_discipline_path, paper_id)
43-
if os.path.exists(new_paper_path):
44-
continue
78+
original_images = glob.glob(os.path.join(original_image_path, "*.jpg"))
79+
for image in original_images:
80+
filename = os.path.basename(image)
81+
match = re.search(r"page-(\d+)", filename)
82+
page_index = int(match.group(1)) - 1
83+
new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4))
4584

46-
os.makedirs(new_paper_path)
85+
shutil.copy(image, os.path.join(new_paper_path, new_image_name))
4786

48-
# coy quality report file
49-
quality_report_file = os.path.join(result_path, "quality_report.json")
50-
shutil.copy(quality_report_file, new_paper_path)
87+
annotated_images = glob.glob(os.path.join(result_path, "*.jpg"))
88+
for image in annotated_images:
89+
shutil.copy(image, new_paper_path)
90+
os.makedirs(new_paper_path)
5191

52-
# copy annotation files
53-
for json_file in json_files:
54-
shutil.copy(os.path.join(result_path, json_file), new_paper_path)
92+
# coy quality report file
93+
quality_report_file = os.path.join(result_path, "quality_report.json")
94+
shutil.copy(quality_report_file, new_paper_path)
95+
96+
# copy annotation files
97+
for json_file in json_files:
98+
shutil.copy(os.path.join(result_path, json_file), new_paper_path)
99+
100+
# copy images
101+
original_image_path = os.path.join(output_path, "paper_colored")
102+
103+
original_images = glob.glob(os.path.join(original_image_path, "*.jpg"))
104+
for image in original_images:
105+
filename = os.path.basename(image)
106+
match = re.search(r"page-(\d+)", filename)
107+
page_index = int(match.group(1)) - 1
108+
new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4))
109+
110+
shutil.copy(image, os.path.join(new_paper_path, new_image_name))
111+
112+
annotated_images = glob.glob(os.path.join(result_path, "*.jpg"))
113+
for image in annotated_images:
114+
shutil.copy(image, new_paper_path)
115+
116+
117+
def export_to_dataset(database_file: str, output_path: str) -> None:
118+
"""
119+
Exports processed papers from the provided database file to the specified output path.
120+
121+
Args:
122+
database_file (str): The path to the processed database file containing the list of processed papers.
123+
output_path (str): The path to the target directory where the processed papers will be exported.
124+
125+
Returns:
126+
None
127+
128+
Raises:
129+
FileNotFoundError: If the target directory does not exist and cannot be created.
130+
131+
Purpose:
132+
This function exports the processed papers by processing each paper individually
133+
and exporting it to the target directory.
134+
135+
Steps:
136+
1. Reads the processed database file and filters the papers with a status of "success".
137+
2. Logs the number of processed papers.
138+
3. Creates a list of arguments containing tuples of processed papers and the output path.
139+
4. Creates a multiprocessing pool with 28 processes.
140+
5. Uses the pool to starmap the export_one_paper function on the list of arguments.
141+
142+
Note:
143+
The export_one_paper function is responsible for processing a single paper
144+
and exporting it to the target directory.
145+
"""
146+
df = pd.read_csv(database_file)
147+
processed_papers = df[df["status"] == "success"]["path"].tolist()
148+
log.info(f"There are {len(processed_papers)} papers")
55149

56-
# copy images
57-
original_image_path = os.path.join(output_path, "paper_colored")
150+
arguments = [(paper, output_path) for paper in processed_papers]
151+
with multiprocessing.Pool(processes=28) as pool:
152+
pool.starmap(export_one_paper, arguments)
58153

59-
original_images = glob.glob(os.path.join(original_image_path, "*.jpg"))
60-
for image in original_images:
61-
filename = os.path.basename(image)
62-
match = re.search(r"page-(\d+)", filename)
63-
page_index = int(match.group(1)) - 1
64-
new_image_name = "original-page-{}.jpg".format(str(page_index).zfill(4))
65154

66-
shutil.copy(image, os.path.join(new_paper_path, new_image_name))
155+
def main() -> None:
156+
"""
157+
The main function that parses command-line arguments and calls the export_to_dataset function.
67158
68-
annotated_images = glob.glob(os.path.join(result_path, "*.jpg"))
69-
for image in annotated_images:
70-
shutil.copy(image, new_paper_path)
159+
Args:
160+
None
71161
162+
Returns:
163+
None
72164
73-
def extract_dataset(database_file: str, output_path: str):
74-
processed_papers = extract_processed_papers(database_file)
75-
export_to_dataset(processed_papers, output_path)
165+
Raises:
166+
None
76167
168+
Purpose:
169+
This function is the entry point of the script. It parses the command-line arguments
170+
using the argparse module and then calls the export_to_dataset function with the provided arguments.
77171
78-
def main():
172+
Steps:
173+
1. Create an ArgumentParser object to parse the command-line arguments.
174+
2. Add two command-line arguments: "-d" for the processed database file and "-o" for the output path of the dataset.
175+
3. Parse the command-line arguments using the parse_args() method of the ArgumentParser object.
176+
4. Call the export_to_dataset function with the parsed arguments.
177+
"""
79178
parser = argparse.ArgumentParser()
80179
parser.add_argument(
81180
"-d", "--database_file", type=str, help="processed database file"
82181
)
83-
parser.add_argument("-o", "--output_path", type=str, help="output dir")
182+
parser.add_argument("-o", "--output_path", type=str, help="output path of dataset")
84183
args = parser.parse_args()
85184

86-
extract_dataset(args.database_file, args.output_path)
185+
export_to_dataset(args.database_file, args.output_path)
87186

88187

89188
if __name__ == "__main__":

0 commit comments

Comments
 (0)