Skip to content

Commit 60eba67

Browse files
committed
Merge branch 'main' of https://github.com/MaoSong2022/vrdu_data_process into main
2 parents e9876b5 + dc9de68 commit 60eba67

1 file changed

Lines changed: 50 additions & 31 deletions

File tree

scripts/extract_category_block.py

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,81 @@
11
import os
22
import shutil
33
import uuid
4+
import datetime
45

56
from vrdu import utils
67
from vrdu.config import config
78

89

9-
def extract_category(path, category_name, output_path):
10-
print(f"extract category {category_name} from {path} to {output_path}")
11-
json_file = os.path.join(path, "reading_annotation.json")
12-
data = utils.load_json(json_file)
13-
14-
category = config.name2category[category_name]
15-
result_json = os.path.join(output_path, "reading_annotation.json")
10+
def extract_category(path, category, output_path):
11+
source_json_file = os.path.join(path, "reading_annotation.json")
12+
data = utils.load_json(source_json_file)
1613

1714
result = []
1815

19-
for x, pairs in data.items():
20-
if not x.isnumeric():
16+
for key, blocks in data.items():
17+
# x must be page index
18+
if not key.isnumeric():
2119
continue
22-
for p in pairs:
23-
if "category" not in p:
20+
for block in blocks:
21+
if "category" not in block:
2422
return
25-
if p["category"] == category:
26-
result.append(p)
23+
if block["category"] == category:
24+
result.append(block)
2725

28-
for x in result:
26+
for key in result:
2927
output_image_name = f"{uuid.uuid4()}.png"
3028
shutil.copyfile(
31-
os.path.join(path, x["image_path"]),
29+
os.path.join(path, key["image_path"]),
3230
os.path.join(output_path, output_image_name),
3331
)
34-
x["image_path"] = output_image_name
35-
x["paper_source"] = path
32+
key["image_path"] = output_image_name
33+
key["paper_source"] = path
34+
key["added_date"] = str(datetime.date.today())
3635

37-
if os.path.exists(result_json):
38-
data = utils.load_json(result_json)
39-
result.extend(data)
36+
return result
4037

41-
utils.export_to_json(result, result_json)
4238

39+
def main(category_name, input_directory, output_path):
40+
"""extract all blocks that is of the given category to a given output directory"""
41+
if category_name not in config.name2category.keys():
42+
raise KeyError(
43+
f"Unknown category name, avalaible category names: {list(config.name2category.keys())}"
44+
)
4345

44-
if __name__ == "__main__":
45-
category_name = "Table"
46-
input_directory = os.path.expanduser("/home/PJLAB/maosong/vrdu_data")
47-
output_path = os.path.expanduser(f"~/Desktop/sample_data/{category_name}")
48-
if os.path.exists(output_path):
49-
shutil.rmtree(output_path)
50-
os.makedirs(output_path)
46+
category = config.name2category[category_name]
47+
result_json = os.path.join(output_path, "reading_annotation.json")
48+
49+
existed_source = set()
50+
if os.path.exists(result_json):
51+
existed_source = set(
52+
item["paper_source"] for item in utils.load_json(result_json)
53+
)
5154

5255
count = 0
5356
for root, dirs, files in os.walk(input_directory):
5457
if "reading_annotation.json" not in files:
5558
continue
59+
60+
# if data of this folder has been extracted
61+
if root in existed_source:
62+
continue
63+
5664
count += 1
5765

58-
extract_category(root, category_name, output_path)
66+
print(f"extract data from {root} to {output_path}")
67+
extract_category(root, category, output_path)
68+
69+
# exclude the reading_annotation.json file
70+
num_of_samples = len(os.listdir(output_path)) - 1
71+
print(f"extracted {count} files, {num_of_samples} samples obtained.")
5972

60-
num_of_sanples = len(os.listdir(output_path)) - 1
6173

62-
print(f"extracted {count} files, {num_of_sanples} samples obtained.")
74+
if __name__ == "__main__":
75+
category_name = "Equation"
76+
input_directory = os.path.expanduser("/cpfs01/shared/ADLab/datasets/vrdu_arxiv")
77+
output_path = os.path.expanduser(
78+
f"/cpfs01/shared/ADLab/datasets/vrdu_{category_name.lower()}"
79+
)
80+
81+
main(category_name, input_directory, output_path)

0 commit comments

Comments
 (0)