Skip to content

Commit f4f4e62

Browse files
committed
Merge branch 'dev' into main
2 parents 646a2bb + 0df6f72 commit f4f4e62

23 files changed

Lines changed: 405 additions & 5321 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,6 @@ TexSoup/tests/
178178
*.tex
179179
*.aux
180180
*.pdf
181+
182+
# jupyter notebook
183+
*.ipynb

batch_process.py

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
import multiprocessing
55
from typing import List
66
from uuid import uuid4
7+
import pandas as pd
78

89
from vrdu import logger
910
from vrdu import utils
1011
from main import process_one_file
1112

1213
log_file = str(uuid4()) + ".log"
13-
log = logger.setup_app_level_logger(file_name=log_file, level="INFO", mode="a")
14+
log = logger.setup_app_level_logger(file_name=log_file, level="INFO")
15+
16+
database = "data/processed_paper_database.csv"
1417

1518

1619
def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
@@ -39,38 +42,57 @@ def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]:
3942
log.debug(f"failed to read tex file: {tex_file}")
4043
continue
4144

45+
log.info(f"Before filtering, Found {len(result)} tex files")
46+
if os.path.exists(database):
47+
df = pd.read_csv(database)
48+
processed_papers = set(df[df["status"] != "processing"]["path"])
49+
result = [x for x in result if os.path.dirname(x) not in processed_papers]
50+
51+
log.info(f"After filtering, Found {len(result)} tex files")
4252
return result
4353

4454

45-
def process_one_category(path, cpu_count, category):
46-
category_path = os.path.join(path, category)
47-
log.info(f"path to raw data: {category_path}")
55+
def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
56+
discpline_path = os.path.join(path, discpline)
57+
log.info(f"path to raw data: {discpline_path}")
4858
log.info(f"Using cpu counts: {cpu_count}")
49-
tex_files = utils.extract_all_tex_files(category_path)
50-
tex_files = filter_tex_files(tex_files, category_path)
59+
tex_files = utils.extract_all_tex_files(discpline_path)
60+
tex_files = filter_tex_files(tex_files, discpline_path)
5161
log.info(f"Found {len(tex_files)} tex files")
5262

5363
try:
5464
with multiprocessing.Pool(cpu_count) as pool:
5565
pool.map(process_one_file, tex_files)
5666
# save log file
5767
except Exception:
58-
log.exception(f"[VRDU] category: {category}, failed to process.")
68+
log.exception(f"[VRDU] discpline: {discpline}, failed to process.")
5969
finally:
6070
# save the process log
61-
shutil.move(log_file, f"batch_process_{category}.log")
71+
log.info(f"[VRDU] discpline: {discpline}, finished processing.")
72+
shutil.move(log_file, f"data/batch_process_{discpline}.log")
6273

6374

64-
if __name__ == "__main__":
75+
def main():
6576
parser = argparse.ArgumentParser()
66-
parser.add_argument("-p", "--path", type=str, required=True)
67-
parser.add_argument("-c", "--cpu_count", type=int, required=True)
68-
parser.add_argument("-t", "--category", type=str, required=False)
77+
parser.add_argument(
78+
"-p", "--path", type=str, required=True, help="path to raw data"
79+
)
80+
parser.add_argument(
81+
"-c",
82+
"--cpu_count",
83+
type=int,
84+
required=True,
85+
help="cpu count for multiprocessing",
86+
)
87+
parser.add_argument(
88+
"-t", "--discpline", type=str, required=True, help="discpline to process"
89+
)
6990
args = parser.parse_args()
70-
path, cpu_count, category = args.path, args.cpu_count, args.category
91+
path, cpu_count, discpline = args.path, args.cpu_count, args.discpline
7192

72-
categories = [category] if category is not None else utils.get_all_categories()
93+
log.info(f"[VRDU] discpline: {discpline}, start to process.")
94+
process_one_discpline(path, cpu_count, discpline)
7395

74-
for category in categories:
75-
log.info(f"Processing single category: {category}")
76-
process_one_category(path, cpu_count, category)
96+
97+
if __name__ == "__main__":
98+
main()
Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
categories,count
1+
discpline,count
22
cs.LG,68
33
cs.CV,56
44
quant-ph,40
@@ -122,3 +122,33 @@ cs.DM,1
122122
cs.SC,1
123123
physics.med-ph,1
124124
q-fin.GN,1
125+
q-bio.OT,1
126+
nlin.CG,1
127+
cs.AR,1
128+
cs.OH,1
129+
nlin.PS,1
130+
cs.GL,1
131+
q-bio.TO,1
132+
physics.pop-ph,1
133+
q-fin.CP,1
134+
q-bio.MN,1
135+
q-bio.SC,1
136+
math.AC,1
137+
physics.ao-ph,1
138+
cs.MM,1
139+
physics.ed-ph,1
140+
q-fin.EC,1
141+
cs.SY,1
142+
nlin.CD,1
143+
cs.PL,1
144+
cs.PF,1
145+
cs.OS,1
146+
q-bio.GN,1
147+
q-fin.MF,1
148+
cs.MS,1
149+
math.KT,1
150+
cs.DL,1
151+
econ.GN,1
152+
q-fin.PM,1
153+
stat.OT,1
154+
astro-ph,1

examples/example1/0.png

-711 KB
Binary file not shown.

examples/example1/1.png

-803 KB
Binary file not shown.

examples/example1/2.png

-605 KB
Binary file not shown.

examples/example1/3.png

-623 KB
Binary file not shown.

examples/example1/4.png

-724 KB
Binary file not shown.

examples/example1/5.png

-742 KB
Binary file not shown.

examples/example1/6.png

-739 KB
Binary file not shown.

0 commit comments

Comments
 (0)