|
4 | 4 | import multiprocessing |
5 | 5 | from typing import List |
6 | 6 | from uuid import uuid4 |
| 7 | +import pandas as pd |
7 | 8 |
|
8 | 9 | from vrdu import logger |
9 | 10 | from vrdu import utils |
10 | 11 | from main import process_one_file |
11 | 12 |
|
12 | 13 | log_file = str(uuid4()) + ".log" |
13 | | -log = logger.setup_app_level_logger(file_name=log_file, level="INFO", mode="a") |
| 14 | +log = logger.setup_app_level_logger(file_name=log_file, level="INFO") |
| 15 | + |
| 16 | +database = "data/processed_paper_database.csv" |
14 | 17 |
|
15 | 18 |
|
16 | 19 | def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]: |
@@ -39,38 +42,57 @@ def filter_tex_files(tex_files: List[str], main_path: str = None) -> List[str]: |
39 | 42 | log.debug(f"failed to read tex file: {tex_file}") |
40 | 43 | continue |
41 | 44 |
|
| 45 | + log.info(f"Before filtering, Found {len(result)} tex files") |
| 46 | + if os.path.exists(database): |
| 47 | + df = pd.read_csv(database) |
| 48 | + processed_papers = set(df[df["status"] != "processing"]["path"]) |
| 49 | + result = [x for x in result if os.path.dirname(x) not in processed_papers] |
| 50 | + |
| 51 | + log.info(f"After filtering, Found {len(result)} tex files") |
42 | 52 | return result |
43 | 53 |
|
44 | 54 |
|
45 | | -def process_one_category(path, cpu_count, category): |
46 | | - category_path = os.path.join(path, category) |
47 | | - log.info(f"path to raw data: {category_path}") |
| 55 | +def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None: |
| 56 | + discpline_path = os.path.join(path, discpline) |
| 57 | + log.info(f"path to raw data: {discpline_path}") |
48 | 58 | log.info(f"Using cpu counts: {cpu_count}") |
49 | | - tex_files = utils.extract_all_tex_files(category_path) |
50 | | - tex_files = filter_tex_files(tex_files, category_path) |
| 59 | + tex_files = utils.extract_all_tex_files(discpline_path) |
| 60 | + tex_files = filter_tex_files(tex_files, discpline_path) |
51 | 61 | log.info(f"Found {len(tex_files)} tex files") |
52 | 62 |
|
53 | 63 | try: |
54 | 64 | with multiprocessing.Pool(cpu_count) as pool: |
55 | 65 | pool.map(process_one_file, tex_files) |
56 | 66 | # save log file |
57 | 67 | except Exception: |
58 | | - log.exception(f"[VRDU] category: {category}, failed to process.") |
| 68 | + log.exception(f"[VRDU] discpline: {discpline}, failed to process.") |
59 | 69 | finally: |
60 | 70 | # save the process log |
61 | | - shutil.move(log_file, f"batch_process_{category}.log") |
| 71 | + log.info(f"[VRDU] discpline: {discpline}, finished processing.") |
| 72 | + shutil.move(log_file, f"data/batch_process_{discpline}.log") |
62 | 73 |
|
63 | 74 |
|
64 | | -if __name__ == "__main__": |
| 75 | +def main(): |
65 | 76 | parser = argparse.ArgumentParser() |
66 | | - parser.add_argument("-p", "--path", type=str, required=True) |
67 | | - parser.add_argument("-c", "--cpu_count", type=int, required=True) |
68 | | - parser.add_argument("-t", "--category", type=str, required=False) |
| 77 | + parser.add_argument( |
| 78 | + "-p", "--path", type=str, required=True, help="path to raw data" |
| 79 | + ) |
| 80 | + parser.add_argument( |
| 81 | + "-c", |
| 82 | + "--cpu_count", |
| 83 | + type=int, |
| 84 | + required=True, |
| 85 | + help="cpu count for multiprocessing", |
| 86 | + ) |
| 87 | + parser.add_argument( |
| 88 | + "-t", "--discpline", type=str, required=True, help="discpline to process" |
| 89 | + ) |
69 | 90 | args = parser.parse_args() |
70 | | - path, cpu_count, category = args.path, args.cpu_count, args.category |
| 91 | + path, cpu_count, discpline = args.path, args.cpu_count, args.discpline |
71 | 92 |
|
72 | | - categories = [category] if category is not None else utils.get_all_categories() |
| 93 | + log.info(f"[VRDU] discpline: {discpline}, start to process.") |
| 94 | + process_one_discpline(path, cpu_count, discpline) |
73 | 95 |
|
74 | | - for category in categories: |
75 | | - log.info(f"Processing single category: {category}") |
76 | | - process_one_category(path, cpu_count, category) |
| 96 | + |
| 97 | +if __name__ == "__main__": |
| 98 | + main() |
0 commit comments