Alpha-Innovator
diff --git a/‎batch_process.py‎
Lines changed: 1 addition & 1 deletion b/‎batch_process.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎main.py‎
Lines changed: 19 additions & 4 deletions b/‎main.py‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎scripts/arxiv_download.py‎
Lines changed: 49 additions & 43 deletions b/‎scripts/arxiv_download.py‎
Lines changed: 49 additions & 43 deletions
@@ -24,7 +24,7 @@ def filter_tex_files(
 
     Args:
         tex_files (List[str]): list of tex files
-        main_path (str, optional): path to main directory. Defaults to None.
+        main_path (str): path to main directory.
 
     Returns:
         List[str]: list of tex files that are compilable.
 
@@ -73,6 +73,16 @@ def remove_redundant_stuff(main_directory: str) -> None:
 
 
 def process_one_file(file_name: str) -> None:
+    """
+    Process a file through multiple steps including preprocessing, rendering,
+    transforming into images, generating annotations, and handling exceptions.
+
+    Args:
+        file_name (str): The path to the main .tex file to be processed.
+
+    Returns:
+        None
+    """
     main_directory = os.path.dirname(file_name)
     log.info(f"[VRDU] file: {file_name}, start processing.")
 
@@ -100,21 +110,25 @@ def process_one_file(file_name: str) -> None:
     cwd = os.getcwd()
 
     try:
-        # change the working directory to the main directory
+        # change the working directory to the main directory of the paper
         os.chdir(main_directory)
+        # create output folder
+        os.makedirs(os.path.join(main_directory, "output/result"))
+
+        # step 1: preprocess the paper
         preprocess.run(original_tex)
 
-        # run rendering
+        # step 2.1: run rendering
         vrdu_renderer = renderer.Renderer()
         vrdu_renderer.render(original_tex)
 
-        # compile into PDFs, and then convert into images
+        # step 2.2: compling tex into PDFs
         log.info(
             f"[VRDU] file: {original_tex}, start transforming into images, this may take a while..."
         )
         transform_tex_to_images(main_directory)
 
-        # generate annotations
+        # Step 3: generate annotations
         log.info(
             f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..."
         )
@@ -124,6 +138,7 @@ def process_one_file(file_name: str) -> None:
         vrdu_order_annotation = order.OrderAnnotation(original_tex)
         vrdu_order_annotation.annotate()
 
+        # generate quality report for simple debugging
         generate_quality_report(main_directory)
 
         log.info(f"[VRDU] file: {original_tex}, successfully processed.")
 
@@ -3,8 +3,12 @@
 from typing import List, Dict
 from tqdm import tqdm
 import tarfile
-import csv
-import random
+
+
+from vrdu import utils
+from vrdu import logger
+
+log = logger.setup_app_level_logger(file_name="arxiv_download.log")
 
 
 def arxiv_download(data: List[Dict], path: str) -> None:
@@ -26,54 +30,56 @@ def arxiv_download(data: List[Dict], path: str) -> None:
     Returns:
         None
     """
+    client = arxiv.Client()
     for row in tqdm(data):
-        category, count = row["categories"], int(row["count"])
-        print(f"category: {category}, count: {count}")
-        sub_directory = os.path.join(path, category)
-        os.makedirs(sub_directory, exist_ok=True)
-
-        search = arxiv.Search(
-            query=category,
-            max_results=count,
-            sort_by=arxiv.SortCriterion.SubmittedDate,
-        )
-
-        for result in search.results():
-            file_name = result._get_default_filename()
-            if os.path.exists(os.path.join(sub_directory, file_name)):
+        if row["auto_annotated_paper_path"]:
+            continue
+        discipline = row["discipline"]
+        discipline_path = os.path.join(path, discipline)
+        os.makedirs(discipline_path, exist_ok=True)
+
+        if os.path.exists(os.path.join(discipline_path, row["paper_id"])):
+            log.debug(f'{os.path.join(discipline_path, row["paper_id"])} exists')
+            continue
+
+        if os.path.exists(os.path.join(discipline_path, row["paper_id"], ".tar.gz")):
+            log.debug(
+                f'{os.path.join(discipline_path, row["paper_id"], ".tar.gz")} exists'
+            )
+            continue
+
+        search_results = client.results(arxiv.Search(id_list=[row["paper_id"]]))
+
+        for result in search_results:
+            tar_file_path = result.download_source(dirpath=discipline_path)
+            log.debug(f"Downloading tar file {tar_file_path}")
+            paper_path = os.path.join(discipline_path, row["paper_id"])
+            try:
+                with tarfile.open(tar_file_path, "r:gz") as tar:
+                    tar.extractall(paper_path)
+            except tarfile.ReadError:
+                log.error(f"{tar_file_path} is not a tar.gz file")
                 continue
 
-            result.download_source(dirpath=sub_directory)
 
+def main():
+    import argparse
 
-def extract_all_tar_gz(directory):
-    for root, dirs, files in os.walk(directory):
-        for file in files:
-            if not file.endswith(".tar.gz"):
-                continue
-            file_path = os.path.join(root, file)
-            extract_path = os.path.splitext(file_path)[0]
-            extract_path = os.path.splitext(extract_path)[0]
-            if os.path.exists(extract_path):
-                continue
-            extract_tar_gz(file_path, extract_path)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-p", "--path", type=str, required=True, help="Path to save result"
+    )
+    parser.add_argument(
+        "-f", "--file", type=str, required=True, help="json file for saving result"
+    )
+
+    args = parser.parse_args()
+    output_path, json_file = args.path, args.file
 
+    json_data = utils.load_json(json_file)
 
-def extract_tar_gz(file_path, extract_path):
-    with tarfile.open(file_path, "r:gz") as tar:
-        tar.extractall(extract_path)
+    arxiv_download(json_data, output_path)
 
 
 if __name__ == "__main__":
-    path = os.path.expanduser("/cpfs01/shared/ADLab/datasets/vrdu_arxiv")
-    data = []
-    with open("scripts/category_count.csv", "r") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            data.append(row)
-
-    random.shuffle(data)
-    arxiv_download(data=data, path=path)
-    for root, dirs, files in os.walk(path):
-        for dir_ in dirs:
-            extract_all_tar_gz(os.path.join(root, dir_))
+    main()