Alpha-Innovator
diff --git a/‎batch_process.py‎
Lines changed: 24 additions & 22 deletions b/‎batch_process.py‎
Lines changed: 24 additions & 22 deletions
diff --git a/‎main.py‎
Lines changed: 9 additions & 5 deletions b/‎main.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎scripts/retrive_metadata.py‎
Lines changed: 70 additions & 125 deletions b/‎scripts/retrive_metadata.py‎
Lines changed: 70 additions & 125 deletions
diff --git a/‎scripts/visualize_dataset_distribution.py‎
Lines changed: 19 additions & 2 deletions b/‎scripts/visualize_dataset_distribution.py‎
Lines changed: 19 additions & 2 deletions
@@ -1,8 +1,8 @@
 import os
-import shutil
 import argparse
 import multiprocessing
-from typing import List
+import shutil
+from typing import List, Optional
 from uuid import uuid4
 import pandas as pd
 
@@ -16,16 +16,18 @@
 database = "data/processed_paper_database.csv"
 
 
-def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
-    """extract all MAIN.tex files for processing,
-    only MAIN.tex files in the main_path (not recursive) are extracted
+def filter_tex_files(
+    tex_files: List[str], main_path: Optional[str] = None
+) -> List[str]:
+    """extract all MAIN.tex files for processing, if main_path is not None, then
+    only extract MAIN.tex files in the main_path (not recursive)
 
     Args:
         tex_files (List[str]): list of tex files
         main_path (str): path to main directory.
 
     Returns:
-        List[str]: list of tex files that are compiable.
+        List[str]: list of tex files that are compilable.
     """
 
     # TODO: move this to config
@@ -48,7 +50,7 @@ def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
         if main_path and os.path.dirname(os.path.dirname(tex_file)) != main_path:
             continue
 
-        # make sure the tex file is compiable (main document)
+        # make sure the tex file is compilable (main document)
         try:
             with open(tex_file) as f:
                 content = f.read()
@@ -70,35 +72,35 @@ def filter_tex_files(tex_files: List[str], main_path: str) -> List[str]:
     return result
 
 
-def process_one_discpline(path: str, cpu_count: int, discpline: str) -> None:
-    """Process the data in a specific discpline.
+def process_one_discipline(path: str, cpu_count: int, discipline: str) -> None:
+    """Process the data in a specific discipline.
 
     Args:
         path (str): The path to the raw data.
         cpu_count (int): The number of CPUs to use for multiprocessing.
-        discpline (str): The discpline to process.
+        discipline (str): The discipline to process.
 
     Raises:
         Exception: If the processing fails.
 
     Returns:
         None
     """
-    discpline_path = os.path.join(path, discpline)
-    log.info(f"[VRDU] Path to raw data: {discpline_path}")
+    discipline_path = os.path.join(path, discipline)
+    log.info(f"[VRDU] Path to raw data: {discipline_path}")
     log.info(f"[VRDU] Using cpu counts: {cpu_count}")
-    tex_files = utils.extract_all_tex_files(discpline_path)
-    tex_files = filter_tex_files(tex_files, discpline_path)
+    tex_files = utils.extract_all_tex_files(discipline_path)
+    tex_files = filter_tex_files(tex_files, discipline_path)
 
     try:
         with multiprocessing.Pool(cpu_count) as pool:
             pool.map(process_one_file, tex_files)
     except Exception:
-        log.exception(f"[VRDU] discpline: {discpline}, failed to process.")
+        log.exception(f"[VRDU] discipline: {discipline}, failed to process.")
     finally:
         # save the process log
-        log.info(f"[VRDU] discpline: {discpline}, finished processing.")
-        shutil.move(log_file, f"data/batch_process_{discpline}.log")
+        log.info(f"[VRDU] discipline: {discipline}, finished processing.")
+        shutil.move(log_file, f"data/batch_process_{discipline}.log")
 
 
 def main():
@@ -107,7 +109,7 @@ def main():
     Args:
         path (str): The path to the raw data.
         cpu_count (int): The number of CPUs to use for multiprocessing.
-        discpline (str): The discpline to process.
+        discipline (str): The discipline to process.
 
     Raises:
         Exception: If the processing fails.
@@ -131,13 +133,13 @@ def main():
         help="cpu count for multiprocessing",
     )
     parser.add_argument(
-        "-t", "--discpline", type=str, required=True, help="discpline to process"
+        "-t", "--discipline", type=str, required=True, help="discipline to process"
     )
     args = parser.parse_args()
-    path, cpu_count, discpline = args.path, args.cpu_count, args.discpline
+    path, cpu_count, discipline = args.path, args.cpu_count, args.discipline
 
-    log.info(f"[VRDU] discpline: {discpline}, start to process.")
-    process_one_discpline(path, cpu_count, discpline)
+    log.info(f"[VRDU] discipline: {discipline}, start to process.")
+    process_one_discipline(path, cpu_count, discipline)
 
 
 if __name__ == "__main__":
 
@@ -94,14 +94,18 @@ def process_one_file(file_name: str) -> None:
         log.info(f"[VRDU] file: {file_name}, paper has been processed")
         return
 
-    # remove redundant files
+    # make a copy of the original tex file
+    original_tex = os.path.join(main_directory, "paper_original.tex")
+    shutil.copyfile(file_name, original_tex)
+
+    # remove the output folder if it exists
     output_directory = os.path.join(main_directory, "output")
     if os.path.exists(output_directory):
         shutil.rmtree(output_directory)
 
-    # make a copy of the original tex file to avoid polluting the original tex file
-    original_tex = os.path.join(main_directory, "paper_original.tex")
-    shutil.copyfile(file_name, original_tex)
+    # output_directory stores the intermediate results
+    # result_directory stores the final results
+    os.makedirs(os.path.join(main_directory, "output/result"))
 
     cwd = os.getcwd()
 
@@ -128,7 +132,7 @@ def process_one_file(file_name: str) -> None:
         log.info(
             f"[VRDU] file: {original_tex}, start generating annotations, this may take a while..."
         )
-        vrdu_layout_annotation = layout.LayoutAnnotation(main_directory)
+        vrdu_layout_annotation = layout.LayoutAnnotation(original_tex)
         vrdu_layout_annotation.annotate()
 
         vrdu_order_annotation = order.OrderAnnotation(original_tex)
 
@@ -1,142 +1,87 @@
+import glob
 import os
-import time
+from typing import Any, Dict, List
 import arxiv
-import shutil
+import argparse
 
+import pandas as pd
+
+
+from vrdu import utils
 from vrdu import logger
 
-log = logger.setup_app_level_logger(file_name="retrive_metadata.log")
+log = logger.setup_app_level_logger(file_name="retrieve_metadata.log")
 
 
-def retrive_metadata_for_files(path: str) -> None:
-    """Retrieves metadata for files in a specified path.
-    It first query the primary category by the file's name, then move the
-    file to the path/{category}
+def retrieve_metadata(data: Dict) -> List[Dict[str, Any]]:
+    paper_ids = list(data.keys())
 
-    Args:
-        path (str): The path to the directory containing the files.
+    client = arxiv.Client()
 
-    Returns:
-        None
+    slice_length = 100
+    paper_metadata = []
+
+    for i in range(0, len(paper_ids), slice_length):
+        slices = paper_ids[i : i + slice_length]
+        search_results = client.results(arxiv.Search(id_list=slices))
+
+        for index, result in enumerate(search_results):
+            paper_metadata.append(
+                {
+                    "entry_id": result.entry_id,
+                    "updated": str(result.updated),
+                    "published": str(result.published),
+                    "title": result.title,
+                    "doi": result.doi,
+                    "authors": [str(author) for author in result.authors],
+                    "summary": result.summary,
+                    "journal_ref": result.journal_ref,
+                    "primary_category": result.primary_category,
+                    "categories": result.categories,
+                    "links": [str(link) for link in result.links],
+                    "pdf_url": result.pdf_url,
+                    "paper_id": slices[index],
+                    "paper_path": data[slices[index]],
+                    "quality": "low",
+                }
+            )
+
+    return paper_metadata
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input_path", type=str, default="data/discipline_info.csv"
+    )
+    args = parser.parse_args()
+    path = args.input_path
 
-    Raises:
-        None
+    discipline_info = pd.read_csv("data/discipline_info.csv")
+    disciplines = set(discipline_info["discipline"])
 
-    This function retrieves metadata for the files in the specified path.
-    It filters the files based on a specific format, moves them to categorized directories, and logs the actions performed.
+    for discipline in disciplines:
+        target_discipline_path = os.path.join(path, discipline)
+        paper_paths = glob.glob(os.path.join(target_discipline_path, "*/"))
 
-    Note:
-        The function assumes that the files in the specified path are in the format 'xxxx.yyyy.ext',
-        where 'xxxx' and 'yyyy' are digits, and `ext` is the extension format.
+        data = {
+            os.path.basename(paper_path[:-1]): paper_path for paper_path in paper_paths
+        }
 
-    Example:
-        retrive_metadata_for_files('/path/to/directory')
-    """
-    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
-    # filter the files have format xxxx.yyyy.ext
-    filtered_files = [
-        f for f in files if f[:4].isdigit() and f[5:9].isdigit() and f[4] == "."
-    ]
-    num_papers = len(filtered_files)
-    log.info("There are {} files".format(num_papers))
-    client = arxiv.Client()
-    slice_length = 100
+        paper_metadata = retrieve_metadata(data)
 
-    for i in range(0, num_papers, slice_length):
-        filename_without_extensions = [
-            os.path.splitext(f)[0] for f in filtered_files[i : i + slice_length]
-        ]
-
-        for pdf_file, result in zip(
-            filtered_files[i : i + slice_length],
-            client.results(arxiv.Search(id_list=filename_without_extensions)),
-        ):
-            old_path = os.path.join(path, pdf_file)
-            category = result.primary_category
-            new_path = os.path.join(path, category)
-            if not os.path.exists(new_path):
-                os.makedirs(new_path)
-                log.info("Created directory: {}".format(new_path))
-            try:
-                shutil.move(old_path, new_path)
-                log.info(
-                    "Move file: {} to {}".format(
-                        old_path,
-                        new_path,
-                    )
-                )
-            except Exception:
-                log.exception(f"Error moving {old_path}")
-
-
-def retrive_metadata_for_folders(path: str) -> None:
-    """Retrieves metadata for subfolders in a specified path.
-    It first query the primary category by the file's name, then move the
-    file to the path/{category}
-
-    Args:
-        path (str): The path to the directory containing the subfolders.
-
-    Returns:
-        None
-
-    Raises:
-        None
-
-    This function retrieves metadata for the subfolders in the specified path.
-    It filters the subfolders based on a specific format, moves them to categorized directories, and logs the actions performed.
-
-    Note:
-        The function assumes that the subfolders in the specified path are in the format 'xxxx.yyyy',
-        where 'xxxx' and 'yyyy' are numeric values and the length of the subfolder name is 9 characters.
-
-    Example:
-        retrive_metadata_for_folders('/path/to/directory')
-    """
-    subfolders = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
-    filtered_subfolders = [
-        f for f in subfolders if f[:4].isdigit() and f[5:].isdigit() and f[4] == "."
-    ]
-    num_papers = len(filtered_subfolders)
-    log.info("There are {} subfolders".format(num_papers))
-    client = arxiv.Client()
-    slice_length = 100
-    for i in range(0, num_papers, slice_length):
-        slice_list = filtered_subfolders[i : i + slice_length]
-        for dir_name, result in zip(
-            slice_list,
-            client.results(arxiv.Search(id_list=slice_list)),
-        ):
-            old_path = os.path.join(path, dir_name)
-            category = result.primary_category
-            log.info(f"dir_name: {dir_name}, category: {category}")
-            new_path = os.path.join(path, category)
-            if not os.path.exists(new_path):
-                os.makedirs(new_path)
-                log.info("Created directory: {}".format(new_path))
-            try:
-                shutil.move(old_path, new_path)
-                log.info(
-                    "Move directory: {} to {}".format(
-                        old_path,
-                        new_path,
-                    )
-                )
-            except Exception:
-                log.exception(f"Error moving {old_path}")
-
-
-def retrive_metadata(path: str) -> None:
-    retrive_metadata_for_folders(path)
-    retrive_metadata_for_files(path)
+        existed_json_file = os.path.join(target_discipline_path, "paper_metadata.json")
+        existed_json_data = []
+        if os.path.exists(existed_json_file):
+            existed_json_data = utils.load_json(existed_json_file)
 
+        existed_paper_ids = [x["paper_id"] for x in existed_json_data]
+        existed_json_data.extend(
+            [x for x in paper_metadata if x["paper_id"] not in existed_paper_ids]
+        )
 
-if __name__ == "__main__":
-    import argparse
+        utils.export_to_json(existed_json_data, existed_json_file)
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", help="path to directory containing subfolders")
-    args = parser.parse_args()
-    # run(args.path, args.cpu_count)
-    # run_v2(args.path)
-    retrive_metadata(args.path)
+
+if __name__ == "__main__":
+    main()
@@ -4,7 +4,24 @@
 import numpy as np
 import csv
 
-from vrdu import utils
+
+def get_all_categories():
+    """
+    Retrieves all categories from the "category_count.csv" file.
+
+    Returns:
+        categories (list): A list of all categories.
+
+    Reference:
+        https://arxiv.org/category_taxonomy
+    """
+    categories = []
+    with open("scripts/category_count.csv", "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            categories.append(row["categories"])
+
+    return categories
 
 
 def visualize_distribution(dict1, dict2):
@@ -50,7 +67,7 @@ def visualize_distribution(dict1, dict2):
 
 
 def analyze_raw_data(path):
-    all_categories = utils.get_all_categories()
+    all_categories = get_all_categories()
 
     data = defaultdict(int)
     for category in all_categories: