Merge pull request graphnet-team#794 from Aske-Rosted/writer_changes

Aske-Rosted · web-flow · commit 384644baa539 · 2025-05-20T09:06:28.000+09:00
Writer changes
diff --git a/src/graphnet/data/pre_configured/dataconverters.py b/src/graphnet/data/pre_configured/dataconverters.py
@@ -1,6 +1,6 @@
 """Pre-configured combinations of writers and readers."""
 
-from typing import List, Union
+from typing import List, Union, Optional
 
 from graphnet.data import DataConverter
 from graphnet.data.readers import I3Reader, ParquetReader
@@ -68,6 +68,7 @@ def __init__(
         index_column: str = "event_no",
         num_workers: int = 1,
         i3_filters: Union[I3Filter, List[I3Filter]] = None,  # type: ignore
+        max_table_size: Optional[int] = None,
     ):
         """Convert I3 files to SQLite.
 
@@ -92,10 +93,11 @@ def __init__(
                          Defaults to 1 (no multiprocessing).
             i3_filters: Instances of `I3Filter` to filter PFrames. Defaults to
                         `NullSplitI3Filter`.
+            max_table_size: Maximum size of the SQLite tables. Default None.
         """
         super().__init__(
             file_reader=I3Reader(gcd_rescue=gcd_rescue, i3_filters=i3_filters),
-            save_method=SQLiteWriter(),
+            save_method=SQLiteWriter(max_table_size=max_table_size),
             extractors=extractors,
             num_workers=num_workers,
             index_column=index_column,
diff --git a/src/graphnet/data/writers/sqlite_writer.py b/src/graphnet/data/writers/sqlite_writer.py
@@ -6,7 +6,7 @@
 
 import os
 from tqdm import tqdm
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Union, Tuple
 
 from graphnet.data.utilities import (
     create_table_and_save_to_sql,
@@ -100,6 +100,8 @@ def merge_files(
         files: List[str],
         output_dir: str,
         primary_key_rescue: str = "event_no",
+        remove_originals: bool = False,
+        reset_integer_primary_key: bool = False,
     ) -> None:
         """SQLite-specific method for merging output files/databases.
 
@@ -117,6 +119,12 @@ def merge_files(
             primary_key_rescue: The name of the columns on which the primary
                 key is constructed. This will only be used if it is not
                 possible to infer the primary key name.
+            remove_originals: If True, the original files will be removed
+                after merging.
+            reset_integer_primary_key: If True, the primary key will reset
+                when merging the databases. This is useful if the files
+                were not generated by the same process and therefore
+                have overlapping index values.
         """
         # Warnings
         if self._max_table_size:
@@ -136,14 +144,21 @@ def merge_files(
         if len(files) > 0:
             os.makedirs(output_dir, exist_ok=True)
             self.info(f"Merging {len(files)} database files")
-            self._merge_databases(files=files, database_path=database_path)
+            self._merge_databases(
+                files=files,
+                database_path=database_path,
+                remove_originals=remove_originals,
+                reset_integer_primary_key=reset_integer_primary_key,
+            )
         else:
             self.warning("No database files given! Exiting.")
 
     def _merge_databases(
         self,
         files: List[str],
         database_path: str,
+        remove_originals: bool = False,
+        reset_integer_primary_key: bool = False,
     ) -> None:
         """Merge the temporary databases.
 
@@ -152,6 +167,10 @@ def _merge_databases(
             database_path: Path to a database, can be an empty path, where the
             databases listed in `files` will be merged into. If no database
             exists at the given path, one will be created.
+            remove_originals: If True, the original files will be removed
+            after merging.
+            reset_integer_primary_key: If True, the primary key will reset
+            when merging the databases.
         """
         if os.path.exists(database_path):
             self.warning(
@@ -165,6 +184,7 @@ def _merge_databases(
             self._largest_table = 0
 
         # Merge temporary databases into newly created one
+        start_count = 0
         for file_count, input_file in tqdm(enumerate(files), colour="green"):
             # Extract table names and index column name in database
             try:
@@ -177,7 +197,18 @@ def _merge_databases(
                     continue
                 else:
                     raise e
-
+            integer_primary_dict = None
+            if reset_integer_primary_key:
+                if list(tables.values())[0] is None:
+                    # Ensure that the first table is indexed
+                    temp_tables: Dict[str, Union[str, None]] = {
+                        k: v for k, v in tables.items() if v is not None
+                    }
+                    # re-add the non-indexed tables to the dictionary
+                    temp_tables.update(
+                        {k: v for k, v in tables.items() if v is None}
+                    )
+                    tables = temp_tables
             for table_name in tables.keys():
                 # Extract all data in the table from the given database
                 df = query_database(
@@ -190,6 +221,22 @@ def _merge_databases(
                     True if tables[table_name] is not None else False
                 )
 
+                if reset_integer_primary_key & (integer_primary_dict is None):
+                    integer_primary_dict, start_count = (
+                        self._generate_integer_primary_dict(
+                            start_count, df[primary_key].values
+                        )
+                    )
+                    df = self._map_integer_primary_keys(
+                        df, primary_key, integer_primary_dict
+                    )
+                elif reset_integer_primary_key & (
+                    integer_primary_dict is not None
+                ):
+                    df = self._map_integer_primary_keys(
+                        df, primary_key, integer_primary_dict
+                    )
+
                 # Submit to new database
                 create_table_and_save_to_sql(
                     df=df,
@@ -216,8 +263,31 @@ def _merge_databases(
                         "Maximum row count reached."
                         f" Creating new partition at {database_path}"
                     )
+        if remove_originals:
+            # Remove original files after all files have been merged
+            for input_file in files:
+                os.remove(input_file)
 
     # Internal methods
+    def _generate_integer_primary_dict(
+        self, start_count: int, integer_primary_keys_array: List[int]
+    ) -> Tuple[Dict[int, int], int]:
+        integer_primary_dict = {}
+        for i, key in enumerate(integer_primary_keys_array):
+            integer_primary_dict[key] = start_count + i
+        return integer_primary_dict, start_count + len(
+            integer_primary_keys_array
+        )
+
+    def _map_integer_primary_keys(
+        self,
+        df: pd.DataFrame,
+        key: str,
+        integer_primary_dict: Union[Dict[int, int], None],
+    ) -> pd.DataFrame:
+        assert integer_primary_dict is not None  # mypy...
+        df[key] = df[key].map(integer_primary_dict)
+        return df
 
     def _adjust_output_path(self, output_file: str) -> str:
         """Adjust the file path to reflect that it is a partition."""
diff --git a/src/graphnet/utilities/filesys.py b/src/graphnet/utilities/filesys.py
@@ -2,6 +2,7 @@
 
 from pathlib import Path
 import re
+import os
 from typing import List, Optional, Tuple, Union
 
 
@@ -31,7 +32,7 @@ def has_extension(filename: str, extensions: List[str]) -> bool:
 
 
 def find_i3_files(
-    directories: Union[str, List[str]],
+    inputs: Union[str, List[str]],
     gcd_rescue: Optional[str] = None,
     recursive: Optional[bool] = True,
 ) -> Tuple[List[str], List[str]]:
@@ -42,7 +43,8 @@ def find_i3_files(
     in the directory.
 
     Args:
-        directories: Directories to search recursively for I3 files.
+        inputs: Directories to search recursively for I3 files.
+            Or list of I3 files.
         gcd_rescue: Path to the GCD that will be default if no GCD is present
             in the directory.
         recursive: Whether or not to search the directories recursively.
@@ -51,46 +53,61 @@ def find_i3_files(
         i3_list: Paths to I3 files in `directories`
         gcd_list: Paths to GCD files for each I3 file.
     """
-    if isinstance(directories, str):
-        directories = [directories]
+    if isinstance(inputs, str):
+        inputs = [inputs]
 
     # Output containers
     i3_files = []
     gcd_files = []
-
-    for directory in directories:
-
-        # Find all I3-like files in `directory`, may or may not be recursively.
-        paths = []
-        i3_patterns = ["*.bz2", "*.zst", "*.gz"]
-        for i3_pattern in i3_patterns:
-            if recursive:
-                paths.extend(list(Path(directory).rglob(i3_pattern)))
-            else:
-                paths.extend(list(Path(directory).glob(i3_pattern)))
-
-        # Loop over all folders containing such I3-like files.
-        folders = sorted(set([path.parent for path in paths]))
-        for folder in folders:
-
-            # List all I3 and GCD files, respectively, in the current folder.
-            folder_files = [
-                str(path) for path in paths if path.parent == folder
-            ]
-            folder_i3_files = list(filter(is_i3_file, folder_files))
-            folder_gcd_files = list(filter(is_gcd_file, folder_files))
-
-            # Make sure that no more than one GCD file is found;
-            # and use rescue file if none is found.
-            assert len(folder_gcd_files) <= 1
-            if len(folder_gcd_files) == 0:
-                assert gcd_rescue is not None
-                folder_gcd_files = [gcd_rescue]
-
-            # Store list of I3 files and corresponding GCD files.
-            folder_gcd_files = folder_gcd_files * len(folder_i3_files)
-
-            gcd_files.extend(folder_gcd_files)
-            i3_files.extend(folder_i3_files)
-
-    return i3_files, gcd_files
+    if all([is_i3_file(input) for input in inputs]):
+        print("Assuming list of files.")
+        assert gcd_rescue is not None
+        gcd_files = [gcd_rescue] * len(inputs)
+        return inputs, gcd_files
+
+    elif all(os.path.isdir(input) for input in inputs):
+        print("Assuming list of directories.")
+
+        for directory in inputs:
+
+            # Find all I3-like files in `directory`.
+            paths = []
+            i3_patterns = ["*.bz2", "*.zst", "*.gz"]
+            for i3_pattern in i3_patterns:
+                if recursive:
+                    paths.extend(list(Path(directory).rglob(i3_pattern)))
+                else:
+                    paths.extend(list(Path(directory).glob(i3_pattern)))
+
+            # Loop over all folders containing such I3-like files.
+            folders = sorted(set([path.parent for path in paths]))
+            for folder in folders:
+
+                # List all I3 and GCD files, in the current folder.
+                folder_files = [
+                    str(path) for path in paths if path.parent == folder
+                ]
+                folder_i3_files = list(filter(is_i3_file, folder_files))
+                folder_gcd_files = list(filter(is_gcd_file, folder_files))
+
+                # Make sure that no more than one GCD file is found;
+                # and use rescue file if none is found.
+                assert len(folder_gcd_files) <= 1
+                if len(folder_gcd_files) == 0:
+                    assert gcd_rescue is not None
+                    folder_gcd_files = [gcd_rescue]
+
+                # Store list of I3 files and corresponding GCD files.
+                folder_gcd_files = folder_gcd_files * len(folder_i3_files)
+
+                gcd_files.extend(folder_gcd_files)
+                i3_files.extend(folder_i3_files)
+        return i3_files, gcd_files
+    else:
+        if any([os.path.isdir(input) for input in inputs]):
+            raise ValueError(
+                "Inputs contains a mix of files and directories \
+                which is not supported."
+            )
+        else:
+            raise ValueError("Some inputs are not valid directories or files.")