Aske-Rosted
diff --git a/‎src/graphnet/data/dataloader.py‎
Lines changed: 2 additions & 2 deletions b/‎src/graphnet/data/dataloader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/graphnet/data/dataset/dataset.py‎
Lines changed: 48 additions & 14 deletions b/‎src/graphnet/data/dataset/dataset.py‎
Lines changed: 48 additions & 14 deletions
diff --git a/‎src/graphnet/data/dataset/sqlite/sqlite_dataset.py‎
Lines changed: 28 additions & 4 deletions b/‎src/graphnet/data/dataset/sqlite/sqlite_dataset.py‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎src/graphnet/data/utilities/sqlite_utilities.py‎
Lines changed: 176 additions & 0 deletions b/‎src/graphnet/data/utilities/sqlite_utilities.py‎
Lines changed: 176 additions & 0 deletions
@@ -5,7 +5,7 @@
 import torch.utils.data
 from torch_geometric.data import Batch, Data
 
-from graphnet.data.dataset import Dataset
+from graphnet.data.dataset import Dataset, EnsembleDataset
 from graphnet.utilities.config import DatasetConfig
 
 
@@ -81,5 +81,5 @@ def from_dataset_config(
                 "need to specify `shuffle` as an argument."
             )
             dataset = Dataset.from_config(config)
-            assert isinstance(dataset, Dataset)
+            assert isinstance(dataset, Union[Dataset, EnsembleDataset])
             return cls(dataset, **kwargs)
@@ -169,9 +169,15 @@ def from_config(  # type: ignore[override]
 
         if isinstance(cfg["path"], list):
             sources = []
+            msg = f"Constructing {len(cfg['path'])} datasets, with selection: {source.selection}"
+            msg_bool = True
             for path in cfg["path"]:
                 cfg["path"] = path
-                sources.append(source._dataset_class(**cfg))
+                sources.append(source._dataset_class(**cfg, verbose=False))
+                if msg_bool:
+                    sources[-1].info(msg)
+                    msg_bool = False
+
             source = EnsembleDataset(sources)
             return source
         else:
@@ -259,6 +265,8 @@ def __init__(
         loss_weight_default_value: Optional[float] = None,
         seed: Optional[int] = None,
         labels: Optional[Dict[str, Any]] = None,
+        use_super_selection: bool = False,
+        verbose: bool = True,
     ):
         """Construct Dataset.
 
@@ -311,6 +319,11 @@ def __init__(
                 NOTE: DEPRECATED Use `data_representation` instead.
                 # DEPRECATION: REMOVE AT 2.0 LAUNCH
                 # See https://github.com/graphnet-team/graphnet/issues/647
+            use_super_selection: If True, the string selection is handled by
+                the query function of the dataset class, rather than
+                pd.DataFrame.query. Defaults to False and should
+                only be used with sqlite.
+            verbose: Whether to print the selection info
         """
         # Base class constructor
         super().__init__(name=__name__, class_name=self.__class__.__name__)
@@ -354,6 +367,7 @@ def __init__(
         self._data_representation = deepcopy(data_representation)
         self._labels = labels
         self._string_column = data_representation._detector.string_index_name
+        self._use_super_selection = use_super_selection
 
         if node_truth is not None:
             assert isinstance(node_truth_table, str)
@@ -404,6 +418,7 @@ def __init__(
             self,
             index_column=index_column,
             seed=seed,
+            use_super_selection=self._use_super_selection,
         )
 
         if self._labels is not None:
@@ -419,7 +434,8 @@ def __init__(
             self._indices = self._get_all_indices()
         elif isinstance(selection, str):
             self._indices = self._resolve_string_selection_to_indices(
-                selection
+                selection,
+                verbose=verbose,
             )
         else:
             self._indices = selection
@@ -528,7 +544,7 @@ def __getitem__(self, sequential_index: int) -> Data:
 
     # Internal method(s)
     def _resolve_string_selection_to_indices(
-        self, selection: str
+        self, selection: str, verbose: bool = True
     ) -> List[int]:
         """Resolve selection as string to list of indices.
 
@@ -537,7 +553,9 @@ def _resolve_string_selection_to_indices(
         fixed number of events to randomly sample, e.g., ``` "10000 random
         events ~ event_no % 5 > 0" "20% random events ~ event_no % 5 > 0" ```
         """
-        return self._string_selection_resolver.resolve(selection)
+        return self._string_selection_resolver.resolve(
+            selection, verbose=verbose
+        )
 
     def _remove_missing_columns(self) -> None:
         """Remove columns that are not present in the input file.
@@ -585,7 +603,7 @@ def _remove_missing_columns(self) -> None:
     def _check_missing_columns(
         self,
         columns: List[str],
-        table: str,
+        table: Union[str, List[str]],
     ) -> List[str]:
         """Return a list missing columns in `table`."""
         for column in columns:
@@ -594,13 +612,26 @@ def _check_missing_columns(
                     table=table, columns=[column], sequential_index=0
                 )
             except ColumnMissingException:
-                if table not in self._missing_variables:
-                    self._missing_variables[table] = []
-                self._missing_variables[table].append(column)
+                if isinstance(table, str):
+                    if table not in self._missing_variables:
+                        self._missing_variables[table] = []
+                    self._missing_variables[table].append(column)
+                elif isinstance(table, list):
+                    for t in table:
+                        if t not in self._missing_variables:
+                            self._missing_variables[t] = []
+                        self._missing_variables[t].append(column)
             except IndexError:
                 self.warning(f"Dataset contains no entries for {column}")
-
-        return self._missing_variables.get(table, [])
+        if isinstance(table, str):
+            missing_variables = self._missing_variables.get(table, [])
+        elif isinstance(table, list):
+            missing_variables = [
+                value
+                for key, value in self._missing_variables.items()
+                if key in table
+            ]
+        return missing_variables
 
     def _query(
         self, sequential_index: int
@@ -677,10 +708,13 @@ def _create_graph(
         """
         # Convert truth to dict
         if len(truth.shape) == 1:
-            truth = truth.reshape(1, -1)
-        truth_dict = {
-            key: truth[:, index] for index, key in enumerate(self._truth)
-        }
+            truth_dict = {
+                key: truth[0][index] for index, key in enumerate(self._truth)
+            }
+        else:
+            truth_dict = {
+                key: truth[:, index] for index, key in enumerate(self._truth)
+            }
 
         # Define custom labels
         labels_dict = self._get_labels(truth_dict)
 
@@ -72,10 +72,19 @@ def query_table(
                     f"{self._index_column} = {index} and {selection}"
                 )
 
-            result = self._conn.execute(
-                f"SELECT {columns} FROM {table} WHERE "
-                f"{combined_selections}"
-            ).fetchall()
+            if isinstance(table, list):
+                SELECT_QUERY = f"SELECT {columns} FROM"
+                JOIN_TABLES = " JOIN ".join(table)
+                USING_CLAUSE = f"USING({self._index_column})"
+                WHERE_CLAUSE = f"WHERE {combined_selections}"
+                FULL_QUERY = f"{SELECT_QUERY} {JOIN_TABLES} {USING_CLAUSE} {WHERE_CLAUSE}"
+                result = self._conn.execute(FULL_QUERY).fetchall()
+            else:
+                result = self._conn.execute(
+                    f"SELECT {columns} FROM {table} WHERE "
+                    f"{combined_selections}"
+                ).fetchall()
+
         except sqlite3.OperationalError as e:
             if "no such column" in str(e):
                 raise ColumnMissingException(str(e))
@@ -151,3 +160,18 @@ def _close_connection(self) -> "SQLiteDataset":
                 self._all_connections_established = False
                 self._conn = None
         return self
+
+    def _join_tables(self, tables, columns):
+        """Join tables in the SQLite database."""
+        # Check(s)
+        if not isinstance(tables, list):
+            raise TypeError("Input must be a list of table names.")
+        if len(tables) == 0:
+            raise ValueError(
+                "Input list must contain at least one table name."
+            )
+        tables = ", ".join(tables)
+        self._conn.execute("DROP VIEW IF EXISTS combined_table")
+        self._conn.execute(
+            f"CREATE VIEW combined_table AS SELECT {columns} FROM {tables} OUTER JOIN {tables} ON({self._index_column})"
+        )
@@ -206,3 +206,179 @@ def create_table_and_save_to_sql(
             integer_primary_key=integer_primary_key,
         )
     save_to_sql(df, table_name=table_name, database_path=database_path)
+
+
+def get_first_pulse_times(
+    database_path: str,
+    pulses_table_name: str = "SRTInIcePulses",
+    time_column: str = "dom_time",
+    index_column: str = "event_no",
+) -> pd.DataFrame:
+    """Get the first pulse time for each event.
+
+    Args:
+        database_path: Path to the database.
+        pulses_table_name: Name of the pulses table.
+        time_column: Name of the time column in the pulses table.
+        index_column: Name of the index column in the pulses table.
+
+    Returns:
+        DataFrame with two columns: `event_no` and `first_pulse_time`.
+    """
+    query = (
+        f"SELECT {index_column}, MIN({time_column}) AS first_pulse_time "
+        f"FROM {pulses_table_name} "
+        f"GROUP BY {index_column};"
+    )
+    return query_database(database_path, query)
+
+
+def add_first_pulse_time_to_truth(
+    database_path: str,
+    truth_table_name: str = "truth",
+    pulses_table_name: str = "SRTInIcePulses",
+    time_column: str = "dom_time",
+    index_column: str = "event_no",
+) -> None:
+    """Add the first pulse time to the truth table.
+
+    Args:
+        database_path: Path to the database.
+        truth_table_name: Name of the truth table.
+        pulses_table_name: Name of the pulses table.
+        time_column: Name of the time column in the pulses table.
+        index_column: Name of the index column in both tables.
+    """
+
+    # Get first pulse times
+    df = get_first_pulse_times(
+        database_path=database_path,
+        pulses_table_name=pulses_table_name,
+        time_column=time_column,
+        index_column=index_column,
+    )
+    print(f"Finished getting first pulse times for {len(df)} events.")
+    # Create temporary table for first pulse times
+    temp_table_name = "temp_first_pulse_times"
+
+    query = f"DROP TABLE IF EXISTS {temp_table_name};"
+    run_sql_code(database_path, query)
+
+    create_table(
+        columns=["event_no", "first_pulse_time"],
+        table_name=temp_table_name,
+        database_path=database_path,
+        index_column=index_column,
+        default_type="FLOAT",
+        integer_primary_key=True,
+    )
+    print(f"Created temporary table {temp_table_name} for first pulse times.")
+    # Save first pulse times to temporary table
+    save_to_sql(
+        df=df,
+        table_name=temp_table_name,
+        database_path=database_path,
+    )
+
+    # Create the column and update it in the truth table remove if already exists
+    query = (
+        f"ALTER TABLE {truth_table_name} "
+        f"ADD COLUMN first_pulse_time FLOAT;"
+    )
+    print(f"Adding column 'first_pulse_time' to {truth_table_name}.")
+
+    run_sql_code(database_path, query)
+    query = (
+        f"UPDATE {truth_table_name} "
+        f"SET first_pulse_time = (SELECT first_pulse_time "
+        f"FROM {temp_table_name} "
+        f"WHERE {temp_table_name}.{index_column} = {truth_table_name}.{index_column});"
+    )
+
+    run_sql_code(database_path, query)
+    print(
+        f"Updated {truth_table_name} with first pulse times from {temp_table_name}."
+    )
+    # Drop the temporary table
+    query = f"DROP TABLE IF EXISTS {temp_table_name};"
+    print(f"Dropping temporary table {temp_table_name}.")
+    run_sql_code(database_path, query)
+
+
+def add_starting(
+    database_path: str,
+    truth_table_name: str = "truth",
+    containment_column: str = "containment_type",
+    index_column: str = "event_no",
+) -> None:
+    """Add the starting to the truth table.
+
+    Args:
+        database_path: Path to the database.
+        truth_table_name: Name of the truth table.
+        index_column: Name of the index column in both tables.
+    """
+
+    # mapping from containment enum to starting
+    map_dict = {
+        1: 0,  # no intersect: not starting
+        2: 0,  # through-going: not starting
+        3: 1,  # contained: starting
+        4: 1,  # tau-to-mu: starting
+        5: 1,  # uncontained-starting: starting
+        6: 0,  # stopping: not starting
+        7: 0,  # decayed: not starting
+        8: 0,  # through-going bundle: not starting
+        9: 0,  # stopping bundle: not starting
+        10: 1,  # partial-contained: starting
+    }
+
+    containment_type_query = (
+        f"SELECT {index_column}, {containment_column} "
+        f"FROM {truth_table_name};"
+    )
+
+    containment_df = query_database(database_path, containment_type_query)
+
+    # convert containment type to starting using map_dict
+    containment_df["starting"] = (
+        containment_df[containment_column].astype(int).map(map_dict)
+    )
+
+    temp_table_name = "temp_starting"
+    query = f"DROP TABLE IF EXISTS {temp_table_name};"
+    run_sql_code(database_path, query)
+
+    create_table(
+        columns=[index_column, "starting"],
+        table_name=temp_table_name,
+        database_path=database_path,
+        index_column=index_column,
+        default_type="INTEGER",
+        integer_primary_key=True,
+    )
+
+    print(f"Created temporary table {temp_table_name} for starting.")
+    # Save starting to temporary table
+    save_to_sql(
+        df=containment_df[[index_column, "starting"]],
+        table_name=temp_table_name,
+        database_path=database_path,
+    )
+    # Create the column and update it in the truth table remove if already exists
+    query = f"ALTER TABLE {truth_table_name} " f"ADD COLUMN starting INTEGER;"
+    print(f"Adding column 'starting' to {truth_table_name}.")
+    run_sql_code(database_path, query)
+    query = (
+        f"UPDATE {truth_table_name} "
+        f"SET starting = (SELECT starting "
+        f"FROM {temp_table_name} "
+        f"WHERE {temp_table_name}.{index_column} = {truth_table_name}.{index_column});"
+    )
+
+    run_sql_code(database_path, query)
+    print(f"Updated {truth_table_name} with starting from {temp_table_name}.")
+    # Drop the temporary table
+    query = f"DROP TABLE IF EXISTS {temp_table_name};"
+    print(f"Dropping temporary table {temp_table_name}.")
+    run_sql_code(database_path, query)