correlation optimisations

SamoraHunter · SamoraHunter · commit 4ae75f90a468 · 2025-12-08T15:09:30.000Z
diff --git a/ml_grid/pipeline/data_correlation_matrix.py b/ml_grid/pipeline/data_correlation_matrix.py
@@ -1,194 +1,200 @@
 import logging
-import sys
-from typing import Any, Dict, List
+from typing import Dict, List, Set
 
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
 
+logger = logging.getLogger("ml_grid")
 
-def correlation_coefficient(col1: pd.Series, col2: pd.Series) -> float:
-    """Calculates the Pearson correlation coefficient between two pandas Series.
 
-    Args:
-        col1 (pd.Series): The first series.
-        col2 (pd.Series): The second series.
+def handle_correlation_matrix(
+    local_param_dict: Dict,
+    drop_list: List[str],
+    df: pd.DataFrame,
+    chunk_size: int = 1000,
+) -> List[str]:
+    """
+    Hybrid Correlation Optimizer.
 
-    Returns:
-        float: The correlation coefficient.
+    Features:
+    1. Respects existing 'drop_list' (adds to it, doesn't replace it).
+    2. Optimizes by skipping columns already in 'drop_list'.
+    3. Hybrid GPU/CPU execution with robust error handling.
     """
-    return col1.corr(col2)
 
+    threshold = local_param_dict.get("corr", 0.25)
 
-def calculate_correlation_chunk(
-    df: pd.DataFrame,
-    cols_chunk1: List[str],
-    cols_chunk2: List[str],
-) -> pd.DataFrame:
-    """Calculate correlation between two chunks of columns.
-
-    Args:
-        df (pd.DataFrame): The input DataFrame.
-        cols_chunk1 (List[str]): First set of column names.
-        cols_chunk2 (List[str]): Second set of column names.
-
-    Returns:
-        pd.DataFrame: Correlation matrix for the chunk pairs.
-    """
-    # Extract data for both chunks
-    data1 = df[cols_chunk1].values
-    data2 = df[cols_chunk2].values
+    # Filter to numeric columns only
+    numeric_columns = df.select_dtypes(include=["number"]).columns
 
-    # Calculate means and standard deviations
-    mean1 = np.mean(data1, axis=0)
-    mean2 = np.mean(data2, axis=0)
-    std1 = np.std(data1, axis=0, ddof=1)
-    std2 = np.std(data2, axis=0, ddof=1)
+    if len(numeric_columns) == 0:
+        return drop_list  # Return existing list if no new work to do
 
-    # Center the data
-    centered1 = data1 - mean1
-    centered2 = data2 - mean2
+    logger.info("Preparing data (converting to float32)...")
+    df_numeric = df[numeric_columns]
+    col_names = df_numeric.columns.tolist()
 
-    # Calculate correlation matrix chunk
-    n = len(df)
-    corr_chunk = np.dot(centered1.T, centered2) / (n - 1)
+    # Create a mapping for fast index lookups
+    col_to_idx = {name: i for i, name in enumerate(col_names)}
 
-    # Normalize by standard deviations
-    corr_chunk = corr_chunk / np.outer(std1, std2)
+    # Convert data to float32
+    data = df_numeric.values.astype(np.float32)
 
-    return pd.DataFrame(corr_chunk, index=cols_chunk1, columns=cols_chunk2)
+    # --- GPU DETECTION & SAFETY ---
+    use_gpu = False
+    try:
+        import cupy as cp
 
+        if cp.cuda.is_available():
+            free_mem = cp.cuda.Device().mem_info[0]
+            req_mem = (data.shape[1] ** 2) * 4  # 4 bytes per float32
 
-def handle_correlation_matrix(
-    local_param_dict: Dict[str, Any],
-    drop_list: List[Any],
-    df: pd.DataFrame,
-    chunk_size: int = 50,
-) -> List[Any]:
-    """Identifies highly correlated columns and adds them to a drop list.
-
-    This function calculates the correlation coefficient between numeric columns
-    in the input DataFrame in chunks to manage memory usage. Pairs of columns
-    with a correlation greater than the specified threshold are added to the
-    `drop_list` as tuples.
-
-    To avoid dropping both columns in a highly correlated pair, this function
-    only adds one of the columns from each pair to the `drop_list`.
-
-    Calculates the correlation coefficient between each column in the input DataFrame
-    using chunks to avoid memory issues. The correlation threshold is defined by
-    the 'corr' key in the local_param_dict dictionary.
-
-    Args:
-        local_param_dict (Dict[str, Any]): Dictionary containing local parameters,
-            including the 'corr' threshold.
-        drop_list (List[Any]): A list to which pairs of correlated columns will
-            be appended.
-        df (pd.DataFrame): The input DataFrame.
-        chunk_size (int, optional): The size of each chunk for correlation
-            calculation. Defaults to 50.
-
-    Returns:
-        List[Any]: The updated list containing unique pairs of correlated columns.
-    """
-    logger = logging.getLogger("ml_grid")
-    # Define the correlation threshold
-    threshold = (
-        local_param_dict.get("corr")
-        if local_param_dict.get("corr") is not None
-        else 0.98
+            if free_mem > req_mem * 1.2:
+                use_gpu = True
+                logger.info(
+                    f"GPU Detected: {cp.cuda.Device().name}. Free VRAM: {free_mem/1e9:.2f} GB."
+                )
+            else:
+                logger.warning(
+                    "GPU detected but insufficient VRAM. Falling back to CPU."
+                )
+    except Exception as e:
+        logger.warning(f"GPU acceleration unavailable (falling back to CPU): {e}")
+        use_gpu = False
+    # -----------------------------
+
+    # Convert input drop_list to a Set for O(1) lookups
+    existing_drops = set(drop_list)
+
+    if use_gpu:
+        try:
+            return _process_on_gpu(data, col_names, threshold, existing_drops)
+        except Exception as e:
+            logger.error(f"GPU processing failed: {e}. Retrying on CPU.")
+            # Fallthrough to CPU
+            pass
+
+    # CPU Fallback
+    return _process_on_cpu(
+        data, col_names, col_to_idx, threshold, chunk_size, existing_drops
     )
 
-    # Remove non-numeric columns
-    numeric_columns = df.select_dtypes(include=["number"]).columns.tolist()
 
-    if len(numeric_columns) == 0:
-        return drop_list
+def _process_on_gpu(
+    data: np.ndarray, col_names: List[str], threshold: float, existing_drops: Set[str]
+) -> List[str]:
+    import cupy as cp
 
-    df_numeric = df[numeric_columns]
-    n_cols = len(numeric_columns)
+    n_samples = data.shape[0]
 
-    # Adjust chunk size if necessary
-    if chunk_size >= n_cols:
-        chunk_size = n_cols
+    # Initialize the final set with what we already had
+    to_drop = existing_drops.copy()
 
-    # Track which columns to drop
-    to_drop = set()
+    # Move data to GPU
+    gpu_data = cp.asarray(data)
 
-    # Calculate number of chunks
-    n_chunks = int(np.ceil(n_cols / chunk_size))
+    # Standardize
+    means = gpu_data.mean(axis=0, keepdims=True)
+    stds = gpu_data.std(axis=0, keepdims=True)
+    stds[stds == 0] = 1.0
+    gpu_data = (gpu_data - means) / stds
 
-    logger.info(
-        f"Processing {n_cols} columns in {n_chunks} chunks of size {chunk_size}..."
-    )
+    scale_factor = 1.0 / (n_samples - 1)
 
-    # Process correlation matrix in chunks (upper triangle only)
-    for i in tqdm(range(n_chunks), desc="Processing column chunks"):
-        start_i = i * chunk_size
-        end_i = min((i + 1) * chunk_size, n_cols)
-        cols_i = numeric_columns[start_i:end_i]
-
-        # Process only the upper triangle (j <= i)
-        for j in range(i + 1):
-            start_j = j * chunk_size
-            end_j = min((j + 1) * chunk_size, n_cols)
-            cols_j = numeric_columns[start_j:end_j]
-
-            # Calculate correlation for this chunk pair
-            try:
-                if i == j:
-                    # Diagonal chunk - calculate self-correlation
-                    corr_chunk = df_numeric[cols_i].corr().abs()
-
-                    # Only look at upper triangle within this chunk
-                    for idx_i, col_i in enumerate(cols_i):
-                        if col_i in to_drop:
-                            continue
-                        for idx_j in range(idx_i):
-                            col_j = cols_j[idx_j]
-                            if col_j in to_drop:
-                                continue
-                            if corr_chunk.iloc[idx_j, idx_i] > threshold:
-                                to_drop.add(col_i)
-                                break
-                        if col_i in to_drop:
-                            break
-                else:
-                    # Off-diagonal chunk
-                    corr_chunk = calculate_correlation_chunk(
-                        df_numeric, cols_i, cols_j
-                    ).abs()
-
-                    # Check all pairs in this chunk
-                    for idx_i, col_i in enumerate(cols_i):
-                        if col_i in to_drop:
-                            continue
-                        for idx_j, col_j in enumerate(cols_j):
-                            if col_j in to_drop:
-                                continue
-                            if corr_chunk.iloc[idx_i, idx_j] > threshold:
-                                # Drop the column that appears later in the original list
-                                col_to_drop = (
-                                    col_i
-                                    if start_i + idx_i > start_j + idx_j
-                                    else col_j
-                                )
-                                to_drop.add(col_to_drop)
-                                if col_to_drop == col_i:
-                                    break
-                        if col_i in to_drop:
-                            break
-
-            except Exception as e:
-                logger.warning(
-                    f"Error processing chunk ({i}, {j}): {e}", file=sys.stderr
-                )
-                continue
+    # Matrix Multiplication
+    corr_matrix = cp.matmul(gpu_data.T, gpu_data)
+    corr_matrix *= scale_factor
+    corr_matrix = cp.abs(corr_matrix)
+
+    # Upper Triangle only (k=1)
+    upper_tri = cp.triu(corr_matrix, k=1)
+
+    # Get indices of high correlations
+    rows, cols = cp.where(upper_tri > threshold)
+
+    cpu_rows = cp.asnumpy(rows)
+    cpu_cols = cp.asnumpy(cols)
+
+    # Process pairs
+    for i, j in zip(cpu_rows, cpu_cols):
+        col_i = col_names[i]
+        col_j = col_names[j]
+
+        # KEY LOGIC: If Col_I is already marked for drop (either from input list
+        # or from this loop), we skip. Otherwise, we drop Col_J.
+        if col_i not in to_drop:
+            to_drop.add(col_j)
+
+    logger.info(f"GPU complete. Total columns to drop: {len(to_drop)}")
+    return sorted(list(to_drop))
+
+
+def _process_on_cpu(
+    data: np.ndarray,
+    col_names: List[str],
+    col_to_idx: Dict[str, int],
+    threshold: float,
+    chunk_size: int,
+    existing_drops: Set[str],
+) -> List[str]:
+
+    logger.info("Using optimized CPU processing...")
+    n_samples, n_cols = data.shape
+
+    # Standardize
+    means = data.mean(axis=0, keepdims=True)
+    stds = data.std(axis=0, keepdims=True)
+    stds[stds == 0] = 1.0
+    data = (data - means) / stds
+
+    scale_factor = 1.0 / (n_samples - 1)
+
+    # Initialize mask with PRE-EXISTING drops
+    # This optimizes the loop: we won't calculate correlations for columns
+    # that came in already dropped.
+    dropped_mask = np.zeros(n_cols, dtype=bool)
+
+    for col in existing_drops:
+        if col in col_to_idx:
+            dropped_mask[col_to_idx[col]] = True
+
+    effective_chunk_size = max(chunk_size, 500)
+
+    with tqdm(total=n_cols, desc="CPU Correlation") as pbar:
+        for i in range(0, n_cols, effective_chunk_size):
+            i_end = min(i + effective_chunk_size, n_cols)
+
+            chunk_data = data[:, i:i_end]
+
+            # Correlation Block
+            corr_chunk = np.matmul(chunk_data.T, data) * scale_factor
+            corr_chunk = np.abs(corr_chunk)
+
+            for local_row in range(corr_chunk.shape[0]):
+                global_current_idx = i + local_row
+
+                # OPTIMIZATION:
+                # If this column was in the input drop_list OR we just dropped it, SKIP.
+                if dropped_mask[global_current_idx]:
+                    continue
+
+                # Check neighbors to the right
+                candidates = corr_chunk[local_row, global_current_idx + 1 :]
+                hits = np.where(candidates > threshold)[0]
+
+                if hits.size > 0:
+                    # Add to mask
+                    dropped_mask[global_current_idx + 1 + hits] = True
+
+            pbar.update(i_end - i)
 
-    # Add the identified columns to the initial drop_list
-    drop_list.extend(list(to_drop))
+    # Convert mask back to list
+    dropped_indices = np.where(dropped_mask)[0]
+    newly_identified_drops = {col_names[i] for i in dropped_indices}
 
-    logger.info(f"Identified {len(to_drop)} columns to drop due to high correlation.")
+    # Merge with original list (in case original list had cols not in this dataframe)
+    final_drop_set = existing_drops.union(newly_identified_drops)
 
-    # Return a list of unique columns to drop
-    return list(set(drop_list))
+    logger.info(f"CPU complete. Total columns to drop: {len(final_drop_set)}")
+    return sorted(list(final_drop_set))
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,7 +66,8 @@ dependencies = [
 
     # Dev
     "black",
-    "ruff"
+    "ruff",
+    
 ]
 
 [project.urls]
diff --git a/requirements.txt b/requirements.txt
@@ -741,4 +741,4 @@ nbformat
 
 upsetplot==0.9.0
     # manually added for results processing
-
+cupy

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,8 @@ dependencies = [`
`66`	`66`
`67`	`67`	`# Dev`
`68`	`68`	`"black",`
`69`		`- "ruff"`
	`69`	`+ "ruff",`
	`70`	`+`
`70`	`71`	`]`
`71`	`72`
`72`	`73`	`[project.urls]`