Skip to content

Commit 30732c4

Browse files
committed
Rename gini_pairwise to somersd_pairwise with ties parameter
1 parent be914d1 commit 30732c4

3 files changed

Lines changed: 693 additions & 13 deletions

File tree

fastwoe/fast_somersd.py

Lines changed: 89 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,17 @@ def _somers_yx_weighted(
6565
for j in range(i + 1, n):
6666
w_ij = weights[i] * weights[j]
6767

68-
if y[i] != y[j]: # Not tied in Y
69-
if (y[i] < y[j] and x[i] < x[j]) or (y[i] > y[j] and x[i] > x[j]):
70-
concordant += w_ij
71-
elif (y[i] < y[j] and x[i] > x[j]) or (y[i] > y[j] and x[i] < x[j]):
72-
discordant += w_ij
73-
else: # Tied in Y
68+
if y[i] == y[j]: # Tied in Y
7469
ties_y += w_ij
7570

71+
elif (y[i] < y[j] and x[i] < x[j]) or (y[i] > y[j] and x[i] > x[j]):
72+
concordant += w_ij
73+
elif (y[i] < y[j] and x[i] > x[j]) or (y[i] > y[j] and x[i] < x[j]):
74+
discordant += w_ij
7675
total_pairs = concordant + discordant + ties_y
7776
denom = concordant + discordant # Exclude ties in Y from denominator
7877

79-
if denom > 0:
80-
stat = (concordant - discordant) / denom
81-
else:
82-
stat = np.nan
83-
78+
stat = (concordant - discordant) / denom if denom > 0 else np.nan
8479
return stat, concordant, discordant, ties_y, total_pairs, denom
8580

8681

@@ -282,3 +277,86 @@ def somersd_xy(y_true: np.ndarray, y_pred: np.ndarray) -> SomersDResult:
282277
x = x[mask]
283278
stat, S, D, Tx, P, denom = _somers_xy_core(y, x) # type: ignore[misc]
284279
return SomersDResult(stat, S, D, Tx, P, denom)
280+
281+
282+
def somersd_pairwise(
283+
pos_scores: np.ndarray, neg_scores: np.ndarray, ties: str = "y"
284+
) -> float | None:
285+
"""Compute pairwise Somers' D between positive and negative scores.
286+
287+
This function computes Somers' D by comparing all positive scores
288+
against all negative scores. It's used for clustered Gini analysis where
289+
you want to measure separation between different groups.
290+
291+
The computation leverages the fast Somers' D implementation for optimal
292+
performance, which uses efficient Numba-accelerated algorithms.
293+
294+
Args:
295+
pos_scores: Array of scores for positive class (label=1)
296+
neg_scores: Array of scores for negative class (label=0)
297+
ties: How to handle ties. "y" (default) computes D_Y|X (ties in Y excluded),
298+
"x" computes D_X|Y (ties in X excluded).
299+
300+
Returns:
301+
Somers' D statistic (net concordant pairs / total pairs), or None if
302+
either array is empty.
303+
304+
Note:
305+
Somers' D is computed by combining the scores into a single array with
306+
binary labels (1 for positive, 0 for negative). This leverages the
307+
efficient O(n log n) algorithm instead of O(n_pos * n_neg).
308+
309+
For binary classification, Somers' D equals the Gini coefficient
310+
(2 * AUC - 1).
311+
312+
Examples:
313+
>>> pos = np.array([0.8, 0.9, 0.7])
314+
>>> neg = np.array([0.3, 0.4, 0.2])
315+
>>> somersd_pairwise(pos, neg)
316+
1.0 # Perfect separation
317+
>>> somersd_pairwise(pos, neg, ties="x")
318+
1.0 # Same result for perfect separation
319+
"""
320+
if ties not in ("x", "y"):
321+
raise ValueError(f"ties must be 'x' or 'y', got {ties}")
322+
323+
pos_scores = np.asarray(pos_scores, dtype=np.float64)
324+
neg_scores = np.asarray(neg_scores, dtype=np.float64)
325+
326+
# Remove NaN values
327+
pos_mask = ~np.isnan(pos_scores)
328+
neg_mask = ~np.isnan(neg_scores)
329+
pos_scores = pos_scores[pos_mask]
330+
neg_scores = neg_scores[neg_mask]
331+
332+
if len(pos_scores) == 0 or len(neg_scores) == 0:
333+
return None
334+
335+
# Combine scores and create binary labels
336+
# This allows us to use the fast somersd implementation
337+
all_scores = np.concatenate([pos_scores, neg_scores])
338+
all_labels = np.concatenate(
339+
[
340+
np.ones(len(pos_scores), dtype=np.float64),
341+
np.zeros(len(neg_scores), dtype=np.float64),
342+
]
343+
)
344+
345+
# Use fast Somers' D implementation (O(n log n) instead of O(n_pos * n_neg))
346+
if ties == "y":
347+
result = somersd_yx(all_labels, all_scores)
348+
else: # ties == "x"
349+
result = somersd_xy(all_labels, all_scores)
350+
351+
statistic = result.statistic
352+
353+
return None if np.isnan(statistic) else float(statistic)
354+
355+
356+
# Backward compatibility alias
357+
def gini_pairwise(pos_scores: np.ndarray, neg_scores: np.ndarray) -> float | None:
358+
"""Backward compatibility alias for somersd_pairwise.
359+
360+
This function is deprecated. Use somersd_pairwise instead.
361+
"""
362+
return somersd_pairwise(pos_scores, neg_scores, ties="y")

fastwoe/metrics.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from matplotlib import pyplot as plt
1212
from scipy.special import expit, logit
1313

14-
from .fast_somersd import somersd_yx
14+
from .fast_somersd import somersd_pairwise, somersd_yx
1515

1616

1717
def plot_performance(
@@ -297,3 +297,66 @@ def value_formatter(x, p):
297297
return frame[["category", "woe", "proba", "proba_delta"]]
298298
else:
299299
return frame[["category", "woe", "log_odds", "log_odds_delta"]]
300+
301+
302+
def gini_clustered_matrix(
303+
df: pd.DataFrame,
304+
score_col: str,
305+
label_col: str,
306+
cluster_col: str,
307+
) -> tuple[pd.DataFrame, float | None]:
308+
"""Compute intra/inter-cluster Gini/Somers' D matrix.
309+
310+
Computes a matrix where each element (i, j) represents the Gini coefficient
311+
between positive scores from cluster i and negative scores from cluster j.
312+
313+
Args:
314+
df: DataFrame containing scores, labels, and cluster assignments
315+
score_col: Name of the column containing model scores
316+
label_col: Name of the column containing binary labels (0/1)
317+
cluster_col: Name of the column containing cluster assignments
318+
319+
Returns:
320+
Tuple of (gini_matrix, global_gini):
321+
- gini_matrix: DataFrame with clusters as index and columns, where:
322+
- Diagonal elements (i, i): Intra-cluster Gini (positive vs negative
323+
scores within the same cluster)
324+
- Off-diagonal elements (i, j): Inter-cluster Gini (positive scores
325+
from cluster i vs negative scores from cluster j)
326+
- global_gini: Overall Gini coefficient across all data
327+
328+
Examples:
329+
>>> df = pd.DataFrame({
330+
... 'score': [0.8, 0.9, 0.3, 0.4, 0.7, 0.6],
331+
... 'label': [1, 1, 0, 0, 1, 0],
332+
... 'cluster': ['C1', 'C1', 'C1', 'C2', 'C2', 'C2']
333+
... })
334+
>>> matrix, global_gini = gini_clustered_matrix(
335+
... df, 'score', 'label', 'cluster'
336+
... )
337+
>>> print(matrix)
338+
>>> print(f"Global Gini: {global_gini}")
339+
"""
340+
# Get unique clusters
341+
clusters = sorted(df[cluster_col].unique())
342+
343+
# Initialize matrix
344+
gini_matrix = pd.DataFrame(index=clusters, columns=clusters, dtype=float)
345+
346+
# Compute intra/inter-cluster Gini
347+
for ci in clusters:
348+
for cj in clusters:
349+
pos_scores = df[(df[cluster_col] == ci) & (df[label_col] == 1)][
350+
score_col
351+
].values
352+
neg_scores = df[(df[cluster_col] == cj) & (df[label_col] == 0)][
353+
score_col
354+
].values
355+
gini_matrix.loc[ci, cj] = somersd_pairwise(pos_scores, neg_scores)
356+
357+
# Compute global Gini
358+
global_pos_scores = df[df[label_col] == 1][score_col].values
359+
global_neg_scores = df[df[label_col] == 0][score_col].values
360+
global_gini = somersd_pairwise(global_pos_scores, global_neg_scores)
361+
362+
return gini_matrix, global_gini

0 commit comments

Comments
 (0)