Skip to content

Commit feffd0a

Browse files
Merge pull request #48 from izzet/feature/metrics-analysisutils-fragmentation-fix
Reduce pandas fragmentation in metrics and unique-count paths
2 parents 3cc85b5 + c8c75fd commit feffd0a

2 files changed

Lines changed: 43 additions & 21 deletions

File tree

python/dftracer/analyzer/analysis_utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,10 @@ def set_size_bins(df: pd.DataFrame):
165165

166166

167167
def set_unique_counts(df: pd.DataFrame, layer: str):
168+
# Defragment once before deriving many unique-count columns.
169+
df = df.copy()
168170
unique_cols = [col for col in df.columns if col.endswith('_unique')]
171+
nunique_cols = {}
169172
for unique_col in unique_cols:
170173
if COL_FILE_NAME in unique_col and 'posix' not in layer:
171174
continue
@@ -177,10 +180,17 @@ def set_unique_counts(df: pd.DataFrame, layer: str):
177180
unique_col,
178181
df[unique_col].dtype,
179182
)
180-
df[nunique_col] = 0
183+
nunique_cols[nunique_col] = pd.Series(0, index=df.index, dtype='Int32')
181184
else:
182-
df[nunique_col] = df[unique_col].map(len)
183-
df[nunique_col] = df[nunique_col].astype('Int32')
185+
nunique_cols[nunique_col] = df[unique_col].map(len).astype('Int32')
186+
187+
if nunique_cols:
188+
nunique_df = pd.DataFrame(nunique_cols, index=df.index).astype('Int32')
189+
overlapping_cols = [col for col in nunique_df.columns if col in df.columns]
190+
if overlapping_cols:
191+
df = df.drop(columns=overlapping_cols)
192+
df = pd.concat([df, nunique_df], axis=1)
193+
184194
return df.drop(columns=unique_cols)
185195

186196

python/dftracer/analyzer/metrics.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -90,40 +90,50 @@ def set_view_metrics(
9090
time_metric = 'time_sum' if is_view_process_based else 'time_max'
9191

9292
view_metrics = list(set(df.columns.tolist()))
93-
new_metrics: List[str] = []
93+
new_metrics: Dict[str, pd.Series] = {}
9494

9595
for metric in view_metrics:
9696
if metric.endswith(count_metric):
9797
count_col = metric
9898
count_frac_total_col = metric.replace(count_metric, 'count_frac_total')
9999
count_sum = df[count_col].sum()
100-
df[count_frac_total_col] = df[count_col] / count_sum if count_sum > 0 else pd.NA
101-
new_metrics.append(count_frac_total_col)
100+
if count_sum > 0:
101+
new_metrics[count_frac_total_col] = df[count_col] / count_sum
102+
else:
103+
new_metrics[count_frac_total_col] = pd.Series(pd.NA, index=df.index, dtype='Float64')
102104
elif metric.endswith(size_metric):
103105
size_col = metric
104106
size_frac_total_col = metric.replace(size_metric, 'size_frac_total')
105107
size_sum = df[size_col].sum()
106-
df[size_frac_total_col] = df[size_col] / size_sum if size_sum > 0 else pd.NA
107-
new_metrics.append(size_frac_total_col)
108+
if size_sum > 0:
109+
new_metrics[size_frac_total_col] = df[size_col] / size_sum
110+
else:
111+
new_metrics[size_frac_total_col] = pd.Series(pd.NA, index=df.index, dtype='Float64')
108112
elif metric.endswith(time_metric):
109113
time_col = metric
110114
time_frac_total_col = metric.replace(time_metric, 'time_frac_total')
111115
time_sum = df[time_col].sum()
112-
df[time_frac_total_col] = df[time_col] / time_sum if time_sum > 0 else pd.NA
113-
new_metrics.append(time_frac_total_col)
116+
if time_sum > 0:
117+
new_metrics[time_frac_total_col] = df[time_col] / time_sum
118+
else:
119+
new_metrics[time_frac_total_col] = pd.Series(pd.NA, index=df.index, dtype='Float64')
114120

115-
count_time_frac_metric_pairs = _find_metric_pairs(new_metrics, 'count_frac_total', 'time_frac_total')
121+
count_time_frac_metric_pairs = _find_metric_pairs(list(new_metrics.keys()), 'count_frac_total', 'time_frac_total')
116122
for count_frac_total_col, time_frac_total_col in count_time_frac_metric_pairs:
117123
ops_percentile_col = count_frac_total_col.replace('count_frac_total', 'ops_percentile')
118124
ops_slope_col = count_frac_total_col.replace('count_frac_total', 'ops_slope')
119-
ops_slope = df[time_frac_total_col] / df[count_frac_total_col]
125+
ops_slope = new_metrics[time_frac_total_col] / new_metrics[count_frac_total_col]
120126
ops_slope = ops_slope.replace([np.inf, -np.inf], pd.NA)
121-
df[ops_percentile_col] = ops_slope.rank(pct=True)
122-
df[ops_slope_col] = ops_slope
123-
new_metrics.append(ops_percentile_col)
124-
new_metrics.append(ops_slope_col)
127+
new_metrics[ops_percentile_col] = ops_slope.rank(pct=True)
128+
new_metrics[ops_slope_col] = ops_slope
125129

126-
df[new_metrics] = df[new_metrics].replace([np.inf, -np.inf], pd.NA).astype('Float64')
130+
if new_metrics:
131+
new_metrics_df = pd.DataFrame(new_metrics, index=df.index)
132+
new_metrics_df = new_metrics_df.replace([np.inf, -np.inf], pd.NA).astype('Float64')
133+
overlapping_cols = [col for col in new_metrics_df.columns if col in df.columns]
134+
if overlapping_cols:
135+
df = df.drop(columns=overlapping_cols)
136+
df = pd.concat([df, new_metrics_df], axis=1)
127137

128138
return df.sort_index(axis=1)
129139

@@ -232,10 +242,12 @@ def set_cross_layer_metrics(
232242
x_layer_metrics[u_time_frac_parent_col] = u_time_series / df[f"{parent_layer}_{time_metric}"]
233243

234244
if x_layer_metrics:
235-
df = df.copy()
236-
df = df.assign(**x_layer_metrics)
237-
x_layer_cols = list(x_layer_metrics.keys())
238-
df[x_layer_cols] = df[x_layer_cols].replace([np.inf, -np.inf], pd.NA).astype('Float64')
245+
x_layer_metrics_df = pd.DataFrame(x_layer_metrics, index=df.index)
246+
x_layer_metrics_df = x_layer_metrics_df.replace([np.inf, -np.inf], pd.NA).astype('Float64')
247+
overlapping_cols = [col for col in x_layer_metrics_df.columns if col in df.columns]
248+
if overlapping_cols:
249+
df = df.drop(columns=overlapping_cols)
250+
df = pd.concat([df.copy(), x_layer_metrics_df], axis=1)
239251

240252
return df.sort_index(axis=1)
241253

0 commit comments

Comments
 (0)