Skip to content

Commit 35f4451

Browse files
committed
Fix error if CCS is directly in df
1 parent 8902973 commit 35f4451

3 files changed

Lines changed: 207 additions & 157 deletions

File tree

im2deep/calibration.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,14 @@ def transform(
174174
if "peptidoform" not in psm_df.columns:
175175
raise CalibrationError("Input DataFrame must contain 'peptidoform' column.")
176176

177-
psm_df["predicted_CCS_uncalibrated"] = psm_df["metadata"].apply(
178-
lambda x: (
179-
x["predicted_CCS_uncalibrated"] if "predicted_CCS_uncalibrated" in x else np.nan
177+
if not "predicted_CCS_uncalibrated" in psm_df.columns and "metadata" in psm_df.columns:
178+
psm_df["predicted_CCS_uncalibrated"] = psm_df["metadata"].apply(
179+
lambda x: (
180+
x["predicted_CCS_uncalibrated"]
181+
if "predicted_CCS_uncalibrated" in x
182+
else np.nan
183+
)
180184
)
181-
)
182185

183186
# Extract charge from peptidoform column efficiently
184187
psm_df["charge"] = psm_df["peptidoform"].apply(
@@ -191,7 +194,7 @@ def transform(
191194
else:
192195
# Global calibration - use same shift for all
193196
psm_df["shift"] = self.general_shift
194-
197+
195198
# Apply shift, handling both scalar and array CCS values (for multiconformer predictions)
196199
def apply_shift(ccs_value, shift_value):
197200
if isinstance(ccs_value, (list, np.ndarray)):
@@ -200,10 +203,9 @@ def apply_shift(ccs_value, shift_value):
200203
else:
201204
# Single value
202205
return float(ccs_value + shift_value)
203-
206+
204207
psm_df["calibrated_CCS"] = psm_df.apply(
205-
lambda row: apply_shift(row["predicted_CCS_uncalibrated"], row["shift"]),
206-
axis=1
208+
lambda row: apply_shift(row["predicted_CCS_uncalibrated"], row["shift"]), axis=1
207209
)
208210

209211
# Return as numpy object array to preserve multiconformer arrays
@@ -298,7 +300,7 @@ def get_charge(pf):
298300

299301
target_work["peptide_key"] = target_work["peptidoform"].apply(get_peptide_key)
300302
target_work["charge"] = target_work["peptidoform"].apply(get_charge)
301-
303+
302304
# Extract CCS from metadata if it's not a direct column
303305
if "CCS" not in target_work.columns and "metadata" in target_work.columns:
304306
target_work["CCS"] = target_work["metadata"].apply(

tests/test_calibration.py

Lines changed: 119 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,14 @@ def test_fit_per_charge(self, sample_peptidoforms, sample_ccs_values, sample_pre
3333
calibration = LinearCCSCalibration(per_charge=True)
3434

3535
# Create DataFrames for target and source
36-
target_df = pd.DataFrame({
37-
'peptidoform': sample_peptidoforms,
38-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
39-
})
40-
41-
source_df = pd.DataFrame({
42-
'peptidoform': sample_peptidoforms,
43-
'CCS': sample_predicted_ccs
44-
})
36+
target_df = pd.DataFrame(
37+
{
38+
"peptidoform": sample_peptidoforms,
39+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
40+
}
41+
)
42+
43+
source_df = pd.DataFrame({"peptidoform": sample_peptidoforms, "CCS": sample_predicted_ccs})
4544

4645
calibration.fit(
4746
psm_df_target=target_df,
@@ -56,15 +55,14 @@ def test_fit_global(self, sample_peptidoforms, sample_ccs_values, sample_predict
5655
"""Test fitting with global calibration."""
5756
calibration = LinearCCSCalibration(per_charge=False, use_charge_state=2)
5857

59-
target_df = pd.DataFrame({
60-
'peptidoform': sample_peptidoforms,
61-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
62-
})
63-
64-
source_df = pd.DataFrame({
65-
'peptidoform': sample_peptidoforms,
66-
'CCS': sample_predicted_ccs
67-
})
58+
target_df = pd.DataFrame(
59+
{
60+
"peptidoform": sample_peptidoforms,
61+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
62+
}
63+
)
64+
65+
source_df = pd.DataFrame({"peptidoform": sample_peptidoforms, "CCS": sample_predicted_ccs})
6866

6967
calibration.fit(
7068
psm_df_target=target_df,
@@ -81,27 +79,30 @@ def test_transform_single_output(
8179
"""Test transforming single-output predictions."""
8280
calibration = LinearCCSCalibration(per_charge=True)
8381

84-
target_df = pd.DataFrame({
85-
'peptidoform': sample_peptidoforms,
86-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
87-
})
88-
89-
source_df = pd.DataFrame({
90-
'peptidoform': sample_peptidoforms,
91-
'CCS': sample_predicted_ccs
92-
})
82+
target_df = pd.DataFrame(
83+
{
84+
"peptidoform": sample_peptidoforms,
85+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
86+
}
87+
)
88+
89+
source_df = pd.DataFrame({"peptidoform": sample_peptidoforms, "CCS": sample_predicted_ccs})
9390

9491
calibration.fit(
9592
psm_df_target=target_df,
9693
psm_df_source=source_df,
9794
)
9895

9996
# Transform with predictions in metadata
100-
transform_df = pd.DataFrame({
101-
'peptidoform': sample_peptidoforms,
102-
'metadata': [{'predicted_CCS_uncalibrated': pred} for pred in sample_predicted_ccs]
103-
})
104-
97+
transform_df = pd.DataFrame(
98+
{
99+
"peptidoform": sample_peptidoforms,
100+
"metadata": [
101+
{"predicted_CCS_uncalibrated": pred} for pred in sample_predicted_ccs
102+
],
103+
}
104+
)
105+
105106
calibrated = calibration.transform(transform_df)
106107

107108
assert len(calibrated) == len(sample_predicted_ccs)
@@ -113,27 +114,32 @@ def test_transform_multi_output(
113114
"""Test transforming multi-output predictions."""
114115
calibration = LinearCCSCalibration(per_charge=True)
115116

116-
target_df = pd.DataFrame({
117-
'peptidoform': sample_peptidoforms,
118-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
119-
})
120-
121-
source_df = pd.DataFrame({
122-
'peptidoform': sample_peptidoforms,
123-
'CCS': sample_ccs_values - 2.0 # Simulate shift
124-
})
117+
target_df = pd.DataFrame(
118+
{
119+
"peptidoform": sample_peptidoforms,
120+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
121+
}
122+
)
123+
124+
source_df = pd.DataFrame(
125+
{"peptidoform": sample_peptidoforms, "CCS": sample_ccs_values - 2.0} # Simulate shift
126+
)
125127

126128
calibration.fit(
127129
psm_df_target=target_df,
128130
psm_df_source=source_df,
129131
)
130132

131133
# Transform multi-output with arrays in metadata
132-
transform_df = pd.DataFrame({
133-
'peptidoform': sample_peptidoforms,
134-
'metadata': [{'predicted_CCS_uncalibrated': pred} for pred in sample_predicted_ccs_multi]
135-
})
136-
134+
transform_df = pd.DataFrame(
135+
{
136+
"peptidoform": sample_peptidoforms,
137+
"metadata": [
138+
{"predicted_CCS_uncalibrated": pred} for pred in sample_predicted_ccs_multi
139+
],
140+
}
141+
)
142+
137143
calibrated = calibration.transform(transform_df)
138144

139145
assert len(calibrated) == len(sample_predicted_ccs_multi)
@@ -146,10 +152,14 @@ def test_transform_not_fitted(self, sample_peptidoforms, sample_predicted_ccs):
146152
"""Test transform raises error when not fitted."""
147153
calibration = LinearCCSCalibration()
148154

149-
transform_df = pd.DataFrame({
150-
'peptidoform': sample_peptidoforms,
151-
'metadata': [{'predicted_CCS_uncalibrated': pred} for pred in sample_predicted_ccs]
152-
})
155+
transform_df = pd.DataFrame(
156+
{
157+
"peptidoform": sample_peptidoforms,
158+
"metadata": [
159+
{"predicted_CCS_uncalibrated": pred} for pred in sample_predicted_ccs
160+
],
161+
}
162+
)
153163

154164
with pytest.raises(CalibrationError, match="not been fitted"):
155165
calibration.transform(transform_df)
@@ -158,15 +168,11 @@ def test_calculate_ccs_shift_no_overlap(self):
158168
"""Test shift calculation with no overlapping peptides."""
159169
calibration = LinearCCSCalibration(per_charge=False, use_charge_state=2)
160170

161-
target_df = pd.DataFrame({
162-
'peptidoform': [Peptidoform("PEPTIDE/2")],
163-
'metadata': [{'CCS': 450.0}]
164-
})
165-
166-
source_df = pd.DataFrame({
167-
'peptidoform': [Peptidoform("DIFFERENT/2")],
168-
'CCS': [460.0]
169-
})
171+
target_df = pd.DataFrame(
172+
{"peptidoform": [Peptidoform("PEPTIDE/2")], "metadata": [{"CCS": 450.0}]}
173+
)
174+
175+
source_df = pd.DataFrame({"peptidoform": [Peptidoform("DIFFERENT/2")], "CCS": [460.0]})
170176

171177
shift = calibration.calculate_ccs_shift(target_df, source_df)
172178

@@ -177,16 +183,12 @@ def test_calculate_ccs_shift_with_overlap(self):
177183
calibration = LinearCCSCalibration(per_charge=False, use_charge_state=2)
178184

179185
peptidoforms = [Peptidoform("PEPTIDE/2"), Peptidoform("SEQUENCE/2")]
180-
181-
target_df = pd.DataFrame({
182-
'peptidoform': peptidoforms,
183-
'metadata': [{'CCS': 450.0}, {'CCS': 520.0}]
184-
})
185-
186-
source_df = pd.DataFrame({
187-
'peptidoform': peptidoforms,
188-
'CCS': [445.0, 515.0]
189-
})
186+
187+
target_df = pd.DataFrame(
188+
{"peptidoform": peptidoforms, "metadata": [{"CCS": 450.0}, {"CCS": 520.0}]}
189+
)
190+
191+
source_df = pd.DataFrame({"peptidoform": peptidoforms, "CCS": [445.0, 515.0]})
190192

191193
shift = calibration.calculate_ccs_shift(target_df, source_df)
192194

@@ -200,16 +202,15 @@ def test_compute_ccs_shift_per_charge(self):
200202
Peptidoform("SEQUENCE/3"),
201203
Peptidoform("TEST/2"),
202204
]
203-
204-
target_df = pd.DataFrame({
205-
'peptidoform': peptidoforms,
206-
'metadata': [{'CCS': 450.0}, {'CCS': 520.0}, {'CCS': 480.0}]
207-
})
208-
209-
source_df = pd.DataFrame({
210-
'peptidoform': peptidoforms,
211-
'CCS': [445.0, 515.0, 475.0]
212-
})
205+
206+
target_df = pd.DataFrame(
207+
{
208+
"peptidoform": peptidoforms,
209+
"metadata": [{"CCS": 450.0}, {"CCS": 520.0}, {"CCS": 480.0}],
210+
}
211+
)
212+
213+
source_df = pd.DataFrame({"peptidoform": peptidoforms, "CCS": [445.0, 515.0, 475.0]})
213214

214215
shifts = LinearCCSCalibration._compute_ccs_shift_per_charge(target_df, source_df)
215216

@@ -223,15 +224,16 @@ def test_fit_with_missing_charges(self, sample_peptidoforms, sample_ccs_values):
223224
"""Test that missing charges are filled with general shift."""
224225
calibration = LinearCCSCalibration(per_charge=True)
225226

226-
target_df = pd.DataFrame({
227-
'peptidoform': sample_peptidoforms,
228-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
229-
})
230-
231-
source_df = pd.DataFrame({
232-
'peptidoform': sample_peptidoforms,
233-
'CCS': sample_ccs_values - 5.0
234-
})
227+
target_df = pd.DataFrame(
228+
{
229+
"peptidoform": sample_peptidoforms,
230+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
231+
}
232+
)
233+
234+
source_df = pd.DataFrame(
235+
{"peptidoform": sample_peptidoforms, "CCS": sample_ccs_values - 5.0}
236+
)
235237

236238
calibration.fit(
237239
psm_df_target=target_df,
@@ -247,15 +249,14 @@ def test_fit_invalid_charge_state(self, sample_peptidoforms, sample_ccs_values):
247249
"""Test that invalid charge state raises error."""
248250
calibration = LinearCCSCalibration(per_charge=False, use_charge_state=10)
249251

250-
target_df = pd.DataFrame({
251-
'peptidoform': sample_peptidoforms,
252-
'metadata': [{'CCS': ccs} for ccs in sample_ccs_values]
253-
})
254-
255-
source_df = pd.DataFrame({
256-
'peptidoform': sample_peptidoforms,
257-
'CCS': sample_ccs_values
258-
})
252+
target_df = pd.DataFrame(
253+
{
254+
"peptidoform": sample_peptidoforms,
255+
"metadata": [{"CCS": ccs} for ccs in sample_ccs_values],
256+
}
257+
)
258+
259+
source_df = pd.DataFrame({"peptidoform": sample_peptidoforms, "CCS": sample_ccs_values})
259260

260261
with pytest.raises(CalibrationError, match="Invalid charge state"):
261262
calibration.calculate_ccs_shift(target_df, source_df)
@@ -271,19 +272,23 @@ def test_shift_broadcasting(self, sample_peptidoforms):
271272

272273
# Test single output
273274
single_pred = np.array([450.0, 520.0, 480.0], dtype=np.float32)
274-
single_df = pd.DataFrame({
275-
'peptidoform': sample_peptidoforms,
276-
'metadata': [{'predicted_CCS_uncalibrated': pred} for pred in single_pred]
277-
})
275+
single_df = pd.DataFrame(
276+
{
277+
"peptidoform": sample_peptidoforms,
278+
"metadata": [{"predicted_CCS_uncalibrated": pred} for pred in single_pred],
279+
}
280+
)
278281
single_cal = calibration.transform(single_df)
279282
assert len(single_cal) == 3
280283

281284
# Test multi output
282285
multi_pred = np.array([[450.0, 452.0], [520.0, 524.0], [480.0, 482.0]], dtype=np.float32)
283-
multi_df = pd.DataFrame({
284-
'peptidoform': sample_peptidoforms,
285-
'metadata': [{'predicted_CCS_uncalibrated': pred} for pred in multi_pred]
286-
})
286+
multi_df = pd.DataFrame(
287+
{
288+
"peptidoform": sample_peptidoforms,
289+
"metadata": [{"predicted_CCS_uncalibrated": pred} for pred in multi_pred],
290+
}
291+
)
287292
multi_cal = calibration.transform(multi_df)
288293
assert len(multi_cal) == 3
289294
# Check arrays are preserved
@@ -295,23 +300,21 @@ def test_get_default_reference(self):
295300
try:
296301
reference_df = get_default_reference(multi=False)
297302
assert isinstance(reference_df, pd.DataFrame)
298-
assert 'peptidoform' in reference_df.columns
299-
assert 'CCS' in reference_df.columns
303+
assert "peptidoform" in reference_df.columns
304+
assert "CCS" in reference_df.columns
300305
assert len(reference_df) > 0
301306
except FileNotFoundError:
302307
pytest.skip("Default reference dataset not found")
303308

304309
def test_large_shift_warning(self, caplog):
305310
"""Test that large shifts trigger a warning."""
306-
target_df = pd.DataFrame({
307-
'peptidoform': [Peptidoform("PEPTIDE/2")],
308-
'metadata': [{'CCS': 450.0}]
309-
})
310-
311-
source_df = pd.DataFrame({
312-
'peptidoform': [Peptidoform("PEPTIDE/2")],
313-
'CCS': [300.0] # Large difference
314-
})
311+
target_df = pd.DataFrame(
312+
{"peptidoform": [Peptidoform("PEPTIDE/2")], "metadata": [{"CCS": 450.0}]}
313+
)
314+
315+
source_df = pd.DataFrame(
316+
{"peptidoform": [Peptidoform("PEPTIDE/2")], "CCS": [300.0]} # Large difference
317+
)
315318

316319
shift = LinearCCSCalibration._compute_ccs_shift(target_df, source_df, 2)
317320

0 commit comments

Comments
 (0)