Skip to content

Commit 7c24b6d

Browse files
authored
Merge pull request #48 from bigbio/dev
Fix bruker issues.
2 parents 66de992 + 4ef17ad commit 7c24b6d

2 files changed

Lines changed: 25 additions & 10 deletions

File tree

quantmsutils/mzml/mzml_statistics.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,21 +100,21 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
100100

101101
# Get allowed columns from the schema
102102
allowed_columns = {
103-
"Id": "Id",
104-
"MsMsType": "CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END",
105-
"NumPeaks": "NumPeaks",
106-
"MaxIntensity": "MaxIntensity",
107-
"SummedIntensities": "SummedIntensities",
108-
"Time": "Time",
109-
"Charge": "Charge",
110-
"MonoisotopicMz": "MonoisotopicMz",
103+
"Id": ("Id", SCAN),
104+
"MsMsType": ("CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END",MS_LEVEL),
105+
"NumPeaks": ("NumPeaks",NUM_PEAKS),
106+
"MaxIntensity": ("MaxIntensity",BASE_PEAK_INTENSITY),
107+
"SummedIntensities": ("SummedIntensities",SUMMED_PEAK_INTENSITY),
108+
"Time": ("Time", RETENTION_TIME),
109+
"Charge": ("Charge", CHARGE),
110+
"MonoisotopicMz": ("MonoisotopicMz", EXPERIMENTAL_MASS_TO_CHARGE),
111111
}
112112

113113
# Construct safe column list
114114
safe_columns = []
115115
for schema_col_name, sql_expr in allowed_columns.items():
116116
if schema_col_name in columns or schema_col_name == "Id":
117-
safe_columns.append(sql_expr)
117+
safe_columns.append(sql_expr[0])
118118

119119
# Construct the query using safe columns
120120
query = f"""SELECT {', '.join(safe_columns)} FROM frames"""
@@ -125,7 +125,10 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
125125
) as parquet_writer:
126126
# Stream data in batches
127127
for chunk in pd.read_sql_query(query, conn, chunksize=batch_size):
128-
chunk["AcquisitionDateTime"] = acquisition_date_time
128+
chunk[ACQUISITION_DATETIME] = acquisition_date_time
129+
# Change column names to match the schema using allowed columns mapping
130+
chunk.rename(columns={v[0]: v[1] for v in allowed_columns.values()}, inplace=True)
131+
chunk[SCAN] = chunk[SCAN].astype(str)
129132
for col in schema.names:
130133
if col not in chunk.columns:
131134
chunk[col] = None

tests/test_commands.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,4 +145,16 @@ def test_mzml_statistics_local(self):
145145
assert os.path.exists(TEST_DATA_DIR / "RD139_Narrow_UPS1_0_1fmol_inj1_ms_info.parquet")
146146

147147
output_table = pd.read_parquet(TEST_DATA_DIR / "RD139_Narrow_UPS1_0_1fmol_inj1_ms_info.parquet")
148+
assert len(output_table) > 0, "Output table is empty"
149+
150+
@pytest.mark.skip("Test to be run locally, with bruker file")
151+
def test_mzml_statistics_bruker(self):
152+
"""Test mzML statistics on Bruker sample"""
153+
args = ["--ms2_file", "--ms_path", str(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d")]
154+
result = run_cli_command("mzmlstats", args)
155+
156+
assert result.exit_code == 0
157+
assert os.path.exists(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733_ms_info.parquet")
158+
159+
output_table = pd.read_parquet(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733_ms_info.parquet")
148160
assert len(output_table) > 0, "Output table is empty"

0 commit comments

Comments
 (0)