Skip to content

Commit 77d0960

Browse files
teonbrookslarsonerautofix-ci[bot]drammock
authored
Update values to int64 (#13548)
Co-authored-by: Eric Larson <larson.eric.d@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Daniel McCloy <dan@mccloy.info>
1 parent d8ce70a commit 77d0960

6 files changed

Lines changed: 290 additions & 153 deletions

File tree

doc/changes/v1.12.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Bugfixes
5555
preserve all remaining info fields.
5656
- Avoid some unnecessary computations when ``n_jobs=None`` is equivalent to ``n_jobs=1``, by `Simon Kern`_. (`#13777 <https://github.com/mne-tools/mne-python/pull/13777>`__)
5757
- Fix clipped annotations and x-axis label in :meth:`mne.io.Raw.plot`, by `Clemens Brunner`_. (`#13787 <https://github.com/mne-tools/mne-python/pull/13787>`__)
58+
- Fix bug with reading large CNT files by adding ``recompute_n_samples`` to :func:`mne.io.read_raw_cnt` with stricter ``data_format`` handling by `Teon Brooks`_ and `Eric Larson`_. (`#13548 <https://github.com/mne-tools/mne-python/pull/13548>`__)
5859

5960

6061
New features
@@ -133,6 +134,7 @@ Authors
133134
- Stefan Appelhoff
134135
- Tamas Fehervari+
135136
- Teemu Taivainen+
137+
- Teon Brooks
136138
- Thomas A Caswell+
137139
- Thomas S. Binns
138140
- Varun Kasyap Pentamaraju+

mne/annotations.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1796,7 +1796,12 @@ def _write_annotations_txt(fname, annot):
17961796

17971797
@fill_doc
17981798
def read_annotations(
1799-
fname, sfreq="auto", uint16_codec=None, encoding="utf8", ignore_marker_types=False
1799+
fname,
1800+
sfreq="auto",
1801+
uint16_codec=None,
1802+
encoding="utf8",
1803+
ignore_marker_types=False,
1804+
data_format="auto",
18001805
) -> Annotations:
18011806
r"""Read annotations from a file.
18021807
@@ -1830,6 +1835,8 @@ def read_annotations(
18301835
ignore_marker_types : bool
18311836
If ``True``, ignore marker types in BrainVision files (and only use their
18321837
descriptions). Defaults to ``False``.
1838+
data_format : str
1839+
Only used by CNT files, see :func:`mne.io.read_raw_cnt` for details.
18331840
18341841
Returns
18351842
-------
@@ -1874,6 +1881,7 @@ def read_annotations(
18741881
kwargs = {
18751882
".vmrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
18761883
".amrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
1884+
".cnt": {"data_format": data_format},
18771885
".dat": {"sfreq": sfreq},
18781886
".cdt": {"sfreq": sfreq},
18791887
".cef": {"sfreq": sfreq},

mne/io/cnt/_utils.py

Lines changed: 113 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,15 @@
1010

1111
import numpy as np
1212

13-
from ...utils import warn
13+
from ...utils import _check_option, _validate_type, logger, warn
14+
15+
# Offsets from SETUP structure in http://paulbourke.net/dataformats/eeg/
16+
_NCHANNELS_OFFSET = 370
17+
_NSAMPLES_OFFSET = 864
18+
_RATE_OFFSET = 376
19+
_EVENTTABLEPOS_OFFSET = 886
20+
_DATA_OFFSET = 900 # Size of the 'SETUP' header.
21+
_CH_SIZE = 75 # Size of each channel in bytes
1422

1523

1624
def _read_teeg(f, teeg_offset):
@@ -105,8 +113,8 @@ def _session_date_2_meas_date(session_date, date_format):
105113
return (int_part, frac_part)
106114

107115

108-
def _compute_robust_event_table_position(fid, data_format="int32"):
109-
"""Compute `event_table_position`.
116+
def _compute_robust_sizes(*, fid, data_format, recompute_n_samples):
117+
"""Compute n_channels, n_samples, n_bytes, and event_table_position.
110118
111119
When recording event_table_position is computed (as accomulation). If the
112120
file recording is large then this value overflows and ends up pointing
@@ -115,36 +123,113 @@ def _compute_robust_event_table_position(fid, data_format="int32"):
115123
If the file is smaller than 2G the value in the SETUP is returned.
116124
Otherwise, the address of the table position is computed from:
117125
n_samples, n_channels, and the bytes size.
118-
"""
119-
SETUP_NCHANNELS_OFFSET = 370
120-
SETUP_NSAMPLES_OFFSET = 864
121-
SETUP_EVENTTABLEPOS_OFFSET = 886
122-
123-
fid_origin = fid.tell() # save the state
124-
125-
if fid.seek(0, SEEK_END) < 2e9:
126-
fid.seek(SETUP_EVENTTABLEPOS_OFFSET)
127-
(event_table_pos,) = np.frombuffer(fid.read(4), dtype="<i4")
128126
127+
Reference: https://paulbourke.net/dataformats/eeg/
128+
Header has a field for number of samples, but it does not seem to be
129+
too reliable.
130+
"""
131+
_check_option("data_format", data_format, ["auto", "int16", "int32"])
132+
# Read the number of channels and samples from the header
133+
fid.seek(_NCHANNELS_OFFSET)
134+
n_channels = int(np.fromfile(fid, dtype="<u2", count=1).item())
135+
logger.debug("Number of channels: %d", n_channels)
136+
fid.seek(_NSAMPLES_OFFSET)
137+
n_samples = int(np.frombuffer(fid.read(4), dtype="<i4").item()) # may be unreliable
138+
logger.debug("Header number of samples: %d", n_samples)
139+
file_size = fid.seek(0, SEEK_END)
140+
workaround = "pass data_format='int16' or 'int32' explicitly"
141+
samples_offset = _DATA_OFFSET + _CH_SIZE * n_channels
142+
_validate_type(recompute_n_samples, (bool, None), "recompute_n_samples")
143+
if file_size < 2e9:
144+
logger.debug("File size < 2GB, using header values")
145+
fid.seek(_EVENTTABLEPOS_OFFSET)
146+
event_offset = int(np.frombuffer(fid.read(4), dtype="<i4").item())
147+
logger.debug("Event table offset from header: %d", event_offset)
148+
if event_offset > file_size:
149+
problem = (
150+
f"Event table offset from header ({event_offset}) is larger than file "
151+
f"size ({file_size})"
152+
)
153+
if data_format == "auto":
154+
raise RuntimeError(
155+
f"{problem}, cannot automatically compute data format, {workaround}"
156+
)
157+
warn(
158+
f"Event table offset from header ({event_offset}) is larger than file "
159+
f"size ({file_size}), recomputing event table offset."
160+
)
161+
n_bytes = 2 if data_format == "int16" else 4
162+
event_offset = samples_offset + n_samples * n_channels * n_bytes
163+
n_data_bytes = event_offset - samples_offset
164+
if data_format == "auto":
165+
n_bytes_per_chan, rem = divmod(n_data_bytes, n_channels)
166+
why = ""
167+
# starting assumption is 16-bit ints
168+
n_bytes = 2
169+
if rem != 0:
170+
why = (
171+
f"number of data bytes {n_data_bytes} is not evenly divisible by "
172+
f"{n_channels=}"
173+
)
174+
elif n_samples == 0:
175+
why = "number of samples (according to header) is 0"
176+
else:
177+
# we know `n_channels` divides evenly into `n_data_bytes`, and header
178+
# said `n_samples` was non-zero, so try to infer `n_bytes`:
179+
n_bytes, rem = divmod(n_bytes_per_chan, n_samples)
180+
if rem != 0 or n_bytes not in [2, 4]:
181+
why = (
182+
f"number of bytes per channel {n_bytes_per_chan} is not evenly "
183+
f"divisible by {n_samples=} or does not result in 2 or 4 bytes "
184+
f"per sample ({n_bytes=})"
185+
)
186+
logger.debug("Inferred data format with %d bytes per sample", n_bytes)
187+
if why:
188+
raise RuntimeError(
189+
"Could not automatically compute number of bytes per sample as the "
190+
f"{why}. set data_format manually."
191+
)
192+
else:
193+
n_bytes = 2 if data_format == "int16" else 4
194+
logger.debug(
195+
"Using %d bytes per sample from data_format=%s", n_bytes, data_format
196+
)
197+
# Our most reliable way to get the number of samples is to compute it
198+
recomputed_n_samples, rem = divmod(n_data_bytes, (n_channels * n_bytes))
199+
logger.debug("Computed number of samples: %d", recomputed_n_samples)
200+
if recompute_n_samples is None:
201+
recompute_n_samples = n_samples <= 0
202+
if recompute_n_samples:
203+
logger.info(
204+
"Number of samples in header (%d) is not positive, setting "
205+
"recompute_n_samples=True",
206+
n_samples,
207+
)
208+
if recompute_n_samples:
209+
n_samples = recomputed_n_samples
210+
if rem != 0:
211+
warn(
212+
"Inconsistent file information detected, number of data bytes "
213+
f"({n_data_bytes}) not evenly divisible by number of channels "
214+
f"({n_channels}) times number of bytes ({n_bytes})"
215+
)
129216
else:
217+
logger.debug("File size >= 2GB, computing event table offset")
218+
if recompute_n_samples:
219+
raise ValueError(
220+
"Cannot recompute number of samples for files larger than 2GB, set "
221+
"recompute_samples=False"
222+
)
130223
if data_format == "auto":
131-
warn(
224+
raise RuntimeError(
132225
"Using `data_format='auto' for a CNT file larger"
133-
" than 2Gb is not granted to work. Please pass"
134-
" 'int16' or 'int32'.` (assuming int32)"
226+
" than 2Gb is not supported, explicitly pass data_format as "
227+
"'int16' or 'int32'"
135228
)
136-
137229
n_bytes = 2 if data_format == "int16" else 4
138-
139-
fid.seek(SETUP_NSAMPLES_OFFSET)
140-
(n_samples,) = np.frombuffer(fid.read(4), dtype="<i4")
141-
142-
fid.seek(SETUP_NCHANNELS_OFFSET)
143-
(n_channels,) = np.frombuffer(fid.read(2), dtype="<u2")
144-
145-
event_table_pos = (
146-
900 + 75 * int(n_channels) + n_bytes * int(n_channels) * int(n_samples)
230+
event_offset = (
231+
_DATA_OFFSET + _CH_SIZE * n_channels + n_bytes * n_channels * n_samples
147232
)
233+
logger.debug("Computed event table offset: %d", event_offset)
148234

149-
fid.seek(fid_origin) # restore the state
150-
return event_table_pos
235+
return n_channels, n_samples, n_bytes, event_offset

0 commit comments

Comments
 (0)