mne-tools
diff --git a/‎doc/changes/v1.12.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/changes/v1.12.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mne/annotations.py‎
Lines changed: 9 additions & 1 deletion b/‎mne/annotations.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎mne/io/cnt/_utils.py‎
Lines changed: 113 additions & 28 deletions b/‎mne/io/cnt/_utils.py‎
Lines changed: 113 additions & 28 deletions
@@ -55,6 +55,7 @@ Bugfixes
   preserve all remaining info fields.
 - Avoid some unnecessary computations when ``n_jobs=None`` is equivalent to ``n_jobs=1``, by `Simon Kern`_. (`#13777 <https://github.com/mne-tools/mne-python/pull/13777>`__)
 - Fix clipped annotations and x-axis label in :meth:`mne.io.Raw.plot`, by `Clemens Brunner`_. (`#13787 <https://github.com/mne-tools/mne-python/pull/13787>`__)
+- Fix bug with reading large CNT files by adding ``recompute_n_samples`` to :func:`mne.io.read_raw_cnt` with stricter ``data_format`` handling by `Teon Brooks`_ and `Eric Larson`_. (`#13548 <https://github.com/mne-tools/mne-python/pull/13548>`__)
 
 
 New features
@@ -133,6 +134,7 @@ Authors
 - Stefan Appelhoff
 - Tamas Fehervari+
 - Teemu Taivainen+
+- Teon Brooks
 - Thomas A Caswell+
 - Thomas S. Binns
 - Varun Kasyap Pentamaraju+
 
@@ -1796,7 +1796,12 @@ def _write_annotations_txt(fname, annot):
 
 @fill_doc
 def read_annotations(
-    fname, sfreq="auto", uint16_codec=None, encoding="utf8", ignore_marker_types=False
+    fname,
+    sfreq="auto",
+    uint16_codec=None,
+    encoding="utf8",
+    ignore_marker_types=False,
+    data_format="auto",
 ) -> Annotations:
     r"""Read annotations from a file.
 
@@ -1830,6 +1835,8 @@ def read_annotations(
     ignore_marker_types : bool
         If ``True``, ignore marker types in BrainVision files (and only use their
         descriptions). Defaults to ``False``.
+    data_format : str
+        Only used by CNT files, see :func:`mne.io.read_raw_cnt` for details.
 
     Returns
     -------
@@ -1874,6 +1881,7 @@ def read_annotations(
     kwargs = {
         ".vmrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
         ".amrk": {"sfreq": sfreq, "ignore_marker_types": ignore_marker_types},
+        ".cnt": {"data_format": data_format},
         ".dat": {"sfreq": sfreq},
         ".cdt": {"sfreq": sfreq},
         ".cef": {"sfreq": sfreq},
 
@@ -10,7 +10,15 @@
 
 import numpy as np
 
-from ...utils import warn
+from ...utils import _check_option, _validate_type, logger, warn
+
+# Offsets from SETUP structure in http://paulbourke.net/dataformats/eeg/
+_NCHANNELS_OFFSET = 370
+_NSAMPLES_OFFSET = 864
+_RATE_OFFSET = 376
+_EVENTTABLEPOS_OFFSET = 886
+_DATA_OFFSET = 900  # Size of the 'SETUP' header.
+_CH_SIZE = 75  # Size of each channel in bytes
 
 
 def _read_teeg(f, teeg_offset):
@@ -105,8 +113,8 @@ def _session_date_2_meas_date(session_date, date_format):
         return (int_part, frac_part)
 
 
-def _compute_robust_event_table_position(fid, data_format="int32"):
-    """Compute `event_table_position`.
+def _compute_robust_sizes(*, fid, data_format, recompute_n_samples):
+    """Compute n_channels, n_samples, n_bytes, and event_table_position.
 
     When recording event_table_position is computed (as accomulation). If the
     file recording is large then this value overflows and ends up pointing
@@ -115,36 +123,113 @@ def _compute_robust_event_table_position(fid, data_format="int32"):
     If the file is smaller than 2G the value in the SETUP is returned.
     Otherwise, the address of the table position is computed from:
     n_samples, n_channels, and the bytes size.
-    """
-    SETUP_NCHANNELS_OFFSET = 370
-    SETUP_NSAMPLES_OFFSET = 864
-    SETUP_EVENTTABLEPOS_OFFSET = 886
-
-    fid_origin = fid.tell()  # save the state
-
-    if fid.seek(0, SEEK_END) < 2e9:
-        fid.seek(SETUP_EVENTTABLEPOS_OFFSET)
-        (event_table_pos,) = np.frombuffer(fid.read(4), dtype="<i4")
 
+    Reference: https://paulbourke.net/dataformats/eeg/
+    Header has a field for number of samples, but it does not seem to be
+    too reliable.
+    """
+    _check_option("data_format", data_format, ["auto", "int16", "int32"])
+    # Read the number of channels and samples from the header
+    fid.seek(_NCHANNELS_OFFSET)
+    n_channels = int(np.fromfile(fid, dtype="<u2", count=1).item())
+    logger.debug("Number of channels: %d", n_channels)
+    fid.seek(_NSAMPLES_OFFSET)
+    n_samples = int(np.frombuffer(fid.read(4), dtype="<i4").item())  # may be unreliable
+    logger.debug("Header number of samples: %d", n_samples)
+    file_size = fid.seek(0, SEEK_END)
+    workaround = "pass data_format='int16' or 'int32' explicitly"
+    samples_offset = _DATA_OFFSET + _CH_SIZE * n_channels
+    _validate_type(recompute_n_samples, (bool, None), "recompute_n_samples")
+    if file_size < 2e9:
+        logger.debug("File size < 2GB, using header values")
+        fid.seek(_EVENTTABLEPOS_OFFSET)
+        event_offset = int(np.frombuffer(fid.read(4), dtype="<i4").item())
+        logger.debug("Event table offset from header: %d", event_offset)
+        if event_offset > file_size:
+            problem = (
+                f"Event table offset from header ({event_offset}) is larger than file "
+                f"size ({file_size})"
+            )
+            if data_format == "auto":
+                raise RuntimeError(
+                    f"{problem}, cannot automatically compute data format, {workaround}"
+                )
+            warn(
+                f"Event table offset from header ({event_offset}) is larger than file "
+                f"size ({file_size}), recomputing event table offset."
+            )
+            n_bytes = 2 if data_format == "int16" else 4
+            event_offset = samples_offset + n_samples * n_channels * n_bytes
+        n_data_bytes = event_offset - samples_offset
+        if data_format == "auto":
+            n_bytes_per_chan, rem = divmod(n_data_bytes, n_channels)
+            why = ""
+            # starting assumption is 16-bit ints
+            n_bytes = 2
+            if rem != 0:
+                why = (
+                    f"number of data bytes {n_data_bytes} is not evenly divisible by "
+                    f"{n_channels=}"
+                )
+            elif n_samples == 0:
+                why = "number of samples (according to header) is 0"
+            else:
+                # we know `n_channels` divides evenly into `n_data_bytes`, and header
+                # said `n_samples` was non-zero, so try to infer `n_bytes`:
+                n_bytes, rem = divmod(n_bytes_per_chan, n_samples)
+                if rem != 0 or n_bytes not in [2, 4]:
+                    why = (
+                        f"number of bytes per channel {n_bytes_per_chan} is not evenly "
+                        f"divisible by {n_samples=} or does not result in 2 or 4 bytes "
+                        f"per sample ({n_bytes=})"
+                    )
+                logger.debug("Inferred data format with %d bytes per sample", n_bytes)
+            if why:
+                raise RuntimeError(
+                    "Could not automatically compute number of bytes per sample as the "
+                    f"{why}.  set data_format manually."
+                )
+        else:
+            n_bytes = 2 if data_format == "int16" else 4
+        logger.debug(
+            "Using %d bytes per sample from data_format=%s", n_bytes, data_format
+        )
+        # Our most reliable way to get the number of samples is to compute it
+        recomputed_n_samples, rem = divmod(n_data_bytes, (n_channels * n_bytes))
+        logger.debug("Computed number of samples: %d", recomputed_n_samples)
+        if recompute_n_samples is None:
+            recompute_n_samples = n_samples <= 0
+            if recompute_n_samples:
+                logger.info(
+                    "Number of samples in header (%d) is not positive, setting "
+                    "recompute_n_samples=True",
+                    n_samples,
+                )
+        if recompute_n_samples:
+            n_samples = recomputed_n_samples
+            if rem != 0:
+                warn(
+                    "Inconsistent file information detected, number of data bytes "
+                    f"({n_data_bytes}) not evenly divisible by number of channels "
+                    f"({n_channels}) times number of bytes ({n_bytes})"
+                )
     else:
+        logger.debug("File size >= 2GB, computing event table offset")
+        if recompute_n_samples:
+            raise ValueError(
+                "Cannot recompute number of samples for files larger than 2GB, set "
+                "recompute_samples=False"
+            )
         if data_format == "auto":
-            warn(
+            raise RuntimeError(
                 "Using `data_format='auto' for a CNT file larger"
-                " than 2Gb is not granted to work. Please pass"
-                " 'int16' or 'int32'.` (assuming int32)"
+                " than 2Gb is not supported, explicitly pass data_format as "
+                "'int16' or 'int32'"
             )
-
         n_bytes = 2 if data_format == "int16" else 4
-
-        fid.seek(SETUP_NSAMPLES_OFFSET)
-        (n_samples,) = np.frombuffer(fid.read(4), dtype="<i4")
-
-        fid.seek(SETUP_NCHANNELS_OFFSET)
-        (n_channels,) = np.frombuffer(fid.read(2), dtype="<u2")
-
-        event_table_pos = (
-            900 + 75 * int(n_channels) + n_bytes * int(n_channels) * int(n_samples)
+        event_offset = (
+            _DATA_OFFSET + _CH_SIZE * n_channels + n_bytes * n_channels * n_samples
         )
+        logger.debug("Computed event table offset: %d", event_offset)
 
-    fid.seek(fid_origin)  # restore the state
-    return event_table_pos
+    return n_channels, n_samples, n_bytes, event_offset