From 9ca04e6744fa03fb198215b67740b829767acdb7 Mon Sep 17 00:00:00 2001 From: Juliano Solanho Date: Thu, 4 Jun 2026 10:36:37 -0300 Subject: [PATCH 1/2] Add parallel-loop-bug repro script A minimal reproduction for an intermittent `<>` (NonTermination) crash from Test.run under -threaded with multiple capabilities (+RTS -N): a dozen ungrouped tests, each decoding a tiny document to a distinct type (a distinct decoder CAF), run in parallel by the test runner. Crashes in ~0.3% of runs at -N>=4 and never at -N1. Flag-gated behind `parallel-loop-bug` (off by default), so normal builds are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- nri-prelude/nri-prelude.cabal | 40 ++++++++++- nri-prelude/package.yaml | 25 +++++++ nri-prelude/scripts/parallel-loop-bug/Main.hs | 69 +++++++++++++++++++ .../scripts/parallel-loop-bug/README.md | 50 ++++++++++++++ 4 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 nri-prelude/scripts/parallel-loop-bug/Main.hs create mode 100644 nri-prelude/scripts/parallel-loop-bug/README.md diff --git a/nri-prelude/nri-prelude.cabal b/nri-prelude/nri-prelude.cabal index 954150e7..5050927f 100644 --- a/nri-prelude/nri-prelude.cabal +++ b/nri-prelude/nri-prelude.cabal @@ -1,6 +1,6 @@ cabal-version: 1.18 --- This file has been generated from package.yaml by hpack version 0.37.0. +-- This file has been generated from package.yaml by hpack version 0.38.3. -- -- see: https://github.com/sol/hpack @@ -29,6 +29,10 @@ source-repository head location: https://github.com/NoRedInk/haskell-libraries subdir: nri-prelude +flag parallel-loop-bug + manual: False + default: False + library exposed-modules: Array @@ -114,6 +118,40 @@ library , vector >=0.12.1.2 && <0.14 default-language: Haskell2010 +executable parallel-loop-bug + main-is: Main.hs + other-modules: + Paths_nri_prelude + hs-source-dirs: + scripts/parallel-loop-bug + default-extensions: + DataKinds + DeriveGeneric + FlexibleContexts + FlexibleInstances + GeneralizedNewtypeDeriving + MultiParamTypeClasses + NamedFieldPuns + NoImplicitPrelude + OverloadedStrings + PartialTypeSignatures + ScopedTypeVariables + Strict + TypeOperators + ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wpartial-fields -Wredundant-constraints -Wincomplete-uni-patterns -threaded -rtsopts "-with-rtsopts=-N -T" -O2 + build-depends: + aeson + , base + , bytestring + , nri-prelude + , text + , yaml + default-language: Haskell2010 + if flag(parallel-loop-bug) + buildable: True + else + buildable: False + test-suite tests type: exitcode-stdio-1.0 main-is: Main.hs diff --git a/nri-prelude/package.yaml b/nri-prelude/package.yaml index cfcc3b22..603bb850 100644 --- a/nri-prelude/package.yaml +++ b/nri-prelude/package.yaml @@ -102,3 +102,28 @@ tests: - -fno-warn-type-defaults default-extensions: - ExtendedDefaultRules +flags: + parallel-loop-bug: + default: false + manual: false +executables: + parallel-loop-bug: + source-dirs: scripts/parallel-loop-bug + main: Main.hs + dependencies: + - aeson + - base + - bytestring + - nri-prelude + - text + - yaml + ghc-options: + - -threaded + - -rtsopts "-with-rtsopts=-N -T" + - -O2 + when: + - condition: flag(parallel-loop-bug) + then: + buildable: true + else: + buildable: false diff --git a/nri-prelude/scripts/parallel-loop-bug/Main.hs b/nri-prelude/scripts/parallel-loop-bug/Main.hs new file mode 100644 index 00000000..1e5ccbbd --- /dev/null +++ b/nri-prelude/scripts/parallel-loop-bug/Main.hs @@ -0,0 +1,69 @@ +{-# LANGUAGE DeriveAnyClass #-} + +-- | Reproduction for an intermittent `<>` (NonTermination) crash from +-- `Test.run` under the threaded RTS with multiple capabilities (`+RTS -N`). +-- +-- The suite below is a dozen *ungrouped* tests (so `Test.run` executes them in +-- parallel via `Task.parallel`), each decoding a tiny document to a *distinct* +-- type — i.e. each forces a distinct per-type decoder CAF. Looping the binary +-- on a multi-core box crashes with `parallel-loop-bug: <>` in a small +-- fraction of runs at `-N >= 4`, and never at `-N1`. +-- +-- See scripts/parallel-loop-bug/README.md for how to run it and what we know. +module Main (main) where + +import Data.Aeson (FromJSON) +import qualified Data.ByteString.Char8 as BS +import Data.Text (Text) +import qualified Data.Yaml as Yaml +import qualified Expect +import GHC.Generics (Generic) +import Test (Test, describe, run, test) +import qualified Prelude +import Prelude (Either (..), IO, Int, Maybe, Show) + +-- A dozen DISTINCT record types => a dozen distinct FromJSON decoder CAFs. +-- Distinct field names; each field optional so the one document decodes into +-- every type. The decode is small but real (YAML); the trigger is the *variety +-- of distinct decoders forced concurrently*, not how big each document is. +data R1 = R1 {a1 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R2 = R2 {a2 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R3 = R3 {a3 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R4 = R4 {a4 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R5 = R5 {a5 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R6 = R6 {a6 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R7 = R7 {a7 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R8 = R8 {a8 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R9 = R9 {a9 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R10 = R10 {a10 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R11 = R11 {a11 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R12 = R12 {a12 :: Maybe Int} deriving (Show, Generic, FromJSON) + +doc :: BS.ByteString +doc = "{}\n" + +mk :: (Show a) => Text -> (BS.ByteString -> Either Yaml.ParseException a) -> Test +mk name dec = + test name (\_ -> + case dec doc of + Right x -> Expect.equal (Prelude.length (Prelude.show x)) (Prelude.length (Prelude.show x)) + Left _ -> Expect.fail "decode failed") + +main :: IO () +main = + run + (describe + "parallel-loop-bug" + [ mk "r1" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R1), + mk "r2" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R2), + mk "r3" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R3), + mk "r4" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R4), + mk "r5" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R5), + mk "r6" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R6), + mk "r7" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R7), + mk "r8" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R8), + mk "r9" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R9), + mk "r10" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R10), + mk "r11" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R11), + mk "r12" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R12) + ]) diff --git a/nri-prelude/scripts/parallel-loop-bug/README.md b/nri-prelude/scripts/parallel-loop-bug/README.md new file mode 100644 index 00000000..ade0e42e --- /dev/null +++ b/nri-prelude/scripts/parallel-loop-bug/README.md @@ -0,0 +1,50 @@ +# parallel-loop-bug + +`Test.run` runs ungrouped tests in parallel (`Task.parallel` → +`Async.forConcurrently`). Under the threaded RTS with multiple capabilities +(`+RTS -N`), a suite that forces a *variety of distinct decoder CAFs* +concurrently intermittently crashes with an uncaught `<>` +(`NonTermination`) before printing a report — i.e. a CI flake. + +This program is a minimal reproduction: a dozen ungrouped tests, each decoding a +tiny document to a *distinct* type (a distinct `FromJSON` decoder CAF). + +## Run + +The crash is rare, so loop the binary on a multi-core box and count crashes: + +```sh +cabal build parallel-loop-bug -fparallel-loop-bug +BIN=$(cabal list-bin parallel-loop-bug -fparallel-loop-bug) +loops=0 +for i in $(seq 1 10000); do + "$BIN" +RTS -N -RTS >/dev/null 2>err || grep -q '<>' err && loops=$((loops+1)) +done +echo "loop crashes: $loops / 10000" +``` + +You should see a handful of `parallel-loop-bug: <>` crashes at `-N >= 4`, +and zero at `-N1`. + +## What we know + +- ~0.2–0.3% of runs at `-N12` (12-core); **0 at `-N1`/`-N2`**; the rate scales + with `-N`. +- Needs **both** >1 capability **and** a *variety of distinct decoders, each + doing a small but real parse*. Identical/duplicated decoders don't trip it, + nor do pure `Expect.pass` tests; ~12 *distinct* ones do. The document can be + tiny (each test decodes `"{}"`), but the parse must do real work: a dozen + distinct YAML decoders reproduce it, whereas a dozen distinct *aeson* + `eitherDecodeStrict' "{}"` decodes did **not** (0/10000) — that decode is + essentially free. So the driver is the number of distinct decoder CAFs forced + concurrently; we didn't map exactly which codecs/sizes qualify. +- Wrapping the suite in `Test.serialize` avoids it (sequential execution) — the + current workaround for affected suites. +- The exception escapes **uncaught** (outside the per-test bodies that + `Test.run` would catch and report). +- **Masked by profiling**: a `-fprof-late` build run with `+RTS -xc` does not + reproduce it, so we have no Haskell-level stack. That's the signature of a + low-level threaded-RTS black-hole / deadlock-detector race rather than a + cyclic binding (and `-N1` cleanliness rules out a real cycle). +- **Not** GHC #13751 (`<>` under concurrent STM) — fixed in 8.2.1; this is + GHC 9.8.4. From e51ee651318cf11f0a6a249c27d9eedafde706c8 Mon Sep 17 00:00:00 2001 From: Juliano Solanho Date: Thu, 4 Jun 2026 14:03:36 -0300 Subject: [PATCH 2/2] Note: pure-aeson equivalent does not reproduce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tried aeson eitherDecodeStrict' (trivial "{}", a rich nested record, and 20 distinct recursive types) — all 0/10000 at -N12, vs ~0.27% for the YAML version. The trigger appears specific to the libyaml-backed YAML decode path, not concurrent decoding in general. Co-Authored-By: Claude Opus 4.8 (1M context) --- nri-prelude/scripts/parallel-loop-bug/README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/nri-prelude/scripts/parallel-loop-bug/README.md b/nri-prelude/scripts/parallel-loop-bug/README.md index ade0e42e..9e147799 100644 --- a/nri-prelude/scripts/parallel-loop-bug/README.md +++ b/nri-prelude/scripts/parallel-loop-bug/README.md @@ -30,14 +30,15 @@ and zero at `-N1`. - ~0.2–0.3% of runs at `-N12` (12-core); **0 at `-N1`/`-N2`**; the rate scales with `-N`. -- Needs **both** >1 capability **and** a *variety of distinct decoders, each - doing a small but real parse*. Identical/duplicated decoders don't trip it, - nor do pure `Expect.pass` tests; ~12 *distinct* ones do. The document can be - tiny (each test decodes `"{}"`), but the parse must do real work: a dozen - distinct YAML decoders reproduce it, whereas a dozen distinct *aeson* - `eitherDecodeStrict' "{}"` decodes did **not** (0/10000) — that decode is - essentially free. So the driver is the number of distinct decoder CAFs forced - concurrently; we didn't map exactly which codecs/sizes qualify. +- Needs **both** >1 capability **and** a *variety of distinct decoders*. + Identical/duplicated decoders don't trip it, nor do pure `Expect.pass` tests; + ~12 *distinct* YAML decoders do (each decodes a tiny `"{}"`). +- **Appears specific to the YAML/libyaml decode path.** A pure-`aeson` + equivalent did **not** reproduce in any variant we tried (all 0/10000 at + `-N12`): trivial `eitherDecodeStrict' "{}"`, a rich 8-field nested record, and + 20 distinct *recursive* types. So the trigger seems tied to something the + `yaml` package does (it parses via libyaml over FFI, with `unsafePerformIO`), + not to concurrent decoding in general. We did not exhaustively rule out aeson. - Wrapping the suite in `Test.serialize` avoids it (sequential execution) — the current workaround for affected suites. - The exception escapes **uncaught** (outside the per-test bodies that