diff --git a/nri-prelude/nri-prelude.cabal b/nri-prelude/nri-prelude.cabal index 954150e7..5050927f 100644 --- a/nri-prelude/nri-prelude.cabal +++ b/nri-prelude/nri-prelude.cabal @@ -1,6 +1,6 @@ cabal-version: 1.18 --- This file has been generated from package.yaml by hpack version 0.37.0. +-- This file has been generated from package.yaml by hpack version 0.38.3. -- -- see: https://github.com/sol/hpack @@ -29,6 +29,10 @@ source-repository head location: https://github.com/NoRedInk/haskell-libraries subdir: nri-prelude +flag parallel-loop-bug + manual: False + default: False + library exposed-modules: Array @@ -114,6 +118,40 @@ library , vector >=0.12.1.2 && <0.14 default-language: Haskell2010 +executable parallel-loop-bug + main-is: Main.hs + other-modules: + Paths_nri_prelude + hs-source-dirs: + scripts/parallel-loop-bug + default-extensions: + DataKinds + DeriveGeneric + FlexibleContexts + FlexibleInstances + GeneralizedNewtypeDeriving + MultiParamTypeClasses + NamedFieldPuns + NoImplicitPrelude + OverloadedStrings + PartialTypeSignatures + ScopedTypeVariables + Strict + TypeOperators + ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wpartial-fields -Wredundant-constraints -Wincomplete-uni-patterns -threaded -rtsopts "-with-rtsopts=-N -T" -O2 + build-depends: + aeson + , base + , bytestring + , nri-prelude + , text + , yaml + default-language: Haskell2010 + if flag(parallel-loop-bug) + buildable: True + else + buildable: False + test-suite tests type: exitcode-stdio-1.0 main-is: Main.hs diff --git a/nri-prelude/package.yaml b/nri-prelude/package.yaml index cfcc3b22..603bb850 100644 --- a/nri-prelude/package.yaml +++ b/nri-prelude/package.yaml @@ -102,3 +102,28 @@ tests: - -fno-warn-type-defaults default-extensions: - ExtendedDefaultRules +flags: + parallel-loop-bug: + default: false + manual: false +executables: + parallel-loop-bug: + source-dirs: scripts/parallel-loop-bug + main: Main.hs + dependencies: + - aeson + - base + - bytestring + - nri-prelude + - text + - yaml + ghc-options: + - -threaded + - -rtsopts "-with-rtsopts=-N -T" + - -O2 + when: + - condition: flag(parallel-loop-bug) + then: + buildable: true + else: + buildable: false diff --git a/nri-prelude/scripts/parallel-loop-bug/Main.hs b/nri-prelude/scripts/parallel-loop-bug/Main.hs new file mode 100644 index 00000000..1e5ccbbd --- /dev/null +++ b/nri-prelude/scripts/parallel-loop-bug/Main.hs @@ -0,0 +1,69 @@ +{-# LANGUAGE DeriveAnyClass #-} + +-- | Reproduction for an intermittent `<>` (NonTermination) crash from +-- `Test.run` under the threaded RTS with multiple capabilities (`+RTS -N`). +-- +-- The suite below is a dozen *ungrouped* tests (so `Test.run` executes them in +-- parallel via `Task.parallel`), each decoding a tiny document to a *distinct* +-- type — i.e. each forces a distinct per-type decoder CAF. Looping the binary +-- on a multi-core box crashes with `parallel-loop-bug: <>` in a small +-- fraction of runs at `-N >= 4`, and never at `-N1`. +-- +-- See scripts/parallel-loop-bug/README.md for how to run it and what we know. +module Main (main) where + +import Data.Aeson (FromJSON) +import qualified Data.ByteString.Char8 as BS +import Data.Text (Text) +import qualified Data.Yaml as Yaml +import qualified Expect +import GHC.Generics (Generic) +import Test (Test, describe, run, test) +import qualified Prelude +import Prelude (Either (..), IO, Int, Maybe, Show) + +-- A dozen DISTINCT record types => a dozen distinct FromJSON decoder CAFs. +-- Distinct field names; each field optional so the one document decodes into +-- every type. The decode is small but real (YAML); the trigger is the *variety +-- of distinct decoders forced concurrently*, not how big each document is. +data R1 = R1 {a1 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R2 = R2 {a2 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R3 = R3 {a3 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R4 = R4 {a4 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R5 = R5 {a5 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R6 = R6 {a6 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R7 = R7 {a7 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R8 = R8 {a8 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R9 = R9 {a9 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R10 = R10 {a10 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R11 = R11 {a11 :: Maybe Int} deriving (Show, Generic, FromJSON) +data R12 = R12 {a12 :: Maybe Int} deriving (Show, Generic, FromJSON) + +doc :: BS.ByteString +doc = "{}\n" + +mk :: (Show a) => Text -> (BS.ByteString -> Either Yaml.ParseException a) -> Test +mk name dec = + test name (\_ -> + case dec doc of + Right x -> Expect.equal (Prelude.length (Prelude.show x)) (Prelude.length (Prelude.show x)) + Left _ -> Expect.fail "decode failed") + +main :: IO () +main = + run + (describe + "parallel-loop-bug" + [ mk "r1" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R1), + mk "r2" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R2), + mk "r3" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R3), + mk "r4" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R4), + mk "r5" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R5), + mk "r6" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R6), + mk "r7" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R7), + mk "r8" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R8), + mk "r9" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R9), + mk "r10" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R10), + mk "r11" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R11), + mk "r12" (Yaml.decodeEither' :: BS.ByteString -> Either Yaml.ParseException R12) + ]) diff --git a/nri-prelude/scripts/parallel-loop-bug/README.md b/nri-prelude/scripts/parallel-loop-bug/README.md new file mode 100644 index 00000000..9e147799 --- /dev/null +++ b/nri-prelude/scripts/parallel-loop-bug/README.md @@ -0,0 +1,51 @@ +# parallel-loop-bug + +`Test.run` runs ungrouped tests in parallel (`Task.parallel` → +`Async.forConcurrently`). Under the threaded RTS with multiple capabilities +(`+RTS -N`), a suite that forces a *variety of distinct decoder CAFs* +concurrently intermittently crashes with an uncaught `<>` +(`NonTermination`) before printing a report — i.e. a CI flake. + +This program is a minimal reproduction: a dozen ungrouped tests, each decoding a +tiny document to a *distinct* type (a distinct `FromJSON` decoder CAF). + +## Run + +The crash is rare, so loop the binary on a multi-core box and count crashes: + +```sh +cabal build parallel-loop-bug -fparallel-loop-bug +BIN=$(cabal list-bin parallel-loop-bug -fparallel-loop-bug) +loops=0 +for i in $(seq 1 10000); do + "$BIN" +RTS -N -RTS >/dev/null 2>err || grep -q '<>' err && loops=$((loops+1)) +done +echo "loop crashes: $loops / 10000" +``` + +You should see a handful of `parallel-loop-bug: <>` crashes at `-N >= 4`, +and zero at `-N1`. + +## What we know + +- ~0.2–0.3% of runs at `-N12` (12-core); **0 at `-N1`/`-N2`**; the rate scales + with `-N`. +- Needs **both** >1 capability **and** a *variety of distinct decoders*. + Identical/duplicated decoders don't trip it, nor do pure `Expect.pass` tests; + ~12 *distinct* YAML decoders do (each decodes a tiny `"{}"`). +- **Appears specific to the YAML/libyaml decode path.** A pure-`aeson` + equivalent did **not** reproduce in any variant we tried (all 0/10000 at + `-N12`): trivial `eitherDecodeStrict' "{}"`, a rich 8-field nested record, and + 20 distinct *recursive* types. So the trigger seems tied to something the + `yaml` package does (it parses via libyaml over FFI, with `unsafePerformIO`), + not to concurrent decoding in general. We did not exhaustively rule out aeson. +- Wrapping the suite in `Test.serialize` avoids it (sequential execution) — the + current workaround for affected suites. +- The exception escapes **uncaught** (outside the per-test bodies that + `Test.run` would catch and report). +- **Masked by profiling**: a `-fprof-late` build run with `+RTS -xc` does not + reproduce it, so we have no Haskell-level stack. That's the signature of a + low-level threaded-RTS black-hole / deadlock-detector race rather than a + cyclic binding (and `-N1` cleanliness rules out a real cycle). +- **Not** GHC #13751 (`<>` under concurrent STM) — fixed in 8.2.1; this is + GHC 9.8.4.