From 08f7cc551b67aa72ec1016f1f7e3bbf5811bb660 Mon Sep 17 00:00:00 2001 From: Steve Hansen Date: Sat, 16 May 2026 13:27:07 +0200 Subject: [PATCH 1/2] perf: prime the row split cache from the engine's multiline loop (#120) The engine's multiline continuation loop computes options.Splitter.Split per iteration to check for unterminated quotes, but the final split was discarded -- the yielded row would lazily Split again on first field access. One redundant Split per row whenever AllowNewLineInEnclosedFieldValues is enabled. Fix: extend IRowFactory.Create with an IList? rawSplit parameter. Each factory assigns it to the row's rawSplitLine field directly. Engine declares rawSplit at the iteration scope, populates it in the header-init multiline pre-pass and the per-row multiline branch, and passes it through to factory.Create. Non-multiline paths pass null and the row's lazy split path is unchanged. The rawSplitLine field on ReadLine/ReadLineSpan/ReadLineSpanOptimized/ ReadLineFromMemory moves from private to internal (the factory structs are siblings, not nested-in, so sibling private access doesn't apply). The classes themselves are already internal sealed; InternalsVisibleTo ("Csv.Tests") is in place. No public surface widened. Co-Authored-By: Claude Opus 4.7 (1M context) --- Csv/CsvReader.Engine.cs | 56 ++++++++++++++++++++++++------------- Csv/CsvReader.FromMemory.cs | 2 +- Csv/CsvReader.cs | 6 ++-- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/Csv/CsvReader.Engine.cs b/Csv/CsvReader.Engine.cs index e85287a..cafbea2 100644 --- a/Csv/CsvReader.Engine.cs +++ b/Csv/CsvReader.Engine.cs @@ -35,7 +35,7 @@ internal interface IAsyncLineSource internal interface IRowFactory where TRow : class { - TRow Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options); + TRow Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, IList? rawSplit, CsvOptions options); } internal readonly struct TextReaderLineSource : ILineSource @@ -188,22 +188,28 @@ public MemoryText Concat(MemoryText head, string newLine, MemoryText tail, out s internal readonly struct StringRowFactory : IRowFactory { - public ReadLine Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + public ReadLine Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, IList? rawSplit, CsvOptions options) { #if NET8_0_OR_GREATER - return new ReadLine(headers, headerLookup, index, rawString ?? raw.ToString(), options); + var row = new ReadLine(headers, headerLookup, index, rawString ?? raw.ToString(), options); #else - return new ReadLine(headers, headerLookup, index, rawString ?? raw, options); + var row = new ReadLine(headers, headerLookup, index, rawString ?? raw, options); #endif + if (rawSplit != null) + row.rawSplitLine = rawSplit; + return row; } } #if NET8_0_OR_GREATER internal readonly struct SpanRowFactory : IRowFactory { - public ReadLineSpan Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + public ReadLineSpan Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, IList? rawSplit, CsvOptions options) { - return new ReadLineSpan(headers, headerLookup, index, rawString ?? raw.ToString(), options); + var row = new ReadLineSpan(headers, headerLookup, index, rawString ?? raw.ToString(), options); + if (rawSplit != null) + row.rawSplitLine = rawSplit; + return row; } } @@ -216,17 +222,23 @@ public OptimizedRowFactory(CsvMemoryOptions memoryOptions) this.memoryOptions = memoryOptions; } - public ReadLineSpanOptimized Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + public ReadLineSpanOptimized Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, IList? rawSplit, CsvOptions options) { - return new ReadLineSpanOptimized(headers, headerLookup, index, raw, options, memoryOptions); + var row = new ReadLineSpanOptimized(headers, headerLookup, index, raw, options, memoryOptions); + if (rawSplit != null) + row.rawSplitLine = rawSplit; + return row; } } internal readonly struct MemoryRowFactory : IRowFactory { - public ReadLineFromMemory Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, CsvOptions options) + public ReadLineFromMemory Create(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, string? rawString, IList? rawSplit, CsvOptions options) { - return new ReadLineFromMemory(headers, headerLookup, index, raw, options); + var row = new ReadLineFromMemory(headers, headerLookup, index, raw, options); + if (rawSplit != null) + row.rawSplitLine = rawSplit; + return row; } } #endif @@ -249,6 +261,8 @@ private static IEnumerable Enumerate(TSource sour if (index <= options.RowsToSkip || options.SkipRow?.Invoke(line, index) == true) continue; + IList? rawSplit = null; + if (headers == null || headerLookup == null) { InitializeOptions(line.AsSpan(), options); @@ -259,15 +273,15 @@ private static IEnumerable Enumerate(TSource sour // case via index == RowsToSkip + 1 and skips its own multiline pass to avoid double-reading. if (!skipInitialLine && options.AllowNewLineInEnclosedFieldValues) { - var splitLine = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options); - while (splitLine.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(splitLine[splitLine.Count - 1].AsSpan(), options)) + while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { if (!source.TryReadLine(out var nextLine, out _)) break; line = source.Concat(line, options.NewLine, nextLine, out lineString); - splitLine = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options); } } @@ -314,7 +328,7 @@ private static IEnumerable Enumerate(TSource sour var isFirstDataLineInHeaderAbsentMode = options.HeaderMode == HeaderMode.HeaderAbsent && index == (options.RowsToSkip + 1); if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) { - var rawSplit = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options); while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { if (!source.TryReadLine(out var nextLine, out _)) @@ -325,7 +339,7 @@ private static IEnumerable Enumerate(TSource sour } } - yield return factory.Create(headers, headerLookup, index, line, lineString, options); + yield return factory.Create(headers, headerLookup, index, line, lineString, rawSplit, options); } } @@ -352,6 +366,8 @@ private static async IAsyncEnumerable EnumerateAsync? rawSplit = null; + if (headers == null || headerLookup == null) { InitializeOptions(line.AsSpan(), options); @@ -362,16 +378,16 @@ private static async IAsyncEnumerable EnumerateAsync 0 && CsvLineSplitter.IsUnterminatedQuotedValue(splitLine[splitLine.Count - 1].AsSpan(), options)) + while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); if (!nextOk) break; line = source.Concat(line, options.NewLine, nextLine, out lineString); - splitLine = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options); } } @@ -418,7 +434,7 @@ private static async IAsyncEnumerable EnumerateAsync 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); @@ -430,7 +446,7 @@ private static async IAsyncEnumerable EnumerateAsync headerLookup; private readonly CsvOptions options; - private IList? rawSplitLine; + internal IList? rawSplitLine; private MemoryText[]? parsedLine; public ReadLineFromMemory(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, CsvOptions options) diff --git a/Csv/CsvReader.cs b/Csv/CsvReader.cs index 5a14f71..b3cb739 100644 --- a/Csv/CsvReader.cs +++ b/Csv/CsvReader.cs @@ -450,7 +450,7 @@ internal sealed class ReadLine : ICsvLine private readonly Dictionary headerLookup; private readonly CsvOptions options; private readonly MemoryText[] headers; - private IList? rawSplitLine; + internal IList? rawSplitLine; internal MemoryText[]? parsedLine; public ReadLine(MemoryText[] headers, Dictionary headerLookup, int index, string raw, CsvOptions options) @@ -551,7 +551,7 @@ internal sealed class ReadLineSpan : ICsvLineSpan private readonly Dictionary headerLookup; private readonly CsvOptions options; private readonly MemoryText[] headers; - private IList? rawSplitLine; + internal IList? rawSplitLine; internal MemoryText[]? parsedLine; public ReadLineSpan(MemoryText[] headers, Dictionary headerLookup, int index, string raw, CsvOptions options) @@ -694,7 +694,7 @@ internal sealed class ReadLineSpanOptimized : ICsvLineSpan private readonly CsvMemoryOptions memoryOptions; private readonly ReadOnlyMemory[] headers; private readonly ReadOnlyMemory rawMemory; - private IList>? rawSplitLine; + internal IList>? rawSplitLine; private ReadOnlyMemory[]? parsedLine; public ReadLineSpanOptimized(ReadOnlyMemory[] headers, Dictionary headerLookup, int index, ReadOnlyMemory raw, CsvOptions options, CsvMemoryOptions memoryOptions) From 76f983c33977abdb9507010702feef2c43ca9040 Mon Sep 17 00:00:00 2001 From: Steve Hansen Date: Sat, 16 May 2026 13:41:43 +0200 Subject: [PATCH 2/2] perf: pre-size split list using known header count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the multiline yield loop, both Enumerate and EnumerateAsync know the column count from `headers` before re-splitting. Thread it through to Splitter.Split as the initial list capacity so we avoid the List grow-and-copy on the hot path. Per-row pre-sizing only — the header-init split (where headers don't exist yet) still uses the default capacity. Co-Authored-By: Claude Opus 4.7 (1M context) --- Csv/CsvReader.Engine.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Csv/CsvReader.Engine.cs b/Csv/CsvReader.Engine.cs index cafbea2..9a57e0b 100644 --- a/Csv/CsvReader.Engine.cs +++ b/Csv/CsvReader.Engine.cs @@ -328,14 +328,14 @@ private static IEnumerable Enumerate(TSource sour var isFirstDataLineInHeaderAbsentMode = options.HeaderMode == HeaderMode.HeaderAbsent && index == (options.RowsToSkip + 1); if (options.AllowNewLineInEnclosedFieldValues && !isFirstDataLineInHeaderAbsentMode) { - rawSplit = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options, headers!.Length); while (rawSplit.Count > 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { if (!source.TryReadLine(out var nextLine, out _)) break; line = source.Concat(line, options.NewLine, nextLine, out lineString); - rawSplit = options.Splitter.Split(line, options); + rawSplit = options.Splitter.Split(line, options, headers!.Length); } } @@ -434,7 +434,7 @@ private static async IAsyncEnumerable EnumerateAsync 0 && CsvLineSplitter.IsUnterminatedQuotedValue(rawSplit[rawSplit.Count - 1].AsSpan(), options)) { var (nextOk, nextLine, _) = await source.TryReadLineAsync(ct).ConfigureAwait(false); @@ -442,7 +442,7 @@ private static async IAsyncEnumerable EnumerateAsync