From c0c45632d77c07e0f818602fbca8b41c6c7744ab Mon Sep 17 00:00:00 2001 From: Steve Hansen Date: Sat, 30 May 2026 13:27:46 +0200 Subject: [PATCH] refactor: align internal terminology with ubiquitous language MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal/private identifiers and XML-doc comments now follow a single documented vocabulary (record / physical line / field / value / quoting). No public API changes — every rename is internal or doc-only. - Reader record classes: rawSplitLine->rawFields, parsedLine->parsedValues, and the private `Line` property (which returned the parsed field array) ->ParsedValues. - Writer: fix the escape-vs-quote naming (FixedEscapeChars->QuoteTriggerChars, needsGeneralEscape->needsQuoting, the wrap-the-field `escape` flag->mustQuote). Kept cell/WriteCell/WriteRow for consistency with the public writer API. - Reword misleading public XML docs (ColumnCount counts fields, Read* yields records, int indexers take a field index, WriteCell does quoting and escaping). - Add UBIQUITOUS_LANGUAGE.md: the glossary, blast-radius analysis, and a vNext rename-target list for the frozen public names. Builds on netstandard2.0/net8.0/net9.0; all 179 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- Csv/CsvBufferWriter.cs | 10 +-- Csv/CsvLineSplitter.cs | 4 +- Csv/CsvOptions.cs | 4 +- Csv/CsvReader.Engine.cs | 8 +- Csv/CsvReader.FromMemory.cs | 30 +++---- Csv/CsvReader.cs | 132 +++++++++++++-------------- Csv/CsvWriter.cs | 32 +++---- Csv/ICsvLine.cs | 12 +-- Csv/ICsvLineFromMemory.cs | 28 +++--- Csv/ICsvLineSpan.cs | 30 +++---- UBIQUITOUS_LANGUAGE.md | 174 ++++++++++++++++++++++++++++++++++++ 11 files changed, 319 insertions(+), 145 deletions(-) create mode 100644 UBIQUITOUS_LANGUAGE.md diff --git a/Csv/CsvBufferWriter.cs b/Csv/CsvBufferWriter.cs index 5338e5c..fdd14e1 100644 --- a/Csv/CsvBufferWriter.cs +++ b/Csv/CsvBufferWriter.cs @@ -14,10 +14,10 @@ namespace Csv public sealed class CsvBufferWriter : IBufferWriter, IDisposable { // The separator is per-call so it can't be baked into a single cached SearchValues. - // Keep the fixed escape chars cached and check the separator with a separate Contains. + // Keep the fixed quote-trigger chars cached and check the separator with a separate Contains. // Without this caching, MemoryExtensions.IndexOfAny(ReadOnlySpan, ReadOnlySpan) builds // a fresh SearchValues on the heap every call (~72 bytes per WriteCell). - private static readonly SearchValues FixedEscapeChars = SearchValues.Create("'\n\r"); + private static readonly SearchValues QuoteTriggerChars = SearchValues.Create("'\n\r"); private readonly CsvMemoryOptions _options; private readonly List<(char[] buffer, int written, bool isPooled)> _buffers; @@ -96,7 +96,7 @@ public void WriteRow(ReadOnlySpan> cells, char separator = } /// - /// Writes a single cell with proper CSV escaping. + /// Writes a single cell with proper CSV quoting and escaping. /// /// The cell content. /// The column separator character. @@ -104,9 +104,9 @@ public void WriteRow(ReadOnlySpan> cells, char separator = public void WriteCell(ReadOnlySpan cell, char separator = ',') { var needsQuoteEscape = cell.Contains('"'); - var needsGeneralEscape = cell.Contains(separator) || cell.IndexOfAny(FixedEscapeChars) >= 0; + var needsQuoting = cell.Contains(separator) || cell.IndexOfAny(QuoteTriggerChars) >= 0; - if (needsQuoteEscape || needsGeneralEscape) + if (needsQuoteEscape || needsQuoting) { Write('"'); diff --git a/Csv/CsvLineSplitter.cs b/Csv/CsvLineSplitter.cs index 9a39c86..5efd07c 100644 --- a/Csv/CsvLineSplitter.cs +++ b/Csv/CsvLineSplitter.cs @@ -11,7 +11,7 @@ namespace Csv { /// - /// Splits a single line (multiline handling is done independently) into multiple parts + /// Splits a single record's text (multiline handling is done independently) into its fields /// internal sealed class CsvLineSplitter { @@ -69,7 +69,7 @@ public static bool IsUnterminatedQuotedValue(SpanText value, CsvOptions options) // A quote may open a quoted field either at the literal field start or, when // TrimData is set, after any run of leading whitespace. This matches the // user-visible promise of TrimData: surrounding whitespace doesn't break the - // structure of a quoted cell (issue #71). + // structure of a quoted field (issue #71). private static bool IsAtFieldOpen(SpanText span, int start, int i, bool trimData) { if (i == start) diff --git a/Csv/CsvOptions.cs b/Csv/CsvOptions.cs index c2963ac..76f8748 100644 --- a/Csv/CsvOptions.cs +++ b/Csv/CsvOptions.cs @@ -46,7 +46,7 @@ public sealed class CsvOptions public HeaderMode HeaderMode { get; set; } = HeaderMode.HeaderPresent; /// - /// Gets or sets whether a row should be validated immediately that the column count matches the header count, defaults to false. + /// Gets or sets whether each row is validated immediately so that its field count matches the header count, defaults to false. /// public bool ValidateColumnCount { get; set; } @@ -56,7 +56,7 @@ public sealed class CsvOptions public bool ReturnEmptyForMissingColumn { get; set; } /// - /// Can be used to use multiple names for a single column. (e.g. to allow "CategoryName", "Category Name", "Category-Name") + /// Can be used to map multiple names to a single header/column. (e.g. to allow "CategoryName", "Category Name", "Category-Name") /// /// /// A group with no matches is ignored. diff --git a/Csv/CsvReader.Engine.cs b/Csv/CsvReader.Engine.cs index 9a57e0b..502ddc1 100644 --- a/Csv/CsvReader.Engine.cs +++ b/Csv/CsvReader.Engine.cs @@ -196,7 +196,7 @@ public ReadLine Create(MemoryText[] headers, Dictionary headerLooku var row = new ReadLine(headers, headerLookup, index, rawString ?? raw, options); #endif if (rawSplit != null) - row.rawSplitLine = rawSplit; + row.rawFields = rawSplit; return row; } } @@ -208,7 +208,7 @@ public ReadLineSpan Create(MemoryText[] headers, Dictionary headerL { var row = new ReadLineSpan(headers, headerLookup, index, rawString ?? raw.ToString(), options); if (rawSplit != null) - row.rawSplitLine = rawSplit; + row.rawFields = rawSplit; return row; } } @@ -226,7 +226,7 @@ public ReadLineSpanOptimized Create(MemoryText[] headers, Dictionary h { var row = new ReadLineFromMemory(headers, headerLookup, index, raw, options); if (rawSplit != null) - row.rawSplitLine = rawSplit; + row.rawFields = rawSplit; return row; } } diff --git a/Csv/CsvReader.FromMemory.cs b/Csv/CsvReader.FromMemory.cs index 400f6eb..b39f4d0 100644 --- a/Csv/CsvReader.FromMemory.cs +++ b/Csv/CsvReader.FromMemory.cs @@ -12,7 +12,7 @@ namespace Csv partial class CsvReader { /// - /// Reads the lines from the csv string. + /// Reads the records from the csv string. /// /// The csv string to read the data from. /// The optional options to use when reading. @@ -23,8 +23,8 @@ internal sealed class ReadLineFromMemory : ICsvLineFromMemory { private readonly Dictionary headerLookup; private readonly CsvOptions options; - internal IList? rawSplitLine; - private MemoryText[]? parsedLine; + internal IList? rawFields; + private MemoryText[]? parsedValues; public ReadLineFromMemory(MemoryText[] headers, Dictionary headerLookup, int index, MemoryText raw, CsvOptions options) { @@ -41,7 +41,7 @@ public ReadLineFromMemory(MemoryText[] headers, Dictionary headerLo public int Index { get; } - public int ColumnCount => Line.Length; + public int ColumnCount => ParsedValues.Length; public bool HasColumn(string name) => headerLookup.ContainsKey(name); @@ -50,28 +50,28 @@ public bool LineHasColumn(string name) if (!headerLookup.TryGetValue(name, out var index)) return false; - return RawSplitLine.Count > index; + return RawFields.Count > index; } - internal IList RawSplitLine => rawSplitLine ??= SplitLine(Raw, options); + internal IList RawFields => rawFields ??= SplitLine(Raw, options); - public MemoryText[] Values => Line; + public MemoryText[] Values => ParsedValues; - private MemoryText[] Line + private MemoryText[] ParsedValues { get { - if (parsedLine == null) + if (parsedValues == null) { - var raw = RawSplitLine; + var raw = RawFields; if (options.ValidateColumnCount && raw.Count != Headers.Length) throw new InvalidOperationException($"Expected {Headers.Length}, got {raw.Count} columns."); - parsedLine = Trim(raw, options); + parsedValues = Trim(raw, options); } - return parsedLine; + return parsedValues; } } @@ -89,16 +89,16 @@ MemoryText ICsvLineFromMemory.this[string name] try { - return Line[index]; + return ParsedValues[index]; } catch (IndexOutOfRangeException) { - throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {Line.Length} columns."); + throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {ParsedValues.Length} columns."); } } } - MemoryText ICsvLineFromMemory.this[int index] => Line[index]; + MemoryText ICsvLineFromMemory.this[int index] => ParsedValues[index]; public override string ToString() { diff --git a/Csv/CsvReader.cs b/Csv/CsvReader.cs index b3cb739..bdc84c4 100644 --- a/Csv/CsvReader.cs +++ b/Csv/CsvReader.cs @@ -19,7 +19,7 @@ namespace Csv public static partial class CsvReader { /// - /// Reads the lines from the reader. + /// Reads the records from the reader. /// /// The text reader to read the data from. /// The optional options to use when reading. @@ -32,7 +32,7 @@ public static IEnumerable Read(TextReader reader, CsvOptions? options } /// - /// Reads the lines from the stream. + /// Reads the records from the stream. /// /// The stream to read the data from. /// The optional options to use when reading. @@ -45,7 +45,7 @@ public static IEnumerable ReadFromStream(Stream stream, CsvOptions? op } /// - /// Reads the lines from the csv string. + /// Reads the records from the csv string. /// /// The csv string to read the data from. /// The optional options to use when reading. @@ -60,7 +60,7 @@ public static IEnumerable ReadFromText(string csv, CsvOptions? options #if NET8_0_OR_GREATER /// - /// Reads the lines from the reader with enhanced Span/Memory support. + /// Reads the records from the reader with enhanced Span/Memory support. /// /// The text reader to read the data from. /// The optional options to use when reading. @@ -73,7 +73,7 @@ public static IEnumerable ReadAsSpan(TextReader reader, CsvOptions } /// - /// Reads the lines from the stream with enhanced Span/Memory support. + /// Reads the records from the stream with enhanced Span/Memory support. /// /// The stream to read the data from. /// The optional options to use when reading. @@ -86,7 +86,7 @@ public static IEnumerable ReadFromStreamAsSpan(Stream stream, CsvO } /// - /// Reads the lines from the csv string with enhanced Span/Memory support. + /// Reads the records from the csv string with enhanced Span/Memory support. /// /// The csv string to read the data from. /// The optional options to use when reading. @@ -125,7 +125,7 @@ private static IEnumerable ReadSpanImpl(TextReader reader, CsvOpti /// The CSV data as ReadOnlyMemory. /// The CSV parsing options. /// The memory management options. - /// An enumerable of CSV lines with memory optimization. + /// An enumerable of CSV records with memory optimization. public static IEnumerable ReadFromMemoryOptimized(ReadOnlyMemory csv, CsvOptions? options = null, CsvMemoryOptions? memoryOptions = null) { options ??= new CsvOptions(); @@ -185,7 +185,7 @@ private static IEnumerable ReadImpl(TextReader reader, CsvOptions? opt #if NET8_0_OR_GREATER /// - /// Reads the lines from the reader. + /// Reads the records from the reader. /// /// The text reader to read the data from. /// The optional options to use when reading. @@ -198,7 +198,7 @@ public static IAsyncEnumerable ReadAsync(TextReader reader, CsvOptions } /// - /// Reads the lines from the stream. + /// Reads the records from the stream. /// /// The stream to read the data from. /// The optional options to use when reading. @@ -218,7 +218,7 @@ static async IAsyncEnumerable Impl(Stream stream, CsvOptions? options) } /// - /// Reads the lines from the csv string. + /// Reads the records from the csv string. /// /// The csv string to read the data from. /// The optional options to use when reading. @@ -442,7 +442,7 @@ ICsvLine SubLine(ICsvLine line, int start, int length) headers = line.Headers.Skip(start).Take(length).Select(x=>x.AsMemory()).ToArray(); MemoryText[] values = headers.Select(x => line[x.ToString()].AsMemory()).ToArray(); Dictionary map = Enumerable.Range(0, headers.Length).ToDictionary(x => headers[x].ToString()); - return new ReadLine(headers, map, line.Index, line.Raw, new CsvOptions()) { parsedLine = values }; + return new ReadLine(headers, map, line.Index, line.Raw, new CsvOptions()) { parsedValues = values }; } } internal sealed class ReadLine : ICsvLine @@ -450,8 +450,8 @@ internal sealed class ReadLine : ICsvLine private readonly Dictionary headerLookup; private readonly CsvOptions options; private readonly MemoryText[] headers; - internal IList? rawSplitLine; - internal MemoryText[]? parsedLine; + internal IList? rawFields; + internal MemoryText[]? parsedValues; public ReadLine(MemoryText[] headers, Dictionary headerLookup, int index, string raw, CsvOptions options) { @@ -468,7 +468,7 @@ public ReadLine(MemoryText[] headers, Dictionary headerLookup, int public int Index { get; } - public int ColumnCount => Line.Length; + public int ColumnCount => ParsedValues.Length; public bool HasColumn(string name) => headerLookup.ContainsKey(name); @@ -477,39 +477,39 @@ public bool LineHasColumn(string name) if (!headerLookup.TryGetValue(name, out var index)) return false; - return RawSplitLine.Count > index; + return RawFields.Count > index; } - internal IList RawSplitLine + internal IList RawFields { get { #if NET8_0_OR_GREATER - rawSplitLine ??= SplitLine(Raw.AsMemory(), options, headers.Length); + rawFields ??= SplitLine(Raw.AsMemory(), options, headers.Length); #else - rawSplitLine ??= SplitLine(Raw, options, headers.Length); + rawFields ??= SplitLine(Raw, options, headers.Length); #endif - return rawSplitLine; + return rawFields; } } - public string[] Values => Line.Select(it => it.AsString()).ToArray(); + public string[] Values => ParsedValues.Select(it => it.AsString()).ToArray(); - private MemoryText[] Line + private MemoryText[] ParsedValues { get { - if (parsedLine == null) + if (parsedValues == null) { - var raw = RawSplitLine; + var raw = RawFields; if (options.ValidateColumnCount && raw.Count != Headers.Length) throw new InvalidOperationException($"Expected {Headers.Length}, got {raw.Count} columns."); - parsedLine = Trim(raw, options); + parsedValues = Trim(raw, options); } - return parsedLine; + return parsedValues; } } @@ -527,16 +527,16 @@ string ICsvLine.this[string name] try { - return Line[index].AsString(); + return ParsedValues[index].AsString(); } catch (IndexOutOfRangeException) { - throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {Line.Length} columns."); + throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {ParsedValues.Length} columns."); } } } - string ICsvLine.this[int index] => Line[index].AsString(); + string ICsvLine.this[int index] => ParsedValues[index].AsString(); public override string ToString() { @@ -551,8 +551,8 @@ internal sealed class ReadLineSpan : ICsvLineSpan private readonly Dictionary headerLookup; private readonly CsvOptions options; private readonly MemoryText[] headers; - internal IList? rawSplitLine; - internal MemoryText[]? parsedLine; + internal IList? rawFields; + internal MemoryText[]? parsedValues; public ReadLineSpan(MemoryText[] headers, Dictionary headerLookup, int index, string raw, CsvOptions options) { @@ -572,7 +572,7 @@ public ReadLineSpan(MemoryText[] headers, Dictionary headerLookup, public ReadOnlySpan RawSpan => Raw.AsSpan(); public int Index { get; } - public int ColumnCount => Line.Length; + public int ColumnCount => ParsedValues.Length; public bool HasColumn(string name) => headerLookup.ContainsKey(name); @@ -581,30 +581,30 @@ public bool LineHasColumn(string name) if (!headerLookup.TryGetValue(name, out var index)) return false; - return RawSplitLine.Count > index; + return RawFields.Count > index; } - internal IList RawSplitLine => rawSplitLine ??= SplitLine(Raw.AsMemory(), options, headers.Length); + internal IList RawFields => rawFields ??= SplitLine(Raw.AsMemory(), options, headers.Length); - public string[] Values => Line.Select(it => it.AsString()).ToArray(); - public ReadOnlyMemory[] ValuesMemory => Line; + public string[] Values => ParsedValues.Select(it => it.AsString()).ToArray(); + public ReadOnlyMemory[] ValuesMemory => ParsedValues; public ReadOnlySpan ValuesSpan => throw new NotSupportedException("ValuesSpan not supported for array access. Use GetSpan(int) or GetMemory(int) for individual values."); - private MemoryText[] Line + private MemoryText[] ParsedValues { get { - if (parsedLine == null) + if (parsedValues == null) { - var raw = RawSplitLine; + var raw = RawFields; if (options.ValidateColumnCount && raw.Count != Headers.Length) throw new InvalidOperationException($"Expected {Headers.Length}, got {raw.Count} columns."); - parsedLine = Trim(raw, options); + parsedValues = Trim(raw, options); } - return parsedLine; + return parsedValues; } } @@ -623,24 +623,24 @@ public ReadOnlyMemory GetMemory(string name) try { - return Line[index]; + return ParsedValues[index]; } catch (IndexOutOfRangeException) { - throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {Line.Length} columns."); + throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {ParsedValues.Length} columns."); } } - public ReadOnlyMemory GetMemory(int index) => Line[index]; + public ReadOnlyMemory GetMemory(int index) => ParsedValues[index]; public ReadOnlySpan GetSpan(string name) => GetMemory(name).Span; public ReadOnlySpan GetSpan(int index) => GetMemory(index).Span; public bool TryGetMemory(string name, out ReadOnlyMemory value) { - if (headerLookup.TryGetValue(name, out var index) && index < Line.Length) + if (headerLookup.TryGetValue(name, out var index) && index < ParsedValues.Length) { - value = Line[index]; + value = ParsedValues[index]; return true; } @@ -650,9 +650,9 @@ public bool TryGetMemory(string name, out ReadOnlyMemory value) public bool TryGetMemory(int index, out ReadOnlyMemory value) { - if (index >= 0 && index < Line.Length) + if (index >= 0 && index < ParsedValues.Length) { - value = Line[index]; + value = ParsedValues[index]; return true; } @@ -694,8 +694,8 @@ internal sealed class ReadLineSpanOptimized : ICsvLineSpan private readonly CsvMemoryOptions memoryOptions; private readonly ReadOnlyMemory[] headers; private readonly ReadOnlyMemory rawMemory; - internal IList>? rawSplitLine; - private ReadOnlyMemory[]? parsedLine; + internal IList>? rawFields; + private ReadOnlyMemory[]? parsedValues; public ReadLineSpanOptimized(ReadOnlyMemory[] headers, Dictionary headerLookup, int index, ReadOnlyMemory raw, CsvOptions options, CsvMemoryOptions memoryOptions) { @@ -716,7 +716,7 @@ public ReadLineSpanOptimized(ReadOnlyMemory[] headers, Dictionary RawSpan => rawMemory.Span; public int Index { get; } - public int ColumnCount => Line.Length; + public int ColumnCount => ParsedValues.Length; public bool HasColumn(string name) => headerLookup.ContainsKey(name); @@ -725,30 +725,30 @@ public bool LineHasColumn(string name) if (!headerLookup.TryGetValue(name, out var index)) return false; - return RawSplitLine.Count > index; + return RawFields.Count > index; } - internal IList> RawSplitLine => rawSplitLine ??= SplitLineOptimized(rawMemory, options, memoryOptions, headers.Length); + internal IList> RawFields => rawFields ??= SplitLineOptimized(rawMemory, options, memoryOptions, headers.Length); - public string[] Values => Line.Select(v => v.ToString()).ToArray(); - public ReadOnlyMemory[] ValuesMemory => Line; + public string[] Values => ParsedValues.Select(v => v.ToString()).ToArray(); + public ReadOnlyMemory[] ValuesMemory => ParsedValues; public ReadOnlySpan ValuesSpan => throw new NotSupportedException("ValuesSpan not supported for array access. Use GetSpan(int) or GetMemory(int) for individual values."); - private ReadOnlyMemory[] Line + private ReadOnlyMemory[] ParsedValues { get { - if (parsedLine == null) + if (parsedValues == null) { - var raw = RawSplitLine; + var raw = RawFields; if (options.ValidateColumnCount && raw.Count != Headers.Length) throw new InvalidOperationException($"Expected {Headers.Length}, got {raw.Count} columns."); - parsedLine = TrimOptimized(raw, options, memoryOptions); + parsedValues = TrimOptimized(raw, options, memoryOptions); } - return parsedLine; + return parsedValues; } } @@ -767,24 +767,24 @@ public ReadOnlyMemory GetMemory(string name) try { - return Line[index]; + return ParsedValues[index]; } catch (IndexOutOfRangeException) { - throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {Line.Length} columns."); + throw new InvalidOperationException($"Invalid row, missing {name} header, expected {Headers.Length} columns, got {ParsedValues.Length} columns."); } } - public ReadOnlyMemory GetMemory(int index) => Line[index]; + public ReadOnlyMemory GetMemory(int index) => ParsedValues[index]; public ReadOnlySpan GetSpan(string name) => GetMemory(name).Span; public ReadOnlySpan GetSpan(int index) => GetMemory(index).Span; public bool TryGetMemory(string name, out ReadOnlyMemory value) { - if (headerLookup.TryGetValue(name, out var index) && index < Line.Length) + if (headerLookup.TryGetValue(name, out var index) && index < ParsedValues.Length) { - value = Line[index]; + value = ParsedValues[index]; return true; } @@ -794,9 +794,9 @@ public bool TryGetMemory(string name, out ReadOnlyMemory value) public bool TryGetMemory(int index, out ReadOnlyMemory value) { - if (index >= 0 && index < Line.Length) + if (index >= 0 && index < ParsedValues.Length) { - value = Line[index]; + value = ParsedValues[index]; return true; } diff --git a/Csv/CsvWriter.cs b/Csv/CsvWriter.cs index a749510..12a780e 100644 --- a/Csv/CsvWriter.cs +++ b/Csv/CsvWriter.cs @@ -23,10 +23,10 @@ public static class CsvWriter { #if NET8_0_OR_GREATER // The separator is per-call so it can't be baked into a single cached SearchValues. - // Keep the fixed escape chars cached and check the separator with a separate Contains. + // Keep the fixed quote-trigger chars cached and check the separator with a separate Contains. // Without this caching, MemoryExtensions.IndexOfAny(ReadOnlySpan, ReadOnlySpan)/char[] // builds a fresh SearchValues on the heap every call. - private static readonly SearchValues FixedEscapeChars = SearchValues.Create("'\n"); + private static readonly SearchValues QuoteTriggerChars = SearchValues.Create("'\n"); #endif /// @@ -468,7 +468,7 @@ public static async Task WriteToTextAsync(ReadOnlyMemory[]? header private static void WriteLine(TextWriter writer, string[] data, int columnCount, char separator) { - var escapeChars = new[] { separator, '\'', '\n' }; + var quoteTriggerChars = new[] { separator, '\'', '\n' }; for (var i = 0; i < columnCount; i++) { if (i > 0) @@ -476,7 +476,7 @@ private static void WriteLine(TextWriter writer, string[] data, int columnCount, if (i < data.Length) { - var escape = false; + var mustQuote = false; var cell = data[i] ?? string.Empty; #if NET8_0_OR_GREATER if (cell.Contains('"')) @@ -484,16 +484,16 @@ private static void WriteLine(TextWriter writer, string[] data, int columnCount, if (cell.Contains("\"")) #endif { - escape = true; + mustQuote = true; cell = cell.Replace("\"", "\"\""); } - else if (cell.IndexOfAny(escapeChars) >= 0) - escape = true; + else if (cell.IndexOfAny(quoteTriggerChars) >= 0) + mustQuote = true; - if (escape) + if (mustQuote) writer.Write('"'); writer.Write(cell); - if (escape) + if (mustQuote) writer.Write('"'); } } @@ -503,7 +503,7 @@ private static void WriteLine(TextWriter writer, string[] data, int columnCount, private static async Task WriteLineAsync(TextWriter writer, string[] data, int columnCount, char separator) { - var escapeChars = new[] { separator, '\'', '\n' }; + var quoteTriggerChars = new[] { separator, '\'', '\n' }; for (var i = 0; i < columnCount; i++) { if (i > 0) @@ -555,7 +555,7 @@ private static async Task WriteLineAsync(TextWriter writer, string[] data, int c // Write closing quote await writer.WriteAsync('"').ConfigureAwait(false); } - else if (cell.IndexOfAny(escapeChars) >= 0) + else if (cell.IndexOfAny(quoteTriggerChars) >= 0) { await writer.WriteAsync('"').ConfigureAwait(false); await writer.WriteAsync(cell).ConfigureAwait(false); @@ -625,7 +625,7 @@ private static void WriteCellMemory(TextWriter writer, ReadOnlySpan cell, writer.Write('"'); } - else if (cell.Contains(separator) || cell.IndexOfAny(FixedEscapeChars) >= 0) + else if (cell.Contains(separator) || cell.IndexOfAny(QuoteTriggerChars) >= 0) { writer.Write('"'); writer.Write(cell); @@ -673,10 +673,10 @@ private static bool WriteCellToBuffer(Span buffer, ReadOnlySpan cell var pos = 0; var needsQuoteEscape = cell.Contains('"'); - var needsGeneralEscape = cell.Contains(separator) || cell.IndexOfAny(FixedEscapeChars) >= 0; - var escape = needsQuoteEscape || needsGeneralEscape; + var needsQuoting = cell.Contains(separator) || cell.IndexOfAny(QuoteTriggerChars) >= 0; + var mustQuote = needsQuoteEscape || needsQuoting; - if (escape) + if (mustQuote) { if (pos >= buffer.Length) return false; buffer[pos++] = '"'; @@ -708,7 +708,7 @@ private static bool WriteCellToBuffer(Span buffer, ReadOnlySpan cell pos += cell.Length; } - if (escape) + if (mustQuote) { if (pos >= buffer.Length) return false; buffer[pos++] = '"'; diff --git a/Csv/ICsvLine.cs b/Csv/ICsvLine.cs index a47320a..cfd48c6 100644 --- a/Csv/ICsvLine.cs +++ b/Csv/ICsvLine.cs @@ -1,7 +1,7 @@ namespace Csv { /// - /// Represents a single data line inside a csv file. + /// Represents a single record (data row) parsed from a csv file. /// public interface ICsvLine { @@ -21,12 +21,12 @@ public interface ICsvLine string Raw { get; } /// - /// Gets the 1-based index for the line inside the file. + /// Gets the 1-based index of this record within the file. /// int Index { get; } /// - /// Gets the number of columns of the line. + /// Gets the number of fields in this record. /// int ColumnCount { get; } @@ -37,7 +37,7 @@ public interface ICsvLine /// /// Indicates whether the specified exists and - /// the current line contains a value for it. + /// this record contains a value for it. /// bool LineHasColumn(string name); @@ -48,9 +48,9 @@ public interface ICsvLine string this[string name] { get; } /// - /// Gets the data for the specified indexed header. + /// Gets the value of the field at the specified index. /// - /// The index of the header. + /// The zero-based field index. string this[int index] { get; } } } \ No newline at end of file diff --git a/Csv/ICsvLineFromMemory.cs b/Csv/ICsvLineFromMemory.cs index c7c27a5..024bf62 100644 --- a/Csv/ICsvLineFromMemory.cs +++ b/Csv/ICsvLineFromMemory.cs @@ -5,7 +5,7 @@ namespace Csv { /// - /// Represents a single data line inside a csv file. + /// Represents a single record (data row) parsed from a csv file. /// public interface ICsvLineFromMemory { @@ -25,25 +25,25 @@ public interface ICsvLineFromMemory MemoryText Raw { get; } /// - /// Gets the 1-based index for the line inside the file. + /// Gets the 1-based index of this record within the file. /// int Index { get; } /// - /// Gets the number of columns of the line. + /// Gets the number of fields in this record. /// int ColumnCount { get; } /// - /// Indicates whether the specified exists. - /// - bool HasColumn(string name); - - /// - /// Indicates whether the specified exists and - /// the current line contains a value for it. - /// - bool LineHasColumn(string name); + /// Indicates whether the specified exists. + /// + bool HasColumn(string name); + + /// + /// Indicates whether the specified exists and + /// this record contains a value for it. + /// + bool LineHasColumn(string name); /// /// Gets the data for the specified named header. @@ -52,9 +52,9 @@ public interface ICsvLineFromMemory MemoryText this[string name] { get; } /// - /// Gets the data for the specified indexed header. + /// Gets the value of the field at the specified index. /// - /// The index of the header. + /// The zero-based field index. MemoryText this[int index] { get; } } } diff --git a/Csv/ICsvLineSpan.cs b/Csv/ICsvLineSpan.cs index 7b22d44..8cfc8fd 100644 --- a/Csv/ICsvLineSpan.cs +++ b/Csv/ICsvLineSpan.cs @@ -7,7 +7,7 @@ namespace Csv { /// - /// Enhanced CSV line interface with Span/Memory support for zero-allocation data access. + /// Enhanced CSV record interface with Span/Memory support for zero-allocation data access. /// public interface ICsvLineSpan : ICsvLine { @@ -48,9 +48,9 @@ public interface ICsvLineSpan : ICsvLine ReadOnlyMemory GetMemory(string name); /// - /// Gets the data for the specified column index as ReadOnlyMemory<char>. + /// Gets the field at the specified index as ReadOnlyMemory<char>. /// - /// The zero-based index of the column. + /// The zero-based field index. ReadOnlyMemory GetMemory(int index); /// @@ -60,41 +60,41 @@ public interface ICsvLineSpan : ICsvLine ReadOnlySpan GetSpan(string name); /// - /// Gets the data for the specified column index as ReadOnlySpan<char>. + /// Gets the field at the specified index as ReadOnlySpan<char>. /// - /// The zero-based index of the column. + /// The zero-based field index. ReadOnlySpan GetSpan(int index); /// /// Tries to get the data for the specified named header as ReadOnlyMemory<char>. /// /// The name of the header. - /// When this method returns, contains the memory for the column if found; otherwise, empty. - /// true if the column was found; otherwise, false. + /// When this method returns, contains the memory for the field if found; otherwise, empty. + /// true if the field was found; otherwise, false. bool TryGetMemory(string name, out ReadOnlyMemory value); /// /// Tries to get the data for the specified column index as ReadOnlyMemory<char>. /// - /// The zero-based index of the column. - /// When this method returns, contains the memory for the column if found; otherwise, empty. - /// true if the column was found; otherwise, false. + /// The zero-based field index. + /// When this method returns, contains the memory for the field if found; otherwise, empty. + /// true if the field was found; otherwise, false. bool TryGetMemory(int index, out ReadOnlyMemory value); /// /// Tries to get the data for the specified named header as ReadOnlySpan<char>. /// /// The name of the header. - /// When this method returns, contains the span for the column if found; otherwise, empty. - /// true if the column was found; otherwise, false. + /// When this method returns, contains the span for the field if found; otherwise, empty. + /// true if the field was found; otherwise, false. bool TryGetSpan(string name, out ReadOnlySpan value); /// /// Tries to get the data for the specified column index as ReadOnlySpan<char>. /// - /// The zero-based index of the column. - /// When this method returns, contains the span for the column if found; otherwise, empty. - /// true if the column was found; otherwise, false. + /// The zero-based field index. + /// When this method returns, contains the span for the field if found; otherwise, empty. + /// true if the field was found; otherwise, false. bool TryGetSpan(int index, out ReadOnlySpan value); } } diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md new file mode 100644 index 0000000..5b85b7a --- /dev/null +++ b/UBIQUITOUS_LANGUAGE.md @@ -0,0 +1,174 @@ +# Ubiquitous Language + +The shared vocabulary of the `Csv` library — what a CSV *is*, and how we read, write, and +configure it. Canonical terms are **bold**. The "Aliases to avoid" column lists words the +codebase currently uses for the same concept that we should stop using. + +> The single most important distinction in this codebase: a **physical line** (newline-bounded +> source text) is *not* the same as a **record** (one logical CSV entry). A record with a +> multiline field spans several physical lines. The public reading interface is named +> `ICsvLine` but actually represents a **record** — see *Flagged ambiguities*. + +## Document structure + +| Term | Definition | Aliases to avoid | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- | +| **Record** | One complete CSV entry: a sequence of fields forming a single logical row; may span several physical lines if it has a multiline field. | row, line, data line, data row | +| **Physical line** | A run of source characters terminated by `\n` or `\r\n`. Usually one record = one physical line, but a multiline field breaks that 1:1. | line (when a record is meant) | +| **Field** | A single datum within a record, delimited by separators and optionally enclosed in quotes. | cell, part, data | +| **Value** | The parsed string content of a field — after trimming and unescaping. (A field is the slot; the value is what's in it.) | — | +| **Separator** | The single character that delimits fields within a record (comma by default; auto-detected when unset). | delimiter | +| **Column** | The vertical slot occupying the same position across all records; named by a header, located by a column index. | (see Header / Column index / Field count) | + +## Headers & columns + +| Term | Definition | Aliases to avoid | +| ---------------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------- | +| **Header** | The *name* of a column, taken from the header row (`HeaderPresent`) or auto-generated as `Column1`, `Column2`, … (`HeaderAbsent`). | column name (when ambiguous) | +| **Header row** | The first non-skipped record, interpreted as the set of headers. | first line | +| **Header mode** | Whether the input carries a header row: `HeaderPresent` (default) or `HeaderAbsent`. | — | +| **Column index** | The zero-based position of a field within a record. | column (when a position is meant) | +| **Field count** | The number of fields in a record. This is what `ICsvLine.ColumnCount` and `ValidateColumnCount` actually measure. | column count | +| **Alias** | An alternative header that resolves to the same column (e.g. `CategoryName` / `Category Name` / `Category-Name`). | — | +| **Auto-rename** | Making duplicate or empty headers unique by appending a number (`A`, `A2`, …; `Empty`, `Empty2`, …). | — | +| **Comparer** | The string-equality rule used when matching a header name (case-sensitive or case-insensitive). | — | + +## Quoting, escaping & multiline + +| Term | Definition | Aliases to avoid | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------- | --------------------------------- | +| **Quote character** | The character that opens and closes an enclosed field — `"` by default, `'` when `AllowSingleQuoteToEncloseFieldValues` is set. | enclosure | +| **Enclose** | To wrap a field in quote characters so it may safely contain separators, quotes, or newlines (`AllowEnclosedFieldValues`). | quote (as a verb, for wrapping) | +| **Quote escaping** | Representing a literal quote inside an enclosed field by doubling it (`""` → one `"`). | escape (bare) | +| **Backslash escape** | Optional alternative to doubling: `\"` represents a literal quote (`AllowBackSlashToEscapeQuote`). | — | +| **Multiline field** | An enclosed field whose value contains literal newlines, making its record span multiple physical lines (`AllowNewLineInEnclosedFieldValues`). | — | +| **Unterminated quoted value** | An enclosed field with no closing quote on the current physical line — the signal that the record continues onto the next. | open quote | +| **Continuation** | Appending the next physical line(s) to an unterminated quoted value until the closing quote is found, joined with `NewLine`. | — | +| **Quoting** (writing) | The writer's decision to *enclose* a field because it contains a separator, quote, or newline. Distinct from escaping. | escape, escaping (the writer's internal name) | + +## Reading & writing pipeline + +These are the codebase's nouns for the read/write architecture — part of the team's working +vocabulary even though a CSV end-user wouldn't say them. + +| Term | Definition | Aliases to avoid | +| ------------------- | ---------------------------------------------------------------------------------------------------------------- | ---------------- | +| **Source** | The input a read consumes: a `TextReader`, `Stream`, `string`, or `ReadOnlyMemory`. | — | +| **Line source** | The internal abstraction that yields one physical line at a time, hiding which kind of source it came from. | — | +| **Engine** | The single generic read loop driving every read path (sync, async, span, memory). | — | +| **Row factory** | The internal component that builds a record object from a parsed line. | — | +| **Raw split line** | A record's fields immediately after splitting on separators — before any trimming or unescaping. | — | +| **Parsed line** | A raw split line after trimming and unescaping have produced the final values. | — | +| **Skip row** | A predicate that drops a physical line *before* parsing (defaults to dropping blank lines and `#` comment lines). | — | +| **Rows to skip** | A fixed count of leading physical lines discarded before the header row is read. | — | +| **Trim data** | Option to strip leading/trailing whitespace from values (and to permit a quote to open after leading whitespace). | — | +| **Validate column count** | Option requiring every record's field count to equal the header count. | — | +| **Auto-detect separator** | Inferring the separator from the header row when none is configured. | — | + +## Relationships + +- A **CSV document** is an optional **header row** followed by zero or more **records**. +- A **record** contains one or more **fields**; each field holds one **value**. +- A **field** is delimited by **separators** and may be **enclosed** in **quote characters**. +- A **column** is the set of same-position fields across records; its name is a **header**, its position a **column index**. +- One or more **aliases** may resolve to the same **column**. +- An **enclosed field** containing newlines is a **multiline field**; its **record** spans multiple **physical lines**, joined during **continuation** once an **unterminated quoted value** is detected. +- Inside an enclosed field, a literal quote is produced by **quote escaping** (`""`) or, if enabled, a **backslash escape** (`\"`). +- **Field count** equals the **header** count when **validate column count** is on. +- On write, a field is **quoted** (enclosed) when it contains a separator, quote, or newline; the literal quotes inside it are then **escaped** by doubling. + +## Example dialogue + +> **Dev:** "When I read a `HeaderAbsent` file, what does `ICsvLine.ColumnCount` give me?" + +> **Domain expert:** "The **field count** of that **record** — how many **fields** it has. It's named 'column count', but there are no **headers** in `HeaderAbsent` mode, so it's really just the number of fields on that line." + +> **Dev:** "And if one of those fields is a **multiline field**, do I get several **records**?" + +> **Domain expert:** "No — one **record**. The value spans several **physical lines**, but because the **enclosed field** is an **unterminated quoted value** on the first line, the reader does a **continuation**: it keeps appending physical lines until it sees the closing **quote character**, then hands you a single record." + +> **Dev:** "Inside that value there's a `\"\"`. Is that two fields?" + +> **Domain expert:** "No — that's **quote escaping**: a doubled quote is one literal `\"`. A **separator** only splits fields when you're *outside* an enclosed field." + +> **Dev:** "On the writing side, when does a **field** get wrapped in quotes?" + +> **Domain expert:** "That's **quoting**, not escaping — the writer **encloses** a field when it contains a separator, a quote, or a newline. The writer's internals confusingly call that 'escape', but escaping proper is only the doubled-quote step that happens *after* it decides to enclose." + +## Flagged ambiguities + +1. **`line` / `row` / `record` are used interchangeably**, but the code genuinely needs two concepts: a **physical line** (newline-bounded source text) and a **record** (one logical entry, possibly multi-line). The public interface `ICsvLine` is named after "line" yet represents a **record**. *Recommendation:* say **record** for the logical entry and **physical line** for source text; retire "row" and "data line" as a third synonym. + +2. **`column` means four different things** — a header *name*, a zero-based *index*, the *vertical slot* across records, and a *count*. The sharpest offenders are `ICsvLine.ColumnCount` and `CsvOptions.ValidateColumnCount`, which actually measure **fields per record**, not columns. *Recommendation:* **Header** (name), **Column index** (position), **Column** (the vertical slot), **Field count** (per-record count). + +3. **One datum has five names** — `field`, `value`, `cell`, `data`, `part`. `CsvLineSplitter` comments say "cell" and "part"; the writer says "cell"; `TrimData` and the `ICsvLine` doc-comments say "data"; the array property is `Values`. *Recommendation:* **Field** for the unit, **Value** for its parsed content; retire cell / part / data. + +4. **`escape` is overloaded across reading and writing.** It means (a) the per-character **quote escaping** that produces a literal quote (`""`), (b) the optional **backslash escape** (`\"`), and (c) — in the writer — the decision to *wrap a whole field in quotes* (`escapeChars`, `needsGeneralEscape`, `FixedEscapeChars`). The third isn't escaping at all; it's **quoting**. *Recommendation:* reserve **escape** for the doubled-quote/backslash step, and call the wrap-the-field decision **quoting**. + +5. **`separator` vs `delimiter`** — the property and implementation use **Separator**; "delimiter" appears only in prose. *Recommendation:* **Separator** everywhere. + +6. **`enclose` vs `quote`** — option names say *enclose* (`AllowEnclosedFieldValues`), the splitter says *quote* (`inQuotes`, `quoteChar`). These are *not* synonyms: **enclose** is the act of wrapping; the **quote character** is the tool that does it. *Recommendation:* keep both, used precisely. + +## Applying this glossary (blast radius) + +`Csv` is a public NuGet package with millions of downloads, so **renaming a public member is a +SemVer-major breaking change**. The recommendations above therefore apply differently depending +on where a name lives: + +| Bucket | Rule | Status | +| ------ | ---- | ------ | +| 🟢 **Internal / private** (`internal`, `private`, locals) | Rename freely — internals are exposed only to `Csv.Tests` via `InternalsVisibleTo`, so there is zero ecosystem impact. | **Done** (see below) | +| 🟡 **Public XML-doc text** | Reword to canonical terms — comments are not part of the binary/source contract, so this is non-breaking. | **Done** (see below) | +| 🔴 **Public member names** (types, properties, methods, enum members, method parameters) | Frozen. Cannot change without a major version + migration guide. Method *parameter* names count — named-argument callers depend on them. | **Deferred to vNext** | + +### Internal renames applied (non-breaking) + +- The reader record classes (`ReadLine`, `ReadLineSpan`, `ReadLineSpanOptimized`, `ReadLineFromMemory`): + `rawSplitLine` → **`rawFields`**, `RawSplitLine` → **`RawFields`**, `parsedLine` → **`parsedValues`**, + and the private property literally named `Line` (which returned the parsed field array) → **`ParsedValues`**. +- The writer's escape-vs-quote confusion (`CsvWriter`, `CsvBufferWriter`): `FixedEscapeChars` → + **`QuoteTriggerChars`**, `escapeChars` → **`quoteTriggerChars`**, `needsGeneralEscape` → + **`needsQuoting`**, `escape` (the wrap-the-field decision) → **`mustQuote`**. `needsQuoteEscape` + is kept — it genuinely means quote-doubling. + +> The writer deliberately keeps `cell` / `WriteCell` / `WriteRow` / `WriteLine`. Those names are +> baked into the **public** writer API, so the private helpers stay consistent with them rather +> than with the reader's `field`/`record` vocabulary. `cell` is thus an accepted writer-side +> synonym for **field**. + +### Doc rewording applied (non-breaking) + +`ICsvLine.ColumnCount` / `ICsvLineFromMemory.ColumnCount` now document "number of **fields** in +this record"; `CsvOptions.ValidateColumnCount` documents matching the "**field count** per row"; +the `Read*` summaries say "Reads the **records**"; the `int` indexers and `ICsvLineSpan`'s +`GetSpan`/`GetMemory`/`TryGet*` document a "**field index**" instead of a "column index"; +`CsvOptions.Aliases` maps names to a "**header/column**"; `CsvBufferWriter.WriteCell` documents +"**quoting and escaping**"; and the splitter is documented as splitting a record's text into +**fields**. + +### vNext rename targets (require a major version) + +When a `v3` is on the table, these public names are the ones worth correcting, each behind an +`[Obsolete]` forwarder so the old name keeps working through one major version: + +| Today (frozen) | Canonical target | Why | +| -------------- | ---------------- | --- | +| `ICsvLine.ColumnCount` | `FieldCount` | It counts fields in a record, not columns. | +| `CsvOptions.ValidateColumnCount` | `ValidateFieldCount` | Same — it validates field count per record. | +| `ICsvLine.LineHasColumn(name)` | `RecordHasValue(name)` | "Line" means record; it tests for a present value. | +| `ICsvLine` / `ICsvLineSpan` / `ICsvLineFromMemory` | `ICsvRecord` / `ICsvRecordSpan` / … | The type models a record, not a physical line. | + +Smaller doc-vs-name frictions (e.g. `ReturnEmptyForMissingColumn`, the writer's `WriteCell`) are +**not** worth a breaking change — leave them and lean on the docs. + +> **Not recommended:** `[Obsolete]` on the *current* names now (it would emit build warnings for +> every consumer over a naming preference), or additive aliases like a second `FieldCount` next to +> `ColumnCount` (two ways to do one thing is its own consistency tax). Reworded docs already carry +> the canonical meaning at zero cost. + +--- + +*Out of scope:* `CsvMemoryOptions` knobs (`ReuseBuffers`, `InitialBufferSize`, `MaxBufferSize`, +`DirectAllocationThreshold`, `EnableZeroCopy`, `UseVectorization`, `ClearBuffers`, +`StreamingThreshold`) and buffer/span/pool plumbing are .NET performance-tuning vocabulary, not +CSV-domain language, and are deliberately excluded from this glossary.