Skip to content

Commit 94de146

Browse files
Add UTF8Mode with Strict, Loose, and Replace options
Introduced `UTF8Mode` enum to control invalid UTF-8 sequence handling: - `UTF8Replace` (default): Replaces invalid bytes with `\uFFFD`. - `UTF8Strict`: Returns an error. - `UTF8Ignore`: Preserves invalid bytes as-is. Added `OptionStrict` (sets `UTF8Strict`) and `OptionLoose` (sets `UTF8Ignore`). Refactored `upperCaseFirstLower` to respect these modes. Added comprehensive tests for all modes. Co-authored-by: arran4 <111667+arran4@users.noreply.github.com>
1 parent 47c0289 commit 94de146

3 files changed

Lines changed: 130 additions & 33 deletions

File tree

types.go

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ type SeparatorWord string
3737
// String implementations
3838
func (w SingleCaseWord) String() string { return strings.ToLower(string(w)) }
3939
func (w FirstUpperCaseWord) String() string {
40-
res, _ := upperCaseFirstLower(string(w), false)
40+
res, _ := upperCaseFirstLower(string(w), UTF8Replace)
4141
return res
4242
}
4343
func (w AcronymWord) String() string { return string(w) }
@@ -122,25 +122,30 @@ func MustLowerCaseFirst(s string) string {
122122
}
123123

124124
// upperCaseFirstLower capitalizes the first character and lowercases the rest.
125-
func upperCaseFirstLower(s string, strict bool) (string, error) {
125+
func upperCaseFirstLower(s string, mode UTF8Mode) (string, error) {
126126
if s == "" {
127127
return "", nil
128128
}
129129
r, size := utf8.DecodeRuneInString(s)
130-
if strict && r == utf8.RuneError {
131-
return "", fmt.Errorf("%w: invalid rune", ErrRune)
130+
if r == utf8.RuneError && size == 1 {
131+
if mode == UTF8Strict {
132+
return "", fmt.Errorf("%w: invalid rune", ErrRune)
133+
}
132134
}
135+
133136
u := unicode.ToUpper(r)
134137

135138
// Check if changes are needed.
136139
// If r == utf8.RuneError && size == 1, it is an invalid UTF-8 start byte.
137140
// We want to replace it with RuneError (like strings.ToLower/ToUpper do).
138141
// So we force needChange.
139-
needChange := (r != u) || (r == utf8.RuneError && size == 1)
142+
needChange := (r != u) || (r == utf8.RuneError && size == 1 && mode == UTF8Replace)
140143
if !needChange {
141144
for _, rc := range s[size:] {
142-
if strict && rc == utf8.RuneError {
143-
return "", fmt.Errorf("%w: invalid rune", ErrRune)
145+
if rc == utf8.RuneError {
146+
if mode == UTF8Strict {
147+
return "", fmt.Errorf("%w: invalid rune", ErrRune)
148+
}
144149
}
145150
if unicode.ToLower(rc) != rc {
146151
needChange = true
@@ -155,10 +160,25 @@ func upperCaseFirstLower(s string, strict bool) (string, error) {
155160

156161
var b strings.Builder
157162
b.Grow(len(s))
158-
b.WriteRune(u)
159-
for _, rc := range s[size:] {
160-
if strict && rc == utf8.RuneError {
161-
return "", fmt.Errorf("%w: invalid rune", ErrRune)
163+
if r == utf8.RuneError && size == 1 && mode == UTF8Ignore {
164+
b.WriteByte(s[0])
165+
} else {
166+
b.WriteRune(u)
167+
}
168+
169+
for i, rc := range s[size:] {
170+
if rc == utf8.RuneError {
171+
if mode == UTF8Strict {
172+
return "", fmt.Errorf("%w: invalid rune", ErrRune)
173+
}
174+
if mode == UTF8Ignore {
175+
// s[size:] is the substring starting after first rune.
176+
// i is the index within that substring.
177+
// We need to write the original byte.
178+
// s[size+i] is the byte.
179+
b.WriteByte(s[size+i])
180+
continue
181+
}
162182
}
163183
b.WriteRune(unicode.ToLower(rc))
164184
}
@@ -188,6 +208,18 @@ const (
188208
CMScreaming
189209
)
190210

211+
// UTF8Mode defines how to handle invalid UTF-8 sequences.
212+
type UTF8Mode int
213+
214+
const (
215+
// UTF8Replace replaces invalid UTF-8 bytes with utf8.RuneError (U+FFFD).
216+
UTF8Replace UTF8Mode = iota
217+
// UTF8Strict returns an error on invalid UTF-8 sequences.
218+
UTF8Strict
219+
// UTF8Ignore ignores invalid UTF-8 sequences and preserves the original bytes (best effort).
220+
UTF8Ignore
221+
)
222+
191223
type caseConfig struct {
192224
caseMode CaseMode
193225
delimiter string
@@ -199,7 +231,7 @@ type caseConfig struct {
199231
mixCaseSupport bool
200232
firstUpper bool
201233
firstLower bool
202-
strict bool
234+
utf8Mode UTF8Mode
203235
}
204236

205237
// OptionDelimiter sets the delimiter between words.
@@ -234,7 +266,12 @@ func OptionUpperIndicator(d string) Option {
234266

235267
// OptionStrict sets strict mode, which returns an error if invalid UTF-8 sequences are encountered.
236268
func OptionStrict() Option {
237-
return func(cfg *caseConfig) { cfg.strict = true }
269+
return func(cfg *caseConfig) { cfg.utf8Mode = UTF8Strict }
270+
}
271+
272+
// OptionLoose sets loose mode, which preserves invalid UTF-8 bytes as-is instead of replacing them.
273+
func OptionLoose() Option {
274+
return func(cfg *caseConfig) { cfg.utf8Mode = UTF8Ignore }
238275
}
239276

240277
// ToFormattedCase generates formatted case strings with the given options
@@ -285,7 +322,7 @@ func WordsToFormattedCase(words []Word, opts ...any) (string, error) {
285322
w = strings.ToLower(w)
286323
} else if cfg.caseMode == CMAllTitle {
287324
var err error
288-
w, err = upperCaseFirstLower(w, cfg.strict)
325+
w, err = upperCaseFirstLower(w, cfg.utf8Mode)
289326
if err != nil {
290327
return "", err
291328
}
@@ -304,7 +341,7 @@ func WordsToFormattedCase(words []Word, opts ...any) (string, error) {
304341
}
305342
case FirstUpperCaseWord:
306343
var err error
307-
w, err = upperCaseFirstLower(string(word), cfg.strict)
344+
w, err = upperCaseFirstLower(string(word), cfg.utf8Mode)
308345
if err != nil {
309346
return "", err
310347
}
@@ -324,7 +361,7 @@ func WordsToFormattedCase(words []Word, opts ...any) (string, error) {
324361
w = strings.ToLower(w)
325362
} else if cfg.caseMode == CMAllTitle {
326363
var err error
327-
w, err = upperCaseFirstLower(w, cfg.strict)
364+
w, err = upperCaseFirstLower(w, cfg.utf8Mode)
328365
if err != nil {
329366
return "", err
330367
}
@@ -337,7 +374,7 @@ func WordsToFormattedCase(words []Word, opts ...any) (string, error) {
337374
w = strings.ToLower(w)
338375
} else if cfg.caseMode == CMAllTitle {
339376
var err error
340-
w, err = upperCaseFirstLower(w, cfg.strict)
377+
w, err = upperCaseFirstLower(w, cfg.utf8Mode)
341378
if err != nil {
342379
return "", err
343380
}

types_internal_test.go

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ func TestUpperCaseFirstLower_Correctness(t *testing.T) {
7575

7676
for _, tt := range tests {
7777
t.Run(tt.name, func(t *testing.T) {
78-
got, err := upperCaseFirstLower(tt.input, false)
78+
got, err := upperCaseFirstLower(tt.input, UTF8Replace)
7979
if err != nil {
80-
t.Errorf("upperCaseFirstLower(%q, false) returned unexpected error: %v", tt.input, err)
80+
t.Errorf("upperCaseFirstLower(%q, UTF8Replace) returned unexpected error: %v", tt.input, err)
8181
}
8282
if got != tt.expected {
8383
t.Errorf("upperCaseFirstLower(%q) = %q, want %q", tt.input, got, tt.expected)
@@ -116,36 +116,81 @@ func TestUpperCaseFirstLower_Strict(t *testing.T) {
116116

117117
for _, tt := range tests {
118118
t.Run(tt.name, func(t *testing.T) {
119-
_, err := upperCaseFirstLower(tt.input, true)
119+
_, err := upperCaseFirstLower(tt.input, UTF8Strict)
120120
if tt.expectErr {
121121
if err == nil {
122-
t.Errorf("upperCaseFirstLower(%q, true) expected error, got nil", tt.input)
122+
t.Errorf("upperCaseFirstLower(%q, UTF8Strict) expected error, got nil", tt.input)
123123
}
124124
if !errors.Is(err, ErrRune) {
125-
t.Errorf("upperCaseFirstLower(%q, true) expected ErrRune, got %v", tt.input, err)
125+
t.Errorf("upperCaseFirstLower(%q, UTF8Strict) expected ErrRune, got %v", tt.input, err)
126126
}
127127
} else {
128128
if err != nil {
129-
t.Errorf("upperCaseFirstLower(%q, true) unexpected error: %v", tt.input, err)
129+
t.Errorf("upperCaseFirstLower(%q, UTF8Strict) unexpected error: %v", tt.input, err)
130130
}
131131
}
132132
})
133133
}
134134
}
135135

136+
func TestUpperCaseFirstLower_Loose(t *testing.T) {
137+
tests := []struct {
138+
name string
139+
input string
140+
expected string
141+
}{
142+
{
143+
name: "Invalid UTF-8 Start",
144+
input: "\xfftest",
145+
expected: "\xfftest", // Preserves invalid byte
146+
},
147+
{
148+
name: "Invalid UTF-8 Middle",
149+
input: "te\xffst",
150+
expected: "Te\xffst", // Preserves invalid byte, title cases valid parts
151+
},
152+
{
153+
name: "Mixed Invalid",
154+
input: "\xffT\xff",
155+
expected: "\xfft\xff", // Start invalid kept, 'T' -> 't', 't' lowercased? No wait.
156+
// upperCaseFirstLower Logic:
157+
// 1. Decode first rune. If invalid: write byte.
158+
// 2. Loop rest. If invalid: write byte. Else toLower.
159+
// Input: \xff T \xff
160+
// 1. First: \xff. Invalid. Write \xff.
161+
// 2. Rest: "T\xff".
162+
// - 'T': ToLower -> 't'.
163+
// - \xff: Invalid. Write \xff.
164+
// Result: "\xfft\xff".
165+
},
166+
}
167+
168+
for _, tt := range tests {
169+
t.Run(tt.name, func(t *testing.T) {
170+
got, err := upperCaseFirstLower(tt.input, UTF8Ignore)
171+
if err != nil {
172+
t.Errorf("upperCaseFirstLower(%q, UTF8Ignore) returned unexpected error: %v", tt.input, err)
173+
}
174+
if got != tt.expected {
175+
t.Errorf("upperCaseFirstLower(%q, UTF8Ignore) = %q (bytes: %x), want %q (bytes: %x)", tt.input, got, []byte(got), tt.expected, []byte(tt.expected))
176+
}
177+
})
178+
}
179+
}
180+
136181
func TestUpperCaseFirstLower_Allocations(t *testing.T) {
137182
// Tests that no allocation occurs if the string is already correct
138183
input := "Test"
139184
if testing.AllocsPerRun(10, func() {
140-
_, _ = upperCaseFirstLower(input, false)
185+
_, _ = upperCaseFirstLower(input, UTF8Replace)
141186
}) > 0 {
142187
t.Errorf("upperCaseFirstLower(%q) allocated memory when no change was needed", input)
143188
}
144189

145190
// Test that allocation occurs when change IS needed
146191
input2 := "test"
147192
if testing.AllocsPerRun(10, func() {
148-
_, _ = upperCaseFirstLower(input2, false)
193+
_, _ = upperCaseFirstLower(input2, UTF8Replace)
149194
}) == 0 {
150195
t.Errorf("upperCaseFirstLower(%q) did not allocate memory when change was needed", input2)
151196
}

types_test.go

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -455,50 +455,62 @@ func TestToFormattedCase_MultibyteFirstLower(t *testing.T) {
455455
}
456456
}
457457

458-
func TestOptionStrict(t *testing.T) {
458+
func TestOptionUTF8Modes(t *testing.T) {
459459
tests := []struct {
460460
name string
461461
words []Word
462462
options []Option
463463
expectErr bool
464+
expected string
464465
}{
465466
{
466-
name: "FirstUpperCaseWord Invalid UTF-8 Strict",
467+
name: "Strict Mode Error",
467468
words: []Word{
468469
FirstUpperCaseWord("\xfftest"),
469470
},
470471
options: []Option{OptionStrict()},
471472
expectErr: true,
472473
},
473474
{
474-
name: "FirstUpperCaseWord Invalid UTF-8 Non-Strict",
475+
name: "Loose Mode Preserves Invalid",
475476
words: []Word{
476477
FirstUpperCaseWord("\xfftest"),
477478
},
478-
options: []Option{},
479+
options: []Option{OptionLoose()},
479480
expectErr: false,
481+
expected: "\xfftest",
480482
},
481483
{
482-
name: "SingleCaseWord CMAllTitle Invalid UTF-8 Strict",
484+
name: "Default Mode Replaces Invalid",
485+
words: []Word{
486+
FirstUpperCaseWord("\xfftest"),
487+
},
488+
options: []Option{}, // Default is UTF8Replace
489+
expectErr: false,
490+
expected: "\uFFFDtest",
491+
},
492+
{
493+
name: "SingleCaseWord CMAllTitle Strict",
483494
words: []Word{
484495
SingleCaseWord("\xfftest"),
485496
},
486497
options: []Option{OptionCaseMode(CMAllTitle), OptionStrict()},
487498
expectErr: true,
488499
},
489500
{
490-
name: "SingleCaseWord CMAllTitle Invalid UTF-8 Non-Strict",
501+
name: "SingleCaseWord CMAllTitle Loose",
491502
words: []Word{
492503
SingleCaseWord("\xfftest"),
493504
},
494-
options: []Option{OptionCaseMode(CMAllTitle)},
505+
options: []Option{OptionCaseMode(CMAllTitle), OptionLoose()},
495506
expectErr: false,
507+
expected: "\xfftest",
496508
},
497509
}
498510

499511
for _, tt := range tests {
500512
t.Run(tt.name, func(t *testing.T) {
501-
_, err := WordsToFormattedCase(tt.words, convertOptions(tt.options)...)
513+
got, err := WordsToFormattedCase(tt.words, convertOptions(tt.options)...)
502514
if tt.expectErr {
503515
if err == nil {
504516
t.Error("expected error, got nil")
@@ -510,6 +522,9 @@ func TestOptionStrict(t *testing.T) {
510522
if err != nil {
511523
t.Errorf("unexpected error: %v", err)
512524
}
525+
if got != tt.expected {
526+
t.Errorf("got %q (bytes: %x), want %q (bytes: %x)", got, []byte(got), tt.expected, []byte(tt.expected))
527+
}
513528
}
514529
})
515530
}

0 commit comments

Comments
 (0)