-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdoc.go
More file actions
90 lines (90 loc) · 2.89 KB
/
doc.go
File metadata and controls
90 lines (90 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// Package fileprep provides preprocessing and validation for file formats
// supported by filesql (CSV, TSV, LTSV, JSON, JSONL, Parquet, Excel with gzip, bzip2, xz, zstd support).
//
// fileprep complements filesql by providing data preprocessing before loading
// into SQLite. It uses struct tags for validation ("validate" tag) and
// preprocessing ("prep" tag).
//
// # Basic Usage
//
// type Record struct {
// Name string `prep:"trim" validate:"required"`
// Email string `prep:"trim,lowercase" validate:"email"`
// Age int `validate:"gte=0,lte=150"`
// }
//
// file, _ := os.Open("data.csv")
// defer file.Close()
//
// var records []Record
// processor := fileprep.NewProcessor(fileprep.FileTypeCSV)
// reader, result, err := processor.Process(file, &records)
// if err != nil {
// log.Fatal(err)
// }
//
// // reader can be passed directly to filesql
// // result.Errors contains validation errors with row/column information
// // result.RowCount and result.ValidRowCount provide processing statistics
//
// # Streaming Output with ProcessToWriter
//
// For large datasets, ProcessToWriter writes preprocessed output directly
// to an io.Writer, avoiding the output buffer allocation:
//
// var buf bytes.Buffer
// result, err := processor.ProcessToWriter(file, &records, &buf)
//
// # Memory Usage
//
// fileprep loads the entire file into memory for processing. This enables
// multi-pass operations (preprocessing then validation) but means memory
// usage scales with file size. For large files, use ProcessToWriter to
// reduce peak memory by avoiding the output buffer.
//
// Format-specific limitations:
// - XLSX: Only the first sheet is processed
// - LTSV: Maximum line size is 10MB
// - JSON/JSONL: Data has a single "data" column containing raw JSON strings
//
// # Supported File Formats
//
// fileprep supports the same formats as filesql:
// - CSV (.csv)
// - TSV (.tsv)
// - LTSV (.ltsv)
// - JSON (.json)
// - JSONL (.jsonl)
// - Parquet (.parquet)
// - Excel (.xlsx)
//
// All formats support compression:
// - gzip (.gz)
// - bzip2 (.bz2)
// - xz (.xz)
// - zstd (.zst)
// - zlib (.z)
// - snappy (.snappy)
// - s2 (.s2)
// - lz4 (.lz4)
//
// # Prep Tags
//
// The "prep" tag specifies preprocessing operations applied before validation:
// - trim: Remove leading and trailing whitespace
// - ltrim: Remove leading whitespace
// - rtrim: Remove trailing whitespace
// - lowercase: Convert to lowercase
// - uppercase: Convert to uppercase
// - default=value: Set default value if empty
//
// # Validate Tags
//
// The "validate" tag specifies validation rules (compatible with go-playground/validator):
// - required: Field must not be empty
// - email: Must be a valid email address
// - url: Must be a valid URL
// - And many more...
//
// See https://pkg.go.dev/github.com/nao1215/fileprep for the complete list of supported validators.
package fileprep