-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcodeaware.go
More file actions
206 lines (182 loc) · 5.98 KB
/
Copy pathcodeaware.go
File metadata and controls
206 lines (182 loc) · 5.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
package tok
import (
"regexp"
"strings"
)
// Span identifies a protected region of source code that must survive
// compression. Spans are line-oriented (1-based, inclusive) and carry the
// extracted symbol name when one is available.
//
// Spans are the unit of "code intelligence" exchanged between a SymbolProvider
// and the compression pipeline. The default provider derives them from the
// chunker's symbol regexes, but an embedder can supply spans from any source
// (e.g. an LSP server) via a custom SymbolProvider.
type Span struct {
StartLine int `json:"start_line"` // 1-based, inclusive
EndLine int `json:"end_line"` // 1-based, inclusive
Symbol string `json:"symbol,omitempty"`
Lines []string
}
// SymbolProvider identifies the semantically significant spans of a piece of
// source code. It is the seam that lets hawk later inject a real LSP-backed
// provider in place of tok's dependency-light regex default.
//
// Implementations must be safe for concurrent use; they receive the full code
// text and a language name (e.g. "go", "python"), and return the spans that
// compression must preserve. An empty lang signals best-effort detection.
type SymbolProvider interface {
Symbols(code, lang string) []Span
}
// RegexSymbolProvider is the default, dependency-light SymbolProvider. It reuses
// the chunker's language boundary and symbol patterns to find declaration and
// signature lines without any LSP integration.
type RegexSymbolProvider struct{}
// DefaultSymbolProvider is the SymbolProvider used by WithCodeAware when no
// other provider is supplied via WithSymbolProvider.
var DefaultSymbolProvider SymbolProvider = RegexSymbolProvider{}
// Symbols implements SymbolProvider using the chunker's boundary/symbol regexes.
//
// A line is treated as a protected signature when it matches the language
// boundary pattern (function/type/class/export/etc.). For each such line the
// returned Span covers the signature itself plus any continuation lines that
// complete a multi-line signature (an unbalanced "(" run, typical of long
// parameter lists), so the full signature is preserved as a unit.
func (RegexSymbolProvider) Symbols(code, lang string) []Span {
if code == "" {
return nil
}
l := strings.ToLower(strings.TrimSpace(lang))
if l == "" {
l = detectLanguage(code)
}
boundary := boundaryForLang(l)
if boundary == nil {
return nil
}
symbolRe := symbolReForLang(l)
lines := strings.Split(code, "\n")
var spans []Span
for i := 0; i < len(lines); i++ {
trimmed := strings.TrimSpace(lines[i])
if trimmed == "" || !boundary.MatchString(trimmed) {
continue
}
start := i
// Extend across an unbalanced-paren multi-line signature.
end := i
if depth := parenDepth(lines[i]); depth > 0 {
for end+1 < len(lines) && depth > 0 {
end++
depth += parenDepth(lines[end])
}
}
symbol := ""
if symbolRe != nil {
if m := symbolRe.FindStringSubmatch(trimmed); m != nil {
for _, g := range m[1:] {
if g != "" {
symbol = g
break
}
}
}
}
spans = append(spans, Span{
StartLine: start + 1,
EndLine: end + 1,
Symbol: symbol,
Lines: append([]string(nil), lines[start:end+1]...),
})
i = end
}
return spans
}
// parenDepth returns the net change in parenthesis nesting for a line: open
// parens minus close parens. Used to detect multi-line signatures.
func parenDepth(line string) int {
return strings.Count(line, "(") - strings.Count(line, ")")
}
// codeProtector captures the protected signature lines for an input so they can
// be re-injected after compression.
type codeProtector struct {
spans []Span
}
// newCodeProtector builds a protector for code using the given provider (or the
// default when nil). It returns nil when there is nothing to protect.
func newCodeProtector(code, lang string, provider SymbolProvider) *codeProtector {
if code == "" {
return nil
}
if provider == nil {
provider = DefaultSymbolProvider
}
spans := provider.Symbols(code, lang)
if len(spans) == 0 {
return nil
}
return &codeProtector{spans: spans}
}
// protectedLines flattens all span lines into the set of trimmed, non-empty
// signature lines that must appear in the output.
func (p *codeProtector) protectedLines() []string {
if p == nil {
return nil
}
var out []string
for _, s := range p.spans {
for _, ln := range s.Lines {
t := strings.TrimSpace(ln)
if t != "" {
out = append(out, ln)
}
}
}
return out
}
// guard re-injects any protected line that the compression pipeline dropped.
// A protected line is considered present if its trimmed form is a substring of
// the (whitespace-normalized) compressed output, which tolerates downstream
// reflowing/indentation changes. Missing lines are appended verbatim under a
// marker so the signatures are never lost.
func (p *codeProtector) guard(original, compressed string) string {
if p == nil {
return compressed
}
haystack := normalizeWhitespace(compressed)
var missing []string
seen := make(map[string]struct{})
for _, ln := range p.protectedLines() {
needle := normalizeWhitespace(ln)
if needle == "" {
continue
}
if _, dup := seen[needle]; dup {
continue
}
if strings.Contains(haystack, needle) {
continue
}
seen[needle] = struct{}{}
missing = append(missing, strings.TrimRight(ln, " \t"))
}
if len(missing) == 0 {
return compressed
}
var b strings.Builder
b.WriteString(compressed)
if compressed != "" && !strings.HasSuffix(compressed, "\n") {
b.WriteByte('\n')
}
b.WriteString(codeAwareMarker)
b.WriteByte('\n')
b.WriteString(strings.Join(missing, "\n"))
return b.String()
}
// codeAwareMarker labels re-injected signature lines in the output.
const codeAwareMarker = "// [tok:code-aware preserved signatures]"
var wsRe = regexp.MustCompile(`\s+`)
// normalizeWhitespace collapses runs of whitespace to a single space and trims,
// so signature containment checks are robust to indentation/reflow changes.
func normalizeWhitespace(s string) string {
return strings.TrimSpace(wsRe.ReplaceAllString(s, " "))
}