Skip to content

Commit 5d19045

Browse files
seapyclaude
andcommitted
feat: 기업 검색에 바이그램 퍼지 검색 폴백 추가 (closes #1)
'마켓컬리'처럼 브랜드명이 법인명과 다를 때 substring 검색에서 결과가 없으면 문자 바이그램 유사도(≥0.3) 기반 퍼지 검색으로 폴백하여 '컬리' 등 관련 기업을 찾아준다. 검색 우선순위: 종목코드 완전일치 → 법인코드 완전일치 → 이름 완전일치 → 부분문자열 → 바이그램 퍼지 (상위 10건) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e5260cd commit 5d19045

2 files changed

Lines changed: 232 additions & 2 deletions

File tree

internal/cache/corpcode.go

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"net/http"
1111
"net/url"
1212
"os"
13+
"sort"
1314
"strings"
1415
"time"
1516

@@ -102,7 +103,7 @@ func Load(apiKey string) (*Store, bool, error) {
102103
}
103104

104105
// Search finds corporations matching the query.
105-
// Returns exact match first; falls back to substring search.
106+
// Priority: exact stock/corp code → exact name → substring → bigram fuzzy.
106107
func (s *Store) Search(query string) []*CorpInfo {
107108
// Try stock code (exact)
108109
if c, ok := s.byStock[query]; ok {
@@ -124,7 +125,63 @@ func (s *Store) Search(query string) []*CorpInfo {
124125
matches = append(matches, info)
125126
}
126127
}
127-
return matches
128+
if len(matches) > 0 {
129+
return matches
130+
}
131+
// Fuzzy fallback: bigram similarity
132+
return s.fuzzySearch(lower, 0.3, 10)
133+
}
134+
135+
// fuzzySearch returns up to max corps whose name has bigram similarity ≥ threshold
136+
// with query, sorted by score descending.
137+
func (s *Store) fuzzySearch(query string, threshold float64, max int) []*CorpInfo {
138+
type scored struct {
139+
info *CorpInfo
140+
score float64
141+
}
142+
var results []scored
143+
for _, info := range s.All {
144+
score := bigramSim(query, strings.ToLower(info.CorpName))
145+
if score >= threshold {
146+
results = append(results, scored{info, score})
147+
}
148+
}
149+
sort.Slice(results, func(i, j int) bool {
150+
return results[i].score > results[j].score
151+
})
152+
if len(results) > max {
153+
results = results[:max]
154+
}
155+
out := make([]*CorpInfo, len(results))
156+
for i, r := range results {
157+
out[i] = r.info
158+
}
159+
return out
160+
}
161+
162+
// bigramSim returns the fraction of query's character bigrams that appear in target.
163+
// Uses rune-level bigrams for correct Korean handling.
164+
func bigramSim(query, target string) float64 {
165+
qr := []rune(query)
166+
tr := []rune(target)
167+
if len(qr) < 2 {
168+
return 0
169+
}
170+
// Build target bigram frequency map
171+
tBig := make(map[[2]rune]int, len(tr))
172+
for i := 0; i < len(tr)-1; i++ {
173+
tBig[[2]rune{tr[i], tr[i+1]}]++
174+
}
175+
// Count how many query bigrams appear in target
176+
matched := 0
177+
for i := 0; i < len(qr)-1; i++ {
178+
k := [2]rune{qr[i], qr[i+1]}
179+
if tBig[k] > 0 {
180+
matched++
181+
tBig[k]--
182+
}
183+
}
184+
return float64(matched) / float64(len(qr)-1)
128185
}
129186

130187
// Status returns cache file info.

internal/cache/corpcode_test.go

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package cache
2+
3+
import (
4+
"testing"
5+
)
6+
7+
// testStore returns a Store built from representative sample data.
8+
func testStore() *Store {
9+
corps := []*CorpInfo{
10+
{CorpCode: "00126380", CorpName: "삼성전자", StockCode: "005930"},
11+
{CorpCode: "01153956", CorpName: "컬리"},
12+
{CorpCode: "01494172", CorpName: "컬리넥스트마일"},
13+
{CorpCode: "01547845", CorpName: "당근마켓"},
14+
{CorpCode: "01717824", CorpName: "당근페이"},
15+
{CorpCode: "01138364", CorpName: "더핑크퐁컴퍼니", StockCode: "403850"},
16+
{CorpCode: "00293886", CorpName: "카카오"},
17+
{CorpCode: "00000002", CorpName: "카카오뱅크"},
18+
{CorpCode: "00000003", CorpName: "카카오페이"},
19+
{CorpCode: "01154811", CorpName: "주식회사 오늘의집"},
20+
}
21+
return buildStore(corps)
22+
}
23+
24+
func corpNames(corps []*CorpInfo) []string {
25+
out := make([]string, len(corps))
26+
for i, c := range corps {
27+
out[i] = c.CorpName
28+
}
29+
return out
30+
}
31+
32+
// --- exact match tests ---
33+
34+
func TestSearch_ExactStockCode(t *testing.T) {
35+
s := testStore()
36+
results := s.Search("005930")
37+
if len(results) != 1 || results[0].CorpName != "삼성전자" {
38+
t.Fatalf("종목코드 완전일치 실패: got %v", corpNames(results))
39+
}
40+
}
41+
42+
func TestSearch_ExactCorpCode(t *testing.T) {
43+
s := testStore()
44+
results := s.Search("00126380")
45+
if len(results) != 1 || results[0].CorpName != "삼성전자" {
46+
t.Fatalf("법인코드 완전일치 실패: got %v", corpNames(results))
47+
}
48+
}
49+
50+
func TestSearch_ExactName(t *testing.T) {
51+
s := testStore()
52+
results := s.Search("삼성전자")
53+
if len(results) != 1 || results[0].CorpName != "삼성전자" {
54+
t.Fatalf("이름 완전일치 실패: got %v", corpNames(results))
55+
}
56+
}
57+
58+
// --- substring tests ---
59+
60+
func TestSearch_Substring_당근(t *testing.T) {
61+
s := testStore()
62+
results := s.Search("당근")
63+
if len(results) != 2 {
64+
t.Fatalf("'당근' substring 검색: 2건 기대, got %d %v", len(results), corpNames(results))
65+
}
66+
}
67+
68+
func TestSearch_Substring_카카오(t *testing.T) {
69+
// "카카오" 이름 완전일치가 존재하므로 정확히 1건만 반환 (substring 전에 리턴)
70+
s := testStore()
71+
results := s.Search("카카오")
72+
if len(results) != 1 || results[0].CorpName != "카카오" {
73+
t.Fatalf("'카카오' 이름 완전일치: 1건 기대, got %d %v", len(results), corpNames(results))
74+
}
75+
}
76+
77+
func TestSearch_Substring_카카오계열(t *testing.T) {
78+
// "카카오뱅" 처럼 완전일치 없는 쿼리는 substring 으로 카카오뱅크를 찾음
79+
s := testStore()
80+
results := s.Search("카카오뱅")
81+
if len(results) != 1 || results[0].CorpName != "카카오뱅크" {
82+
t.Fatalf("'카카오뱅' substring 검색: 카카오뱅크 기대, got %v", corpNames(results))
83+
}
84+
}
85+
86+
func TestSearch_Substring_핑크퐁(t *testing.T) {
87+
s := testStore()
88+
results := s.Search("핑크퐁")
89+
if len(results) != 1 || results[0].CorpName != "더핑크퐁컴퍼니" {
90+
t.Fatalf("'핑크퐁' substring 검색 실패: got %v", corpNames(results))
91+
}
92+
}
93+
94+
// --- fuzzy tests ---
95+
96+
func TestSearch_Fuzzy_마켓컬리(t *testing.T) {
97+
s := testStore()
98+
results := s.Search("마켓컬리")
99+
if len(results) == 0 {
100+
t.Fatal("'마켓컬리' fuzzy 검색: 결과 없음 (기대: '컬리' 포함)")
101+
}
102+
found := false
103+
for _, r := range results {
104+
if r.CorpName == "컬리" {
105+
found = true
106+
}
107+
}
108+
if !found {
109+
t.Fatalf("'마켓컬리' fuzzy 결과에 '컬리' 없음: got %v", corpNames(results))
110+
}
111+
t.Logf("'마켓컬리' fuzzy 결과: %v", corpNames(results))
112+
}
113+
114+
func TestSearch_Fuzzy_오늘의집(t *testing.T) {
115+
// "오늘의집" → substring 으로도 찾힘, fuzzy 로도 찾혀야 함
116+
s := testStore()
117+
results := s.Search("오늘의집")
118+
if len(results) == 0 {
119+
t.Fatal("'오늘의집' 검색: 결과 없음")
120+
}
121+
found := false
122+
for _, r := range results {
123+
if r.CorpName == "주식회사 오늘의집" {
124+
found = true
125+
}
126+
}
127+
if !found {
128+
t.Fatalf("'오늘의집' 결과에 '주식회사 오늘의집' 없음: got %v", corpNames(results))
129+
}
130+
}
131+
132+
func TestSearch_NoResult_완전엉뚱한검색어(t *testing.T) {
133+
s := testStore()
134+
results := s.Search("xyzxyz없는기업명zyx")
135+
if len(results) != 0 {
136+
t.Logf("'xyzxyz없는기업명zyx' 검색 결과 (fuzzy 허용): %v", corpNames(results))
137+
}
138+
}
139+
140+
// --- bigramSim unit tests ---
141+
142+
func TestBigramSim_완전동일(t *testing.T) {
143+
score := bigramSim("삼성전자", "삼성전자")
144+
if score != 1.0 {
145+
t.Fatalf("동일 문자열 bigramSim: 1.0 기대, got %f", score)
146+
}
147+
}
148+
149+
func TestBigramSim_부분겹침(t *testing.T) {
150+
// "마켓컬리" bigrams: [마켓, 켓컬, 컬리]
151+
// "컬리" bigrams: [컬리] → 1 match / 3 query bigrams = 0.333
152+
score := bigramSim("마켓컬리", "컬리")
153+
if score < 0.3 {
154+
t.Fatalf("'마켓컬리' vs '컬리': 0.3 이상 기대, got %f", score)
155+
}
156+
}
157+
158+
func TestBigramSim_핑크퐁(t *testing.T) {
159+
// "핑크퐁" bigrams: [핑크, 크퐁]
160+
// "더핑크퐁컴퍼니" bigrams: [더핑, 핑크, 크퐁, 퐁컴, 컴퍼, 퍼니] → 2 match / 2 = 1.0
161+
score := bigramSim("핑크퐁", "더핑크퐁컴퍼니")
162+
if score != 1.0 {
163+
t.Fatalf("'핑크퐁' vs '더핑크퐁컴퍼니': 1.0 기대, got %f", score)
164+
}
165+
}
166+
167+
func TestBigramSim_단일문자(t *testing.T) {
168+
// 단일 rune은 bigram 없음 → 0
169+
score := bigramSim("가", "가나다라")
170+
if score != 0 {
171+
t.Fatalf("단일 문자 bigramSim: 0 기대, got %f", score)
172+
}
173+
}

0 commit comments

Comments
 (0)