Skip to content

Commit 8f42dfe

Browse files
committed
[feat] 제품 스크래핑 및 클러스터 생성 management command 추가
1 parent 378f953 commit 8f42dfe

4 files changed

Lines changed: 357 additions & 0 deletions

File tree

moodico/products/management/__init__.py

Whitespace-only changes.

moodico/products/management/commands/__init__.py

Whitespace-only changes.
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# moodico/products/management/commands/generate_clusters.py
2+
3+
#실행방법
4+
'''
5+
python manage.py generate_clusters --input data/all_products.json
6+
'''
7+
import os
8+
import json
9+
import numpy as np
10+
from django.core.management.base import BaseCommand, CommandError
11+
from django.conf import settings
12+
from django.contrib.staticfiles import finders
13+
from sklearn.cluster import KMeans
14+
from sklearn.metrics import silhouette_score
15+
from sklearn.preprocessing import StandardScaler
16+
17+
# --- helpers from your script ---
18+
def hex_to_rgb(hex):
19+
hex = hex.lstrip('#')
20+
if len(hex) == 3:
21+
hex = ''.join([c*2 for c in hex])
22+
r = int(hex[:2], 16)
23+
g = int(hex[2:4], 16)
24+
b = int(hex[4:], 16)
25+
return r, g, b
26+
27+
def rgb_to_hsl(r, g, b):
28+
r, g, b = r/255, g/255, b/255
29+
maxc, minc = max(r, g, b), min(r, g, b)
30+
l = (maxc + minc) / 2
31+
if maxc == minc:
32+
h = s = 0
33+
else:
34+
d = maxc - minc
35+
s = d / (2 - maxc - minc) if l > 0.5 else d / (maxc + minc)
36+
if maxc == r:
37+
h = (g - b) / d + (6 if g < b else 0)
38+
elif maxc == g:
39+
h = (b - r) / d + 2
40+
else:
41+
h = (r - g) / d + 4
42+
h /= 6
43+
return h * 360, s, l
44+
45+
def calculate_coordinates(h, s, l):
46+
if h >= 330 or h < 60:
47+
if h >= 330:
48+
h -= 360
49+
warm_cool_score = (h + 30) / 90
50+
elif 60 <= h < 180:
51+
warm_cool_score = 1 - ((h - 60) / 120)
52+
elif 180 <= h < 300:
53+
warm_cool_score = -((h - 180) / 120)
54+
else:
55+
warm_cool_score = -1 + ((h - 300) / 30)
56+
57+
if s < 0.05:
58+
warm_cool_score = 0
59+
else:
60+
warm_cool_score *= s**0.8
61+
62+
if l < 0.1 or l > 0.9:
63+
warm_cool_score *= (1 - ((abs(0.5 - l) * 2)**2))
64+
65+
final_warm = (warm_cool_score + 1) * 50
66+
final_warm = max(0, min(100, final_warm))
67+
final_deep = (1 - l) * 100
68+
69+
return round(final_warm, 2), round(final_deep, 2)
70+
71+
class Command(BaseCommand):
72+
help = "Generate color clusters from product JSON and save to MEDIA_ROOT/data."
73+
74+
def add_arguments(self, parser):
75+
parser.add_argument(
76+
"--input",
77+
default="data/test_products.json",
78+
help="Input JSON path relative to a static dir (e.g., data/test_products.json).",
79+
)
80+
parser.add_argument(
81+
"--clusters",
82+
type=int,
83+
default=4,
84+
help="Number of KMeans clusters (default: 4).",
85+
)
86+
parser.add_argument(
87+
"--no-silhouette",
88+
action="store_true",
89+
help="Skip silhouette score sweep.",
90+
)
91+
92+
def handle(self, *args, **opts):
93+
input_rel = opts["input"]
94+
n_clusters = opts["clusters"]
95+
skip_sil = opts["no_silhouette"]
96+
97+
# 1) Find input JSON via staticfiles finder; fallback to BASE_DIR/static
98+
src = finders.find(input_rel)
99+
if not src:
100+
fallback = os.path.join(settings.BASE_DIR, "static", input_rel)
101+
if os.path.exists(fallback):
102+
src = fallback
103+
if not src:
104+
raise CommandError(f"Input not found: {input_rel}")
105+
106+
with open(src, "r", encoding="utf-8") as f:
107+
products = json.load(f)
108+
109+
coordinates = []
110+
valid_products = []
111+
for p in products:
112+
hex_color = p.get("hex")
113+
if not hex_color:
114+
continue
115+
try:
116+
r, g, b = hex_to_rgb(hex_color)
117+
h, s, l = rgb_to_hsl(r, g, b)
118+
warm, deep = calculate_coordinates(h, s, l)
119+
lab_l = p.get("lab_l", 0)
120+
lab_a = p.get("lab_a", 0)
121+
lab_b = p.get("lab_b", 0)
122+
123+
p["warmCool"] = warm
124+
p["lightDeep"] = deep
125+
coordinates.append([warm, deep, lab_l, lab_a, lab_b])
126+
valid_products.append(p)
127+
except Exception:
128+
continue
129+
130+
if not coordinates:
131+
raise CommandError("No valid products with color data found.")
132+
133+
# 2) Cluster — standardize first, then fit
134+
coords_np = np.array(coordinates, dtype=float)
135+
coords_std = StandardScaler().fit_transform(coords_np)
136+
137+
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
138+
labels = kmeans.fit_predict(coords_std)
139+
140+
for i, label in enumerate(labels):
141+
valid_products[i]["cluster"] = int(label)
142+
143+
# 3) Save outputs under MEDIA_ROOT/data (writable in prod)
144+
out_dir = os.path.join(settings.MEDIA_ROOT, "data")
145+
os.makedirs(out_dir, exist_ok=True)
146+
147+
products_out = os.path.join(out_dir, "products_clustered.json")
148+
with open(products_out, "w", encoding="utf-8") as f:
149+
json.dump(valid_products, f, ensure_ascii=False, indent=2)
150+
151+
centers_out = os.path.join(out_dir, "cluster_centers.json")
152+
with open(centers_out, "w", encoding="utf-8") as f:
153+
json.dump(kmeans.cluster_centers_.tolist(), f, ensure_ascii=False, indent=2)
154+
155+
self.stdout.write(self.style.SUCCESS(f"Wrote:\n {products_out}\n {centers_out}"))
156+
157+
# 4) Optional: silhouette sweep (k=2..10 or up to len-1)
158+
if not skip_sil:
159+
max_k = min(len(coords_std) - 1, 10)
160+
for k in range(2, max_k + 1):
161+
mdl = KMeans(n_clusters=k, random_state=42, n_init='auto')
162+
lab = mdl.fit_predict(coords_std)
163+
sc = silhouette_score(coords_std, lab)
164+
self.stdout.write(f"k={k} Silhouette={sc:.4f}")
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
# moodico/products/management/commands/scrape_products.py
2+
3+
## 실행방법
4+
'''
5+
python manage.py scrape_products --brands romand,3ce --scroll 4 --limit 5
6+
'''
7+
import os
8+
import time
9+
import json
10+
import uuid
11+
import requests
12+
import numpy as np
13+
from io import BytesIO
14+
from PIL import Image
15+
from skimage import color
16+
17+
from django.core.management.base import BaseCommand
18+
from django.conf import settings
19+
20+
# Selenium
21+
from selenium import webdriver
22+
from selenium.webdriver.chrome.service import Service
23+
from selenium.webdriver.common.by import By
24+
from webdriver_manager.chrome import ChromeDriverManager
25+
26+
27+
TARGETS_DEFAULT = [
28+
{"brand": "romand", "url": "https://romand.co.kr/product/maincatedetail.html?cate_code=289", "category": "Lips"},
29+
{"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/lips", "category": "Lips"},
30+
{"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/cheeks/blush", "category": "blush"},
31+
{"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/eyes/eyeshadow", "category": "eyeshadow"},
32+
]
33+
34+
def extract_romand_items(driver, category):
35+
items = driver.find_elements(By.CSS_SELECTOR, 'li.list_prd_item')
36+
results = []
37+
for item in items:
38+
try:
39+
name = item.find_element(By.CSS_SELECTOR, '.prd_title').text
40+
image = item.find_element(By.CSS_SELECTOR, 'img').get_attribute('src')
41+
price = item.find_element(By.CSS_SELECTOR, '.current_price').text.strip()
42+
url = item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
43+
results.append({
44+
"brand": "romand",
45+
"category": category,
46+
"name": name,
47+
"color_name": name.split('/')[-1].strip(),
48+
"image": image,
49+
"price": price,
50+
"url": url
51+
})
52+
except Exception as e:
53+
print("Romand Error:", e)
54+
continue
55+
return results
56+
57+
def extract_3ce_items(driver, category):
58+
items = driver.find_elements(By.CSS_SELECTOR, 'li.tce-grid__item')
59+
results = []
60+
for item in items:
61+
try:
62+
name = item.find_element(By.CSS_SELECTOR, 'h2.tce-product-card__name').text.strip()
63+
url = item.find_element(By.CSS_SELECTOR, 'a.tce-product-card__link').get_attribute("href")
64+
price = item.find_element(By.CSS_SELECTOR, '.tce-product-card__price').text.strip()
65+
image = item.find_element(By.CSS_SELECTOR, 'img.tce-product-card__image').get_attribute("src")
66+
67+
results.append({
68+
"brand": "3CE",
69+
"category": category,
70+
"name": name,
71+
"color_name": name.split('/')[-1].strip() if '/' in name else name,
72+
"url": f"https://www.3cecosmetics.com{url}" if url.startswith('/') else url,
73+
"image": f"https://www.3cecosmetics.com{image}" if image.startswith('/') else image,
74+
"price": price
75+
})
76+
except Exception as e:
77+
print("3CE Error:", e)
78+
continue
79+
return results
80+
81+
def extract_average_color(img_url):
82+
"""Remove near-white background, compute average color; return (hex, L, a, b)."""
83+
try:
84+
response = requests.get(img_url, timeout=8)
85+
response.raise_for_status()
86+
img = Image.open(BytesIO(response.content)).convert('RGB')
87+
img = img.resize((50, 50))
88+
pixels = np.array(img).reshape(-1, 3)
89+
90+
# filter out very bright pixels (likely background)
91+
filtered = [px for px in pixels if not all(c > 240 for c in px)]
92+
if not filtered:
93+
filtered = pixels
94+
95+
avg_rgb = np.array(filtered).mean(axis=0)
96+
r, g, b = map(int, avg_rgb)
97+
hex_code = '#{:02x}{:02x}{:02x}'.format(r, g, b)
98+
99+
rgb_norm = np.array([[avg_rgb]]) / 255.0
100+
lab = color.rgb2lab(rgb_norm)[0][0]
101+
lab_l, lab_a, lab_b = lab.round(2)
102+
103+
return hex_code, float(lab_l), float(lab_a), float(lab_b)
104+
except Exception as e:
105+
print(f"[Color Error] {img_url} - {e}")
106+
return "#000000", 0.0, 0.0, 0.0
107+
108+
109+
class Command(BaseCommand):
110+
help = "Scrape cosmetic products and dump a JSON file under MEDIA_ROOT/data/test_products.json"
111+
112+
def add_arguments(self, parser):
113+
parser.add_argument("--scroll", type=int, default=4, help="Scroll count per page (default: 4)")
114+
parser.add_argument("--headful", action="store_true", help="Run Chrome with UI (not headless)")
115+
parser.add_argument("--output", default="data/test_products.json",
116+
help="Output path under MEDIA_ROOT (default: data/test_products.json)")
117+
parser.add_argument("--limit", type=int, default=10,
118+
help="Number of examples to keep in test JSON (first N + last N, default: 10)")
119+
parser.add_argument("--brands", default="romand,3ce",
120+
help="Comma-separated brands to scrape (romand,3ce). Default: both")
121+
122+
def handle(self, *args, **opts):
123+
scroll_count = opts["scroll"]
124+
headless = not opts["headful"]
125+
brands = {b.strip().lower() for b in opts["brands"].split(",") if b.strip()}
126+
output_rel = opts["output"]
127+
test_limit = max(1, int(opts["limit"]))
128+
129+
# Selenium options
130+
options = webdriver.ChromeOptions()
131+
if headless:
132+
options.add_argument("--headless=new")
133+
options.add_argument("--no-sandbox")
134+
options.add_argument("--disable-dev-shm-usage")
135+
136+
# Driver (downloads matching ChromeDriver if needed)
137+
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
138+
139+
# Targets
140+
targets = [t for t in TARGETS_DEFAULT if t["brand"].lower() in brands]
141+
142+
all_products = []
143+
try:
144+
for target in targets:
145+
brand = target["brand"]
146+
url = target["url"]
147+
category = target["category"]
148+
self.stdout.write(f"Scraping {brand} ({category}) ...")
149+
150+
driver.get(url)
151+
time.sleep(2)
152+
153+
for _ in range(scroll_count):
154+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
155+
time.sleep(2)
156+
157+
if brand.lower() == "romand":
158+
raw_items = extract_romand_items(driver, category)
159+
elif brand.lower() == "3ce":
160+
raw_items = extract_3ce_items(driver, category)
161+
else:
162+
self.stdout.write(self.style.WARNING(f"No extractor for: {brand}"))
163+
continue
164+
165+
self.stdout.write(f" → {len(raw_items)} items found")
166+
167+
# enrich with color
168+
for item in raw_items:
169+
hex_color, lab_l, lab_a, lab_b = extract_average_color(item["image"])
170+
product = {
171+
"id": str(uuid.uuid4()),
172+
**item,
173+
"hex": hex_color,
174+
"lab_l": lab_l,
175+
"lab_a": lab_a,
176+
"lab_b": lab_b,
177+
}
178+
all_products.append(product)
179+
finally:
180+
driver.quit()
181+
182+
# Save under MEDIA_ROOT/data/...
183+
out_path = os.path.join(settings.MEDIA_ROOT, output_rel)
184+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
185+
186+
# keep a small test set (first N + last N)
187+
test_products = all_products[:test_limit] + all_products[-test_limit:]
188+
189+
with open(out_path, "w", encoding="utf-8") as f:
190+
json.dump(test_products, f, ensure_ascii=False, indent=2)
191+
192+
self.stdout.write(self.style.SUCCESS(
193+
f"Saved {len(test_products)} items to {out_path} (from {len(all_products)} scraped)"))

0 commit comments

Comments
 (0)