|
| 1 | +# moodico/products/management/commands/scrape_products.py |
| 2 | + |
| 3 | +## 실행방법 |
| 4 | +''' |
| 5 | +python manage.py scrape_products --brands romand,3ce --scroll 4 --limit 5 |
| 6 | +''' |
| 7 | +import os |
| 8 | +import time |
| 9 | +import json |
| 10 | +import uuid |
| 11 | +import requests |
| 12 | +import numpy as np |
| 13 | +from io import BytesIO |
| 14 | +from PIL import Image |
| 15 | +from skimage import color |
| 16 | + |
| 17 | +from django.core.management.base import BaseCommand |
| 18 | +from django.conf import settings |
| 19 | + |
| 20 | +# Selenium |
| 21 | +from selenium import webdriver |
| 22 | +from selenium.webdriver.chrome.service import Service |
| 23 | +from selenium.webdriver.common.by import By |
| 24 | +from webdriver_manager.chrome import ChromeDriverManager |
| 25 | + |
| 26 | + |
| 27 | +TARGETS_DEFAULT = [ |
| 28 | + {"brand": "romand", "url": "https://romand.co.kr/product/maincatedetail.html?cate_code=289", "category": "Lips"}, |
| 29 | + {"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/lips", "category": "Lips"}, |
| 30 | + {"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/cheeks/blush", "category": "blush"}, |
| 31 | + {"brand": "3ce", "url": "https://www.3cecosmetics.com/all-products/eyes/eyeshadow", "category": "eyeshadow"}, |
| 32 | +] |
| 33 | + |
| 34 | +def extract_romand_items(driver, category): |
| 35 | + items = driver.find_elements(By.CSS_SELECTOR, 'li.list_prd_item') |
| 36 | + results = [] |
| 37 | + for item in items: |
| 38 | + try: |
| 39 | + name = item.find_element(By.CSS_SELECTOR, '.prd_title').text |
| 40 | + image = item.find_element(By.CSS_SELECTOR, 'img').get_attribute('src') |
| 41 | + price = item.find_element(By.CSS_SELECTOR, '.current_price').text.strip() |
| 42 | + url = item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href') |
| 43 | + results.append({ |
| 44 | + "brand": "romand", |
| 45 | + "category": category, |
| 46 | + "name": name, |
| 47 | + "color_name": name.split('/')[-1].strip(), |
| 48 | + "image": image, |
| 49 | + "price": price, |
| 50 | + "url": url |
| 51 | + }) |
| 52 | + except Exception as e: |
| 53 | + print("Romand Error:", e) |
| 54 | + continue |
| 55 | + return results |
| 56 | + |
| 57 | +def extract_3ce_items(driver, category): |
| 58 | + items = driver.find_elements(By.CSS_SELECTOR, 'li.tce-grid__item') |
| 59 | + results = [] |
| 60 | + for item in items: |
| 61 | + try: |
| 62 | + name = item.find_element(By.CSS_SELECTOR, 'h2.tce-product-card__name').text.strip() |
| 63 | + url = item.find_element(By.CSS_SELECTOR, 'a.tce-product-card__link').get_attribute("href") |
| 64 | + price = item.find_element(By.CSS_SELECTOR, '.tce-product-card__price').text.strip() |
| 65 | + image = item.find_element(By.CSS_SELECTOR, 'img.tce-product-card__image').get_attribute("src") |
| 66 | + |
| 67 | + results.append({ |
| 68 | + "brand": "3CE", |
| 69 | + "category": category, |
| 70 | + "name": name, |
| 71 | + "color_name": name.split('/')[-1].strip() if '/' in name else name, |
| 72 | + "url": f"https://www.3cecosmetics.com{url}" if url.startswith('/') else url, |
| 73 | + "image": f"https://www.3cecosmetics.com{image}" if image.startswith('/') else image, |
| 74 | + "price": price |
| 75 | + }) |
| 76 | + except Exception as e: |
| 77 | + print("3CE Error:", e) |
| 78 | + continue |
| 79 | + return results |
| 80 | + |
| 81 | +def extract_average_color(img_url): |
| 82 | + """Remove near-white background, compute average color; return (hex, L, a, b).""" |
| 83 | + try: |
| 84 | + response = requests.get(img_url, timeout=8) |
| 85 | + response.raise_for_status() |
| 86 | + img = Image.open(BytesIO(response.content)).convert('RGB') |
| 87 | + img = img.resize((50, 50)) |
| 88 | + pixels = np.array(img).reshape(-1, 3) |
| 89 | + |
| 90 | + # filter out very bright pixels (likely background) |
| 91 | + filtered = [px for px in pixels if not all(c > 240 for c in px)] |
| 92 | + if not filtered: |
| 93 | + filtered = pixels |
| 94 | + |
| 95 | + avg_rgb = np.array(filtered).mean(axis=0) |
| 96 | + r, g, b = map(int, avg_rgb) |
| 97 | + hex_code = '#{:02x}{:02x}{:02x}'.format(r, g, b) |
| 98 | + |
| 99 | + rgb_norm = np.array([[avg_rgb]]) / 255.0 |
| 100 | + lab = color.rgb2lab(rgb_norm)[0][0] |
| 101 | + lab_l, lab_a, lab_b = lab.round(2) |
| 102 | + |
| 103 | + return hex_code, float(lab_l), float(lab_a), float(lab_b) |
| 104 | + except Exception as e: |
| 105 | + print(f"[Color Error] {img_url} - {e}") |
| 106 | + return "#000000", 0.0, 0.0, 0.0 |
| 107 | + |
| 108 | + |
| 109 | +class Command(BaseCommand): |
| 110 | + help = "Scrape cosmetic products and dump a JSON file under MEDIA_ROOT/data/test_products.json" |
| 111 | + |
| 112 | + def add_arguments(self, parser): |
| 113 | + parser.add_argument("--scroll", type=int, default=4, help="Scroll count per page (default: 4)") |
| 114 | + parser.add_argument("--headful", action="store_true", help="Run Chrome with UI (not headless)") |
| 115 | + parser.add_argument("--output", default="data/test_products.json", |
| 116 | + help="Output path under MEDIA_ROOT (default: data/test_products.json)") |
| 117 | + parser.add_argument("--limit", type=int, default=10, |
| 118 | + help="Number of examples to keep in test JSON (first N + last N, default: 10)") |
| 119 | + parser.add_argument("--brands", default="romand,3ce", |
| 120 | + help="Comma-separated brands to scrape (romand,3ce). Default: both") |
| 121 | + |
| 122 | + def handle(self, *args, **opts): |
| 123 | + scroll_count = opts["scroll"] |
| 124 | + headless = not opts["headful"] |
| 125 | + brands = {b.strip().lower() for b in opts["brands"].split(",") if b.strip()} |
| 126 | + output_rel = opts["output"] |
| 127 | + test_limit = max(1, int(opts["limit"])) |
| 128 | + |
| 129 | + # Selenium options |
| 130 | + options = webdriver.ChromeOptions() |
| 131 | + if headless: |
| 132 | + options.add_argument("--headless=new") |
| 133 | + options.add_argument("--no-sandbox") |
| 134 | + options.add_argument("--disable-dev-shm-usage") |
| 135 | + |
| 136 | + # Driver (downloads matching ChromeDriver if needed) |
| 137 | + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) |
| 138 | + |
| 139 | + # Targets |
| 140 | + targets = [t for t in TARGETS_DEFAULT if t["brand"].lower() in brands] |
| 141 | + |
| 142 | + all_products = [] |
| 143 | + try: |
| 144 | + for target in targets: |
| 145 | + brand = target["brand"] |
| 146 | + url = target["url"] |
| 147 | + category = target["category"] |
| 148 | + self.stdout.write(f"Scraping {brand} ({category}) ...") |
| 149 | + |
| 150 | + driver.get(url) |
| 151 | + time.sleep(2) |
| 152 | + |
| 153 | + for _ in range(scroll_count): |
| 154 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
| 155 | + time.sleep(2) |
| 156 | + |
| 157 | + if brand.lower() == "romand": |
| 158 | + raw_items = extract_romand_items(driver, category) |
| 159 | + elif brand.lower() == "3ce": |
| 160 | + raw_items = extract_3ce_items(driver, category) |
| 161 | + else: |
| 162 | + self.stdout.write(self.style.WARNING(f"No extractor for: {brand}")) |
| 163 | + continue |
| 164 | + |
| 165 | + self.stdout.write(f" → {len(raw_items)} items found") |
| 166 | + |
| 167 | + # enrich with color |
| 168 | + for item in raw_items: |
| 169 | + hex_color, lab_l, lab_a, lab_b = extract_average_color(item["image"]) |
| 170 | + product = { |
| 171 | + "id": str(uuid.uuid4()), |
| 172 | + **item, |
| 173 | + "hex": hex_color, |
| 174 | + "lab_l": lab_l, |
| 175 | + "lab_a": lab_a, |
| 176 | + "lab_b": lab_b, |
| 177 | + } |
| 178 | + all_products.append(product) |
| 179 | + finally: |
| 180 | + driver.quit() |
| 181 | + |
| 182 | + # Save under MEDIA_ROOT/data/... |
| 183 | + out_path = os.path.join(settings.MEDIA_ROOT, output_rel) |
| 184 | + os.makedirs(os.path.dirname(out_path), exist_ok=True) |
| 185 | + |
| 186 | + # keep a small test set (first N + last N) |
| 187 | + test_products = all_products[:test_limit] + all_products[-test_limit:] |
| 188 | + |
| 189 | + with open(out_path, "w", encoding="utf-8") as f: |
| 190 | + json.dump(test_products, f, ensure_ascii=False, indent=2) |
| 191 | + |
| 192 | + self.stdout.write(self.style.SUCCESS( |
| 193 | + f"Saved {len(test_products)} items to {out_path} (from {len(all_products)} scraped)")) |
0 commit comments