AOSSIE-Org · Prateekiiitg56 · Mar 3, 2026 · Mar 3, 2026 · Mar 8, 2026 · Mar 9, 2026
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "python.defaultInterpreterPath": "d:\\gsoc\\Perspective\\backend\\.venv\\Scripts\\python.exe",
+    "python.analysis.extraPaths": [
+        "d:\\gsoc\\Perspective\\backend"
+    ]
+}
diff --git a/backend/.pyre_configuration b/backend/.pyre_configuration
@@ -0,0 +1 @@
+{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
-{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
+{ "source_directories": ["."] }
-{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
+{ "source_directories": ["."] }
diff --git a/backend/app/db/perspective_cache.db b/backend/app/db/perspective_cache.db
diff --git a/backend/app/db/perspective_cache.db-shm b/backend/app/db/perspective_cache.db-shm
diff --git a/backend/app/db/perspective_cache.db-wal b/backend/app/db/perspective_cache.db-wal
diff --git a/backend/app/db/sqlite_cache.py b/backend/app/db/sqlite_cache.py
@@ -0,0 +1,191 @@
+"""
+sqlite_cache.py
+---------------
+SQLite cache for article metadata and generated lens perspectives.
+
+Improvements over v1:
+  - WAL (Write-Ahead Logging) mode: significantly faster concurrent reads/writes
+  - Connection pool via threading.local() — one connection per thread, not per call
+  - TTL expiry: articles expire after 7 days, perspectives after 30 days
+  - Cache size limit: auto-evicts the oldest articles beyond MAX_ARTICLES
+  - Indexes on url_hash + lens for fast lookups
+  - PRAGMA optimisations: cache_size, synchronous=NORMAL, temp_store=MEMORY
+"""
+
+import sqlite3
+import hashlib
+import json
+import os
+import threading
+from app.logging.logging_config import setup_logger
+
+logger = setup_logger(__name__)
+
+DB_PATH = os.path.join(os.path.dirname(__file__), "perspective_cache.db")
+
+# TTL in seconds
+ARTICLE_TTL_SECS = 7 * 24 * 3600  # 7 days
+PERSPECTIVE_TTL_SECS = 30 * 24 * 3600  # 30 days
+MAX_ARTICLES = 500  # evict oldest beyond this
+
+# Thread-local connection pool
+_local = threading.local()
+
+
+def get_connection() -> sqlite3.Connection:
+    """
+    Return a per-thread SQLite connection.
+    Creates the connection with WAL mode and performance PRAGMAs on first use.
+    """
+    if not hasattr(_local, "conn") or _local.conn is None:
+        conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+        conn.row_factory = sqlite3.Row
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA synchronous=NORMAL")
+        conn.execute("PRAGMA cache_size=-8000")  # 8 MB page cache
+        conn.execute("PRAGMA temp_store=MEMORY")
+        conn.execute("PRAGMA mmap_size=134217728")  # 128 MB memory-mapped I/O
+        _local.conn = conn
+    return _local.conn
+
+
+def init_db():
+    """Create tables and indexes if they don't already exist."""
+    with get_connection() as conn:
+        conn.executescript("""
+            CREATE TABLE IF NOT EXISTS article_cache (
+                url_hash     TEXT PRIMARY KEY,
+                url          TEXT NOT NULL,
+                cleaned_text TEXT,
+                summary      TEXT,
+                main_claim   TEXT,
+                entities     TEXT,
+                tone         TEXT,
+                key_points   TEXT,
+                created_at   REAL DEFAULT (strftime('%s','now'))
+            );
+
+            CREATE TABLE IF NOT EXISTS perspective_cache (
+                id           INTEGER PRIMARY KEY AUTOINCREMENT,
+                url_hash     TEXT NOT NULL,
+                lens         TEXT NOT NULL,
+                content      TEXT NOT NULL,
+                created_at   REAL DEFAULT (strftime('%s','now')),
+                UNIQUE(url_hash, lens)
+            );
+
+            -- Indexes for fast lookups (idempotent)
+            CREATE INDEX IF NOT EXISTS idx_article_hash   ON article_cache(url_hash);
+            CREATE INDEX IF NOT EXISTS idx_persp_hash_lens ON perspective_cache(url_hash, lens);
+            CREATE INDEX IF NOT EXISTS idx_article_age    ON article_cache(created_at);
+        """)
+    logger.info("SQLite cache initialised (WAL mode).")
+
+
+def url_to_hash(url: str) -> str:
+    return hashlib.sha256(url.strip().encode()).hexdigest()
+
+
+# ── Article cache ─────────────────────────────────────────────────────────────
+
+
+def get_cached_article(url: str) -> dict | None:
+    url_hash = url_to_hash(url)
+    conn = get_connection()
+    row = conn.execute(
+        """SELECT * FROM article_cache
+           WHERE url_hash = ?
+             AND (strftime('%s','now') - created_at) < ?""",
+        (url_hash, ARTICLE_TTL_SECS),
+    ).fetchone()
+    if row:
+        data = dict(row)
+        data["entities"] = json.loads(data.get("entities") or "[]")
+        data["key_points"] = json.loads(data.get("key_points") or "[]")
+        return data
+    return None
+
+
+def save_article_cache(url: str, article_data: dict):
+    url_hash = url_to_hash(url)
+    conn = get_connection()
+    conn.execute(
+        """
+        INSERT OR REPLACE INTO article_cache
+            (url_hash, url, cleaned_text, summary, main_claim, entities, tone, key_points)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+    """,
+        (
+            url_hash,
+            url,
+            article_data.get("cleaned_text", ""),
+            article_data.get("summary", ""),
+            article_data.get("main_claim", ""),
+            json.dumps(article_data.get("entities", [])),
+            article_data.get("tone", ""),
+            json.dumps(article_data.get("key_points", [])),
+        ),
+    )
+    conn.commit()
+    _evict_old_articles(conn)
+    logger.debug(f"Article cached: {url_hash[:12]}…")
+
+
+def _evict_old_articles(conn: sqlite3.Connection):
+    """Delete oldest articles beyond MAX_ARTICLES to keep the DB lean."""
+    count = conn.execute("SELECT COUNT(*) FROM article_cache").fetchone()[0]
+    if count > MAX_ARTICLES:
+        excess = count - MAX_ARTICLES
+        conn.execute(
+            """
+            DELETE FROM article_cache WHERE url_hash IN (
+                SELECT url_hash FROM article_cache
+                ORDER BY created_at ASC LIMIT ?
+            )
+        """,
+            (excess,),
+        )
+        conn.commit()
+        logger.info(f"Cache eviction: removed {excess} oldest articles")
+
+
+# ── Perspective cache ─────────────────────────────────────────────────────────
+
+
+def get_cached_perspective(url: str, lens: str) -> str | None:
+    url_hash = url_to_hash(url)
+    conn = get_connection()
+    row = conn.execute(
+        """SELECT content FROM perspective_cache
+           WHERE url_hash = ? AND lens = ?
+             AND (strftime('%s','now') - created_at) < ?""",
+        (url_hash, lens, PERSPECTIVE_TTL_SECS),
+    ).fetchone()
+    return row["content"] if row else None
+
+
+def save_perspective_cache(url: str, lens: str, content: str):
+    url_hash = url_to_hash(url)
+    conn = get_connection()
+    conn.execute(
+        """
+        INSERT OR REPLACE INTO perspective_cache (url_hash, lens, content)
+        VALUES (?, ?, ?)
+    """,
+        (url_hash, lens, content),
+    )
+    conn.commit()
+    logger.debug(f"Perspective cached: lens={lens}, hash={url_hash[:12]}…")
+
+
+def get_all_cached_perspectives(url: str) -> dict:
+    """Return {lens: content} for all non-expired cached lenses for this URL."""
+    url_hash = url_to_hash(url)
+    conn = get_connection()
+    rows = conn.execute(
+        """SELECT lens, content FROM perspective_cache
+           WHERE url_hash = ?
+             AND (strftime('%s','now') - created_at) < ?""",
+        (url_hash, PERSPECTIVE_TTL_SECS),
+    ).fetchall()
+    return {row["lens"]: row["content"] for row in rows}
diff --git a/backend/app/logging/logging_config.py b/backend/app/logging/logging_config.py
@@ -1,6 +1,7 @@
 import logging
 import sys
 
+
 def setup_logger(name: str) -> logging.Logger:
     """
     Creates and configures a logger with console + file output.
@@ -21,7 +22,7 @@ def setup_logger(name: str) -> logging.Logger:
     # Formatter with timestamp, log level, module name
     formatter = logging.Formatter(
         "[%(asctime)s] [%(levelname)s] [%(name)s]: %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S"
+        datefmt="%Y-%m-%d %H:%M:%S",
     )
 
     # Console Handler

diff --git a/backend/app/modules/article_extractor/__init__.py b/backend/app/modules/article_extractor/__init__.py
diff --git a/backend/app/modules/article_extractor/extract_metadata.py b/backend/app/modules/article_extractor/extract_metadata.py
@@ -0,0 +1,129 @@
+"""
+extract_metadata.py
+-------------------
+Extracts structured article metadata using Groq LLM (Llama 3.3 70B).
+
+Improvements:
+    - Retry logic: up to 2 attempts on JSON parse failure
+    - More specific prompt with examples for each field
+    - Robust JSON cleaning (strips fences, BOM, leading junk)
+    - Type normalisation for entities and key_points
+
+Returns a dict with:
+    summary     - 2-3 sentence objective summary
+    main_claim  - The central argument/thesis in one sentence
+    entities    - Key named entities (people, orgs, places)
+    tone        - One-word tone descriptor
+    key_points  - 3-5 bullet-point insights
+"""
+
+import os
+import json
+import re
+import time
+from groq import Groq
+from dotenv import load_dotenv
+from app.logging.logging_config import setup_logger
+
+load_dotenv()
+logger = setup_logger(__name__)
+
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+
+_FALLBACK = {
+    "summary": "",
+    "main_claim": "",
+    "entities": [],
+    "tone": "neutral",
+    "key_points": [],
+}
+
+METADATA_PROMPT = """Analyze the following article excerpt and return a JSON object. Be precise and factual.
+
+Fields required:
+- "summary": String. A 2-3 sentence objective summary of what the article is about.
+- "main_claim": String. The single central argument or thesis in ONE sentence.
+- "entities": Array of strings. Key named people, organisations, countries, or technologies mentioned.
+- "tone": String. ONE word describing the overall tone. Choose from: alarmist, optimistic, critical, neutral, celebratory, authoritative, speculative, alarming, hopeful.
+- "key_points": Array of strings. Exactly 3-5 concise insights or findings from the article.
+
+Rules:
+- Return ONLY the JSON object. No markdown code fences, no explanations, no extra text.
+- If you cannot determine a field, use an empty string or empty array.
+
+Article:
+{text}
+"""
+
+
+def _clean_json_string(raw: str) -> str:
+    """Remove common wrapping/junk around LLM JSON output."""
+    raw = raw.strip().lstrip("\ufeff")  # strip BOM
+    # Remove markdown fences
+    raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.IGNORECASE)
+    raw = re.sub(r"\s*```$", "", raw)
+    return raw.strip()
+
+
+def extract_article_metadata(cleaned_text: str) -> dict:
+    """
+    Call Groq LLM to extract structured article metadata.
+    Retries once on JSON parse failure.
+    """
+    if not cleaned_text or not cleaned_text.strip():
+        return _FALLBACK.copy()
+
+    # Use first 5000 chars — sufficient for metadata capture
+    excerpt = cleaned_text[:5000]
+    prompt = METADATA_PROMPT.format(text=excerpt)
+
+    for attempt in range(1, 3):
+        try:
+            response = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a precise article analyst. Return only valid JSON.",
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.1,  # near-zero for consistent structured output
+                max_tokens=500,
+            )
+
+            raw = response.choices[0].message.content
+            cleaned = _clean_json_string(raw)
+            metadata = json.loads(cleaned)
+
+            # Normalise types
+            if isinstance(metadata.get("entities"), str):
+                metadata["entities"] = [
+                    e.strip()
+                    for e in re.split(r"[,;]", metadata["entities"])
+                    if e.strip()
+                ]
+            if isinstance(metadata.get("key_points"), str):
+                metadata["key_points"] = [metadata["key_points"]]
+            if not isinstance(metadata.get("entities"), list):
+                metadata["entities"] = []
+            if not isinstance(metadata.get("key_points"), list):
+                metadata["key_points"] = []
+
+            logger.info("Article metadata extracted successfully.")
+            return metadata
+
+        except json.JSONDecodeError as e:
+            logger.warning(
+                f"Metadata JSON parse failed (attempt {attempt}): {e}. Raw: {raw[:200]}"
+            )
+            if attempt < 2:
+                time.sleep(1)
+                continue
+            return _FALLBACK.copy()
+
+        except Exception as e:
+            logger.exception(f"Metadata extraction error: {e}")
+            return _FALLBACK.copy()
+
+    return _FALLBACK.copy()