Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"python.defaultInterpreterPath": "d:\\gsoc\\Perspective\\backend\\.venv\\Scripts\\python.exe",
"python.analysis.extraPaths": [
"d:\\gsoc\\Perspective\\backend"
]
Comment thread
Prateekiiitg56 marked this conversation as resolved.
}
1 change: 1 addition & 0 deletions backend/.pyre_configuration
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
Copy link

Copilot AI Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file appears to include a developer-local absolute site-packages path (and is currently a single-line JSON with a Windows drive). Project-level Pyre configs should be portable; remove the machine-specific search_path or make it relative so it works across environments/CI.

Suggested change
{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
{ "source_directories": ["."] }

Copilot uses AI. Check for mistakes.
Binary file added backend/app/db/perspective_cache.db
Binary file not shown.
Binary file added backend/app/db/perspective_cache.db-shm
Binary file not shown.
Binary file added backend/app/db/perspective_cache.db-wal
Binary file not shown.
191 changes: 191 additions & 0 deletions backend/app/db/sqlite_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
sqlite_cache.py
---------------
SQLite cache for article metadata and generated lens perspectives.

Improvements over v1:
- WAL (Write-Ahead Logging) mode: significantly faster concurrent reads/writes
- Connection pool via threading.local() — one connection per thread, not per call
- TTL expiry: articles expire after 7 days, perspectives after 30 days
- Cache size limit: auto-evicts the oldest articles beyond MAX_ARTICLES
- Indexes on url_hash + lens for fast lookups
- PRAGMA optimisations: cache_size, synchronous=NORMAL, temp_store=MEMORY
"""

import sqlite3
import hashlib
import json
import os
import threading
from app.logging.logging_config import setup_logger

logger = setup_logger(__name__)

DB_PATH = os.path.join(os.path.dirname(__file__), "perspective_cache.db")

# TTL in seconds
ARTICLE_TTL_SECS = 7 * 24 * 3600 # 7 days
PERSPECTIVE_TTL_SECS = 30 * 24 * 3600 # 30 days
MAX_ARTICLES = 500 # evict oldest beyond this

# Thread-local connection pool
_local = threading.local()


def get_connection() -> sqlite3.Connection:
"""
Return a per-thread SQLite connection.
Creates the connection with WAL mode and performance PRAGMAs on first use.
"""
if not hasattr(_local, "conn") or _local.conn is None:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA cache_size=-8000") # 8 MB page cache
conn.execute("PRAGMA temp_store=MEMORY")
conn.execute("PRAGMA mmap_size=134217728") # 128 MB memory-mapped I/O
_local.conn = conn
return _local.conn


def init_db():
"""Create tables and indexes if they don't already exist."""
with get_connection() as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS article_cache (
url_hash TEXT PRIMARY KEY,
url TEXT NOT NULL,
cleaned_text TEXT,
summary TEXT,
main_claim TEXT,
entities TEXT,
tone TEXT,
key_points TEXT,
created_at REAL DEFAULT (strftime('%s','now'))
);

CREATE TABLE IF NOT EXISTS perspective_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url_hash TEXT NOT NULL,
lens TEXT NOT NULL,
content TEXT NOT NULL,
created_at REAL DEFAULT (strftime('%s','now')),
UNIQUE(url_hash, lens)
);

-- Indexes for fast lookups (idempotent)
CREATE INDEX IF NOT EXISTS idx_article_hash ON article_cache(url_hash);
CREATE INDEX IF NOT EXISTS idx_persp_hash_lens ON perspective_cache(url_hash, lens);
CREATE INDEX IF NOT EXISTS idx_article_age ON article_cache(created_at);
""")
logger.info("SQLite cache initialised (WAL mode).")


def url_to_hash(url: str) -> str:
return hashlib.sha256(url.strip().encode()).hexdigest()


# ── Article cache ─────────────────────────────────────────────────────────────


def get_cached_article(url: str) -> dict | None:
url_hash = url_to_hash(url)
conn = get_connection()
row = conn.execute(
"""SELECT * FROM article_cache
WHERE url_hash = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, ARTICLE_TTL_SECS),
).fetchone()
if row:
data = dict(row)
data["entities"] = json.loads(data.get("entities") or "[]")
data["key_points"] = json.loads(data.get("key_points") or "[]")
return data
return None


def save_article_cache(url: str, article_data: dict):
url_hash = url_to_hash(url)
conn = get_connection()
conn.execute(
"""
INSERT OR REPLACE INTO article_cache
(url_hash, url, cleaned_text, summary, main_claim, entities, tone, key_points)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
url_hash,
url,
article_data.get("cleaned_text", ""),
article_data.get("summary", ""),
article_data.get("main_claim", ""),
json.dumps(article_data.get("entities", [])),
article_data.get("tone", ""),
json.dumps(article_data.get("key_points", [])),
),
)
conn.commit()
_evict_old_articles(conn)
logger.debug(f"Article cached: {url_hash[:12]}…")


def _evict_old_articles(conn: sqlite3.Connection):
"""Delete oldest articles beyond MAX_ARTICLES to keep the DB lean."""
count = conn.execute("SELECT COUNT(*) FROM article_cache").fetchone()[0]
if count > MAX_ARTICLES:
excess = count - MAX_ARTICLES
conn.execute(
"""
DELETE FROM article_cache WHERE url_hash IN (
SELECT url_hash FROM article_cache
ORDER BY created_at ASC LIMIT ?
)
""",
(excess,),
)
conn.commit()
logger.info(f"Cache eviction: removed {excess} oldest articles")


# ── Perspective cache ─────────────────────────────────────────────────────────


def get_cached_perspective(url: str, lens: str) -> str | None:
url_hash = url_to_hash(url)
conn = get_connection()
row = conn.execute(
"""SELECT content FROM perspective_cache
WHERE url_hash = ? AND lens = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, lens, PERSPECTIVE_TTL_SECS),
).fetchone()
return row["content"] if row else None


def save_perspective_cache(url: str, lens: str, content: str):
url_hash = url_to_hash(url)
conn = get_connection()
conn.execute(
"""
INSERT OR REPLACE INTO perspective_cache (url_hash, lens, content)
VALUES (?, ?, ?)
""",
(url_hash, lens, content),
)
conn.commit()
logger.debug(f"Perspective cached: lens={lens}, hash={url_hash[:12]}…")


def get_all_cached_perspectives(url: str) -> dict:
"""Return {lens: content} for all non-expired cached lenses for this URL."""
url_hash = url_to_hash(url)
conn = get_connection()
rows = conn.execute(
"""SELECT lens, content FROM perspective_cache
WHERE url_hash = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, PERSPECTIVE_TTL_SECS),
).fetchall()
return {row["lens"]: row["content"] for row in rows}
3 changes: 2 additions & 1 deletion backend/app/logging/logging_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import sys


def setup_logger(name: str) -> logging.Logger:
"""
Creates and configures a logger with console + file output.
Expand All @@ -21,7 +22,7 @@ def setup_logger(name: str) -> logging.Logger:
# Formatter with timestamp, log level, module name
formatter = logging.Formatter(
"[%(asctime)s] [%(levelname)s] [%(name)s]: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
datefmt="%Y-%m-%d %H:%M:%S",
)

# Console Handler
Expand Down
Empty file.
129 changes: 129 additions & 0 deletions backend/app/modules/article_extractor/extract_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
extract_metadata.py
-------------------
Extracts structured article metadata using Groq LLM (Llama 3.3 70B).

Improvements:
- Retry logic: up to 2 attempts on JSON parse failure
- More specific prompt with examples for each field
- Robust JSON cleaning (strips fences, BOM, leading junk)
- Type normalisation for entities and key_points

Returns a dict with:
summary - 2-3 sentence objective summary
main_claim - The central argument/thesis in one sentence
entities - Key named entities (people, orgs, places)
tone - One-word tone descriptor
key_points - 3-5 bullet-point insights
"""

import os
import json
import re
import time
from groq import Groq
from dotenv import load_dotenv
from app.logging.logging_config import setup_logger

load_dotenv()
logger = setup_logger(__name__)

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

_FALLBACK = {
"summary": "",
"main_claim": "",
"entities": [],
"tone": "neutral",
"key_points": [],
}

METADATA_PROMPT = """Analyze the following article excerpt and return a JSON object. Be precise and factual.

Fields required:
- "summary": String. A 2-3 sentence objective summary of what the article is about.
- "main_claim": String. The single central argument or thesis in ONE sentence.
- "entities": Array of strings. Key named people, organisations, countries, or technologies mentioned.
- "tone": String. ONE word describing the overall tone. Choose from: alarmist, optimistic, critical, neutral, celebratory, authoritative, speculative, alarming, hopeful.
- "key_points": Array of strings. Exactly 3-5 concise insights or findings from the article.

Rules:
- Return ONLY the JSON object. No markdown code fences, no explanations, no extra text.
- If you cannot determine a field, use an empty string or empty array.

Article:
{text}
"""


def _clean_json_string(raw: str) -> str:
"""Remove common wrapping/junk around LLM JSON output."""
raw = raw.strip().lstrip("\ufeff") # strip BOM
# Remove markdown fences
raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.IGNORECASE)
raw = re.sub(r"\s*```$", "", raw)
return raw.strip()


def extract_article_metadata(cleaned_text: str) -> dict:
"""
Call Groq LLM to extract structured article metadata.
Retries once on JSON parse failure.
"""
if not cleaned_text or not cleaned_text.strip():
return _FALLBACK.copy()

# Use first 5000 chars — sufficient for metadata capture
excerpt = cleaned_text[:5000]
prompt = METADATA_PROMPT.format(text=excerpt)

for attempt in range(1, 3):
try:
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{
"role": "system",
"content": "You are a precise article analyst. Return only valid JSON.",
},
{"role": "user", "content": prompt},
],
temperature=0.1, # near-zero for consistent structured output
max_tokens=500,
)

raw = response.choices[0].message.content
cleaned = _clean_json_string(raw)
metadata = json.loads(cleaned)

# Normalise types
if isinstance(metadata.get("entities"), str):
metadata["entities"] = [
e.strip()
for e in re.split(r"[,;]", metadata["entities"])
if e.strip()
]
if isinstance(metadata.get("key_points"), str):
metadata["key_points"] = [metadata["key_points"]]
if not isinstance(metadata.get("entities"), list):
metadata["entities"] = []
if not isinstance(metadata.get("key_points"), list):
metadata["key_points"] = []

logger.info("Article metadata extracted successfully.")
return metadata

except json.JSONDecodeError as e:
logger.warning(
f"Metadata JSON parse failed (attempt {attempt}): {e}. Raw: {raw[:200]}"
)
if attempt < 2:
time.sleep(1)
continue
return _FALLBACK.copy()

except Exception as e:
logger.exception(f"Metadata extraction error: {e}")
return _FALLBACK.copy()

return _FALLBACK.copy()
Loading