Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"python.defaultInterpreterPath": "d:\\gsoc\\Perspective\\backend\\.venv\\Scripts\\python.exe",
"python.analysis.extraPaths": [
"d:\\gsoc\\Perspective\\backend"
]
Comment thread
Prateekiiitg56 marked this conversation as resolved.
}
1 change: 1 addition & 0 deletions backend/.pyre_configuration
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
Copy link

Copilot AI Mar 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file appears to include a developer-local absolute site-packages path (and is currently a single-line JSON with a Windows drive). Project-level Pyre configs should be portable; remove the machine-specific search_path or make it relative so it works across environments/CI.

Suggested change
{ "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] }
{ "source_directories": ["."] }

Copilot uses AI. Check for mistakes.
Binary file added backend/app/db/perspective_cache.db
Binary file not shown.
Binary file added backend/app/db/perspective_cache.db-shm
Binary file not shown.
Binary file added backend/app/db/perspective_cache.db-wal
Binary file not shown.
191 changes: 191 additions & 0 deletions backend/app/db/sqlite_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""
sqlite_cache.py
---------------
SQLite cache for article metadata and generated lens perspectives.

Improvements over v1:
- WAL (Write-Ahead Logging) mode: significantly faster concurrent reads/writes
- Connection pool via threading.local() — one connection per thread, not per call
- TTL expiry: articles expire after 7 days, perspectives after 30 days
- Cache size limit: auto-evicts the oldest articles beyond MAX_ARTICLES
- Indexes on url_hash + lens for fast lookups
- PRAGMA optimisations: cache_size, synchronous=NORMAL, temp_store=MEMORY
"""

import sqlite3
import hashlib
import json
import os
import threading
from app.logging.logging_config import setup_logger

logger = setup_logger(__name__)

DB_PATH = os.path.join(os.path.dirname(__file__), "perspective_cache.db")

# TTL in seconds
ARTICLE_TTL_SECS = 7 * 24 * 3600 # 7 days
PERSPECTIVE_TTL_SECS = 30 * 24 * 3600 # 30 days
MAX_ARTICLES = 500 # evict oldest beyond this

# Thread-local connection pool
_local = threading.local()


def get_connection() -> sqlite3.Connection:
"""
Return a per-thread SQLite connection.
Creates the connection with WAL mode and performance PRAGMAs on first use.
"""
if not hasattr(_local, "conn") or _local.conn is None:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA cache_size=-8000") # 8 MB page cache
conn.execute("PRAGMA temp_store=MEMORY")
conn.execute("PRAGMA mmap_size=134217728") # 128 MB memory-mapped I/O
_local.conn = conn
return _local.conn


def init_db():
"""Create tables and indexes if they don't already exist."""
with get_connection() as conn:
conn.executescript("""
CREATE TABLE IF NOT EXISTS article_cache (
url_hash TEXT PRIMARY KEY,
url TEXT NOT NULL,
cleaned_text TEXT,
summary TEXT,
main_claim TEXT,
entities TEXT,
tone TEXT,
key_points TEXT,
created_at REAL DEFAULT (strftime('%s','now'))
);

CREATE TABLE IF NOT EXISTS perspective_cache (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url_hash TEXT NOT NULL,
lens TEXT NOT NULL,
content TEXT NOT NULL,
created_at REAL DEFAULT (strftime('%s','now')),
UNIQUE(url_hash, lens)
);

-- Indexes for fast lookups (idempotent)
CREATE INDEX IF NOT EXISTS idx_article_hash ON article_cache(url_hash);
CREATE INDEX IF NOT EXISTS idx_persp_hash_lens ON perspective_cache(url_hash, lens);
CREATE INDEX IF NOT EXISTS idx_article_age ON article_cache(created_at);
""")
logger.info("SQLite cache initialised (WAL mode).")


def url_to_hash(url: str) -> str:
return hashlib.sha256(url.strip().encode()).hexdigest()


# ── Article cache ─────────────────────────────────────────────────────────────


def get_cached_article(url: str) -> dict | None:
url_hash = url_to_hash(url)
conn = get_connection()
row = conn.execute(
"""SELECT * FROM article_cache
WHERE url_hash = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, ARTICLE_TTL_SECS),
).fetchone()
if row:
data = dict(row)
data["entities"] = json.loads(data.get("entities") or "[]")
data["key_points"] = json.loads(data.get("key_points") or "[]")
return data
return None


def save_article_cache(url: str, article_data: dict):
url_hash = url_to_hash(url)
conn = get_connection()
conn.execute(
"""
INSERT OR REPLACE INTO article_cache
(url_hash, url, cleaned_text, summary, main_claim, entities, tone, key_points)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
(
url_hash,
url,
article_data.get("cleaned_text", ""),
article_data.get("summary", ""),
article_data.get("main_claim", ""),
json.dumps(article_data.get("entities", [])),
article_data.get("tone", ""),
json.dumps(article_data.get("key_points", [])),
),
)
conn.commit()
_evict_old_articles(conn)
logger.debug(f"Article cached: {url_hash[:12]}…")


def _evict_old_articles(conn: sqlite3.Connection):
"""Delete oldest articles beyond MAX_ARTICLES to keep the DB lean."""
count = conn.execute("SELECT COUNT(*) FROM article_cache").fetchone()[0]
if count > MAX_ARTICLES:
excess = count - MAX_ARTICLES
conn.execute(
"""
DELETE FROM article_cache WHERE url_hash IN (
SELECT url_hash FROM article_cache
ORDER BY created_at ASC LIMIT ?
)
""",
(excess,),
)
conn.commit()
logger.info(f"Cache eviction: removed {excess} oldest articles")


# ── Perspective cache ─────────────────────────────────────────────────────────


def get_cached_perspective(url: str, lens: str) -> str | None:
url_hash = url_to_hash(url)
conn = get_connection()
row = conn.execute(
"""SELECT content FROM perspective_cache
WHERE url_hash = ? AND lens = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, lens, PERSPECTIVE_TTL_SECS),
).fetchone()
return row["content"] if row else None


def save_perspective_cache(url: str, lens: str, content: str):
url_hash = url_to_hash(url)
conn = get_connection()
conn.execute(
"""
INSERT OR REPLACE INTO perspective_cache (url_hash, lens, content)
VALUES (?, ?, ?)
""",
(url_hash, lens, content),
)
conn.commit()
logger.debug(f"Perspective cached: lens={lens}, hash={url_hash[:12]}…")


def get_all_cached_perspectives(url: str) -> dict:
"""Return {lens: content} for all non-expired cached lenses for this URL."""
url_hash = url_to_hash(url)
conn = get_connection()
rows = conn.execute(
"""SELECT lens, content FROM perspective_cache
WHERE url_hash = ?
AND (strftime('%s','now') - created_at) < ?""",
(url_hash, PERSPECTIVE_TTL_SECS),
).fetchall()
return {row["lens"]: row["content"] for row in rows}
3 changes: 2 additions & 1 deletion backend/app/logging/logging_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import sys


def setup_logger(name: str) -> logging.Logger:
"""
Creates and configures a logger with console + file output.
Expand All @@ -21,7 +22,7 @@ def setup_logger(name: str) -> logging.Logger:
# Formatter with timestamp, log level, module name
formatter = logging.Formatter(
"[%(asctime)s] [%(levelname)s] [%(name)s]: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
datefmt="%Y-%m-%d %H:%M:%S",
)

# Console Handler
Expand Down
Empty file.
129 changes: 129 additions & 0 deletions backend/app/modules/article_extractor/extract_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
extract_metadata.py
-------------------
Extracts structured article metadata using Groq LLM (Llama 3.3 70B).

Improvements:
- Retry logic: up to 2 attempts on JSON parse failure
- More specific prompt with examples for each field
- Robust JSON cleaning (strips fences, BOM, leading junk)
- Type normalisation for entities and key_points

Returns a dict with:
summary - 2-3 sentence objective summary
main_claim - The central argument/thesis in one sentence
entities - Key named entities (people, orgs, places)
tone - One-word tone descriptor
key_points - 3-5 bullet-point insights
"""

import os
import json
import re
import time
from groq import Groq
from dotenv import load_dotenv
from app.logging.logging_config import setup_logger

load_dotenv()
logger = setup_logger(__name__)

client = Groq(api_key=os.getenv("GROQ_API_KEY"))

_FALLBACK = {
"summary": "",
"main_claim": "",
"entities": [],
"tone": "neutral",
"key_points": [],
}

METADATA_PROMPT = """Analyze the following article excerpt and return a JSON object. Be precise and factual.

Fields required:
- "summary": String. A 2-3 sentence objective summary of what the article is about.
- "main_claim": String. The single central argument or thesis in ONE sentence.
- "entities": Array of strings. Key named people, organisations, countries, or technologies mentioned.
- "tone": String. ONE word describing the overall tone. Choose from: alarmist, optimistic, critical, neutral, celebratory, authoritative, speculative, alarming, hopeful.
- "key_points": Array of strings. Exactly 3-5 concise insights or findings from the article.

Rules:
- Return ONLY the JSON object. No markdown code fences, no explanations, no extra text.
- If you cannot determine a field, use an empty string or empty array.

Article:
{text}
"""


def _clean_json_string(raw: str) -> str:
"""Remove common wrapping/junk around LLM JSON output."""
raw = raw.strip().lstrip("\ufeff") # strip BOM
# Remove markdown fences
raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.IGNORECASE)
raw = re.sub(r"\s*```$", "", raw)
return raw.strip()


def extract_article_metadata(cleaned_text: str) -> dict:
"""
Call Groq LLM to extract structured article metadata.
Retries once on JSON parse failure.
"""
if not cleaned_text or not cleaned_text.strip():
return _FALLBACK.copy()

# Use first 5000 chars — sufficient for metadata capture
excerpt = cleaned_text[:5000]
prompt = METADATA_PROMPT.format(text=excerpt)

for attempt in range(1, 3):
try:
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{
"role": "system",
"content": "You are a precise article analyst. Return only valid JSON.",
},
{"role": "user", "content": prompt},
],
temperature=0.1, # near-zero for consistent structured output
max_tokens=500,
)

raw = response.choices[0].message.content
cleaned = _clean_json_string(raw)
metadata = json.loads(cleaned)

# Normalise types
if isinstance(metadata.get("entities"), str):
metadata["entities"] = [
e.strip()
for e in re.split(r"[,;]", metadata["entities"])
if e.strip()
]
if isinstance(metadata.get("key_points"), str):
metadata["key_points"] = [metadata["key_points"]]
if not isinstance(metadata.get("entities"), list):
metadata["entities"] = []
if not isinstance(metadata.get("key_points"), list):
metadata["key_points"] = []

logger.info("Article metadata extracted successfully.")
return metadata

except json.JSONDecodeError as e:
logger.warning(
f"Metadata JSON parse failed (attempt {attempt}): {e}. Raw: {raw[:200]}"
)
if attempt < 2:
time.sleep(1)
continue
return _FALLBACK.copy()

except Exception as e:
logger.exception(f"Metadata extraction error: {e}")
return _FALLBACK.copy()

return _FALLBACK.copy()
Loading
Loading