-
-
Notifications
You must be signed in to change notification settings - Fork 83
Added multi perspective feature #150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Prateekiiitg56
wants to merge
4
commits into
AOSSIE-Org:main
Choose a base branch
from
Prateekiiitg56:Added-multi-perspective
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| { | ||
| "python.defaultInterpreterPath": "d:\\gsoc\\Perspective\\backend\\.venv\\Scripts\\python.exe", | ||
| "python.analysis.extraPaths": [ | ||
| "d:\\gsoc\\Perspective\\backend" | ||
| ] | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1 @@ | ||||||
| { "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] } | ||||||
|
||||||
| { "source_directories": ["."], "search_path": ["d:/gsoc/Perspective/backend/.venv/Lib/site-packages"] } | |
| { "source_directories": ["."] } |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,191 @@ | ||
| """ | ||
| sqlite_cache.py | ||
| --------------- | ||
| SQLite cache for article metadata and generated lens perspectives. | ||
|
|
||
| Improvements over v1: | ||
| - WAL (Write-Ahead Logging) mode: significantly faster concurrent reads/writes | ||
| - Connection pool via threading.local() — one connection per thread, not per call | ||
| - TTL expiry: articles expire after 7 days, perspectives after 30 days | ||
| - Cache size limit: auto-evicts the oldest articles beyond MAX_ARTICLES | ||
| - Indexes on url_hash + lens for fast lookups | ||
| - PRAGMA optimisations: cache_size, synchronous=NORMAL, temp_store=MEMORY | ||
| """ | ||
|
|
||
| import sqlite3 | ||
| import hashlib | ||
| import json | ||
| import os | ||
| import threading | ||
| from app.logging.logging_config import setup_logger | ||
|
|
||
| logger = setup_logger(__name__) | ||
|
|
||
| DB_PATH = os.path.join(os.path.dirname(__file__), "perspective_cache.db") | ||
|
|
||
| # TTL in seconds | ||
| ARTICLE_TTL_SECS = 7 * 24 * 3600 # 7 days | ||
| PERSPECTIVE_TTL_SECS = 30 * 24 * 3600 # 30 days | ||
| MAX_ARTICLES = 500 # evict oldest beyond this | ||
|
|
||
| # Thread-local connection pool | ||
| _local = threading.local() | ||
|
|
||
|
|
||
| def get_connection() -> sqlite3.Connection: | ||
| """ | ||
| Return a per-thread SQLite connection. | ||
| Creates the connection with WAL mode and performance PRAGMAs on first use. | ||
| """ | ||
| if not hasattr(_local, "conn") or _local.conn is None: | ||
| conn = sqlite3.connect(DB_PATH, check_same_thread=False) | ||
| conn.row_factory = sqlite3.Row | ||
| conn.execute("PRAGMA journal_mode=WAL") | ||
| conn.execute("PRAGMA synchronous=NORMAL") | ||
| conn.execute("PRAGMA cache_size=-8000") # 8 MB page cache | ||
| conn.execute("PRAGMA temp_store=MEMORY") | ||
| conn.execute("PRAGMA mmap_size=134217728") # 128 MB memory-mapped I/O | ||
| _local.conn = conn | ||
| return _local.conn | ||
|
|
||
|
|
||
| def init_db(): | ||
| """Create tables and indexes if they don't already exist.""" | ||
| with get_connection() as conn: | ||
| conn.executescript(""" | ||
| CREATE TABLE IF NOT EXISTS article_cache ( | ||
| url_hash TEXT PRIMARY KEY, | ||
| url TEXT NOT NULL, | ||
| cleaned_text TEXT, | ||
| summary TEXT, | ||
| main_claim TEXT, | ||
| entities TEXT, | ||
| tone TEXT, | ||
| key_points TEXT, | ||
| created_at REAL DEFAULT (strftime('%s','now')) | ||
| ); | ||
|
|
||
| CREATE TABLE IF NOT EXISTS perspective_cache ( | ||
| id INTEGER PRIMARY KEY AUTOINCREMENT, | ||
| url_hash TEXT NOT NULL, | ||
| lens TEXT NOT NULL, | ||
| content TEXT NOT NULL, | ||
| created_at REAL DEFAULT (strftime('%s','now')), | ||
| UNIQUE(url_hash, lens) | ||
| ); | ||
|
|
||
| -- Indexes for fast lookups (idempotent) | ||
| CREATE INDEX IF NOT EXISTS idx_article_hash ON article_cache(url_hash); | ||
| CREATE INDEX IF NOT EXISTS idx_persp_hash_lens ON perspective_cache(url_hash, lens); | ||
| CREATE INDEX IF NOT EXISTS idx_article_age ON article_cache(created_at); | ||
| """) | ||
| logger.info("SQLite cache initialised (WAL mode).") | ||
|
|
||
|
|
||
| def url_to_hash(url: str) -> str: | ||
| return hashlib.sha256(url.strip().encode()).hexdigest() | ||
|
|
||
|
|
||
| # ── Article cache ───────────────────────────────────────────────────────────── | ||
|
|
||
|
|
||
| def get_cached_article(url: str) -> dict | None: | ||
| url_hash = url_to_hash(url) | ||
| conn = get_connection() | ||
| row = conn.execute( | ||
| """SELECT * FROM article_cache | ||
| WHERE url_hash = ? | ||
| AND (strftime('%s','now') - created_at) < ?""", | ||
| (url_hash, ARTICLE_TTL_SECS), | ||
| ).fetchone() | ||
| if row: | ||
| data = dict(row) | ||
| data["entities"] = json.loads(data.get("entities") or "[]") | ||
| data["key_points"] = json.loads(data.get("key_points") or "[]") | ||
| return data | ||
| return None | ||
|
|
||
|
|
||
| def save_article_cache(url: str, article_data: dict): | ||
| url_hash = url_to_hash(url) | ||
| conn = get_connection() | ||
| conn.execute( | ||
| """ | ||
| INSERT OR REPLACE INTO article_cache | ||
| (url_hash, url, cleaned_text, summary, main_claim, entities, tone, key_points) | ||
| VALUES (?, ?, ?, ?, ?, ?, ?, ?) | ||
| """, | ||
| ( | ||
| url_hash, | ||
| url, | ||
| article_data.get("cleaned_text", ""), | ||
| article_data.get("summary", ""), | ||
| article_data.get("main_claim", ""), | ||
| json.dumps(article_data.get("entities", [])), | ||
| article_data.get("tone", ""), | ||
| json.dumps(article_data.get("key_points", [])), | ||
| ), | ||
| ) | ||
| conn.commit() | ||
| _evict_old_articles(conn) | ||
| logger.debug(f"Article cached: {url_hash[:12]}…") | ||
|
|
||
|
|
||
| def _evict_old_articles(conn: sqlite3.Connection): | ||
| """Delete oldest articles beyond MAX_ARTICLES to keep the DB lean.""" | ||
| count = conn.execute("SELECT COUNT(*) FROM article_cache").fetchone()[0] | ||
| if count > MAX_ARTICLES: | ||
| excess = count - MAX_ARTICLES | ||
| conn.execute( | ||
| """ | ||
| DELETE FROM article_cache WHERE url_hash IN ( | ||
| SELECT url_hash FROM article_cache | ||
| ORDER BY created_at ASC LIMIT ? | ||
| ) | ||
| """, | ||
| (excess,), | ||
| ) | ||
| conn.commit() | ||
| logger.info(f"Cache eviction: removed {excess} oldest articles") | ||
|
|
||
|
|
||
| # ── Perspective cache ───────────────────────────────────────────────────────── | ||
|
|
||
|
|
||
| def get_cached_perspective(url: str, lens: str) -> str | None: | ||
| url_hash = url_to_hash(url) | ||
| conn = get_connection() | ||
| row = conn.execute( | ||
| """SELECT content FROM perspective_cache | ||
| WHERE url_hash = ? AND lens = ? | ||
| AND (strftime('%s','now') - created_at) < ?""", | ||
| (url_hash, lens, PERSPECTIVE_TTL_SECS), | ||
| ).fetchone() | ||
| return row["content"] if row else None | ||
|
|
||
|
|
||
| def save_perspective_cache(url: str, lens: str, content: str): | ||
| url_hash = url_to_hash(url) | ||
| conn = get_connection() | ||
| conn.execute( | ||
| """ | ||
| INSERT OR REPLACE INTO perspective_cache (url_hash, lens, content) | ||
| VALUES (?, ?, ?) | ||
| """, | ||
| (url_hash, lens, content), | ||
| ) | ||
| conn.commit() | ||
| logger.debug(f"Perspective cached: lens={lens}, hash={url_hash[:12]}…") | ||
|
|
||
|
|
||
| def get_all_cached_perspectives(url: str) -> dict: | ||
| """Return {lens: content} for all non-expired cached lenses for this URL.""" | ||
| url_hash = url_to_hash(url) | ||
| conn = get_connection() | ||
| rows = conn.execute( | ||
| """SELECT lens, content FROM perspective_cache | ||
| WHERE url_hash = ? | ||
| AND (strftime('%s','now') - created_at) < ?""", | ||
| (url_hash, PERSPECTIVE_TTL_SECS), | ||
| ).fetchall() | ||
| return {row["lens"]: row["content"] for row in rows} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
129 changes: 129 additions & 0 deletions
129
backend/app/modules/article_extractor/extract_metadata.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,129 @@ | ||
| """ | ||
| extract_metadata.py | ||
| ------------------- | ||
| Extracts structured article metadata using Groq LLM (Llama 3.3 70B). | ||
|
|
||
| Improvements: | ||
| - Retry logic: up to 2 attempts on JSON parse failure | ||
| - More specific prompt with examples for each field | ||
| - Robust JSON cleaning (strips fences, BOM, leading junk) | ||
| - Type normalisation for entities and key_points | ||
|
|
||
| Returns a dict with: | ||
| summary - 2-3 sentence objective summary | ||
| main_claim - The central argument/thesis in one sentence | ||
| entities - Key named entities (people, orgs, places) | ||
| tone - One-word tone descriptor | ||
| key_points - 3-5 bullet-point insights | ||
| """ | ||
|
|
||
| import os | ||
| import json | ||
| import re | ||
| import time | ||
| from groq import Groq | ||
| from dotenv import load_dotenv | ||
| from app.logging.logging_config import setup_logger | ||
|
|
||
| load_dotenv() | ||
| logger = setup_logger(__name__) | ||
|
|
||
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | ||
|
|
||
| _FALLBACK = { | ||
| "summary": "", | ||
| "main_claim": "", | ||
| "entities": [], | ||
| "tone": "neutral", | ||
| "key_points": [], | ||
| } | ||
|
|
||
| METADATA_PROMPT = """Analyze the following article excerpt and return a JSON object. Be precise and factual. | ||
|
|
||
| Fields required: | ||
| - "summary": String. A 2-3 sentence objective summary of what the article is about. | ||
| - "main_claim": String. The single central argument or thesis in ONE sentence. | ||
| - "entities": Array of strings. Key named people, organisations, countries, or technologies mentioned. | ||
| - "tone": String. ONE word describing the overall tone. Choose from: alarmist, optimistic, critical, neutral, celebratory, authoritative, speculative, alarming, hopeful. | ||
| - "key_points": Array of strings. Exactly 3-5 concise insights or findings from the article. | ||
|
|
||
| Rules: | ||
| - Return ONLY the JSON object. No markdown code fences, no explanations, no extra text. | ||
| - If you cannot determine a field, use an empty string or empty array. | ||
|
|
||
| Article: | ||
| {text} | ||
| """ | ||
|
|
||
|
|
||
| def _clean_json_string(raw: str) -> str: | ||
| """Remove common wrapping/junk around LLM JSON output.""" | ||
| raw = raw.strip().lstrip("\ufeff") # strip BOM | ||
| # Remove markdown fences | ||
| raw = re.sub(r"^```(?:json)?\s*", "", raw, flags=re.IGNORECASE) | ||
| raw = re.sub(r"\s*```$", "", raw) | ||
| return raw.strip() | ||
|
|
||
|
|
||
| def extract_article_metadata(cleaned_text: str) -> dict: | ||
| """ | ||
| Call Groq LLM to extract structured article metadata. | ||
| Retries once on JSON parse failure. | ||
| """ | ||
| if not cleaned_text or not cleaned_text.strip(): | ||
| return _FALLBACK.copy() | ||
|
|
||
| # Use first 5000 chars — sufficient for metadata capture | ||
| excerpt = cleaned_text[:5000] | ||
| prompt = METADATA_PROMPT.format(text=excerpt) | ||
|
|
||
| for attempt in range(1, 3): | ||
| try: | ||
| response = client.chat.completions.create( | ||
| model="llama-3.3-70b-versatile", | ||
| messages=[ | ||
| { | ||
| "role": "system", | ||
| "content": "You are a precise article analyst. Return only valid JSON.", | ||
| }, | ||
| {"role": "user", "content": prompt}, | ||
| ], | ||
| temperature=0.1, # near-zero for consistent structured output | ||
| max_tokens=500, | ||
| ) | ||
|
|
||
| raw = response.choices[0].message.content | ||
| cleaned = _clean_json_string(raw) | ||
| metadata = json.loads(cleaned) | ||
|
|
||
| # Normalise types | ||
| if isinstance(metadata.get("entities"), str): | ||
| metadata["entities"] = [ | ||
| e.strip() | ||
| for e in re.split(r"[,;]", metadata["entities"]) | ||
| if e.strip() | ||
| ] | ||
| if isinstance(metadata.get("key_points"), str): | ||
| metadata["key_points"] = [metadata["key_points"]] | ||
| if not isinstance(metadata.get("entities"), list): | ||
| metadata["entities"] = [] | ||
| if not isinstance(metadata.get("key_points"), list): | ||
| metadata["key_points"] = [] | ||
|
|
||
| logger.info("Article metadata extracted successfully.") | ||
| return metadata | ||
|
|
||
| except json.JSONDecodeError as e: | ||
| logger.warning( | ||
| f"Metadata JSON parse failed (attempt {attempt}): {e}. Raw: {raw[:200]}" | ||
| ) | ||
| if attempt < 2: | ||
| time.sleep(1) | ||
| continue | ||
| return _FALLBACK.copy() | ||
|
|
||
| except Exception as e: | ||
| logger.exception(f"Metadata extraction error: {e}") | ||
| return _FALLBACK.copy() | ||
|
|
||
| return _FALLBACK.copy() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.