Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ faiss_index/
# ===============================
.cache/
.huggingface/
models/


# ===============================
# IDE
Expand Down
48 changes: 48 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Capstone RAG Backend
This is the backend for Retrieval-Augmented Generation system for intelligent document Q & A.
Will try to keep it updated as i update the files for better usability

## Setup

### Virtual Environment

```bash
python3 -m venv venv
source venv/bin/activate # Linux/macOS
venv\Scripts\activate # Windows
```
### Instal Dependencies
pip install -r requirements.txt

### Environment Variables
Create a .env file in backend/:

DATABASE_URL=postgresql://postgres:<password>@localhost:5432/ragdb
# JWT secret key (must NOT be left as default)
# Generate a secure random key:
# In Ubuntu terminal (prefered) type: openssl rand -hex 32
JWT_SECRET=<paste-generated-key>

# Token settings
JWT_ALGORITHM=HS256
ACCESS_TOKEN_EXPIRE_MINUTES=30

## Database

### Create the database in PostgreSQL
```bash
sudo -u postgres psql
```
inside psql:
```Sql
CREATE DATABASE ragdb;
```

## Run App
```bash
uvicorn app.main:app --reload
```

## Test
http://localhost:8000/ -> {"message":"Backend Running"}
http://localhost:8000/db_test -> shows PostgreSQL version
136 changes: 136 additions & 0 deletions backend/app/api/auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""
routers/auth.py: Authentication Routes
Endpoints:
POST /auth/register: create a new user account
POST /auth/login: verify credentials, return JWT token
The frontend stores access_token and sends it in every request header:
Authorization: Bearer eyJhbGci...
"""

from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.orm import Session

from app.config.database import get_db
from app.models.db_models import User, Log
from app.config.dependencies import hash_password, verify_password, create_access_token
from app.models.schemas import UserRegisterRequest, UserResponse, UserLoginRequest, TokenResponse

import uuid
from datetime import datetime

router = APIRouter(prefix="/auth", tags=["Authentication"])


# POST /auth/register
@router.post(
"/register",
response_model=UserResponse,
status_code=status.HTTP_201_CREATED,
summary="Register a new user"
)
def register(payload: UserRegisterRequest, db: Session = Depends(get_db)):
"""
Creates a new user account.

Checks:
- Email is not already registered
- Username is not already taken

Password is hashed with bcrypt before storing — never stored in plain text.
"""
# Check email uniqueness
if db.query(User).filter(User.email == payload.email).first():
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="An account with this email already exists."
)

# Check username uniqueness
if db.query(User).filter(User.username == payload.username).first():
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="This username is already taken."
)

# Create user
new_user = User(
id=uuid.uuid4(),
username=payload.username,
email=payload.email,
password_hash=hash_password(payload.password),
created_at=datetime.utcnow()
)
db.add(new_user)

# Log the action
db.add(Log(
user_id=new_user.id,
action="user_registered",
detail=f"New user registered: {new_user.username}",
timestamp=datetime.utcnow()
))

db.commit()
db.refresh(new_user)

return new_user


# POST /auth/login
@router.post(
"/login",
response_model=TokenResponse,
summary="Login and receive a JWT token"
)
def login(payload: UserLoginRequest, db: Session = Depends(get_db)):
"""
Verifies credentials and returns a JWT access token.

The frontend stores this token and sends it in the
Authorization header on every subsequent request:
Authorization: Bearer <token>

The token payload contains:
{ "sub": "<user_id>" }

This is decoded by get_current_user() in dependencies.py
to identify WHO is making each request.
"""
# Find user by email
user = db.query(User).filter(User.email == payload.email).first()

if not user or not verify_password(payload.password, user.password_hash):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect email or password."
)

# Generate JWT: sub = user's UUID (this is how every request is tied to a user)
token = create_access_token(data={"sub": str(user.id)})

# Log the action
db.add(Log(
user_id=user.id,
action="user_login",
detail=f"User logged in: {user.username}",
timestamp=datetime.utcnow()
))
db.commit()

# I used TokenResponse + UserResponse instead of returning a plain dict,
# because this technique ensures FastAPI validates the response correctly
# against the schema and includes all required fields (id, username, email, created_at).
return TokenResponse(
access_token=token,
token_type = "bearer",
user=UserResponse.model_validate(user)
)

# {
# "access_token": token,
# "token_type": "bearer",
# "user": {
# "id": str(user.id),
# "username": user.username
# }
# }
198 changes: 198 additions & 0 deletions backend/app/api/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import uuid
from datetime import datetime
from fastapi import APIRouter, Depends, UploadFile, File, HTTPException, status, BackgroundTasks
from sqlalchemy.orm import Session

from app.config.database import get_db
from app.config.dependencies import get_current_user
from app.models.db_models import User, Document, DocumentChunk, Log
from app.ingestion.parser import extract_text

router = APIRouter(tags=["Documents"])


@router.post("/upload", status_code=status.HTTP_202_ACCEPTED)
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
# 1: Generate shared document_id ─
document_id = f"doc_{uuid.uuid4().hex[:12]}"

# 2: Save file to disk ─
try:
file_info = await save_upload_file(file, user_id=str(current_user.id))
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))

# 3: Save metadata to PostgreSQL (status=processing) ─
doc = Document(
document_id = document_id,
filename = file_info["filename"],
filepath = file_info["filepath"],
file_type = file_info["file_type"],
uploaded_by = current_user.id,
upload_date = datetime.utcnow(),
status = "processing"
)
db.add(doc)
db.add(Log(
user_id = current_user.id,
action = "document_upload_started",
detail = f"file={file_info['filename']} doc_id={document_id}",
timestamp = datetime.utcnow()
))
db.commit()

# 4: Queue background processing ─
background_tasks.add_task(
process_document_background,
document_id = document_id,
filepath = file_info["filepath"],
file_type = file_info["file_type"],
filename = file_info["filename"],
user_id = current_user.id,
username = current_user.username
)

# Return immediately — don't wait for processing
return {
"message" : "Document uploaded. Processing in background.",
"document_id" : document_id,
"filename" : file_info["filename"],
"status" : "processing"
}


async def process_document_background(
document_id: str,
filepath: str,
file_type: str,
filename: str,
user_id: int,
username: str
):
from app.config.database import SessionLocal
db = SessionLocal()

try:
# Extract text
text = extract_text(filepath, file_type)
print(f"✅ Text extracted from {filename}")

# Chunk the text
chunks = chunk_text(
text = text,
document_id = document_id,
source_name = filename
)
print(f"✅ {len(chunks)} chunks created")

# Index chunks in ChromaDB
index_chunks(
chunks = chunks,
document_id = document_id,
uploaded_by = username,
file_type = file_type
)
print(f"✅ Chunks indexed in ChromaDB")

# Save chunk metadata to PostgreSQL
for chunk in chunks:
db.add(DocumentChunk(
chunk_id = chunk["chunk_id"],
document_id = document_id,
source_name = filename,
text = chunk["text"],
page = chunk.get("page", 1),
start_char = chunk.get("start_char", 0),
end_char = chunk.get("end_char", 0),
))

# Mark document as completed
doc = db.query(Document).filter(
Document.document_id == document_id
).first()
if doc:
doc.status = "completed"

db.add(Log(
user_id = user_id,
action = "document_uploaded",
detail = f"file={filename} chunks={len(chunks)} doc_id={document_id}",
timestamp = datetime.utcnow()
))
db.commit()
print(f" ✅ Document {document_id} processing complete")

except Exception as e:
print(f"❌ Failed: {e}")
try:
doc = db.query(Document).filter(
Document.document_id == document_id
).first()
if doc:
doc.status = "failed"
db.add(Log(
user_id = user_id,
action = "document_upload_failed",
detail = f"file={filename} error={str(e)}",
timestamp = datetime.utcnow()
))
db.commit()
except:
pass

finally:
db.close()


@router.get("/documents", status_code=status.HTTP_200_OK)
def list_documents(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
# List all documents uploaded by the current user.
docs = db.query(Document)\
.filter(Document.uploaded_by == current_user.id)\
.order_by(Document.upload_date.desc())\
.all()

return {
"documents": [
{
"document_id" : d.document_id,
"filename" : d.filename,
"file_type" : d.file_type,
"status" : d.status,
"upload_date" : d.upload_date,
}
for d in docs
],
"total": len(docs)
}


@router.get("/document/{document_id}/status")
def get_document_status(
document_id: str,
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
# Check the processing status of a specific document.
doc = db.query(Document).filter(
Document.document_id == document_id,
Document.uploaded_by == current_user.id
).first()

if not doc:
raise HTTPException(status_code=404, detail="Document not found")

return {
"document_id" : doc.document_id,
"filename" : doc.filename,
"status" : doc.status,
"upload_date" : doc.upload_date
}
Loading