diff --git a/backend/prisma/migrations/20260626000000_curriculum_search_index/migration.sql b/backend/prisma/migrations/20260626000000_curriculum_search_index/migration.sql new file mode 100644 index 00000000..dce34b37 --- /dev/null +++ b/backend/prisma/migrations/20260626000000_curriculum_search_index/migration.sql @@ -0,0 +1,44 @@ +-- CreateTable +CREATE TABLE "curriculum_search_entries" ( + "id" TEXT NOT NULL, + "workspaceId" TEXT NOT NULL DEFAULT 'default', + "entityType" TEXT NOT NULL, + "entityId" TEXT NOT NULL, + "courseId" TEXT, + "title" TEXT NOT NULL, + "content" TEXT NOT NULL, + "difficulty" TEXT, + "searchVector" tsvector, + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "curriculum_search_entries_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE UNIQUE INDEX "curriculum_search_entries_workspaceId_entityType_entityId_key" ON "curriculum_search_entries"("workspaceId", "entityType", "entityId"); + +-- CreateIndex +CREATE INDEX "curriculum_search_entries_workspaceId_entityType_idx" ON "curriculum_search_entries"("workspaceId", "entityType"); + +-- CreateIndex +CREATE INDEX "curriculum_search_entries_workspaceId_courseId_idx" ON "curriculum_search_entries"("workspaceId", "courseId"); + +-- Full-text search GIN index over the maintained tsvector column. +CREATE INDEX "curriculum_search_entries_search_idx" ON "curriculum_search_entries" USING GIN ("searchVector"); + +-- Trigger keeps "searchVector" in sync with the indexed text. The title is +-- weighted higher than the body so title matches rank above description matches. +CREATE OR REPLACE FUNCTION curriculum_search_entries_vector_update() RETURNS trigger AS $$ +BEGIN + NEW."searchVector" := + setweight(to_tsvector('english', COALESCE(NEW."title", '')), 'A') || + setweight(to_tsvector('english', COALESCE(NEW."content", '')), 'B'); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER curriculum_search_entries_vector_trigger + BEFORE INSERT OR UPDATE OF "title", "content" + ON "curriculum_search_entries" + FOR EACH ROW EXECUTE FUNCTION curriculum_search_entries_vector_update(); diff --git a/backend/prisma/schema.prisma b/backend/prisma/schema.prisma index 653a8815..85dde65a 100644 --- a/backend/prisma/schema.prisma +++ b/backend/prisma/schema.prisma @@ -271,3 +271,26 @@ model StudentActivity { @@map("student_activities") } +/// Denormalised, full-text searchable index of curriculum content +/// (courses, modules and lessons). Populated by the reindex routine and +/// queried via PostgreSQL full-text search (see CurriculumSearchService). +model CurriculumSearchEntry { + id String @id @default(cuid()) + workspaceId String @default("default") + entityType String // 'course' | 'module' | 'lesson' + entityId String // source id, e.g. 'course-1-lesson-1' + courseId String? + title String + content String // text block indexed for search (title + description) + difficulty String? + /// Maintained automatically by the curriculum_search_entries_vector_update trigger. + searchVector Unsupported("tsvector")? + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt + + @@unique([workspaceId, entityType, entityId]) + @@index([workspaceId, entityType]) + @@index([workspaceId, courseId]) + @@map("curriculum_search_entries") +} + diff --git a/backend/src/routes/index.ts b/backend/src/routes/index.ts index 0594e6fa..06749223 100644 --- a/backend/src/routes/index.ts +++ b/backend/src/routes/index.ts @@ -13,6 +13,7 @@ import exportRouter from './export.routes.js'; import generatorRouter from './generator/generator.routes.js'; import healthRouter from './health.routes.js'; import learningRoutes from './learning/learning.routes.js'; +import curriculumSearchRouter from './search/curriculum-search.routes.js'; import securityRouter from './security.routes.js'; import studentsRouter from './students.js'; @@ -33,6 +34,7 @@ router.use('/dashboard', dashboardRoutes); router.use('/feedback', feedbackRouter); router.use('/auth', authRoutes); router.use('/learning', learningRoutes); +router.use('/search', curriculumSearchRouter); router.use('/contracts', contractRouter); router.use('/notifications', notificationRouter); router.use('/security', securityRouter); diff --git a/backend/src/routes/search/curriculum-search.routes.ts b/backend/src/routes/search/curriculum-search.routes.ts new file mode 100644 index 00000000..282c3e80 --- /dev/null +++ b/backend/src/routes/search/curriculum-search.routes.ts @@ -0,0 +1,59 @@ +import { Router, Request, Response } from 'express'; +import { getWorkspaceId } from '../../middleware/WorkspaceContext.js'; +import { validateQuery } from '../../utils/validation.js'; +import { curriculumSearchQuerySchema } from './curriculum-search.schemas.js'; +import { + reindexCurriculum, + searchCurriculum, +} from '../../search/curriculum/CurriculumSearchService.js'; + +const router = Router(); + +/** + * GET /api/v1/search + * + * Full-text search across indexed curriculum content (courses, modules, + * lessons). Supports filtering by `type`, `difficulty` and `courseId`, and + * pagination via `limit`/`offset`. Results are ranked by relevance. + */ +router.get('/', validateQuery(curriculumSearchQuerySchema), async (req: Request, res: Response) => { + try { + // Re-parse to obtain the coerced/typed values (validateQuery only validates). + const { q, type, difficulty, courseId, limit, offset } = curriculumSearchQuerySchema.parse( + req.query + ); + const workspaceId = getWorkspaceId() ?? 'default'; + + const results = await searchCurriculum({ + query: q, + workspaceId, + entityType: type, + difficulty, + courseId, + limit, + offset, + }); + + res.json({ query: q, count: results.length, limit, offset, results }); + } catch { + res.status(500).json({ error: 'Search request failed' }); + } +}); + +/** + * POST /api/v1/search/reindex + * + * Rebuilds the curriculum search index for the current workspace. Intended for + * admin/maintenance use (e.g. after curriculum updates or a fresh deploy). + */ +router.post('/reindex', async (_req: Request, res: Response) => { + try { + const workspaceId = getWorkspaceId() ?? 'default'; + const indexed = await reindexCurriculum(workspaceId); + res.json({ indexed }); + } catch { + res.status(500).json({ error: 'Reindex request failed' }); + } +}); + +export default router; diff --git a/backend/src/routes/search/curriculum-search.schemas.ts b/backend/src/routes/search/curriculum-search.schemas.ts new file mode 100644 index 00000000..804457c9 --- /dev/null +++ b/backend/src/routes/search/curriculum-search.schemas.ts @@ -0,0 +1,19 @@ +import { z } from 'zod'; + +/** + * Validation schema for `GET /api/v1/search`. + * + * `q` is the required keyword query. `type`, `difficulty` and `courseId` are + * optional filters. `limit`/`offset` are coerced from query strings and bounded + * to keep result sets fast. + */ +export const curriculumSearchQuerySchema = z.object({ + q: z.string().trim().min(1, 'q (search keywords) is required'), + type: z.enum(['course', 'module', 'lesson']).optional(), + difficulty: z.enum(['beginner', 'intermediate', 'advanced']).optional(), + courseId: z.string().trim().min(1).optional(), + limit: z.coerce.number().int().min(1).max(50).default(20), + offset: z.coerce.number().int().min(0).default(0), +}); + +export type CurriculumSearchQuery = z.infer; diff --git a/backend/src/search/curriculum/CurriculumSearchService.ts b/backend/src/search/curriculum/CurriculumSearchService.ts new file mode 100644 index 00000000..5897eb5c --- /dev/null +++ b/backend/src/search/curriculum/CurriculumSearchService.ts @@ -0,0 +1,110 @@ +import prisma from '../../db/index.js'; +import { COURSES, curriculumByCourseId } from '../../routes/learning/curriculum.data.js'; +import { + buildCurriculumSearchQuery, + buildIndexEntries, + type CurriculumIndexEntry, + type CurriculumSearchParams, + type SearchableCourse, +} from './curriculumSearchQuery.js'; + +/** + * CurriculumSearchService — the database-facing layer of the search indexer. + * + * Responsibilities: + * - `reindexCurriculum`: (re)build the `curriculum_search_entries` table from + * the curriculum sources (static modules/lessons + DB course records). + * - `searchCurriculum`: run the PostgreSQL full-text query and return ranked + * results. + * + * The query/row-shaping logic lives in `curriculumSearchQuery.ts` (pure and unit + * tested); this module only wires it to Prisma. + */ + +/** A single ranked search hit returned to the API. */ +export interface CurriculumSearchResult { + id: string; + entityType: string; + entityId: string; + courseId: string | null; + title: string; + content: string; + difficulty: string | null; + rank: number; +} + +// The default Prisma export is workspace-extended; raw queries and the new model +// are accessed dynamically to avoid coupling to the extension's narrowed types. +const db = prisma as unknown as { + $queryRawUnsafe: (text: string, ...values: unknown[]) => Promise; + $transaction: (ops: unknown[]) => Promise; + course: { findMany: (args: unknown) => Promise }; + curriculumSearchEntry: { + deleteMany: (args: unknown) => unknown; + createMany: (args: unknown) => unknown; + }; +}; + +/** + * Merge the curriculum sources into a deduplicated set of index rows. + * + * The static curriculum (`curriculum.data.ts`) is the source of modules and + * lessons; DB `Course` rows are also indexed so live courses are searchable. + * Pure and deterministic for easy testing — pass the DB courses in. + */ +export function collectIndexEntries(dbCourses: SearchableCourse[]): CurriculumIndexEntry[] { + const staticEntries = buildIndexEntries( + COURSES.map((c) => ({ id: c.id, title: c.title, description: c.description })), + curriculumByCourseId + ); + // DB courses contribute only 'course' rows (no static modules attached). + const dbEntries = buildIndexEntries(dbCourses, {}); + + const seen = new Set(); + const merged: CurriculumIndexEntry[] = []; + for (const entry of [...staticEntries, ...dbEntries]) { + const key = `${entry.entityType}:${entry.entityId}`; + if (seen.has(key)) continue; + seen.add(key); + merged.push(entry); + } + return merged; +} + +/** + * Rebuild the search index for a workspace. Returns the number of rows indexed. + * The stored `searchVector` is populated by the database trigger on insert. + */ +export async function reindexCurriculum(workspaceId: string): Promise { + let dbCourses: SearchableCourse[] = []; + try { + dbCourses = await db.course.findMany({ + where: { workspaceId }, + select: { id: true, title: true, description: true }, + }); + } catch { + // If the courses table is unavailable, still index the static curriculum. + dbCourses = []; + } + + const entries = collectIndexEntries(dbCourses); + + await db.$transaction([ + db.curriculumSearchEntry.deleteMany({ where: { workspaceId } }), + db.curriculumSearchEntry.createMany({ + data: entries.map((entry) => ({ ...entry, workspaceId })), + skipDuplicates: true, + }), + ]); + + return entries.length; +} + +/** Execute the full-text search and return ranked results. */ +export async function searchCurriculum( + params: CurriculumSearchParams +): Promise { + const { text, values } = buildCurriculumSearchQuery(params); + const rows = await db.$queryRawUnsafe(text, ...values); + return rows as CurriculumSearchResult[]; +} diff --git a/backend/src/search/curriculum/README.md b/backend/src/search/curriculum/README.md new file mode 100644 index 00000000..0d032c33 --- /dev/null +++ b/backend/src/search/curriculum/README.md @@ -0,0 +1,76 @@ +# Advanced Search Indexer for Learning Roadmaps + +A fast, relevance-ranked search API over curriculum content (courses, modules, +lessons) backed by **PostgreSQL full-text search**. + +## Endpoints + +### `GET /api/v1/search` +Keyword search across indexed curriculum. + +| Query param | Type | Notes | +|-------------|------|-------| +| `q` | string (required) | Keywords. Parsed with `websearch_to_tsquery` (supports quoted phrases, `or`, `-negation`). | +| `type` | `course` \| `module` \| `lesson` | Filter by entity type. | +| `difficulty` | `beginner` \| `intermediate` \| `advanced` | Filter (applies to lessons). | +| `courseId` | string | Restrict to one course. | +| `limit` | number (1–50, default 20) | Page size. | +| `offset` | number (default 0) | Pagination offset. | + +Response: +```json +{ "query": "soroban auth", "count": 2, "limit": 20, "offset": 0, + "results": [ { "entityType": "lesson", "entityId": "course-1-lesson-3", + "courseId": "course-1", "title": "...", "content": "...", + "difficulty": "intermediate", "rank": 0.12 } ] } +``` + +### `POST /api/v1/search/reindex` +Rebuilds the index for the current workspace. Run after curriculum changes or a +fresh deploy. + +## How it works + +``` +curriculum.data.ts (modules/lessons) ┐ +DB Course records ┘──▶ collectIndexEntries() + ──▶ curriculum_search_entries (tsvector + GIN index) + ──▶ GET /api/v1/search → websearch_to_tsquery + ts_rank +``` + +- **Index table** `curriculum_search_entries` stores one denormalised, searchable + row per course/module/lesson. A `tsvector` column (`searchVector`) is kept up + to date by a database trigger, with the **title weighted above the body** so + title matches rank higher. +- **GIN index** on `searchVector` makes `@@` lookups fast. +- **Workspace isolation**: every row carries `workspaceId`; queries and reindex + are scoped to the request's workspace. + +## Files + +| File | Responsibility | +|------|----------------| +| `curriculumSearchQuery.ts` | Pure builders: `buildIndexEntries`, `buildCurriculumSearchQuery` (parameterised, injection-safe). Unit tested. | +| `CurriculumSearchService.ts` | Prisma wiring: `reindexCurriculum`, `searchCurriculum`, `collectIndexEntries`. | +| `../../routes/search/curriculum-search.routes.ts` | Express routes (mounted at `/api/v1/search`). | +| `../../routes/search/curriculum-search.schemas.ts` | Zod query validation. | +| `prisma/migrations/20260626000000_curriculum_search_index/` | Table + tsvector + GIN index + trigger. | + +## Setup + +```bash +cd backend +npx prisma migrate deploy # apply the migration +# then, once per workspace (or via the endpoint): +curl -X POST localhost:8080/api/v1/search/reindex -H 'x-workspace-id: default' +``` + +## Tests + +```bash +cd backend +npm test -- curriculum-search +``` + +Pure builder tests run without a database. The integration tests auto-skip when +no database / migrated index table is available. diff --git a/backend/src/search/curriculum/curriculumSearchQuery.ts b/backend/src/search/curriculum/curriculumSearchQuery.ts new file mode 100644 index 00000000..e3279a7f --- /dev/null +++ b/backend/src/search/curriculum/curriculumSearchQuery.ts @@ -0,0 +1,154 @@ +import type { Module } from '../../routes/learning/types.js'; + +/** + * Pure, side-effect-free helpers for the curriculum search indexer. + * + * These functions contain the logic that benefits most from unit testing — the + * shaping of index rows and the construction of the parameterised full-text + * search SQL — without requiring a live database. The service layer + * (`CurriculumSearchService`) wires them to Prisma. + */ + +export type CurriculumEntityType = 'course' | 'module' | 'lesson'; + +/** Minimal course shape needed to build index entries. */ +export interface SearchableCourse { + id: string; + title: string; + description?: string | null; +} + +/** A denormalised, searchable row destined for `curriculum_search_entries`. */ +export interface CurriculumIndexEntry { + entityType: CurriculumEntityType; + entityId: string; + courseId: string | null; + title: string; + content: string; + difficulty: string | null; +} + +/** Validated parameters for a curriculum search request. */ +export interface CurriculumSearchParams { + query: string; + workspaceId: string; + entityType?: CurriculumEntityType; + difficulty?: string; + courseId?: string; + limit: number; + offset: number; +} + +/** A parameterised SQL statement ready for `$queryRawUnsafe(text, ...values)`. */ +export interface ParameterisedQuery { + text: string; + values: unknown[]; +} + +/** Join non-empty text fragments into a single indexable block. */ +function toContent(...parts: Array): string { + return parts + .map((p) => (p ?? '').trim()) + .filter(Boolean) + .join(' '); +} + +/** + * Flatten courses and their modules/lessons into index rows. + * + * Each course yields one `course` row, each module one `module` row, and each + * lesson one `lesson` row. Lesson rows carry their difficulty so callers can + * filter by it. The text block combines the title and description so both are + * searchable. + */ +export function buildIndexEntries( + courses: SearchableCourse[], + curriculumByCourseId: Record +): CurriculumIndexEntry[] { + const entries: CurriculumIndexEntry[] = []; + + for (const course of courses) { + entries.push({ + entityType: 'course', + entityId: course.id, + courseId: course.id, + title: course.title, + content: toContent(course.title, course.description), + difficulty: null, + }); + + const modules = curriculumByCourseId[course.id] ?? []; + for (const module of modules) { + entries.push({ + entityType: 'module', + entityId: module.id, + courseId: course.id, + title: module.title, + content: toContent(module.title, module.description), + difficulty: null, + }); + + for (const lesson of module.lessons) { + entries.push({ + entityType: 'lesson', + entityId: lesson.id, + courseId: course.id, + title: lesson.title, + content: toContent(lesson.title, lesson.description), + difficulty: lesson.difficulty, + }); + } + } + } + + return entries; +} + +/** + * Build the parameterised full-text search query. + * + * Uses `websearch_to_tsquery`, which safely parses arbitrary user input (quoted + * phrases, `or`, `-negation`) and never throws on malformed queries. Results are + * ranked with `ts_rank` (title weighted above body via the stored vector) and + * paginated. All user-supplied values are passed as bind parameters — never + * interpolated — so the query is injection-safe. + */ +export function buildCurriculumSearchQuery(params: CurriculumSearchParams): ParameterisedQuery { + const values: unknown[] = []; + const bind = (value: unknown): string => { + values.push(value); + return `$${values.length}`; + }; + + const queryParam = bind(params.query); + const workspaceParam = bind(params.workspaceId); + + const where: string[] = [ + `"workspaceId" = ${workspaceParam}`, + `"searchVector" @@ websearch_to_tsquery('english', ${queryParam})`, + ]; + + if (params.entityType) { + where.push(`"entityType" = ${bind(params.entityType)}`); + } + if (params.difficulty) { + where.push(`"difficulty" = ${bind(params.difficulty)}`); + } + if (params.courseId) { + where.push(`"courseId" = ${bind(params.courseId)}`); + } + + const limitParam = bind(params.limit); + const offsetParam = bind(params.offset); + + const text = [ + 'SELECT "id", "entityType", "entityId", "courseId", "title", "content", "difficulty",', + ` ts_rank("searchVector", websearch_to_tsquery('english', ${queryParam})) AS rank`, + 'FROM "curriculum_search_entries"', + `WHERE ${where.join('\n AND ')}`, + 'ORDER BY rank DESC, "title" ASC', + `LIMIT ${limitParam} OFFSET ${offsetParam}`, + ].join('\n'); + + return { text, values }; +} diff --git a/backend/tests/curriculum-search.test.ts b/backend/tests/curriculum-search.test.ts new file mode 100644 index 00000000..e88871e1 --- /dev/null +++ b/backend/tests/curriculum-search.test.ts @@ -0,0 +1,156 @@ +import request from 'supertest'; +import prisma from '../src/db/index.js'; +import { app } from '../src/index.js'; +import { + buildCurriculumSearchQuery, + buildIndexEntries, +} from '../src/search/curriculum/curriculumSearchQuery.js'; +import { collectIndexEntries } from '../src/search/curriculum/CurriculumSearchService.js'; +import type { Module } from '../src/routes/learning/types.js'; + +const sampleCurriculum: Record = { + 'course-1': [ + { + id: 'm1', + title: 'Soroban Foundations', + description: 'Mental model for contracts.', + order: 1, + lessons: [ + { + id: 'l1', + title: 'What Soroban Adds', + description: 'Primitives overview.', + difficulty: 'beginner', + order: 1, + }, + ], + }, + ], +}; + +describe('curriculum search — pure builders', () => { + describe('buildIndexEntries', () => { + it('flattens courses, modules and lessons into rows', () => { + const entries = buildIndexEntries( + [{ id: 'course-1', title: 'Soroban Smart Contracts', description: 'Master contracts.' }], + sampleCurriculum + ); + + // 1 course + 1 module + 1 lesson + expect(entries).toHaveLength(3); + expect(entries.map((e) => e.entityType)).toEqual(['course', 'module', 'lesson']); + + const lesson = entries.find((e) => e.entityType === 'lesson'); + expect(lesson?.entityId).toBe('l1'); + expect(lesson?.courseId).toBe('course-1'); + expect(lesson?.difficulty).toBe('beginner'); + expect(lesson?.content).toBe('What Soroban Adds Primitives overview.'); + }); + + it('handles courses without curriculum', () => { + const entries = buildIndexEntries([{ id: 'c9', title: 'Lonely', description: null }], {}); + expect(entries).toHaveLength(1); + expect(entries[0]).toMatchObject({ entityType: 'course', entityId: 'c9', content: 'Lonely' }); + }); + }); + + describe('buildCurriculumSearchQuery', () => { + it('binds query + workspace and omits absent filters', () => { + const { text, values } = buildCurriculumSearchQuery({ + query: 'soroban', + workspaceId: 'default', + limit: 20, + offset: 0, + }); + expect(values).toEqual(['soroban', 'default', 20, 0]); + expect(text).toContain("websearch_to_tsquery('english', $1)"); + expect(text).toContain('"workspaceId" = $2'); + expect(text).not.toContain('"entityType"'); + expect(text).toContain('LIMIT $3 OFFSET $4'); + expect(text).toContain('ORDER BY rank DESC'); + }); + + it('appends type, difficulty and courseId filters in order', () => { + const { text, values } = buildCurriculumSearchQuery({ + query: 'auth', + workspaceId: 'w1', + entityType: 'lesson', + difficulty: 'beginner', + courseId: 'course-1', + limit: 10, + offset: 5, + }); + expect(values).toEqual(['auth', 'w1', 'lesson', 'beginner', 'course-1', 10, 5]); + expect(text).toContain('"entityType" = $3'); + expect(text).toContain('"difficulty" = $4'); + expect(text).toContain('"courseId" = $5'); + expect(text).toContain('LIMIT $6 OFFSET $7'); + }); + }); + + describe('collectIndexEntries', () => { + it('indexes static modules/lessons and deduplicates DB courses', () => { + // DB course collides with a static course id — static entry should win. + const entries = collectIndexEntries([ + { id: 'course-1', title: 'DB Title', description: 'db' }, + { id: 'db-only', title: 'Fresh Course', description: 'new' }, + ]); + + const courseOnes = entries.filter( + (e) => e.entityType === 'course' && e.entityId === 'course-1' + ); + expect(courseOnes).toHaveLength(1); + expect(courseOnes[0].title).not.toBe('DB Title'); // static wins on dedupe + + expect(entries.some((e) => e.entityType === 'lesson')).toBe(true); + expect(entries.some((e) => e.entityId === 'db-only')).toBe(true); + }); + }); +}); + +describe('GET /api/v1/search (integration)', () => { + let canRun = false; + + beforeAll(async () => { + try { + await prisma.$connect(); + // Verify the search index table exists (migration applied). + await (prisma as any).$queryRawUnsafe('SELECT 1 FROM "curriculum_search_entries" LIMIT 1'); + await request(app).post('/api/v1/search/reindex').expect(200); + canRun = true; + } catch { + console.warn('Search index/database unavailable — skipping integration tests'); + } + }); + + afterAll(async () => { + try { + await prisma.$disconnect(); + } catch { + /* ignore */ + } + }); + + it('returns 400 when q is missing', async () => { + if (!canRun) return; + await request(app).get('/api/v1/search').expect(400); + }); + + it('returns ranked lessons/modules for a keyword', async () => { + if (!canRun) return; + const res = await request(app).get('/api/v1/search').query({ q: 'soroban' }).expect(200); + expect(Array.isArray(res.body.results)).toBe(true); + expect(res.body.count).toBeGreaterThan(0); + }); + + it('filters by type', async () => { + if (!canRun) return; + const res = await request(app) + .get('/api/v1/search') + .query({ q: 'contract', type: 'lesson' }) + .expect(200); + expect(res.body.results.every((r: { entityType: string }) => r.entityType === 'lesson')).toBe( + true + ); + }); +});