ScienciaLAB · lfoppiano · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026 · Jun 7, 2026
diff --git a/.env.example b/.env.example
@@ -1,18 +1,17 @@
-PHI_URL=....
-QWEN_URL=...
+# ── LLM endpoints (OpenAI-compatible vLLM servers on Modal) ──
+PHI_URL=https://<account>--phi-4-mini-instruct-qa-vllm-serve.modal.run/v1
+QWEN_URL=https://<account>--qwen-0-6b-qa-vllm-serve.modal.run/v1
+API_KEY=your-llm-api-key
 
-EMBEDS_URL=...
+# ── Embedding endpoint ───────────────────────────────────────
+EMBEDS_URL=https://<account>--intfloat-multilingual-e5-large-instruct-embeddings-embed.modal.run
+EMBEDS_API_KEY=your-embedding-api-key
+
+# ── Defaults pre-selected in the UI ──────────────────────────
 DEFAULT_MODEL=microsoft/Phi-4-mini-instruct
 DEFAULT_EMBEDDING=intfloat/multilingual-e5-large-instruct-modal
 
-API_KEY=...
-EMBEDS_API_KEY=...
-
-GROBID_URL=...
-GROBID_QUANTITIES_URL=...
-
-
-QWEN_URL=...
-GROBID_MATERIALS_URL=... 
-API_KEY=... 
-EMBEDS_API_KEY=...
+# ── GROBID services ──────────────────────────────────────────
+GROBID_URL=https://your-grobid-url
+GROBID_QUANTITIES_URL=https://your-grobid-quantities-url/   # optional (measurements NER)
+GROBID_MATERIALS_URL=https://your-grobid-superconductors-url/  # optional (materials NER)
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -31,14 +31,14 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install --upgrade flake8 pytest pycodestyle pytest-cov huggingface_hub
+          pip install --upgrade ruff pytest pytest-cov huggingface_hub
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Lint with flake8
+      - name: Lint with ruff
         run: |
           # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+          ruff check --select=E9,F63,F7,F82 --output-format=full .
+          # non-blocking: report all remaining style issues without failing the build
+          ruff check --exit-zero --statistics .
       - name: Test with pytest
         run: |
           pytest

diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml
@@ -25,14 +25,14 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install --upgrade flake8 pytest pycodestyle
+          pip install --upgrade ruff pytest
           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Lint with flake8
+      - name: Lint with ruff
         run: |
           # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+          ruff check --select=E9,F63,F7,F82 --output-format=full .
+          # non-blocking: report all remaining style issues without failing the build
+          ruff check --exit-zero --statistics .
     #    - name: Test with pytest
     #      run: |
     #        pytest

diff --git a/README.md b/README.md
@@ -46,6 +46,8 @@ Additionally, this frontend provides the visualisation of named entities on LLM
 
  **For full technical documentation** of the `document-qa-engine` library **[`docs/README.md`](docs/README.md)**.
 
+ **To deploy the LLM and embedding endpoints** on Modal.com, see **[`document_qa/deployment/README.md`](document_qa/deployment/README.md)**.
+
 ### Embedding selection
 In the latest version, there is the possibility to select both embedding functions and LLMs. There are some limitations, OpenAI embeddings cannot be used with open source models, and vice-versa. 
 
@@ -83,7 +85,7 @@ For more information, see the [details](https://docs.trychroma.com/troubleshooti
 Please read carefully:
 
 - Avoid uploading sensitive data. We temporarily store text from the uploaded PDF documents only for processing your request, and we disclaim any responsibility for subsequent use or handling of the submitted data by third-party LLMs.
-- Mistral and Zephyr are FREE to use and do not require any API, but as we leverage the free API entrypoint, there is no guarantee that all requests will go through. Use at your own risk.
+- The public demo serves open models (Phi-4-mini-instruct, Qwen3) self-hosted on [Modal.com](https://www.modal.com) under a limited monthly compute budget, so there is no guarantee that all requests will go through. Use at your own risk.
 - We do not assume responsibility for how the data is utilized by the LLM end-points API.
 
 ## Development notes

diff --git a/docs/README.md b/docs/README.md
@@ -97,6 +97,13 @@ GROBID_MATERIALS_URL=https://your-grobid-superconductors-url/
 | `GROBID_QUANTITIES_URL` | URL to a grobid-quantities server (for measurement NER) |
 | `GROBID_MATERIALS_URL` | URL to a grobid-superconductors server (for materials NER) |
 
+### Deploying the model endpoints
+
+The `PHI_URL`, `QWEN_URL`, and `EMBEDS_URL` endpoints above are served by the Modal apps
+in [`../document_qa/deployment/`](../document_qa/deployment/README.md). That README covers
+the required secrets, deploy commands, and how each printed `*.modal.run` URL maps back to
+these variables.
+
 ---
 
 ## Quick Start — Streamlit App

diff --git a/document_qa/custom_embeddings.py b/document_qa/custom_embeddings.py
@@ -47,18 +47,13 @@ def embed(self, text: List[str]) -> List[List[float]]:
         # Newlines degrade embedding quality for most models
         cleaned_text = [t.replace("\n", " ") for t in text]
 
-        payload = {'text': "\n".join(cleaned_text)}
+        payload = {"text": "\n".join(cleaned_text)}
 
         headers = {}
         if self.api_key:
-            headers = {'x-api-key': self.api_key}
-
-        response = requests.post(
-            self.url,
-            data=payload,
-            files=[],
-            headers=headers
-        )
+            headers = {"x-api-key": self.api_key}
+
+        response = requests.post(self.url, data=payload, files=[], headers=headers)
         response.raise_for_status()
 
         # print(response.text)
@@ -92,12 +87,15 @@ def get_model_name(self) -> str:
 
 
 if __name__ == "__main__":
+    # Smoke test against a deployed Modal embedding endpoint. The endpoint requires
+    # the x-api-key header, so set EMBEDS_URL and EMBEDS_API_KEY in the environment
+    # (see document_qa/deployment/README.md).
+    import os
+
     embeds = ModalEmbeddings(
-        url="https://lfoppiano--intfloat-multilingual-e5-large-instruct-embed-5da184.modal.run/",
-        model_name="intfloat/multilingual-e5-large-instruct"
+        url=os.environ["EMBEDS_URL"],
+        model_name="intfloat/multilingual-e5-large-instruct",
+        api_key=os.environ.get("EMBEDS_API_KEY"),
     )
 
-    print(embeds.embed(
-        ["We are surrounded by stupid kids",
-         "We are interested in the future of AI"]
-    ))
+    print(embeds.embed(["We are surrounded by stupid kids", "We are interested in the future of AI"]))
diff --git a/document_qa/deployment/README.md b/document_qa/deployment/README.md
@@ -0,0 +1,105 @@
+# Modal deployment scripts
+
+This folder contains the [Modal](https://modal.com) apps that serve the LLM and
+embedding endpoints used by document-qa. Each script is an independent Modal app:
+deploy the ones you need, then point the matching `.env` variables at the URLs
+Modal prints.
+
+| Script | Modal app | Serves | Maps to `.env` |
+|--------|-----------|--------|----------------|
+| `modal_inference_phi.py` | `phi-4-mini-instruct-qa-vllm` | `microsoft/Phi-4-mini-instruct` (vLLM, OpenAI-compatible) | `PHI_URL` |
+| `modal_inference_qwen.py` | `qwen-0.6b-qa-vllm` | `Qwen/Qwen3-0.6B` (vLLM, reasoning) | `QWEN_URL` |
+| `modal_embeddings_multilang.py` | `intfloat-multilingual-e5-large-instruct-embeddings` | `intfloat/multilingual-e5-large-instruct` | `EMBEDS_URL` |
+| `modal_embeddings_en.py` | `intfloat-e5-large-v2-embeddings` | `intfloat/e5-large-v2` (English-only) | `EMBEDS_URL` |
+
+> Both embedding scripts define a tiny global `EmbeddingModel` class that delegates
+> to the shared helpers in `_embeddings_app.py` (`cls_kwargs`, `load_embedding_model`,
+> `run_embed`). The shared module holds the container image and the embedding logic;
+> the model is loaded **once per container** via `@modal.enter()`. To add another
+> embedding model, copy one wrapper and change `MODEL_NAME` / `MODEL_REVISION` / the
+> app name.
+
+## Prerequisites
+
+```bash
+pip install modal
+modal token new      # one-time browser auth
+```
+
+## Secrets
+
+The scripts read an `API_KEY` from a Modal [Secret](https://modal.com/docs/guide/secrets).
+Create the two secrets once (the value is the bearer token clients must send):
+
+```bash
+# Used by the inference scripts (phi, qwen)
+modal secret create document-qa-api-key API_KEY=<your-llm-token>
+
+# Used by the embedding scripts
+modal secret create document-qa-embedding-key API_KEY=<your-embedding-token>
+```
+
+| Secret | Used by | Provides |
+|--------|---------|----------|
+| `document-qa-api-key` | `modal_inference_phi.py`, `modal_inference_qwen.py` | `API_KEY` for the vLLM `--api-key` flag |
+| `document-qa-embedding-key` | `modal_embeddings_*.py` | `API_KEY` checked against the `x-api-key` header |
+
+## Deploy
+
+```bash
+modal deploy document_qa/deployment/modal_inference_phi.py
+modal deploy document_qa/deployment/modal_inference_qwen.py
+modal deploy document_qa/deployment/modal_embeddings_multilang.py
+# modal deploy document_qa/deployment/modal_embeddings_en.py   # optional English-only
+```
+
+Each deploy prints a public `https://<...>.modal.run` URL. Copy it into `.env`:
+
+```env
+PHI_URL=https://<account>--phi-4-mini-instruct-qa-vllm-serve.modal.run/v1
+QWEN_URL=https://<account>--qwen-0-6b-qa-vllm-serve.modal.run/v1
+EMBEDS_URL=https://<account>--embeddings-multilang.modal.run   # English-only: --embeddings-en
+API_KEY=<your-llm-token>            # matches document-qa-api-key
+EMBEDS_API_KEY=<your-embedding-token>  # matches document-qa-embedding-key
+```
+
+> **Inference endpoints** are OpenAI-compatible vLLM servers, so their URLs end in
+> `/v1`. **Embedding endpoints** are a custom form endpoint (see below), so their
+> URL has no `/v1` suffix.
+
+## Endpoint contracts
+
+### Inference (vLLM)
+
+Standard OpenAI Chat Completions API at `<PHI_URL|QWEN_URL>`, authenticated with the
+`Authorization: Bearer <API_KEY>` header. Used by `langchain_openai.ChatOpenAI` in
+`streamlit_app.py`.
+
+### Embeddings
+
+A custom `POST` endpoint consumed by
+[`ModalEmbeddings`](../custom_embeddings.py):
+
+- **Auth**: `x-api-key: <EMBEDS_API_KEY>` header.
+- **Body**: form field `text` with newline-separated strings.
+- **Response**: JSON list of L2-normalised vectors, one per input line.
+
+Smoke test:
+
+```bash
+curl -X POST "$EMBEDS_URL" \
+  -H "x-api-key: $EMBEDS_API_KEY" \
+  -F $'text=first sentence\nsecond sentence'
+```
+
+## Tuning
+
+These knobs live near the top of each script (or in `_embeddings_app.py`):
+
+| Setting | Where | Notes |
+|---------|-------|-------|
+| `gpu` | `@app.function` / `@app.cls` | `A10G` is cheaper; `L40S` is faster. Embeddings default to `L40S`, inference to `A10G`. |
+| `scaledown_window` | decorator | Idle time before a replica is stopped (cost vs. cold starts). |
+| `max_inputs` | `@modal.concurrent` | Concurrent requests per replica — tune to GPU memory. |
+| `LABEL` | `modal_embeddings_*.py` | Pins the public URL (`--<label>.modal.run`). Without it Modal truncates the long auto-name and appends a random hash. |
+| `FAST_BOOT` | `modal_inference_phi.py` | `--enforce-eager` for faster cold starts vs. peak throughput. |
diff --git a/document_qa/deployment/_embeddings_app.py b/document_qa/deployment/_embeddings_app.py
@@ -0,0 +1,116 @@
+"""Shared building blocks for the Modal embedding endpoints.
+
+``modal_embeddings_en.py`` and ``modal_embeddings_multilang.py`` each define a tiny
+``EmbeddingModel`` class at module scope (Modal requires globally-defined classes
+with stacked ``@app.cls`` / ``@modal.concurrent`` decorators) that delegates to the
+helpers here. All the heavy lifting — the container image, model loading, pooling,
+and the embedding request handler — lives in this module so it is written once.
+
+The endpoint contract (consumed by ``document_qa.custom_embeddings.ModalEmbeddings``):
+
+- **Method**: ``POST``
+- **Auth**: ``x-api-key`` header, compared against the ``API_KEY`` secret.
+- **Body**: form field ``text`` containing newline-separated strings.
+- **Response**: JSON list of L2-normalised embedding vectors, one per input line.
+"""
+
+import os
+
+import modal
+import torch
+import torch.nn.functional as F
+from fastapi import HTTPException, Request
+from torch import Tensor
+
+MINUTES = 60  # seconds
+N_GPU = 1
+
+# Shared container image for every embedding model.
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "transformers",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        "fastapi[standard]",
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+    # Modal 1.0 no longer auto-mounts imported local modules; the wrapper scripts
+    # import this module by name, so it must be added explicitly. Kept last so it
+    # doesn't invalidate the (expensive) pip layer above on every code edit.
+    .add_local_python_source("_embeddings_app")
+)
+
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+
+
+def cls_kwargs() -> dict:
+    """Common ``@app.cls`` configuration shared by every embedding endpoint."""
+    return dict(
+        image=image,
+        gpu=f"L40S:{N_GPU}",
+        # how long should we stay up with no requests?
+        scaledown_window=3 * MINUTES,
+        volumes={
+            "/root/.cache/huggingface": hf_cache_vol,
+            "/root/.cache/vllm": vllm_cache_vol,
+        },
+        secrets=[modal.Secret.from_name("document-qa-embedding-key")],
+    )
+
+
+def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    """Mean-pool token embeddings, ignoring padding positions."""
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+
+
+def load_embedding_model(model_name: str, model_revision: str):
+    """Load a tokenizer + model onto the best available device, once per container.
+
+    Returns:
+        tuple: ``(tokenizer, model, device)`` with ``model`` already in eval mode.
+    """
+    # transformers is only available inside the Modal image, so import lazily.
+    from transformers import AutoModel, AutoTokenizer
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Loading {model_name} on {device}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, revision=model_revision)
+    model = AutoModel.from_pretrained(model_name, revision=model_revision).to(device)
+    model.eval()
+    print("Model loaded successfully.")
+    return tokenizer, model, device
+
+
+def run_embed(tokenizer, model, device, request: Request, text: str):
+    """Authenticate, embed newline-separated ``text``, and return normalised vectors."""
+    api_key = request.headers.get("x-api-key")
+    if api_key != os.environ["API_KEY"]:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+    texts = [t for t in text.split("\n") if t.strip()]
+    if not texts:
+        return []
+
+    print(f"Start embedding {len(texts)} texts")
+    try:
+        with torch.no_grad():
+            batch_dict = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+            batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
+
+            outputs = model(**batch_dict)
+            embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            embeddings = embeddings.cpu().numpy().tolist()
+
+        print("Finished embedding texts.")
+        return embeddings
+
+    except RuntimeError as e:
+        print(f"Error during embedding: {str(e)}")
+        if "CUDA out of memory" in str(e):
+            print("CUDA OOM. Try reducing batch size or using a smaller model.")
+        raise