CogStack
diff --git a/‎.github/workflows/api-docs.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/api-docs.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/docker.yaml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/docker.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/main.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/main.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/release-gpu.yaml‎
Lines changed: 113 additions & 0 deletions b/‎.github/workflows/release-gpu.yaml‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎.github/workflows/release.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/release.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎app/api/routers/generative.py‎
Lines changed: 92 additions & 7 deletions b/‎app/api/routers/generative.py‎
Lines changed: 92 additions & 7 deletions
diff --git a/‎app/config.py‎
Lines changed: 1 addition & 0 deletions b/‎app/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎app/domain.py‎
Lines changed: 5 additions & 2 deletions b/‎app/domain.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎app/envs/.env‎
Lines changed: 3 additions & 0 deletions b/‎app/envs/.env‎
Lines changed: 3 additions & 0 deletions
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.10' ]
+        python-version: [ '3.11' ]
       max-parallel: 1
 
     steps:
 
@@ -19,7 +19,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Lint
-        run: hadolint --ignore DL3008 --ignore DL3013 --ignore DL3003 --ignore DL4006 docker/Dockerfile* docker/**/Dockerfile*
+        run: hadolint --ignore DL3008 --ignore DL4006 --ignore DL3006 --ignore SC2046 docker/Dockerfile
 
   build-and-push:
     needs: lint
@@ -74,6 +74,9 @@ jobs:
           platforms: linux/amd64,linux/arm64
           context: .
           file: docker/Dockerfile
+          build-args: |
+            IMAGE_TYPE=gpu
+            PIP_EXTRAS=llm
           push: true
           tags: ${{ steps.cms_meta.outputs.tags }}
           labels: ${{ steps.cms_meta.outputs.labels }}
 
@@ -24,7 +24,7 @@ jobs:
       - name: Install uv and set Python to ${{ matrix.python-version }}
         uses: astral-sh/setup-uv@v6
         with:
-          version: "0.8.10"
+          version: "0.9.30"
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
 
@@ -0,0 +1,113 @@
+name: release
+
+on:
+  release:
+    types: [published]
+
+env:
+  REGISTRY: docker.io
+  CMS_GPU_IMAGE_NAME: cogstacksystems/cogstack-modelserve-gpu
+
+jobs:
+  ensure-branch:
+    runs-on: ubuntu-latest
+    outputs:
+      is-valid: ${{ steps.ensure-branch.outputs.is-valid }}
+    steps:
+      - name: Ensures release is from the production branch only
+        id: ensure-branch
+        run: |
+          TARGET_BRANCH="${{ github.event.release.target_commitish }}"
+          if [ "$TARGET_BRANCH" != "production" ]; then
+            echo "Only releases from the 'production' branch are allowed but found: $TARGET_BRANCH"
+            echo "is-valid=false" >> "$GITHUB_OUTPUT"
+            exit 1
+          else
+            echo "Target release branch is: $TARGET_BRANCH"
+            echo "is-valid=true" >> "$GITHUB_OUTPUT"
+          fi
+
+  qc:
+    runs-on: ubuntu-latest
+    needs: ensure-branch
+    if: needs.ensure-branch.outputs.is-valid == 'true'
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+            version: "0.9.30"
+            python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          uv sync --lock --extra dev --extra docs --extra llm
+          uv run python -m ensurepip
+      - name: Run unit tests
+        run: |
+          uv run pytest -v tests/app --cov --cov-report=html:coverage_reports #--random-order
+      - name: Run integration tests
+        run: |
+          uv run pytest -s -v tests/integration
+
+  release-gpu:
+      runs-on: ubuntu-latest
+      needs: [ensure-branch, qc]
+      if: needs.ensure-branch.outputs.is-valid == 'true'
+      permissions:
+        contents: read
+        packages: write
+        id-token: write
+        attestations: write
+      steps:
+        - uses: actions/checkout@v4
+
+        - name: Set up QEMU
+          uses: docker/setup-qemu-action@v3
+
+        - name: Set up Docker Buildx
+          uses: docker/setup-buildx-action@v3
+
+        - name: Extract the tag
+          run: |
+            echo "RELEASE_VERSION=${GITHUB_REF/refs\/tags\/v/}" >> $GITHUB_ENV
+
+        - name: Login to Docker Hub
+          uses: docker/login-action@v3
+          with:
+            registry: ${{ env.REGISTRY }}
+            username: ${{ secrets.DOCKERHUB_USERNAME }}
+            password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+        - name: Extract CMS meta
+          id: cms_meta
+          uses: docker/metadata-action@v5
+          with:
+            images: ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}
+
+        - name: Build and push CMS image
+          uses: docker/build-push-action@v6
+          id: build_and_push_cms
+          with:
+            platforms: linux/amd64,linux/arm64
+            context: .
+            file: docker/Dockerfile
+            build-args: |
+              IMAGE_TYPE=gpu
+              PIP_EXTRAS=llm
+            push: true
+            github-token: ${{ github.token }}
+            tags: |
+              ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
+            labels: ${{ steps.cms_meta.outputs.labels }}
+
+        - name: Attest CMS image artifacts
+          uses: actions/attest-build-provenance@v2
+          with:
+            subject-name: ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}
+            subject-digest: ${{ steps.build_and_push_cms.outputs.digest }}
+            push-to-registry: true
+
+        - name: Inspect the released image
+          run: |
+            docker pull ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
+            docker image inspect ${{ env.REGISTRY }}/${{ env.CMS_GPU_IMAGE_NAME }}:${{ env.RELEASE_VERSION }}
@@ -37,11 +37,11 @@ jobs:
       - name: Install uv
         uses: astral-sh/setup-uv@v5
         with:
-            version: "0.8.10"
-            python-version: "3.10"
+            version: "0.9.30"
+            python-version: "3.11"
       - name: Install dependencies
         run: |
-          uv sync --extra dev --extra docs --extra llm
+          uv sync --lock --extra dev --extra docs --extra llm
           uv run python -m ensurepip
       - name: Run unit tests
         run: |
 
@@ -91,7 +91,6 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-.env
 
 # Spyder project settings
 .spyderproject
 
@@ -10,7 +10,12 @@
 from fastapi import APIRouter, Depends, Request, Body, Query
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import PlainTextResponse, StreamingResponse, JSONResponse
-from starlette.status import HTTP_200_OK, HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
+from starlette.status import (
+    HTTP_200_OK,
+    HTTP_400_BAD_REQUEST,
+    HTTP_500_INTERNAL_SERVER_ERROR,
+    HTTP_404_NOT_FOUND,
+)
 from app.domain import (
     Tags,
     TagsGenerative,
@@ -35,6 +40,7 @@
 PATH_CHAT_COMPLETIONS = "/v1/chat/completions"
 PATH_COMPLETIONS = "/v1/completions"
 PATH_EMBEDDINGS = "/v1/embeddings"
+PATH_MODELS = "/v1/models"
 
 router = APIRouter()
 config = get_settings()
@@ -200,7 +206,12 @@ def generate_chat_completions(
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
-    stop_sequences = request_data.stop_sequences
+    if isinstance(request_data.stop, str):
+        stop_sequences = [request_data.stop]
+    elif isinstance(request_data.stop, list):
+        stop_sequences = request_data.stop
+    else:
+        stop_sequences = []
     tracking_id = tracking_id or str(uuid.uuid4())
 
     if not messages:
@@ -337,12 +348,11 @@ def generate_text_completions(
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
     top_p = request_data.top_p
-    stop = request_data.stop
 
-    if isinstance(stop, str):
-        stop_sequences = [stop]
-    elif isinstance(stop, list):
-        stop_sequences = stop
+    if isinstance(request_data.stop, str):
+        stop_sequences = [request_data.stop]
+    elif isinstance(request_data.stop, list):
+        stop_sequences = request_data.stop
     else:
         stop_sequences = []
 
@@ -534,6 +544,81 @@ def embed_texts(
         )
 
 
+@router.get(
+    PATH_MODELS,
+    tags=[Tags.OpenAICompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="List available models, similar to OpenAI's /v1/models endpoint",
+)
+def list_models(
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
+) -> JSONResponse:
+    """
+    Lists all available models, mimicking OpenAI's /v1/models endpoint.
+
+    Args:
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A response containing the list of models.
+    """
+    response = {
+        "object": "list",
+        "data": [
+            {
+                "id": model_service.model_name.replace(" ", "_"),
+                "object": "model",
+                "created": 0,
+                "owned_by": "cms",
+            }
+        ],
+    }
+    return JSONResponse(content=response)
+
+
+@router.get(
+    PATH_MODELS + "/{model_name}",
+    tags=[Tags.OpenAICompatible],
+    dependencies=[Depends(cms_globals.props.current_active_user)],
+    description="Get a specific model, similar to OpenAI's /v1/models/{model_id} endpoint",
+)
+def get_model(
+    model_name: str,
+    model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
+) -> JSONResponse:
+    """
+    Gets a specific model by ID, mimicking OpenAI's /v1/models/{model_id} endpoint.
+
+    Args:
+        model_name (str): The model name to retrieve.
+        model_service (AbstractModelService): The model service dependency.
+
+    Returns:
+        JSONResponse: A response containing the model details.
+    """
+    if model_name != model_service.model_name.replace(" ", "_"):
+        error_response = {
+            "error": {
+                "message": f"The model `{model_name}` does not exist",
+                "type": "invalid_request_error",
+                "param": None,
+                "code": "model_not_found",
+            }
+        }
+        return JSONResponse(content=error_response, status_code=HTTP_404_NOT_FOUND
+)
+    response = {
+        "id": model_name,
+        "object": "model",
+        "created": 0,
+        "owned_by": "cms",
+        "permission": [],
+        "root": model_name,
+        "parent": None,
+    }
+    return JSONResponse(content=response)
+
+
 def _empty_prompt_error() -> Iterable[str]:
     yield "ERROR: No prompt text provided\n"
 
 
@@ -38,6 +38,7 @@ class Settings(BaseSettings):   # type: ignore
     HF_PIPELINE_AGGREGATION_STRATEGY: str = "simple"  # the strategy used for aggregating the predictions of the Hugging Face NER model
     LOG_PER_CONCEPT_ACCURACIES: str = "false"         # if "true", per-concept accuracies will be exposed to the metrics scrapper. Switch this on with caution due to the potentially high number of concepts
     MEDCAT2_MAPPED_ONTOLOGIES: str = ""               # the comma-separated names of ontologies for MedCAT2 to map to
+    ENABLE_SPDA_ATTN: str = "true"                    # if "true", attempt to use SPDA attention for HuggingFace LLM loading
     DEBUG: str = "false"                              # if "true", the debug mode is switched on
 
     class Config:
 
@@ -218,7 +218,10 @@ class OpenAIChatCompletionsRequest(BaseModel):
     model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
     top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
-    stop_sequences: Optional[List[str]] = Field(default=None, description="The list of sequences used to stop the generation")
+    stop: Optional[Union[str, List[str]]] = Field(
+        default=None,
+        description="The single sequence or the list of sequences used to stop the generation",
+    )
 
 
 class OpenAIChatCompletionsResponse(BaseModel):
@@ -242,7 +245,7 @@ class OpenAICompletionsRequest(BaseModel):
     top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
     stop: Optional[Union[str, List[str]]] = Field(
         default=None,
-        description="The list of sequences used to stop the generation",
+        description="The single sequence or the list of sequences used to stop the generation",
     )
 
 
 
@@ -79,5 +79,8 @@ TRAINING_HF_TAGGING_SCHEME=flat
 # The comma-separated names of ontologies for MedCAT2 to map to
 MEDCAT2_MAPPED_ONTOLOGIES=opcs4,icd10
 
+# If "true", attempt to use SPDA attention for Hugging Face LLM loading
+ENABLE_SPDA_ATTN=true
+
 # If "true", the debug mode is switched on
 DEBUG=false