DataLabTechTV
diff --git a/‎.env.example‎
Lines changed: 16 additions & 0 deletions b/‎.env.example‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎dlctl/cli.py‎
Lines changed: 2 additions & 0 deletions b/‎dlctl/cli.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 119 additions & 4 deletions b/‎docker-compose.yml‎
Lines changed: 119 additions & 4 deletions
diff --git a/‎docker/mlflow/Dockerfile‎
Lines changed: 3 additions & 0 deletions b/‎docker/mlflow/Dockerfile‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graph/visualization.py‎
Lines changed: 1 addition & 33 deletions b/‎graph/visualization.py‎
Lines changed: 1 addition & 33 deletions
diff --git a/‎ingest/fetcher.py‎
Lines changed: 1 addition & 2 deletions b/‎ingest/fetcher.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎ingest/handler.py‎
Lines changed: 3 additions & 3 deletions b/‎ingest/handler.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ingest/parser.py‎
Lines changed: 2 additions & 2 deletions b/‎ingest/parser.py‎
Lines changed: 2 additions & 2 deletions
@@ -17,6 +17,7 @@ S3_REGION=eu-west-1
 S3_BUCKET=lakehouse
 S3_INGEST_PREFIX=raw
 S3_STAGE_PREFIX=stage
+S3_SECURE_STAGE_PREFIX=secure-stage
 S3_GRAPHS_MART_PREFIX=marts/graphs
 S3_ANALYTICS_MART_PREFIX=marts/analytics
 S3_EXPORTS_PREFIX=exports
@@ -30,6 +31,7 @@ S3_BACKUPS_PREFIX=backups
 
 ENGINE_DB=engine.duckdb
 STAGE_DB=stage.sqlite
+SECURE_STAGE_DB=secure_stage.sqlite
 GRAPHS_MART_DB=marts/graphs.sqlite
 ANALYTICS_MART_DB=marts/analytics.sqlite
 
@@ -43,3 +45,17 @@ ECON_COMP_GRAPH_DB=graphs/econ_comp.kuzu
 # =====================
 
 OLLAMA_MODELS=gemma3:latest,phi4:latest
+
+# MLflow configurations
+# =====================
+
+MLFLOW_TRACKING_URI=http://localhost:5000
+MLFLOW_TRACKING_USERNAME=datalabtech
+S3_MLFLOW_BUCKET=mlflow
+S3_MLFLOW_ARTIFACTS_PREFIX=artifacts
+
+# Kafka configurations
+# ====================
+
+KAFKA_BROKER_ENDPOINT=localhost:9092
+KAFKA_GROUP_TOPIC_LIST=ml_inference_results:lakehouse-inference-result-consumer,ml_inference_feedback:lakehouse-inference-feedback-consumer
@@ -13,6 +13,7 @@
 from export.cli import export
 from graph.cli import graph
 from ingest.cli import ingest
+from ml.cli import ml
 from shared.cache import cache_usage, expunge_cache
 from shared.settings import LOCAL_DIR, MART_DB_VARS, env
 from shared.storage import Storage, StoragePrefix
@@ -70,6 +71,7 @@ def dlctl(ctx: click.Context, debug: bool, logfile_enabled: bool, show_version:
 dlctl.add_command(ingest)
 dlctl.add_command(export)
 dlctl.add_command(graph)
+dlctl.add_command(ml)
 
 # Backups
 # =======
 
@@ -9,15 +9,19 @@ services:
       - MINIO_ROOT_PASSWORD=${S3_SECRET_ACCESS_KEY}
     volumes:
       - minio:/data
+    networks:
+      - minio
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
       interval: 10s
       retries: 5
     restart: unless-stopped
     command: server /data --console-address ":9001"
 
-  minio-mc:
+  minio-init:
     image: minio/mc:RELEASE.2025-04-16T18-13-26Z
+    networks:
+      - minio
     depends_on:
       minio:
         condition: service_healthy
@@ -34,20 +38,25 @@ services:
       - "11434:11434"
     volumes:
       - ollama:/root/.ollama
+    networks:
+      - ollama
     deploy:
       resources:
         reservations:
           devices:
-            - capabilities: [gpu]
-    runtime: nvidia
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
     healthcheck:
       test: ["CMD", "ollama", "ls"]
       interval: 10s
       retries: 3
     restart: unless-stopped
 
-  ollama-models:
+  ollama-init:
     image: alpine/curl:latest
+    networks:
+      - ollama
     depends_on:
       ollama:
         condition: service_healthy
@@ -64,6 +73,112 @@ services:
       done
       '
     restart: no
+
+  mlflow:
+    build:
+      context: ./docker/mlflow
+      dockerfile: Dockerfile
+    ports:
+      - "5000:5000"
+    volumes:
+      - mlflow:/mlflow
+    networks:
+      - mlflow
+    environment:
+      MLFLOW_S3_ENDPOINT_URL: http://${S3_ENDPOINT}
+      AWS_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID}
+      AWS_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY}
+      AWS_DEFAULT_REGION: ${S3_REGION}
+      AWS_S3_ADDRESSING_STYLE: ${S3_URL_STYLE}
+    command: >
+      mlflow server
+        --backend-store-uri sqlite:///mlflow/mlflow.db
+        --serve-artifacts
+        --artifacts-destination s3://${S3_MLFLOW_BUCKET}/${S3_MLFLOW_ARTIFACTS_PREFIX}
+        --host 0.0.0.0
+        --port 5000
+    healthcheck:
+      test: >
+        python -c "import urllib.request;
+        urllib.request.urlopen('http://localhost:5000')"
+      interval: 10s
+      retries: 5
+    restart: unless-stopped
+
+  kafka:
+    image: apache/kafka:4.0.0
+    ports:
+      - "9092:9092"
+    environment:
+      KAFKA_NODE_ID: 1
+      KAFKA_PROCESS_ROLES: broker,controller
+
+      KAFKA_LISTENERS: EXTERNAL://:9092,INTERNAL://:29092,CONTROLLER://:9093
+      KAFKA_ADVERTISED_LISTENERS: EXTERNAL://localhost:9092,INTERNAL://kafka:29092
+
+      KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,EXTERNAL:PLAINTEXT,INTERNAL:PLAINTEXT
+      KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL
+
+      KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER
+      KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093
+
+      KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
+      KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
+      KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
+      KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
+
+      KAFKA_LOG_DIRS: /var/lib/kafka/data
+    volumes:
+      - kafka:/var/lib/kafka/data
+    networks:
+      - kafka
+    healthcheck:
+      test: [
+        "CMD", "bash", "-c",
+        "/opt/kafka/bin/kafka-topics.sh --bootstrap-server kafka:29092 --list"
+      ]
+      interval: 10s
+      retries: 5
+    restart: unless-stopped
+
+  kafka-init:
+    image: apache/kafka:4.0.0
+    environment:
+      KAFKA_GROUP_TOPIC_LIST: ${KAFKA_GROUP_TOPIC_LIST}
+    networks:
+      - kafka
+    depends_on:
+      kafka:
+        condition: service_healthy
+    command: |
+        /bin/bash -c '
+        for topic_group in $${KAFKA_GROUP_TOPIC_LIST//,/ }; do
+          IFS=':' read -r topic group <<< "$$topic_group"
+
+          echo "Creating topic: $$topic"
+          /opt/kafka/bin/kafka-topics.sh \
+            --bootstrap-server kafka:29092 \
+            --create --if-not-exists --topic $$topic \
+            --partitions 1 --replication-factor 1
+
+          echo "Initializing consumer for topic $$topic and group $$group"
+          /opt/kafka/bin/kafka-console-consumer.sh \
+            --bootstrap-server kafka:29092 \
+            --topic $$topic \
+            --group $$group \
+            --timeout-ms 5000
+        done
+        '
+    restart: no
+
 volumes:
   minio:
   ollama:
+  mlflow:
+  kafka:
+
+networks:
+  ollama:
+  minio:
+  mlflow:
+  kafka:
@@ -0,0 +1,3 @@
+FROM ghcr.io/mlflow/mlflow:v3.2.0
+
+RUN pip install boto3
@@ -3,21 +3,12 @@
 
 import geopandas as gpd
 import kagglehub
-import matplotlib as mpl
 import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
 import pandas as pd
-from matplotlib.colors import LinearSegmentedColormap, to_hex
 
-COLOR_PALETTE = [
-    "#42b0f9",
-    "#ff5c92",
-    "#ffcc00",
-    "#9900ff",
-    "#92ff5c",
-    "#f98242",
-]
+from shared.color import COLOR_PALETTE, get_palette
 
 
 def set_labels(G: nx.Graph, label_props: dict[str, str]):
@@ -31,29 +22,6 @@ def set_labels(G: nx.Graph, label_props: dict[str, str]):
         data["label"] = data[prop]
 
 
-def darken_color(color, amount=0.5) -> tuple[float, float, float]:
-    c = mpl.colors.to_rgb(color)
-    return tuple(max(0, min(1, channel * (1 - amount))) for channel in c)
-
-
-def get_palette(n_colors: int = 3, darken: bool = False, reverse: bool = False):
-    color_palette = list(COLOR_PALETTE)
-
-    if reverse:
-        color_palette = reversed(color_palette)
-
-    if darken:
-        color_palette = [darken_color(c) for c in color_palette]
-    else:
-        color_palette = list(color_palette)
-
-    if n_colors <= len(color_palette):
-        return color_palette
-
-    cmap = LinearSegmentedColormap.from_list("custom", color_palette)
-    return [to_hex(cmap(i)) for i in np.linspace(0, 1, n_colors)]
-
-
 def plot(
     G: nx.Graph,
     name_prop: str = "label",
 
@@ -7,7 +7,6 @@
 from loguru import logger as log
 from tqdm import tqdm
 
-from shared.cache import get_requests_cache_session
 from shared.storage import Storage, StoragePrefix
 
 DATACITE_API_URL = "https://api.datacite.org/"
@@ -17,7 +16,7 @@ class DataCiteFetcher:
     def __init__(self, s3_dir_path: str):
         self.s3_dir_path = s3_dir_path
         self.storage = Storage(StoragePrefix.INGEST)
-        self.session = get_requests_cache_session("datacite")
+        self.session = requests.Session()
 
     def to_canonical_doi(self, doi: str) -> str:
         rel_path = urlsplit(doi).path.removeprefix("/")
 
@@ -3,17 +3,17 @@
 import git
 import kagglehub as kh
 from loguru import logger as log
-from slugify import slugify
 
 from ingest.fetcher import DataCiteFetcher
 from ingest.parser import DatasetURL
 from ingest.template.base import DataCiteTemplate, DatasetTemplate, DatasetTemplateID
 from shared.cache import get_cache_dir
 from shared.storage import Storage, StoragePrefix
+from shared.utils import fn_sanitize
 
 
 def handle_standalone(dataset: str):
-    ds_name = slugify(dataset, separator="_")
+    ds_name = fn_sanitize(dataset)
     log.info("Standalone detected, creating dataset: {}", ds_name)
 
     try:
@@ -25,7 +25,7 @@ def handle_standalone(dataset: str):
 
 
 def handle_template(dataset: str, template_id: DatasetTemplateID):
-    ds_name = slugify(dataset, separator="_")
+    ds_name = fn_sanitize(dataset)
     template = DatasetTemplate.from_id(template_id)
 
     log.info(
 
@@ -2,7 +2,7 @@
 from typing import Self
 from urllib.parse import urlparse
 
-from slugify import slugify
+from shared.utils import fn_sanitize
 
 
 @dataclass
@@ -20,7 +20,7 @@ def parse(cls, dataset_url: str) -> Self:
         author = path[-2]
         slug = path[-1]
         handle = f"{author}/{slug}"
-        name = slugify(slug, separator="_")
+        name = fn_sanitize(slug)
 
         ds_url = cls(
             author=author,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+FROM ghcr.io/mlflow/mlflow:v3.2.0`
	`2`	`+`
	`3`	`+RUN pip install boto3`