Example Domain
This domain is for use in documentation examples without needing permission. Avoid use in operations.
Learn more
+
+App /health endpoint:
+Defaulted container "devops-python-app" out of: devops-python-app, init-wait-dns (init), init-download (init) {"status":"healthy","timestamp":"2026-05-14T19:30:06.938108+00:00","uptime_seconds":201}
+
+Per-pod visit counts (storage isolation):
+Defaulted container "devops-python-app" out of: devops-python-app, init-wait-dns (init), init-download (init) {"visits":3,"visits_file":"/data/visits"}
+Defaulted container "devops-python-app" out of: devops-python-app, init-wait-dns (init), init-download (init) {"visits":5,"visits_file":"/data/visits"}
+Defaulted container "devops-python-app" out of: devops-python-app, init-wait-dns (init), init-download (init) {"visits":2,"visits_file":"/data/visits"}
diff --git a/k8s/screenshots/lab16-init-download.txt b/k8s/screenshots/lab16-init-download.txt
new file mode 100644
index 0000000000..59cd2ff91f
Binary files /dev/null and b/k8s/screenshots/lab16-init-download.txt differ
diff --git a/k8s/screenshots/lab16-init-logs.txt b/k8s/screenshots/lab16-init-logs.txt
new file mode 100644
index 0000000000..80f40bfe14
Binary files /dev/null and b/k8s/screenshots/lab16-init-logs.txt differ
diff --git a/k8s/screenshots/lab16-monitoring-pods.txt b/k8s/screenshots/lab16-monitoring-pods.txt
new file mode 100644
index 0000000000..cf06e97551
Binary files /dev/null and b/k8s/screenshots/lab16-monitoring-pods.txt differ
diff --git a/k8s/screenshots/lab16-resources.txt b/k8s/screenshots/lab16-resources.txt
new file mode 100644
index 0000000000..0a471af057
Binary files /dev/null and b/k8s/screenshots/lab16-resources.txt differ
diff --git a/k8s/service-app2.yml b/k8s/service-app2.yml
new file mode 100644
index 0000000000..902c4b3d76
--- /dev/null
+++ b/k8s/service-app2.yml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: devops-python-app-v2-service
+ labels:
+ app: devops-python-app-v2
+ component: service
+spec:
+ type: ClusterIP
+ selector:
+ app: devops-python-app-v2
+ ports:
+ - name: http
+ protocol: TCP
+ port: 80
+ targetPort: 5000
diff --git a/k8s/service.yml b/k8s/service.yml
new file mode 100644
index 0000000000..4c50fef201
--- /dev/null
+++ b/k8s/service.yml
@@ -0,0 +1,17 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: devops-python-app-service
+ labels:
+ app: devops-python-app
+ component: service
+spec:
+ type: NodePort
+ selector:
+ app: devops-python-app
+ ports:
+ - name: http
+ protocol: TCP
+ port: 80
+ targetPort: 5000
+ nodePort: 30080
diff --git a/monitoring/.env.example b/monitoring/.env.example
new file mode 100644
index 0000000000..06670a6c8c
--- /dev/null
+++ b/monitoring/.env.example
@@ -0,0 +1,11 @@
+# Environment variables for Grafana (optional)
+# ⚠️ IMPORTANT: Copy this to .env and update values
+# Do NOT commit .env file with real credentials!
+
+# Grafana Admin Credentials
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=changeme_secure_password
+
+# For development/testing only:
+# Set GF_AUTH_ANONYMOUS_ENABLED=true in docker-compose.yml
+# Remove for production deployment!
diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml
new file mode 100644
index 0000000000..f19d240d47
--- /dev/null
+++ b/monitoring/docker-compose.yml
@@ -0,0 +1,189 @@
+version: '3.8'
+
+services:
+ # Prometheus - Metrics collection and TSDB storage
+ prometheus:
+ image: prom/prometheus:v3.9.0
+ container_name: prometheus
+ ports:
+ - "9090:9090"
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.retention.time=15d'
+ - '--storage.tsdb.retention.size=10GB'
+ volumes:
+ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+ - prometheus-data:/prometheus
+ networks:
+ - logging
+ deploy:
+ resources:
+ limits:
+ cpus: '1.0'
+ memory: 1G
+ reservations:
+ cpus: '0.5'
+ memory: 512M
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ restart: unless-stopped
+
+ # Loki - Log aggregation system
+ loki:
+ image: grafana/loki:3.0.0
+ container_name: loki
+ ports:
+ - "3100:3100"
+ command: -config.file=/etc/loki/config.yml
+ volumes:
+ - ./loki/config.yml:/etc/loki/config.yml:ro
+ - loki-data:/tmp/loki
+ networks:
+ - logging
+ deploy:
+ resources:
+ limits:
+ cpus: '1.0'
+ memory: 1G
+ reservations:
+ cpus: '0.5'
+ memory: 512M
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ restart: unless-stopped
+
+ # Promtail - Log collector
+ promtail:
+ image: grafana/promtail:3.0.0
+ container_name: promtail
+ command: -config.file=/etc/promtail/config.yml
+ volumes:
+ - ./promtail/config.yml:/etc/promtail/config.yml:ro
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ - /var/lib/docker/containers:/var/lib/docker/containers:ro
+ - promtail-data:/tmp
+ networks:
+ - logging
+ depends_on:
+ loki:
+ condition: service_healthy
+ deploy:
+ resources:
+ limits:
+ cpus: '0.5'
+ memory: 512M
+ reservations:
+ cpus: '0.25'
+ memory: 256M
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9080/ready || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ restart: unless-stopped
+
+ # Grafana - Visualization and dashboards
+ grafana:
+ image: grafana/grafana:12.3.0
+ container_name: grafana
+ ports:
+ - "3000:3000"
+ environment:
+ # ⚠️ DEVELOPMENT ONLY - Remove for production
+ - GF_AUTH_ANONYMOUS_ENABLED=true
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+ - GF_SECURITY_ALLOW_EMBEDDING=true
+ # Security settings
+ - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
+ # Server settings
+ - GF_SERVER_ROOT_URL=http://localhost:3000
+ - GF_LOG_LEVEL=info
+ volumes:
+ - grafana-data:/var/lib/grafana
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
+ - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+ networks:
+ - logging
+ depends_on:
+ loki:
+ condition: service_healthy
+ prometheus:
+ condition: service_healthy
+ deploy:
+ resources:
+ limits:
+ cpus: '0.5'
+ memory: 512M
+ reservations:
+ cpus: '0.25'
+ memory: 256M
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 20s
+ restart: unless-stopped
+
+ # Python DevOps Info Service
+ app-python:
+ build:
+ context: ../app_python
+ dockerfile: Dockerfile
+ container_name: devops-python-app
+ ports:
+ - "8000:5000"
+ environment:
+ - PORT=5000
+ - DEBUG=false
+ - LOG_LEVEL=INFO
+ networks:
+ - logging
+ labels:
+ logging: "promtail"
+ app: "devops-python"
+ deploy:
+ resources:
+ limits:
+ cpus: '0.5'
+ memory: 256M
+ reservations:
+ cpus: '0.25'
+ memory: 128M
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:5000/health || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+ restart: unless-stopped
+ depends_on:
+ promtail:
+ condition: service_healthy
+ prometheus:
+ condition: service_healthy
+
+networks:
+ logging:
+ driver: bridge
+ name: logging-network
+
+volumes:
+ prometheus-data:
+ name: prometheus-data
+ loki-data:
+ name: loki-data
+ promtail-data:
+ name: promtail-data
+ grafana-data:
+ name: grafana-data
diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md
new file mode 100644
index 0000000000..888fc9aae7
--- /dev/null
+++ b/monitoring/docs/LAB07.md
@@ -0,0 +1,1819 @@
+# Lab 7: Observability & Logging with Loki Stack
+
+**Student**: Selivanov George
+**Date**: March 12, 2026
+
+## 1. Overview
+
+This lab implements a complete centralized logging solution using the Grafana Loki stack. The setup includes Loki 3.0 for log aggregation with TSDB storage, Promtail 3.0 for log collection from Docker containers, and Grafana 11.3.1 for visualization and dashboards.
+
+### 1.1 Technology Stack
+
+| Component | Version | Purpose |
+|-----------|---------|---------|
+| **Loki** | 3.0.0 | Log aggregation and storage with TSDB |
+| **Promtail** | 3.0.0 | Log collector for Docker containers |
+| **Grafana** | 11.3.1 | Visualization and dashboards |
+| **Python App** | 1.0.0 | DevOps Info Service with JSON logging |
+
+### 1.2 Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ Logging Architecture │
+├─────────────────────────────────────────────────────────────┤
+│ │
+│ ┌────────────────┐ ┌────────────────┐ │
+│ │ Python App │ │ Other Apps │ │
+│ │ (JSON Logs) │ │ (JSON Logs) │ │
+│ └────────┬───────┘ └────────┬───────┘ │
+│ │ │ │
+│ └─────────┬───────────────┘ │
+│ │ │
+│ ↓ Docker logs via │
+│ ┌──────────────┐ /var/lib/docker/containers │
+│ │ Promtail │ │
+│ │ (Collector) │ ← Docker Socket (discovery) │
+│ └──────┬───────┘ │
+│ │ HTTP Push │
+│ ↓ │
+│ ┌──────────────┐ │
+│ │ Loki │ │
+│ │ (Storage) │ ← TSDB + 7-day retention │
+│ └──────┬───────┘ │
+│ │ LogQL Queries │
+│ ↓ │
+│ ┌──────────────┐ │
+│ │ Grafana │ │
+│ │ (Dashboards) │ ← Web UI (localhost:3000) │
+│ └──────────────┘ │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Data Flow**:
+1. Applications write logs to stdout (JSON format)
+2. Docker captures logs in `/var/lib/docker/containers`
+3. Promtail discovers containers via Docker socket
+4. Promtail reads logs and pushes to Loki
+5. Loki stores logs with TSDB indexing
+6. Grafana queries logs via LogQL
+7. Users visualize logs in dashboards
+
+### 1.3 Why Loki Over Elasticsearch?
+
+**Key Differences**:
+
+| Feature | Loki | Elasticsearch |
+|---------|------|---------------|
+| **Indexing Strategy** | Only metadata (labels) | Full-text indexing |
+| **Storage Cost** | Very low (5-10x cheaper) | High |
+| **Query Performance** | Fast for label-based queries | Fast for full-text search |
+| **Resource Usage** | Low (100-500 MB RAM) | High (2-8 GB RAM minimum) |
+| **Complexity** | Simple deployment | Complex cluster management |
+| **Best For** | Container logs, metrics | Complex search, analytics |
+
+**Why Loki for This Lab**:
+- **Lightweight**: Perfect for development and small-scale deployments
+- **Label-Based**: Container metadata (app name, environment) as labels
+- **Cost-Effective**: Minimal storage and resource requirements
+- **Native Grafana**: Seamless integration with Grafana ecosystem
+- **Container-First**: Designed specifically for cloud-native logs
+
+---
+
+## 2. Task 1 — Deploy Loki Stack (4 pts)
+
+### 2.1 Understanding Log Labels
+
+**Labels in Loki** are key-value pairs attached to log streams:
+- Used for indexing and querying
+- Should be low-cardinality (few unique values)
+- Examples: `app`, `environment`, `container`, `job`
+
+**Good Labels**:
+```
+{app="devops-python", environment="dev", level="ERROR"}
+```
+
+**Bad Labels** (high cardinality):
+```
+{request_id="uuid-123456", user_id="user-789", timestamp="2026-03-12..."}
+```
+
+**Why It Matters**:
+- Too many label combinations = poor performance
+- Labels create separate log streams
+- Store high-cardinality data in log lines, not labels
+
+### 2.2 Promtail Container Discovery
+
+**Docker Service Discovery** (`docker_sd_configs`):
+- Connects to Docker socket: `/var/run/docker.sock`
+- Automatically discovers running containers
+- Filters containers by label: `logging=promtail`
+- Extracts metadata: container name, ID, labels, image
+
+**Relabeling Process**:
+1. `__meta_docker_container_name` -> `container` label
+2. `__meta_docker_container_label_app` -> `app` label
+3. Remove leading `/` from container names with regex
+4. Add static labels like `job="docker"`
+
+**Security Consideration**:
+- Docker socket access = root privileges
+- Use read-only mount: `/var/run/docker.sock:ro`
+- In production, consider rootless Docker or API-based discovery
+
+### 2.3 Docker Compose Configuration
+
+**File**: `monitoring/docker-compose.yml`
+
+**Key Design Decisions**:
+
+#### Loki Service
+```yaml
+loki:
+ image: grafana/loki:3.0.0
+ command: -config.file=/etc/loki/config.yml
+ volumes:
+ - ./loki/config.yml:/etc/loki/config.yml:ro
+ - loki-data:/tmp/loki
+ ports:
+ - "3100:3100"
+```
+
+**Why These Choices**:
+- **Version 3.0.0**: Latest stable with TSDB support
+- **Config Mount**: Read-only for security
+- **Data Volume**: Persistent storage for logs
+- **Port 3100**: Standard Loki HTTP port
+
+#### Promtail Service
+```yaml
+promtail:
+ image: grafana/promtail:3.0.0
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ - /var/lib/docker/containers:/var/lib/docker/containers:ro
+ depends_on:
+ loki:
+ condition: service_healthy
+```
+
+**Why These Choices**:
+- **Docker Socket**: For container discovery
+- **Container Logs**: Direct access to Docker log files
+- **Read-Only**: Security best practice
+- **Health Dependency**: Wait for Loki before starting
+
+#### Grafana Service
+```yaml
+grafana:
+ image: grafana/grafana:11.3.1
+ environment:
+ - GF_AUTH_ANONYMOUS_ENABLED=true # DEV ONLY
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+ volumes:
+ - grafana-data:/var/lib/grafana
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
+```
+
+**Why These Choices**:
+- **Anonymous Auth**: For testing convenience (remove in production!)
+- **Provisioning**: Auto-configure Loki datasource
+- **Persistent Data**: Dashboards and settings survive restarts
+
+### 2.4 Loki Configuration Deep Dive
+
+**File**: `monitoring/loki/config.yml`
+
+#### TSDB Storage Configuration
+
+```yaml
+schema_config:
+ configs:
+ - from: 2020-10-24
+ store: tsdb
+ object_store: filesystem
+ schema: v13
+```
+
+**TSDB Benefits (Loki 3.0+)**:
+- **10x Query Performance**: Optimized index structure
+- **Lower Memory**: More efficient than boltdb-shipper
+- **Better Compression**: Smaller storage footprint
+- **Faster Compaction**: Quicker cleanup operations
+
+**Schema v13**:
+- Required for TSDB
+- Incompatible with older schemas (migration needed)
+- Standard for Loki 3.0+
+
+#### Retention Configuration
+
+```yaml
+limits_config:
+ retention_period: 168h # 7 days
+
+compactor:
+ retention_enabled: true
+ retention_delete_delay: 2h
+ compaction_interval: 10m
+```
+
+**How Retention Works**:
+1. **Mark**: Compactor marks logs older than 168h
+2. **Wait**: Delay 2h before deletion (safety buffer)
+3. **Delete**: Remove marked logs from storage
+4. **Compact**: Clean up index and chunks
+
+**Why 7 Days**:
+- Balances storage cost vs. debugging needs
+- Sufficient for most incident investigations
+- Can be extended to 30+ days for compliance
+
+### 2.5 Promtail Configuration Deep Dive
+
+**File**: `monitoring/promtail/config.yml`
+
+#### Pipeline Stages
+
+```yaml
+pipeline_stages:
+ - json:
+ expressions:
+ level: level
+ timestamp: timestamp
+ message: message
+ method: method
+ path: path
+ status_code: status_code
+```
+
+**Pipeline Processing**:
+1. **JSON Parser**: Extract fields from JSON logs
+2. **Labels Extraction**: Convert fields to Loki labels
+3. **Timestamp Parsing**: Use log timestamp, not ingestion time
+4. **Output Stage**: Optional debugging output
+
+**Why JSON Parsing**:
+- Structured data is easier to query
+- Extract specific fields: `| json | level="ERROR"`
+- Performance: No regex parsing needed
+- Consistency: Same format across all apps
+
+#### Label Extraction
+
+```yaml
+- labels:
+ level:
+ method:
+```
+
+**Careful Label Selection**:
+- **level**: Low cardinality (INFO, ERROR, DEBUG)
+- **method**: Low cardinality (GET, POST, PUT, DELETE)
+- **status_code**: Medium cardinality (200, 404, 500...)
+- **path**: High cardinality (unique URLs)
+
+**Trade-off**: More labels = easier queries but worse performance
+
+### 2.6 Deployment and Verification
+
+#### Deploy the Stack
+
+```bash
+cd monitoring
+
+# Create .env file (see section 2.7)
+cp .env.example .env
+# Edit .env and set GRAFANA_ADMIN_PASSWORD
+
+# Start all services
+docker compose up -d
+
+# Check service status
+docker compose ps
+
+# View logs
+docker compose logs -f loki
+docker compose logs -f promtail
+```
+
+**Expected Output**:
+```
+NAME STATUS PORTS
+loki healthy 0.0.0.0:3100->3100/tcp
+promtail healthy 0.0.0.0:9080->9080/tcp
+grafana healthy 0.0.0.0:3000->3000/tcp
+devops-python-app healthy 0.0.0.0:8000->5000/tcp
+```
+
+#### Verify Loki
+
+```bash
+# Check readiness
+curl http://localhost:3100/ready
+# Expected: Ready
+
+# Check metrics
+curl http://localhost:3100/metrics | grep loki
+
+# Check config
+curl http://localhost:3100/config | jq .
+```
+
+#### Verify Promtail
+
+```bash
+# Check targets
+curl http://localhost:9080/targets | jq .
+
+# Expected output:
+# {
+# "activeTargets": [
+# {
+# "labels": {
+# "app": "devops-python",
+# "container": "devops-python-app",
+# "job": "docker"
+# },
+# "discoveredLabels": { ... }
+# }
+# ]
+# }
+
+# Check metrics
+curl http://localhost:9080/metrics | grep promtail
+```
+
+#### Verify Grafana
+
+1. **Access Grafana**: http://localhost:3000
+ - Default login: `admin` / `admin` (or your .env password)
+
+2. **Check Datasource**:
+ - Go to **Connections** -> **Data sources**
+ - Should see "Loki" with green checkmark
+ - If not: Add manually with URL `http://loki:3100`
+
+3. **Test in Explore**:
+ - Click **Explore** (compass icon)
+ - Select **Loki** datasource
+ - Query: `{job="docker"}`
+ - Should see logs from all containers
+
+### 2.7 Environment Configuration
+
+**File**: `monitoring/.env`
+
+**Step-by-Step**:
+```bash
+cd monitoring
+cp .env.example .env
+```
+
+**Edit `.env` and change**:
+```bash
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=your_secure_password_here
+```
+
+## 3. Task 2 — Integrate Applications (3 pts)
+
+### 3.1 JSON Logging Implementation
+
+**Library Choice**: `python-json-logger` (version 3.2.1)
+
+**Why python-json-logger**:
+- **Maintained**: Active development and updates
+- **Simple**: Extends standard `logging.Formatter`
+- **Flexible**: Customizable JSON fields
+- **Compatible**: Works with any logging handler
+
+**Alternative Considered**: `structlog`
+- More powerful but heavier
+- Overkill for this use case
+- Steeper learning curve
+
+#### Custom JSON Formatter
+
+**File**: `app_python/app.py` (lines 10-18)
+
+```python
+class CustomJsonFormatter(jsonlogger.JsonFormatter):
+ """Custom JSON formatter for structured logging"""
+ def add_fields(self, log_record, record, message_dict):
+ super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict)
+ log_record['timestamp'] = datetime.now(timezone.utc).isoformat()
+ log_record['level'] = record.levelname
+ log_record['logger'] = record.name
+ log_record['module'] = record.module
+ log_record['function'] = record.funcName
+```
+
+**Custom Fields Added**:
+- `timestamp`: ISO 8601 format with timezone
+- `level`: INFO, ERROR, DEBUG, WARNING
+- `logger`: Logger name (devops-info-service)
+- `module`: Source module (app, controller, etc.)
+- `function`: Function that logged the message
+
+**Why These Fields**:
+- **Timestamp**: Critical for time-series analysis
+- **Level**: Easy filtering in Grafana
+- **Context**: Debug where log originated
+
+#### Logging Setup
+
+```python
+logger = logging.getLogger("devops-info-service")
+logger.setLevel(os.getenv('LOG_LEVEL', 'INFO'))
+
+json_handler = logging.StreamHandler(sys.stdout)
+formatter = CustomJsonFormatter('%(timestamp)s %(level)s %(name)s %(message)s')
+json_handler.setFormatter(formatter)
+logger.addHandler(json_handler)
+```
+
+**Configuration**:
+- **Stream**: `sys.stdout` (Docker captures this)
+- **Log Level**: Configurable via `LOG_LEVEL` env var
+- **Format**: JSON with custom fields
+
+### 3.2 Request/Response Logging
+
+#### Middleware Implementation
+
+**File**: `app_python/app.py` (lines 51-71)
+
+```python
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+ """Log all HTTP requests and responses"""
+ # Log incoming request
+ logger.info("HTTP Request", extra={
+ "method": request.method,
+ "path": request.url.path,
+ "client_ip": request.client.host if request.client else "unknown",
+ "user_agent": request.headers.get('user-agent', 'unknown')
+ })
+
+ # Process request
+ response = await call_next(request)
+
+ # Log response
+ logger.info("HTTP Response", extra={
+ "method": request.method,
+ "path": request.url.path,
+ "status_code": response.status_code
+ })
+
+ return response
+```
+
+**What's Logged**:
+- **Request**: Method, path, client IP, user agent
+- **Response**: Method, path, status code
+- **Extra Fields**: Merged into JSON output
+
+**Example Log Output**:
+```json
+{
+ "timestamp": "2026-03-12T10:30:45.123456+00:00",
+ "level": "INFO",
+ "logger": "devops-info-service",
+ "module": "app",
+ "function": "log_requests",
+ "message": "HTTP Request",
+ "method": "GET",
+ "path": "/",
+ "client_ip": "172.18.0.1",
+ "user_agent": "curl/7.88.1"
+}
+```
+
+### 3.3 Application Startup Logging
+
+```python
+logger.info("Application starting", extra={
+ "host": HOST,
+ "port": PORT,
+ "debug": DEBUG,
+ "python_version": platform.python_version()
+})
+```
+
+**Why Log Startup**:
+- Confirms app is running
+- Shows configuration values
+- Useful for debugging deployment issues
+
+### 3.4 Docker Compose Integration
+
+**Application Service in `monitoring/docker-compose.yml`**:
+
+```yaml
+app-python:
+ build:
+ context: ../app_python
+ dockerfile: Dockerfile
+ container_name: devops-python-app
+ ports:
+ - "8000:5000"
+ environment:
+ - PORT=5000
+ - DEBUG=false
+ - LOG_LEVEL=INFO
+ networks:
+ - logging
+ labels:
+ logging: "promtail"
+ app: "devops-python"
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:5000/health || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ restart: unless-stopped
+ depends_on:
+ promtail:
+ condition: service_healthy
+```
+
+**Key Configuration**:
+- **Labels**: `logging=promtail` and `app=devops-python`
+ - Promtail filters by `logging=promtail`
+ - `app` label appears in Loki queries
+- **Environment**: `LOG_LEVEL=INFO` for production-like logging
+- **Network**: Joins `logging` network
+- **Health Check**: Verifies app is responding
+- **Dependencies**: Waits for Promtail to be healthy
+
+### 3.5 Generate Test Logs
+
+**Script**: Create `monitoring/test-logs.sh` (if needed)
+
+```bash
+#!/bin/bash
+echo "Generating test traffic..."
+
+# Generate successful requests
+for i in {1..20}; do
+ curl -s http://localhost:8000/ > /dev/null
+ echo "Request $i to /"
+done
+
+# Generate health checks
+for i in {1..20}; do
+ curl -s http://localhost:8000/health > /dev/null
+ echo "Request $i to /health"
+done
+
+# Generate errors (404)
+for i in {1..10}; do
+ curl -s http://localhost:8000/nonexistent > /dev/null
+ echo "Request $i to /nonexistent (404)"
+done
+
+echo "Test traffic generated"
+```
+
+**Run**:
+```bash
+cd monitoring
+bash test-logs.sh
+```
+
+### 3.6 Verify Logs in Grafana
+
+**Evidence Required - Manual Steps**:
+
+1. **Open Grafana**: http://localhost:3000
+
+2. **Navigate to Explore**:
+ - Click **Explore** icon (compass) in left sidebar
+ - Select **Loki** datasource from dropdown
+
+3. **Query All App Logs**:
+ ```logql
+ {app="devops-python"}
+ ```
+
+4. **Query by Log Level**:
+ ```logql
+ {app="devops-python"} | json | level="INFO"
+ ```
+
+5. **Query HTTP Requests**:
+ ```logql
+ {app="devops-python"} | json | method="GET"
+ ```
+
+6. **Query Errors** (if any):
+ ```logql
+ {app="devops-python"} |= "ERROR"
+ ```
+
+---
+
+## 4. Task 3 — Build Log Dashboard (2 pts)
+
+### 4.1 LogQL Query Examples
+
+#### Basic Queries
+
+**1. All logs from app**:
+```logql
+{app="devops-python"}
+```
+
+**2. Filter by container**:
+```logql
+{container="devops-python-app"}
+```
+
+**3. Multiple apps**:
+```logql
+{app=~"devops-.*"}
+```
+
+**4. Specific job**:
+```logql
+{job="docker"}
+```
+
+#### Text Filtering
+
+**5. Contains "error" (case-insensitive)**:
+```logql
+{app="devops-python"} |= "error"
+```
+
+**6. Doesn't contain "health"**:
+```logql
+{app="devops-python"} != "health"
+```
+
+**7. Regex match**:
+```logql
+{app="devops-python"} |~ "status_code\":\\s*[45]\\d\\d"
+```
+
+#### JSON Parsing
+
+**8. Parse JSON and filter**:
+```logql
+{app="devops-python"} | json | level="ERROR"
+```
+
+**9. Multiple field filters**:
+```logql
+{app="devops-python"} | json | method="GET" | status_code="200"
+```
+
+**10. Numeric comparison** (Loki 3.0+):
+```logql
+{app="devops-python"} | json | unwrap status_code | status_code >= 400
+```
+
+#### Metrics from Logs
+
+**11. Logs per second**:
+```logql
+rate({app="devops-python"}[1m])
+```
+
+**12. Count by level**:
+```logql
+sum by (level) (count_over_time({app="devops-python"} | json [5m]))
+```
+
+**13. Request rate by method**:
+```logql
+sum by (method) (rate({app="devops-python"} | json | message="HTTP Request" [1m]))
+```
+
+**14. Error rate**:
+```logql
+sum(rate({app="devops-python"} | json | level="ERROR" [5m]))
+```
+
+**15. 95th percentile response time** (if logged):
+```logql
+quantile_over_time(0.95, {app="devops-python"} | json | unwrap response_time [5m])
+```
+
+### 4.2 Dashboard Creation Guide
+
+**Manual Steps Required - Follow This Guide**:
+
+#### Panel 1: Logs Table
+
+1. **Grafana** -> **Dashboards** -> **New** -> **New Dashboard**
+2. **Add visualization**
+3. **Panel settings**:
+ - **Title**: "Application Logs"
+ - **Data source**: Loki
+ - **Query**:
+ ```logql
+ {app=~"devops-.*"} | json
+ ```
+ - **Visualization**: Logs
+ - **Options**:
+ - Show time: +
+ - Wrap lines: +
+ - Pretty print: +
+ - Deduplication: None
+4. **Apply** and **Save**
+
+#### Panel 2: Request Rate (Time Series)
+
+1. **Add panel** -> **Add visualization**
+2. **Panel settings**:
+ - **Title**: "Logs per Second by Application"
+ - **Data source**: Loki
+ - **Query**:
+ ```logql
+ sum by (app) (rate({app=~"devops-.*"} [1m]))
+ ```
+ - **Visualization**: Time series
+ - **Options**:
+ - Legend: {{app}}
+ - Unit: logs/s
+ - Draw style: Lines
+3. **Apply**
+
+#### Panel 3: Error Logs
+
+1. **Add panel** -> **Add visualization**
+2. **Panel settings**:
+ - **Title**: "Error Logs Only"
+ - **Data source**: Loki
+ - **Query**:
+ ```logql
+ {app=~"devops-.*"} | json | level="ERROR"
+ ```
+ - **Visualization**: Logs
+ - **Options**:
+ - Highlight errors: +
+3. **Apply**
+
+#### Panel 4: Log Level Distribution
+
+1. **Add panel** -> **Add visualization**
+2. **Panel settings**:
+ - **Title**: "Log Levels Distribution"
+ - **Data source**: Loki
+ - **Query**:
+ ```logql
+ sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))
+ ```
+ - **Visualization**: Pie chart (or Stat)
+ - **Options**:
+ - Legend: {{level}}
+ - Show values: Percent
+3. **Apply**
+
+#### Panel 5: HTTP Methods (Bonus)
+
+1. **Add panel** -> **Add visualization**
+2. **Panel settings**:
+ - **Title**: "HTTP Methods"
+ - **Data source**: Loki
+ - **Query**:
+ ```logql
+ sum by (method) (count_over_time({app="devops-python"} | json | method!="" [5m]))
+ ```
+ - **Visualization**: Bar chart
+3. **Apply**
+
+#### Save Dashboard
+
+1. **Click Save dashboard** (disk icon)
+2. **Name**: "Application Logs Dashboard"
+3. **Folder**: General
+4. **Save**
+
+### 4.3 Dashboard Best Practices
+
+**Layout**:
+- Put most important panel at top-left (users scan F-pattern)
+- Group related panels together
+- Use consistent time ranges
+
+**Performance**:
+- Avoid queries with high-cardinality labels
+- Use time range limits (`[5m]` instead of `[24h]`)
+- Add panel caching where appropriate
+
+**Usability**:
+- Add panel descriptions
+- Use meaningful titles
+- Include units on axes
+- Add thresholds and alerts
+
+## 5. Task 4 — Production Readiness (1 pt)
+
+### 5.1 Resource Limits
+
+**Already Implemented** in `docker-compose.yml`:
+
+```yaml
+deploy:
+ resources:
+ limits:
+ cpus: '1.0'
+ memory: 1G
+ reservations:
+ cpus: '0.5'
+ memory: 512M
+```
+
+**Limits by Service**:
+
+| Service | CPU Limit | Memory Limit | CPU Reserved | Memory Reserved |
+|---------|-----------|--------------|--------------|-----------------|
+| Loki | 1.0 | 1 GB | 0.5 | 512 MB |
+| Promtail | 0.5 | 512 MB | 0.25 | 256 MB |
+| Grafana | 1.0 | 1 GB | 0.5 | 512 MB |
+| Python App | 0.5 | 512 MB | 0.25 | 256 MB |
+
+**Why These Values**:
+- **Loki**: Needs memory for index caching
+- **Promtail**: Lightweight, minimal resources
+- **Grafana**: UI requires more memory for dashboards
+- **Python App**: Small FastAPI app, minimal needs
+
+**Reservations**:
+- Guarantees minimum resources
+- Prevents starvation under load
+- Allows bursting up to limits
+
+### 5.2 Security Configuration
+
+#### Grafana Authentication
+
+**Development Configuration** (current):
+```yaml
+environment:
+ - GF_AUTH_ANONYMOUS_ENABLED=true
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+```
+
+**For Production** (change to):
+```yaml
+environment:
+ - GF_AUTH_ANONYMOUS_ENABLED=false
+ - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER}
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
+```
+
+**Steps to Secure**:
+1. Edit `docker-compose.yml`
+2. Change `GF_AUTH_ANONYMOUS_ENABLED=false`
+3. Set strong password in `.env`
+4. Restart Grafana: `docker compose restart grafana`
+
+#### Docker Socket Security
+
+**Current** (read-only mount):
+```yaml
+volumes:
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+```
+
+**Security Risk**:
+- Docker socket = root access to host
+- Compromised Promtail = full system access
+
+**Mitigation Options**:
+1. **Docker Socket Proxy**: Use `tecnativa/docker-socket-proxy`
+2. **Rootless Docker**: Run Docker as non-root user
+3. **Alternative**: Use Docker API with TLS authentication
+4. **Container Isolation**: Run Promtail with limited capabilities
+
+**For This Lab**: Read-only mount is acceptable for learning
+**For Production**: Implement proper socket isolation
+
+### 5.3 Health Checks
+
+**Already Implemented** for all services:
+
+```yaml
+healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ start_period: 10s
+```
+
+**Parameters Explained**:
+- **test**: Command to check health
+- **interval**: Check every 10 seconds
+- **timeout**: Fail if no response in 5 seconds
+- **retries**: Mark unhealthy after 5 failures
+- **start_period**: Grace period during startup
+
+**Health Endpoints**:
+- **Loki**: `http://localhost:3100/ready`
+- **Promtail**: `http://localhost:9080/ready`
+- **Grafana**: `http://localhost:3000/api/health`
+- **Python App**: `http://localhost:5000/health`
+
+**Dependency Order**:
+```
+Loki (healthy) -> Promtail (healthy) -> Python App
+ ↓
+ Grafana
+```
+
+**Verify Health**:
+```bash
+docker compose ps
+
+# Expected output:
+# NAME STATUS
+# loki Up 2 minutes (healthy)
+# promtail Up 2 minutes (healthy)
+# grafana Up 2 minutes (healthy)
+# devops-python-app Up 2 minutes (healthy)
+```
+
+### 5.4 Additional Production Considerations
+
+#### Backup and Recovery
+
+**What to Backup**:
+- Loki data: `loki-data` volume
+- Grafana data: `grafana-data` volume (dashboards, users)
+- Configuration files: `loki/config.yml`, `promtail/config.yml`
+
+**Backup Strategy**:
+```bash
+# Backup volumes
+docker run --rm \
+ -v loki-data:/data \
+ -v $(pwd)/backups:/backup \
+ alpine tar czf /backup/loki-data.tar.gz /data
+
+# Restore
+docker run --rm \
+ -v loki-data:/data \
+ -v $(pwd)/backups:/backup \
+ alpine tar xzf /backup/loki-data.tar.gz -C /
+```
+
+#### Monitoring the Monitoring Stack
+
+**Monitor**:
+- Disk usage: Loki data volume
+- Memory usage: All services
+- Log ingestion rate: Promtail metrics
+- Query performance: Loki metrics
+
+**Export Metrics**:
+- Loki exposes Prometheus metrics on `:3100/metrics`
+- Promtail exposes metrics on `:9080/metrics`
+- Grafana exposes metrics on `:3000/metrics`
+
+**Set Alerts**:
+- Disk > 80% full
+- Loki ingestion errors
+- Promtail targets down
+
+#### Network Security
+
+**Current**: Bridge network (internal communication)
+```yaml
+networks:
+ logging:
+ driver: bridge
+```
+
+**For Production**:
+- Use overlay network for multi-host
+- Implement network policies
+- Enable TLS between services
+- Use secrets for credentials
+
+
+## 6. Task 5 — Documentation (2 pts)
+
+### 6.1 Architecture Diagram
+
+See section 1.2 for complete architecture diagram.
+
+**Components**:
+- Docker containers writing JSON logs
+- Promtail collecting via Docker socket
+- Loki storing with TSDB
+- Grafana visualizing logs
+
+### 6.2 Setup Guide
+
+**Prerequisites**:
+- Docker Engine 20.10+
+- Docker Compose v2 (with `docker compose` command)
+- 4 GB RAM minimum
+- 10 GB disk space
+
+**Step-by-Step Deployment**:
+
+```bash
+# 1. Clone repository
+cd DevOps-Core-Course
+
+# 2. Navigate to monitoring directory
+cd monitoring
+
+# 3. Create .env file
+cp .env.example .env
+# Edit .env and set GRAFANA_ADMIN_PASSWORD
+
+# 4. Start stack
+docker compose up -d
+
+# 5. Verify services
+docker compose ps
+# All services should show "healthy"
+
+# 6. Check logs
+docker compose logs -f
+
+# 7. Access Grafana
+# Open http://localhost:3000
+# Login with admin / your_password
+
+# 8. Verify Loki datasource
+# Go to Connections -> Data sources -> Loki
+# Should show "Data source is working"
+
+# 9. Explore logs
+# Click Explore -> Select Loki
+# Query: {job="docker"}
+
+# 10. Generate test traffic
+curl http://localhost:8000/
+curl http://localhost:8000/health
+
+# 11. Create dashboard (follow Task 3 guide)
+```
+
+**Teardown**:
+```bash
+# Stop services
+docker compose down
+
+# Remove volumes (deletes all data)
+docker compose down -v
+
+# Remove images
+docker compose down --rmi all
+```
+
+### 6.3 Configuration Explanation
+
+**Loki Config Highlights**:
+- **TSDB**: Faster than boltdb-shipper
+- **Retention**: 168h (7 days)
+- **Compactor**: Cleans up old logs automatically
+- **Schema v13**: Required for Loki 3.0+
+
+**Promtail Config Highlights**:
+- **Docker SD**: Auto-discovers containers
+- **Label Filter**: Only `logging=promtail`
+- **JSON Parser**: Extracts structured fields
+- **Relabeling**: Creates meaningful labels
+
+**Grafana Config Highlights**:
+- **Provisioning**: Auto-configures Loki datasource
+- **Anonymous Auth**: Enabled for development (disable for prod)
+- **Persistent Storage**: Dashboards saved to volume
+
+### 6.4 Application Logging Design
+
+**JSON Logging**:
+- Library: `python-json-logger`
+- Custom formatter with timestamp, level, context
+- HTTP middleware logs every request/response
+- Startup logging with configuration details
+
+**Log Levels**:
+- **INFO**: Normal operations (requests, startup)
+- **ERROR**: Exceptions and errors
+- **DEBUG**: Detailed debugging (disabled by default)
+- **WARNING**: Non-critical issues
+
+**Logged Events**:
+- Application startup with config
+- Every HTTP request (method, path, IP, user agent)
+- Every HTTP response (status code, method, path)
+- Application errors and exceptions
+
+### 6.5 Dashboard Explanation
+
+**Panel 1: Logs Table**
+- **Purpose**: View raw logs from all apps
+- **Query**: `{app=~"devops-.*"} | json`
+- **Use**: Quick log inspection, debugging
+
+**Panel 2: Request Rate**
+- **Purpose**: Monitor traffic volume
+- **Query**: `sum by (app) (rate({app=~"devops-.*"} [1m]))`
+- **Use**: Detect traffic spikes, unusual patterns
+
+**Panel 3: Error Logs**
+- **Purpose**: Focus on failures
+- **Query**: `{app=~"devops-.*"} | json | level="ERROR"`
+- **Use**: Incident response, error tracking
+
+**Panel 4: Log Level Distribution**
+- **Purpose**: Understand log composition
+- **Query**: `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))`
+- **Use**: Detect unusual error rates
+
+### 6.6 Testing Commands
+
+**Test Loki**:
+```bash
+# Check ready status
+curl http://localhost:3100/ready
+
+# Query API
+curl http://localhost:3100/loki/api/v1/labels
+
+# Get label values
+curl http://localhost:3100/loki/api/v1/label/app/values
+
+# Run query
+curl -G -s "http://localhost:3100/loki/api/v1/query" \
+ --data-urlencode 'query={app="devops-python"}' \
+ | jq .
+```
+
+**Test Promtail**:
+```bash
+# Check targets
+curl http://localhost:9080/targets | jq .
+
+# Check metrics
+curl http://localhost:9080/metrics | grep promtail_targets_active_total
+```
+
+**Test Application Logs**:
+```bash
+# Generate traffic
+for i in {1..50}; do curl -s http://localhost:8000/ > /dev/null; done
+
+# Check container logs
+docker logs devops-python-app | tail -20
+
+# Should see JSON output
+```
+
+**Test Grafana**:
+```bash
+# Check health
+curl http://localhost:3000/api/health
+
+# Check datasources (requires auth)
+curl -u admin:your_password http://localhost:3000/api/datasources
+```
+
+## 6. Bonus — Ansible Automation (2.5 pts)
+
+### 6.1 Ansible Role Structure
+
+**Role Path**: `ansible/roles/monitoring`
+
+```
+roles/monitoring/
+├── defaults/
+│ └── main.yml # Default variables
+├── tasks/
+│ ├── main.yml # Main orchestration
+│ ├── setup.yml # Directory and config setup
+│ └── deploy.yml # Docker Compose deployment
+├── templates/
+│ ├── docker-compose.yml.j2 # Templated compose file
+│ ├── loki-config.yml.j2 # Templated Loki config
+│ ├── promtail-config.yml.j2 # Templated Promtail config
+│ └── env.j2 # Templated .env file
+├── handlers/
+│ └── main.yml # Service restart handlers
+└── meta/
+ └── main.yml # Role dependencies
+```
+
+### 6.2 Role Variables
+
+**File**: `ansible/roles/monitoring/defaults/main.yml`
+
+```yaml
+---
+# Monitoring Stack Configuration
+
+# Service versions
+loki_version: "3.0.0"
+promtail_version: "3.0.0"
+grafana_version: "11.3.1"
+
+# Service ports
+loki_port: 3100
+grafana_port: 3000
+promtail_port: 9080
+
+# Loki configuration
+loki_retention_period: "168h" # 7 days
+loki_schema_version: "v13"
+loki_compaction_interval: "10m"
+
+# Resource limits
+loki_memory_limit: "1G"
+loki_cpu_limit: "1.0"
+grafana_memory_limit: "1G"
+grafana_cpu_limit: "1.0"
+promtail_memory_limit: "512M"
+promtail_cpu_limit: "0.5"
+
+# Grafana configuration
+grafana_admin_user: "admin"
+grafana_admin_password: "{{ vault_grafana_password | default('changeme') }}"
+grafana_anonymous_enabled: false # Secure by default
+
+# Deployment paths
+monitoring_dir: "/opt/monitoring"
+monitoring_config_dir: "{{ monitoring_dir }}/config"
+
+# Application configuration
+python_app_enabled: true
+python_app_port: 8000
+python_app_log_level: "INFO"
+```
+
+### 6.3 Role Tasks
+
+**File**: `ansible/roles/monitoring/tasks/main.yml`
+
+```yaml
+---
+# Main orchestration for monitoring stack
+
+- name: Include setup tasks
+ include_tasks: setup.yml
+ tags:
+ - setup
+ - monitoring
+
+- name: Include deployment tasks
+ include_tasks: deploy.yml
+ tags:
+ - deploy
+ - monitoring
+```
+
+**File**: `ansible/roles/monitoring/tasks/setup.yml`
+
+```yaml
+---
+# Setup tasks: directories and configuration files
+
+- name: Create monitoring directories
+ file:
+ path: "{{ item }}"
+ state: directory
+ mode: '0755'
+ loop:
+ - "{{ monitoring_dir }}"
+ - "{{ monitoring_dir }}/loki"
+ - "{{ monitoring_dir }}/promtail"
+ - "{{ monitoring_dir }}/grafana"
+ - "{{ monitoring_dir }}/grafana/provisioning"
+ - "{{ monitoring_dir }}/grafana/provisioning/datasources"
+ - "{{ monitoring_dir }}/docs"
+
+- name: Template Loki configuration
+ template:
+ src: loki-config.yml.j2
+ dest: "{{ monitoring_dir }}/loki/config.yml"
+ mode: '0644'
+ notify: Restart monitoring stack
+
+- name: Template Promtail configuration
+ template:
+ src: promtail-config.yml.j2
+ dest: "{{ monitoring_dir }}/promtail/config.yml"
+ mode: '0644'
+ notify: Restart monitoring stack
+
+- name: Template Grafana Loki datasource
+ copy:
+ content: |
+ apiVersion: 1
+ datasources:
+ - name: Loki
+ type: loki
+ access: proxy
+ url: http://loki:{{ loki_port }}
+ isDefault: true
+ editable: true
+ dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/loki.yml"
+ mode: '0644'
+
+- name: Template Docker Compose file
+ template:
+ src: docker-compose.yml.j2
+ dest: "{{ monitoring_dir }}/docker-compose.yml"
+ mode: '0644'
+ notify: Restart monitoring stack
+
+- name: Template environment file
+ template:
+ src: env.j2
+ dest: "{{ monitoring_dir }}/.env"
+ mode: '0600' # Secure: only owner can read
+ no_log: true # Don't log passwords
+```
+
+**File**: `ansible/roles/monitoring/tasks/deploy.yml`
+
+```yaml
+---
+# Deployment tasks: Docker Compose
+
+- name: Check if Docker is installed
+ command: docker --version
+ register: docker_check
+ changed_when: false
+ failed_when: false
+
+- name: Fail if Docker is not installed
+ fail:
+ msg: "Docker is not installed. Please run the docker role first."
+ when: docker_check.rc != 0
+
+- name: Deploy monitoring stack with Docker Compose
+ community.docker.docker_compose_v2:
+ project_src: "{{ monitoring_dir }}"
+ state: present
+ pull: policy
+ register: compose_result
+
+- name: Wait for Loki to be ready
+ uri:
+ url: "http://localhost:{{ loki_port }}/ready"
+ method: GET
+ status_code: 200
+ retries: 30
+ delay: 2
+ register: loki_ready
+ until: loki_ready.status == 200
+
+- name: Wait for Promtail to be ready
+ uri:
+ url: "http://localhost:{{ promtail_port }}/ready"
+ method: GET
+ status_code: 200
+ retries: 20
+ delay: 2
+ register: promtail_ready
+ until: promtail_ready.status == 200
+
+- name: Wait for Grafana to be ready
+ uri:
+ url: "http://localhost:{{ grafana_port }}/api/health"
+ method: GET
+ status_code: 200
+ retries: 30
+ delay: 2
+ register: grafana_ready
+ until: grafana_ready.status == 200
+
+- name: Display deployment status
+ debug:
+ msg: |
+ Monitoring stack deployed successfully!
+
+ Access URLs:
+ - Grafana: http://{{ ansible_default_ipv4.address }}:{{ grafana_port }}
+ - Loki: http://{{ ansible_default_ipv4.address }}:{{ loki_port }}
+ - Promtail: http://{{ ansible_default_ipv4.address }}:{{ promtail_port }}
+
+ Credentials:
+ - Username: {{ grafana_admin_user }}
+ - Password: (stored in .env)
+```
+
+### 6.4 Templates
+
+**File**: `ansible/roles/monitoring/templates/docker-compose.yml.j2`
+
+```yaml
+version: '3.8'
+
+services:
+ loki:
+ image: grafana/loki:{{ loki_version }}
+ container_name: loki
+ ports:
+ - "{{ loki_port }}:3100"
+ command: -config.file=/etc/loki/config.yml
+ volumes:
+ - ./loki/config.yml:/etc/loki/config.yml:ro
+ - loki-data:/tmp/loki
+ networks:
+ - logging
+ deploy:
+ resources:
+ limits:
+ cpus: '{{ loki_cpu_limit }}'
+ memory: {{ loki_memory_limit }}
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ restart: unless-stopped
+
+ promtail:
+ image: grafana/promtail:{{ promtail_version }}
+ container_name: promtail
+ command: -config.file=/etc/promtail/config.yml
+ volumes:
+ - ./promtail/config.yml:/etc/promtail/config.yml:ro
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ - /var/lib/docker/containers:/var/lib/docker/containers:ro
+ networks:
+ - logging
+ depends_on:
+ loki:
+ condition: service_healthy
+ deploy:
+ resources:
+ limits:
+ cpus: '{{ promtail_cpu_limit }}'
+ memory: {{ promtail_memory_limit }}
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9080/ready || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ restart: unless-stopped
+
+ grafana:
+ image: grafana/grafana:{{ grafana_version }}
+ container_name: grafana
+ ports:
+ - "{{ grafana_port }}:3000"
+ environment:
+ - GF_AUTH_ANONYMOUS_ENABLED={{ 'true' if grafana_anonymous_enabled else 'false' }}
+{% if grafana_anonymous_enabled %}
+ - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+{% endif %}
+ - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER}
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
+ - GF_SERVER_ROOT_URL=http://localhost:{{ grafana_port }}
+ volumes:
+ - grafana-data:/var/lib/grafana
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
+ networks:
+ - logging
+ depends_on:
+ loki:
+ condition: service_healthy
+ deploy:
+ resources:
+ limits:
+ cpus: '{{ grafana_cpu_limit }}'
+ memory: {{ grafana_memory_limit }}
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ restart: unless-stopped
+
+{% if python_app_enabled %}
+ app-python:
+ build:
+ context: ../app_python
+ dockerfile: Dockerfile
+ container_name: devops-python-app
+ ports:
+ - "{{ python_app_port }}:5000"
+ environment:
+ - PORT=5000
+ - LOG_LEVEL={{ python_app_log_level }}
+ networks:
+ - logging
+ labels:
+ logging: "promtail"
+ app: "devops-python"
+ healthcheck:
+ test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:5000/health || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 5
+ restart: unless-stopped
+ depends_on:
+ promtail:
+ condition: service_healthy
+{% endif %}
+
+networks:
+ logging:
+ driver: bridge
+
+volumes:
+ loki-data:
+ grafana-data:
+```
+
+**File**: `ansible/roles/monitoring/templates/loki-config.yml.j2`
+
+```yaml
+# Loki {{ loki_version }} Configuration
+# Generated by Ansible
+
+auth_enabled: false
+
+server:
+ http_listen_port: 3100
+ grpc_listen_port: 9096
+
+common:
+ path_prefix: /tmp/loki
+ storage:
+ filesystem:
+ chunks_directory: /tmp/loki/chunks
+ rules_directory: /tmp/loki/rules
+ replication_factor: 1
+ ring:
+ instance_addr: 127.0.0.1
+ kvstore:
+ store: inmemory
+
+query_range:
+ results_cache:
+ cache:
+ embedded_cache:
+ enabled: true
+ max_size_mb: 100
+
+schema_config:
+ configs:
+ - from: 2020-10-24
+ store: tsdb
+ object_store: filesystem
+ schema: {{ loki_schema_version }}
+ index:
+ prefix: index_
+ period: 24h
+
+storage_config:
+ tsdb_shipper:
+ active_index_directory: /tmp/loki/tsdb-index
+ cache_location: /tmp/loki/tsdb-cache
+ cache_ttl: 24h
+ filesystem:
+ directory: /tmp/loki/chunks
+
+compactor:
+ working_directory: /tmp/loki/boltdb-shipper-compactor
+ shared_store: filesystem
+ compaction_interval: {{ loki_compaction_interval }}
+ retention_enabled: true
+ retention_delete_delay: 2h
+ retention_delete_worker_count: 150
+
+limits_config:
+ retention_period: {{ loki_retention_period }}
+ reject_old_samples: true
+ reject_old_samples_max_age: {{ loki_retention_period }}
+ ingestion_rate_mb: 4
+ ingestion_burst_size_mb: 6
+
+analytics:
+ reporting_enabled: false
+```
+
+**File**: `ansible/roles/monitoring/templates/promtail-config.yml.j2`
+
+```yaml
+# Promtail {{ promtail_version }} Configuration
+# Generated by Ansible
+
+server:
+ http_listen_port: 9080
+ grpc_listen_port: 0
+
+positions:
+ filename: /tmp/positions.yaml
+
+clients:
+ - url: http://loki:{{ loki_port }}/loki/api/v1/push
+
+scrape_configs:
+ - job_name: docker
+ docker_sd_configs:
+ - host: unix:///var/run/docker.sock
+ refresh_interval: 5s
+ filters:
+ - name: label
+ values: ["logging=promtail"]
+
+ relabel_configs:
+ - source_labels: ['__meta_docker_container_name']
+ regex: '/(.*)'
+ target_label: 'container'
+ - source_labels: ['__meta_docker_container_label_app']
+ target_label: 'app'
+ - replacement: 'docker'
+ target_label: 'job'
+
+ pipeline_stages:
+ - json:
+ expressions:
+ level: level
+ timestamp: timestamp
+ message: message
+ method: method
+ path: path
+ status_code: status_code
+ - labels:
+ level:
+ method:
+ - timestamp:
+ source: timestamp
+ format: RFC3339Nano
+ fallback_formats:
+ - RFC3339
+```
+
+**File**: `ansible/roles/monitoring/templates/env.j2`
+
+```bash
+# Environment variables for Monitoring Stack
+# Generated by Ansible - DO NOT EDIT MANUALLY
+
+GRAFANA_ADMIN_USER={{ grafana_admin_user }}
+GRAFANA_ADMIN_PASSWORD={{ grafana_admin_password }}
+```
+
+### 7.5 Handlers
+
+**File**: `ansible/roles/monitoring/handlers/main.yml`
+
+```yaml
+---
+- name: Restart monitoring stack
+ community.docker.docker_compose_v2:
+ project_src: "{{ monitoring_dir }}"
+ state: restarted
+```
+
+### 6.6 Meta Dependencies
+
+**File**: `ansible/roles/monitoring/meta/main.yml`
+
+```yaml
+---
+dependencies:
+ - role: docker
+ when: docker_install | default(true)
+
+galaxy_info:
+ author: Selivanov George
+ description: Ansible role for deploying Loki monitoring stack
+ company: Innopolis University
+ license: MIT
+ min_ansible_version: "2.16"
+ platforms:
+ - name: Ubuntu
+ versions:
+ - focal
+ - jammy
+ - name: Debian
+ versions:
+ - bullseye
+ - bookworm
+ galaxy_tags:
+ - loki
+ - grafana
+ - monitoring
+ - logging
+ - observability
+```
+
+### 6.7 Deployment Playbook
+
+**File**: `ansible/playbooks/deploy-monitoring.yml`
+
+```yaml
+---
+- name: Deploy Loki Monitoring Stack
+ hosts: all
+ become: true
+ vars:
+ # Override defaults here
+ grafana_anonymous_enabled: false
+ loki_retention_period: "168h"
+ python_app_enabled: true
+
+ roles:
+ - role: monitoring
+ tags:
+ - monitoring
+ - loki
+
+ post_tasks:
+ - name: Display access information
+ debug:
+ msg: |
+ ========================================
+ Monitoring Stack Deployed Successfully!
+ ========================================
+
+ Services:
+ - Grafana: http://{{ ansible_default_ipv4.address }}:{{ grafana_port }}
+ - Loki API: http://{{ ansible_default_ipv4.address }}:{{ loki_port }}
+ - Promtail: http://{{ ansible_default_ipv4.address }}:{{ promtail_port }}
+
+ Credentials:
+ - Username: {{ grafana_admin_user }}
+ - Password: (check .env file on target host)
+
+ Next Steps:
+ 1. Access Grafana and verify Loki datasource
+ 2. Navigate to Explore and query logs: {job="docker"}
+ 3. Create dashboards based on Lab 7 requirements
+
+ ========================================
+```
+
+### 6.8 Variables for Group Vars
+
+**File**: `ansible/group_vars/all.yml` (add these)
+
+```yaml
+# Monitoring Stack Configuration
+monitoring_stack_enabled: true
+loki_version: "3.0.0"
+promtail_version: "3.0.0"
+grafana_version: "11.3.1"
+
+# Security: Use Ansible Vault for passwords
+vault_grafana_password: !vault |
+ $ANSIBLE_VAULT;1.1;AES256
+ # ... encrypted password ...
+
+# Or use plain text for development (NOT RECOMMENDED)
+# grafana_admin_password: "secure_password_here"
+```
+
+### 6.9 Usage Instructions
+
+**Deploy Monitoring Stack**:
+
+```bash
+cd ansible
+
+# Run playbook
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml
+
+# With vault password
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml --ask-vault-pass
+
+# Dry run (check mode)
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml --check
+
+# Only setup tasks
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml --tags setup
+
+# Only deployment tasks
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml --tags deploy
+```
+
+**Test Idempotency**:
+
+```bash
+# Run twice
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml
+# First run: changed > 0
+ansible-playbook -i inventory/hosts.ini playbooks/deploy-monitoring.yml
+# Second run: changed = 0 (idempotent)
+```
+
+**Expected Output** (first run):
+```
+PLAY RECAP *************************************************************
+localhost : ok=15 changed=10 unreachable=0 failed=0 skipped=0
+```
+
+**Expected Output** (second run - idempotent):
+```
+PLAY RECAP *************************************************************
+localhost : ok=15 changed=0 unreachable=0 failed=0 skipped=0
+```
\ No newline at end of file
diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md
new file mode 100644
index 0000000000..b091b063d9
--- /dev/null
+++ b/monitoring/docs/LAB08.md
@@ -0,0 +1,289 @@
+# Lab 8: Metrics & Monitoring with Prometheus
+
+**Student**: Selivanov George
+**Date**: March 19, 2026
+
+## 1. Overview
+
+This lab extends the existing observability stack from Lab 7 (Loki + Promtail + Grafana) with full metrics monitoring using Prometheus.
+
+Implemented scope:
+- Python app instrumentation with `prometheus_client`
+- `/metrics` endpoint with RED metrics and app-specific metrics
+- Prometheus 3.9 deployment and scrape configuration
+- Grafana integration with Prometheus datasource
+- Pre-provisioned dashboards (logs + metrics)
+- Production hardening: health checks, resource limits, retention, persistence
+- Ansible automation updated for full stack (bonus)
+
+## 2. Architecture
+
+### 2.1 Metrics Flow
+
+```text
+app-python (/metrics)
+ |
+ | scrape every 15s
+ v
+ Prometheus (TSDB, 15d/10GB retention)
+ |
+ | PromQL
+ v
+ Grafana dashboards
+```
+
+### 2.2 Full Observability Stack
+
+```text
+Docker containers -> Promtail -> Loki -> Grafana (logs)
+app-python /metrics -> Prometheus -> Grafana (metrics)
+```
+
+## 3. Application Instrumentation
+
+### 3.1 Dependency Added
+
+File updated:
+- `app_python/requirements.txt`
+
+Added package:
+- `prometheus-client==0.23.1`
+
+### 3.2 Metrics Implemented
+
+File updated:
+- `app_python/app.py`
+
+HTTP RED metrics:
+- Counter: `http_requests_total{method,endpoint,status_code}`
+- Histogram: `http_request_duration_seconds{method,endpoint}`
+- Gauge: `http_requests_in_progress`
+
+Application-specific metrics:
+- Counter: `devops_info_endpoint_calls_total{endpoint}`
+- Histogram: `devops_info_system_collection_seconds`
+
+### 3.3 Endpoints
+
+Implemented:
+- `GET /metrics` returns Prometheus exposition format
+
+Updated endpoint catalog (`GET /` response) to include `/metrics`.
+
+### 3.4 Instrumentation Approach
+
+- Middleware records:
+ - request start time
+ - in-progress gauge increment/decrement
+ - response status code
+ - histogram observation
+ - counter increment with labels
+- Endpoint labels are normalized using route path when available.
+
+## 4. Prometheus Setup
+
+### 4.1 Docker Compose Changes
+
+File updated:
+- `monitoring/docker-compose.yml`
+
+Added service:
+- `prometheus` with image `prom/prometheus:v3.9.0`
+- Port mapping: `9090:9090`
+- Config mount: `./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro`
+- Data volume: `prometheus-data:/prometheus`
+- Retention flags:
+ - `--storage.tsdb.retention.time=15d`
+ - `--storage.tsdb.retention.size=10GB`
+
+### 4.2 Prometheus Configuration
+
+File created:
+- `monitoring/prometheus/prometheus.yml`
+
+Configured jobs:
+- `prometheus` -> `localhost:9090`
+- `app` -> `app-python:5000`, path `/metrics`
+- `loki` -> `loki:3100`, path `/metrics`
+- `grafana` -> `grafana:3000`, path `/metrics`
+
+Global intervals:
+- scrape interval: `15s`
+- evaluation interval: `15s`
+
+## 5. Grafana Dashboards
+
+### 5.1 Datasource Provisioning
+
+Files created:
+- `monitoring/grafana/provisioning/datasources/prometheus.yml`
+- `monitoring/grafana/provisioning/dashboards/dashboards.yml`
+
+Grafana now auto-loads:
+- Loki datasource
+- Prometheus datasource
+- Dashboards from `/var/lib/grafana/dashboards`
+
+### 5.2 Dashboard Files
+
+Files created:
+- `monitoring/grafana/dashboards/grafana-app-dashboard.json`
+- `monitoring/grafana/dashboards/grafana-logs-dashboard.json`
+
+### 5.3 Metrics Dashboard Panels (7)
+
+`grafana-app-dashboard.json` includes:
+1. Request Rate by Endpoint
+2. Error Rate (5xx)
+3. Request Duration p95
+4. Request Duration Heatmap
+5. Active Requests
+6. Status Code Distribution
+7. App Uptime
+
+Note: Label name is `status_code` (not `status`) because the implementation follows lab requirement labels: `method`, `endpoint`, `status_code`.
+
+## 6. Production Configuration
+
+### 6.1 Health Checks
+
+Configured in compose for:
+- Prometheus: `/-/healthy`
+- Loki: `/ready`
+- Promtail: `/ready`
+- Grafana: `/api/health`
+- App: `/health`
+
+### 6.2 Resource Limits
+
+Configured:
+- Prometheus: `1G`, `1.0 CPU`
+- Loki: `1G`, `1.0 CPU`
+- Grafana: `512M`, `0.5 CPU`
+- App: `256M`, `0.5 CPU`
+
+### 6.3 Data Retention
+
+Configured:
+- Prometheus: `15d`, `10GB`
+- Loki: existing retention from Lab 7 remains active (`168h`)
+
+### 6.4 Persistence
+
+Volumes:
+- `prometheus-data`
+- `loki-data`
+- `grafana-data`
+- `promtail-data`
+
+## 7. PromQL Examples (RED + Ops)
+
+1. Request rate by endpoint:
+```promql
+sum(rate(http_requests_total[5m])) by (endpoint)
+```
+
+2. 5xx error rate:
+```promql
+sum(rate(http_requests_total{status_code=~"5.."}[5m]))
+```
+
+3. p95 latency:
+```promql
+histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))
+```
+
+4. Current active requests:
+```promql
+http_requests_in_progress
+```
+
+5. Status code distribution:
+```promql
+sum by (status_code) (rate(http_requests_total[5m]))
+```
+
+6. Service uptime status:
+```promql
+up{job="app"}
+```
+
+7. Endpoint business usage:
+```promql
+sum(rate(devops_info_endpoint_calls_total[5m])) by (endpoint)
+```
+
+## 8. Testing Results
+
+### 8.1 Automated Validation Performed
+
+1. Python tests:
+- Command: `python -m pytest -q`
+- Result: **30 passed**
+
+2. Lint:
+- Command: `python -m ruff check .`
+- Result: **All checks passed**
+
+3. Docker Compose syntax:
+- Command: `docker compose config` in `monitoring/`
+- Result: **Valid**
+- Note: compose warns `version` key is obsolete (non-blocking)
+
+4. Ansible syntax check:
+- Could not run because `ansible-playbook` is not installed in this environment.
+
+## 9. Metrics vs Logs (Lab 7 Comparison)
+
+Use **metrics** when you need:
+- trends over time
+- SLO/SLA tracking
+- threshold alerting
+- low-cost aggregation
+
+Use **logs** when you need:
+- request-level details
+- stack traces and payload context
+- forensic debugging
+- exact event timelines
+
+Best practice: use both together (implemented in this stack).
+
+## 10. Challenges & Solutions
+
+1. Missing test tooling in local Python runtime:
+- Issue: `pytest` module missing
+- Fix: configured venv and installed dependencies via `requirements.txt`
+
+2. Label schema mismatch risk (`status` vs `status_code`):
+- Issue: dashboards/examples often use `status`
+- Fix: standardized to `status_code` across instrumentation and dashboard queries
+
+3. Full stack automation gap in role:
+- Issue: existing role provisioned only Loki datasource
+- Fix: added Prometheus config templating, datasource provisioning, and dashboard provisioning
+
+4. Local Ansible validation unavailable:
+- Issue: `ansible-playbook` command not found
+- Fix: provided manual verification algorithm below
+
+## 11. Bonus — Ansible Automation Implemented
+
+### 11.1 Role Enhancements
+
+Updated role:
+- `ansible/roles/monitoring/defaults/main.yml`
+- `ansible/roles/monitoring/tasks/setup.yml`
+- `ansible/roles/monitoring/tasks/deploy.yml`
+- `ansible/roles/monitoring/templates/docker-compose.yml.j2`
+- `ansible/roles/monitoring/templates/prometheus.yml.j2`
+- `ansible/roles/monitoring/templates/grafana/datasources.yml.j2`
+- `ansible/roles/monitoring/templates/grafana/dashboards.yml.j2`
+- `ansible/roles/monitoring/files/grafana-app-dashboard.json`
+- `ansible/roles/monitoring/files/grafana-logs-dashboard.json`
+
+Capabilities added:
+- Prometheus vars and templated scrape config
+- Grafana auto-provisioning for Loki + Prometheus datasources
+- Auto-provisioning of logs + metrics dashboards
+- Readiness checks for Prometheus and datasource verification
\ No newline at end of file
diff --git a/monitoring/generate-test-logs.ps1 b/monitoring/generate-test-logs.ps1
new file mode 100644
index 0000000000..eac06f700e
--- /dev/null
+++ b/monitoring/generate-test-logs.ps1
@@ -0,0 +1,76 @@
+# Lab 7 - Generate Test Logs (PowerShell)
+# This script generates various types of log entries for testing
+
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host "Generating Test Traffic for Lab 7" -ForegroundColor Cyan
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host ""
+
+$baseUrl = "http://localhost:8000"
+
+Write-Host "1. Generating successful requests to /..." -ForegroundColor Yellow
+1..20 | ForEach-Object {
+ $null = Invoke-WebRequest -Uri "$baseUrl/" -UseBasicParsing -ErrorAction SilentlyContinue
+ Write-Host "." -NoNewline
+}
+Write-Host " ✓ Done (20 requests)" -ForegroundColor Green
+
+Write-Host ""
+Write-Host "2. Generating health check requests..." -ForegroundColor Yellow
+1..20 | ForEach-Object {
+ $null = Invoke-WebRequest -Uri "$baseUrl/health" -UseBasicParsing -ErrorAction SilentlyContinue
+ Write-Host "." -NoNewline
+}
+Write-Host " ✓ Done (20 requests)" -ForegroundColor Green
+
+Write-Host ""
+Write-Host "3. Generating 404 errors..." -ForegroundColor Yellow
+1..10 | ForEach-Object {
+ $null = Invoke-WebRequest -Uri "$baseUrl/nonexistent-endpoint" -UseBasicParsing -ErrorAction SilentlyContinue
+ Write-Host "." -NoNewline
+}
+Write-Host " ✓ Done (10 requests)" -ForegroundColor Green
+
+Write-Host ""
+Write-Host "4. Generating requests with different user agents..." -ForegroundColor Yellow
+$userAgents = @(
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0",
+ "curl/7.68.0",
+ "PostmanRuntime/7.28.0",
+ "Python-requests/2.26.0"
+)
+
+foreach ($ua in $userAgents) {
+ $null = Invoke-WebRequest -Uri "$baseUrl/" -UserAgent $ua -UseBasicParsing -ErrorAction SilentlyContinue
+ Write-Host " Request with UA: $ua"
+}
+Write-Host " ✓ Done (4 requests)" -ForegroundColor Green
+
+Write-Host ""
+Write-Host "5. Rapid fire test (100 requests)..." -ForegroundColor Yellow
+$jobs = @()
+1..100 | ForEach-Object {
+ $jobs += Start-Job -ScriptBlock {
+ param($url)
+ $null = Invoke-WebRequest -Uri $url -UseBasicParsing -ErrorAction SilentlyContinue
+ } -ArgumentList $baseUrl
+}
+$jobs | Wait-Job | Remove-Job
+Write-Host " ✓ Done (100 concurrent requests)" -ForegroundColor Green
+
+Write-Host ""
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host "Test Summary" -ForegroundColor Cyan
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host "Total requests generated: 154"
+Write-Host "- Successful (200): 124"
+Write-Host "- Not Found (404): 10"
+Write-Host "- Health checks: 20"
+Write-Host ""
+Write-Host "Check logs in:" -ForegroundColor Green
+Write-Host "1. Docker: docker logs devops-python-app"
+Write-Host "2. Grafana Explore: http://localhost:3000/explore"
+Write-Host " Query: {app=`"devops-python`"}"
+Write-Host ""
+Write-Host "Wait 10-15 seconds for logs to be ingested by Loki"
+Write-Host "=========================================" -ForegroundColor Cyan
diff --git a/monitoring/generate-test-logs.sh b/monitoring/generate-test-logs.sh
new file mode 100644
index 0000000000..a9471b6f5d
--- /dev/null
+++ b/monitoring/generate-test-logs.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Lab 7 - Generate Test Logs
+# This script generates various types of log entries for testing
+
+echo "========================================="
+echo "Generating Test Traffic for Lab 7"
+echo "========================================="
+echo ""
+
+BASE_URL="http://localhost:8000"
+
+echo "1. Generating successful requests to /..."
+for i in {1..20}; do
+ curl -s "$BASE_URL/" > /dev/null
+ echo -n "."
+done
+echo " ✓ Done (20 requests)"
+
+echo ""
+echo "2. Generating health check requests..."
+for i in {1..20}; do
+ curl -s "$BASE_URL/health" > /dev/null
+ echo -n "."
+done
+echo " ✓ Done (20 requests)"
+
+echo ""
+echo "3. Generating 404 errors..."
+for i in {1..10}; do
+ curl -s "$BASE_URL/nonexistent-endpoint" > /dev/null
+ echo -n "."
+done
+echo " ✓ Done (10 requests)"
+
+echo ""
+echo "4. Generating requests with different user agents..."
+user_agents=(
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0"
+ "curl/7.68.0"
+ "PostmanRuntime/7.28.0"
+ "Python-requests/2.26.0"
+)
+
+for ua in "${user_agents[@]}"; do
+ curl -s -H "User-Agent: $ua" "$BASE_URL/" > /dev/null
+ echo " Request with UA: $ua"
+done
+echo " ✓ Done (4 requests)"
+
+echo ""
+echo "5. Rapid fire test (100 requests)..."
+for i in {1..100}; do
+ curl -s "$BASE_URL/" > /dev/null &
+done
+wait
+echo " ✓ Done (100 concurrent requests)"
+
+echo ""
+echo "========================================="
+echo "Test Summary"
+echo "========================================="
+echo "Total requests generated: 174"
+echo "- Successful (200): 144"
+echo "- Not Found (404): 10"
+echo "- Health checks: 20"
+echo ""
+echo "Check logs in:"
+echo "1. Docker: docker logs devops-python-app"
+echo "2. Grafana Explore: http://localhost:3000/explore"
+echo " Query: {app=\"devops-python\"}"
+echo ""
+echo "Wait 10-15 seconds for logs to be ingested by Loki"
+echo "========================================="
diff --git a/monitoring/grafana/dashboards/grafana-app-dashboard.json b/monitoring/grafana/dashboards/grafana-app-dashboard.json
new file mode 100644
index 0000000000..5cd68f9867
--- /dev/null
+++ b/monitoring/grafana/dashboards/grafana-app-dashboard.json
@@ -0,0 +1,326 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "reqps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "options": {
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
+ "legendFormat": "{{endpoint}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Request Rate by Endpoint",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "reqps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "id": 2,
+ "options": {
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))",
+ "legendFormat": "5xx",
+ "refId": "A"
+ }
+ ],
+ "title": "Error Rate (5xx)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "id": 3,
+ "options": {
+ "legend": {
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[5m])))",
+ "legendFormat": "p95",
+ "refId": "A"
+ }
+ ],
+ "title": "Request Duration p95",
+ "type": "timeseries"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "id": 4,
+ "targets": [
+ {
+ "expr": "sum by (le) (rate(http_request_duration_seconds_bucket[5m]))",
+ "format": "heatmap",
+ "legendFormat": "{{le}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Request Duration Heatmap",
+ "type": "heatmap"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 16
+ },
+ "id": 5,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "targets": [
+ {
+ "expr": "http_requests_in_progress",
+ "refId": "A"
+ }
+ ],
+ "title": "Active Requests",
+ "type": "gauge"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 16
+ },
+ "id": 6,
+ "options": {
+ "displayLabels": [
+ "name",
+ "percent"
+ ],
+ "legend": {
+ "displayMode": "list",
+ "placement": "right",
+ "showLegend": true
+ },
+ "pieType": "pie",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "expr": "sum by (status_code) (rate(http_requests_total[5m]))",
+ "legendFormat": "{{status_code}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Status Code Distribution",
+ "type": "piechart"
+ },
+ {
+ "datasource": "Prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [
+ {
+ "options": {
+ "0": {
+ "text": "DOWN"
+ },
+ "1": {
+ "text": "UP"
+ }
+ },
+ "type": "value"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "red",
+ "value": null
+ },
+ {
+ "color": "green",
+ "value": 1
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 16
+ },
+ "id": 7,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "center",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "value"
+ },
+ "targets": [
+ {
+ "expr": "up{job=\"app\"}",
+ "refId": "A"
+ }
+ ],
+ "title": "App Uptime",
+ "type": "stat"
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 39,
+ "style": "dark",
+ "tags": [
+ "devops",
+ "prometheus",
+ "lab08"
+ ],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "DevOps App Metrics",
+ "uid": "devops-app-metrics",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/monitoring/grafana/dashboards/grafana-logs-dashboard.json b/monitoring/grafana/dashboards/grafana-logs-dashboard.json
new file mode 100644
index 0000000000..8b44d1edcc
--- /dev/null
+++ b/monitoring/grafana/dashboards/grafana-logs-dashboard.json
@@ -0,0 +1,76 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "datasource": "Loki",
+ "gridPos": {
+ "h": 16,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 1,
+ "options": {
+ "dedupStrategy": "none",
+ "enableLogDetails": true,
+ "prettifyLogMessage": false,
+ "showCommonLabels": false,
+ "showLabels": true,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": false
+ },
+ "targets": [
+ {
+ "expr": "{job=\"docker\"}",
+ "queryType": "range",
+ "refId": "A"
+ }
+ ],
+ "title": "Container Logs",
+ "type": "logs"
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 39,
+ "style": "dark",
+ "tags": [
+ "devops",
+ "loki",
+ "lab07"
+ ],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "DevOps Logs",
+ "uid": "devops-logs",
+ "version": 1,
+ "weekStart": ""
+}
diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml
new file mode 100644
index 0000000000..7435f09d71
--- /dev/null
+++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+providers:
+ - name: 'default'
+ orgId: 1
+ folder: ''
+ type: file
+ disableDeletion: false
+ updateIntervalSeconds: 10
+ allowUiUpdates: true
+ options:
+ path: /var/lib/grafana/dashboards
diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml
new file mode 100644
index 0000000000..e6f033cf5e
--- /dev/null
+++ b/monitoring/grafana/provisioning/datasources/loki.yml
@@ -0,0 +1,19 @@
+# Grafana datasource provisioning for Loki
+# This file automatically configures the Loki datasource on Grafana startup
+apiVersion: 1
+
+datasources:
+ - name: Loki
+ type: loki
+ access: proxy
+ url: http://loki:3100
+ isDefault: true
+ jsonData:
+ maxLines: 1000
+ derivedFields:
+ # Extract trace IDs if available
+ - datasourceUid: loki
+ matcherRegex: "trace_id=(\\w+)"
+ name: TraceID
+ url: "$${__value.raw}"
+ editable: true
diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml
new file mode 100644
index 0000000000..17b63c049a
--- /dev/null
+++ b/monitoring/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,9 @@
+apiVersion: 1
+
+datasources:
+ - name: Prometheus
+ type: prometheus
+ access: proxy
+ url: http://prometheus:9090
+ isDefault: false
+ editable: true
diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml
new file mode 100644
index 0000000000..145cc0feb3
--- /dev/null
+++ b/monitoring/loki/config.yml
@@ -0,0 +1,77 @@
+# Loki 3.0 Configuration with TSDB and 7-day retention
+# Documentation: https://grafana.com/docs/loki/latest/configure/
+
+auth_enabled: false
+
+server:
+ http_listen_port: 3100
+ grpc_listen_port: 9096
+
+# Common configuration shared across components
+common:
+ path_prefix: /tmp/loki
+ storage:
+ filesystem:
+ chunks_directory: /tmp/loki/chunks
+ rules_directory: /tmp/loki/rules
+ replication_factor: 1
+ ring:
+ instance_addr: 127.0.0.1
+ kvstore:
+ store: inmemory
+
+# Query configuration
+query_range:
+ results_cache:
+ cache:
+ embedded_cache:
+ enabled: true
+ max_size_mb: 100
+
+# Schema configuration with TSDB (faster than boltdb-shipper in Loki 3.0)
+schema_config:
+ configs:
+ - from: 2020-10-24
+ store: tsdb
+ object_store: filesystem
+ schema: v13
+ index:
+ prefix: index_
+ period: 24h
+
+# Storage configuration
+storage_config:
+ tsdb_shipper:
+ active_index_directory: /tmp/loki/tsdb-index
+ cache_location: /tmp/loki/tsdb-cache
+ cache_ttl: 24h
+ filesystem:
+ directory: /tmp/loki/chunks
+
+# Compactor configuration (required for retention)
+compactor:
+ working_directory: /tmp/loki/boltdb-shipper-compactor
+ shared_store: filesystem
+ compaction_interval: 10m
+ retention_enabled: true
+ retention_delete_delay: 2h
+ retention_delete_worker_count: 150
+
+# Limits configuration with 7-day (168h) retention
+limits_config:
+ retention_period: 168h
+ reject_old_samples: true
+ reject_old_samples_max_age: 168h
+ ingestion_rate_mb: 4
+ ingestion_burst_size_mb: 6
+ max_label_name_length: 1024
+ max_label_value_length: 2048
+ max_label_names_per_series: 30
+
+# Runtime configuration
+runtime_config:
+ file: /tmp/loki/runtime-config.yaml
+
+# Analytics disabled for privacy
+analytics:
+ reporting_enabled: false
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
new file mode 100644
index 0000000000..26a4b69a73
--- /dev/null
+++ b/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,23 @@
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+scrape_configs:
+ - job_name: 'prometheus'
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'app'
+ static_configs:
+ - targets: ['app-python:5000']
+ metrics_path: '/metrics'
+
+ - job_name: 'loki'
+ static_configs:
+ - targets: ['loki:3100']
+ metrics_path: '/metrics'
+
+ - job_name: 'grafana'
+ static_configs:
+ - targets: ['grafana:3000']
+ metrics_path: '/metrics'
diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml
new file mode 100644
index 0000000000..33c36ee10d
--- /dev/null
+++ b/monitoring/promtail/config.yml
@@ -0,0 +1,77 @@
+# Promtail 3.0 Configuration for Docker log collection
+# Documentation: https://grafana.com/docs/loki/latest/send-data/promtail/
+
+server:
+ http_listen_port: 9080
+ grpc_listen_port: 0
+
+# Position file to track which logs have been read
+positions:
+ filename: /tmp/positions.yaml
+
+# Loki client configuration
+clients:
+ - url: http://loki:3100/loki/api/v1/push
+
+# Scrape configurations
+scrape_configs:
+ # Docker service discovery configuration
+ - job_name: docker
+ docker_sd_configs:
+ - host: unix:///var/run/docker.sock
+ refresh_interval: 5s
+ filters:
+ - name: label
+ values: ["logging=promtail"]
+
+ relabel_configs:
+ # Extract container name and remove leading '/'
+ - source_labels: ['__meta_docker_container_name']
+ regex: '/(.*)'
+ target_label: 'container'
+
+ # Extract container ID (short version)
+ - source_labels: ['__meta_docker_container_id']
+ regex: '([a-zA-Z0-9]{12}).*'
+ target_label: 'container_id'
+
+ # Extract app label if present
+ - source_labels: ['__meta_docker_container_label_app']
+ target_label: 'app'
+
+ # Extract image name
+ - source_labels: ['__meta_docker_container_label_com_docker_compose_service']
+ target_label: 'compose_service'
+
+ # Add job label
+ - replacement: 'docker'
+ target_label: 'job'
+
+ # Pipeline stages for log processing
+ pipeline_stages:
+ # Parse JSON logs if they are JSON
+ - json:
+ expressions:
+ level: level
+ timestamp: timestamp
+ message: message
+ method: method
+ path: path
+ status_code: status_code
+
+ # Extract labels from JSON fields
+ - labels:
+ level:
+ method:
+
+ # Set timestamp from JSON if available
+ - timestamp:
+ source: timestamp
+ format: RFC3339Nano
+ fallback_formats:
+ - RFC3339
+ - '2006-01-02T15:04:05.999999999Z07:00'
+
+ # Output stage for debugging (comment out in production)
+ # - output:
+ # source: message
diff --git a/monitoring/verify-stack.ps1 b/monitoring/verify-stack.ps1
new file mode 100644
index 0000000000..5014448066
--- /dev/null
+++ b/monitoring/verify-stack.ps1
@@ -0,0 +1,209 @@
+# Lab 8 - Monitoring Stack Testing Script (PowerShell)
+# This script tests observability components: Prometheus, Loki, Promtail, Grafana, and app metrics
+
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host "Lab 8 - Observability Stack Verification" -ForegroundColor Cyan
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host ""
+
+function Test-Endpoint {
+ param(
+ [string]$Url,
+ [int]$ExpectedStatus,
+ [string]$Name
+ )
+
+ Write-Host "Testing $Name... " -NoNewline
+ try {
+ $response = Invoke-WebRequest -Uri $Url -UseBasicParsing -TimeoutSec 5 -ErrorAction Stop
+ if ($response.StatusCode -eq $ExpectedStatus) {
+ Write-Host "✓ (HTTP $($response.StatusCode))" -ForegroundColor Green
+ return $true
+ } else {
+ Write-Host "✗ (HTTP $($response.StatusCode), expected $ExpectedStatus)" -ForegroundColor Red
+ return $false
+ }
+ } catch {
+ Write-Host "✗ (Failed to connect)" -ForegroundColor Red
+ return $false
+ }
+}
+
+Write-Host "1. Checking Docker Compose services..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+Push-Location $PSScriptRoot
+docker compose ps --format table
+Write-Host ""
+
+Write-Host "2. Testing service endpoints..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+
+# Test all endpoints
+$endpoints = @(
+ @{Url="http://localhost:3100/ready"; Status=200; Name="Loki /ready"}
+ @{Url="http://localhost:3100/metrics"; Status=200; Name="Loki /metrics"}
+ @{Url="http://localhost:9090/-/healthy"; Status=200; Name="Prometheus /-/healthy"}
+ @{Url="http://localhost:9090/targets"; Status=200; Name="Prometheus /targets"}
+ @{Url="http://localhost:9080/ready"; Status=200; Name="Promtail /ready"}
+ @{Url="http://localhost:9080/targets"; Status=200; Name="Promtail /targets"}
+ @{Url="http://localhost:3000/api/health"; Status=200; Name="Grafana /api/health"}
+ @{Url="http://localhost:8000/"; Status=200; Name="Python App /"}
+ @{Url="http://localhost:8000/health"; Status=200; Name="Python App /health"}
+ @{Url="http://localhost:8000/metrics"; Status=200; Name="Python App /metrics"}
+)
+
+foreach ($endpoint in $endpoints) {
+ Test-Endpoint -Url $endpoint.Url -ExpectedStatus $endpoint.Status -Name $endpoint.Name
+}
+
+Write-Host ""
+Write-Host "3. Checking Promtail targets..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+try {
+ $targetsResponse = Invoke-RestMethod -Uri "http://localhost:9080/targets" -UseBasicParsing
+ $targetCount = $targetsResponse.activeTargets.Count
+ Write-Host "Active targets: $targetCount"
+
+ if ($targetCount -gt 0) {
+ Write-Host "✓ Promtail is collecting logs from $targetCount targets" -ForegroundColor Green
+ Write-Host ""
+ Write-Host "Target details:"
+ $targetsResponse.activeTargets | Select-Object -First 3 | ForEach-Object {
+ Write-Host " - Container: $($_.labels.container)" -ForegroundColor Cyan
+ Write-Host " App: $($_.labels.app)" -ForegroundColor Cyan
+ }
+ } else {
+ Write-Host "✗ No active targets found" -ForegroundColor Red
+ Write-Host "Check if containers have the 'logging=promtail' label"
+ }
+} catch {
+ Write-Host "✗ Failed to query Promtail targets" -ForegroundColor Red
+}
+
+Write-Host ""
+Write-Host "4. Checking Loki labels..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+try {
+ $labelsResponse = Invoke-RestMethod -Uri "http://localhost:3100/loki/api/v1/labels" -UseBasicParsing
+ if ($labelsResponse.data.Count -gt 0) {
+ Write-Host "Available labels in Loki:"
+ $labelsResponse.data | Select-Object -First 10 | ForEach-Object {
+ Write-Host " - $_" -ForegroundColor Cyan
+ }
+ Write-Host "✓ Loki has labels configured" -ForegroundColor Green
+ } else {
+ Write-Host "⚠ No labels found yet (logs may not have been ingested)" -ForegroundColor Yellow
+ }
+} catch {
+ Write-Host "⚠ Failed to query Loki labels" -ForegroundColor Yellow
+}
+
+Write-Host ""
+Write-Host "5. Checking Docker container logs (JSON format)..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+$pythonAppLogs = docker logs devops-python-app --tail 3 2>&1
+if ($pythonAppLogs) {
+ Write-Host "Sample logs from Python app:"
+ $pythonAppLogs | ForEach-Object {
+ Write-Host " $_" -ForegroundColor Gray
+ }
+
+ # Check if JSON
+ try {
+ $lastLog = docker logs devops-python-app --tail 1 2>&1 | Out-String
+ $null = $lastLog | ConvertFrom-Json
+ Write-Host "✓ Python app is logging in JSON format" -ForegroundColor Green
+ } catch {
+ Write-Host "⚠ Python app logs may not be in JSON format" -ForegroundColor Yellow
+ }
+} else {
+ Write-Host "⚠ Python app container not found" -ForegroundColor Yellow
+}
+
+Write-Host ""
+Write-Host "6. Testing Loki queries..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+try {
+ $queryUrl = "http://localhost:3100/loki/api/v1/query?query={job=`"docker`"}&limit=5"
+ $queryResponse = Invoke-RestMethod -Uri $queryUrl -UseBasicParsing
+ $resultCount = $queryResponse.data.result.Count
+
+ if ($resultCount -gt 0) {
+ Write-Host "✓ Query returned $resultCount log streams" -ForegroundColor Green
+ } else {
+ Write-Host "⚠ No logs found (may need to generate some traffic first)" -ForegroundColor Yellow
+ }
+} catch {
+ Write-Host "⚠ Failed to query Loki" -ForegroundColor Yellow
+}
+
+Write-Host ""
+Write-Host "7. Generating test traffic..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+Write-Host "Sending 20 requests to Python app..."
+
+1..10 | ForEach-Object {
+ $null = Invoke-WebRequest -Uri "http://localhost:8000/" -UseBasicParsing -ErrorAction SilentlyContinue
+ $null = Invoke-WebRequest -Uri "http://localhost:8000/health" -UseBasicParsing -ErrorAction SilentlyContinue
+}
+
+Write-Host "✓ Generated 20 requests" -ForegroundColor Green
+Write-Host "Waiting 10 seconds for logs to be ingested..."
+Start-Sleep -Seconds 10
+
+# Query again
+try {
+ $queryUrl = "http://localhost:3100/loki/api/v1/query?query={app=`"devops-python`"}&limit=5"
+ $queryResponseAfter = Invoke-RestMethod -Uri $queryUrl -UseBasicParsing
+ $resultCountAfter = $queryResponseAfter.data.result.Count
+
+ if ($resultCountAfter -gt 0) {
+ Write-Host "✓ Query returned $resultCountAfter log streams from Python app" -ForegroundColor Green
+ } else {
+ Write-Host "⚠ Still no logs from Python app" -ForegroundColor Yellow
+ }
+} catch {
+ Write-Host "⚠ Failed to query Loki after traffic generation" -ForegroundColor Yellow
+}
+
+Write-Host ""
+Write-Host "8. Checking Prometheus targets..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+try {
+ $upQuery = Invoke-RestMethod -Uri "http://localhost:9090/api/v1/query?query=up" -UseBasicParsing
+ $upCount = $upQuery.data.result.Count
+ if ($upCount -gt 0) {
+ Write-Host "✓ Prometheus up query returned $upCount target series" -ForegroundColor Green
+ } else {
+ Write-Host "✗ Prometheus up query returned no data" -ForegroundColor Red
+ }
+} catch {
+ Write-Host "✗ Failed to query Prometheus" -ForegroundColor Red
+}
+
+Write-Host ""
+Write-Host "9. Checking resource usage..." -ForegroundColor Yellow
+Write-Host "---------------------------------------"
+docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
+
+Write-Host ""
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host "Verification Summary" -ForegroundColor Cyan
+Write-Host "=========================================" -ForegroundColor Cyan
+Write-Host ""
+Write-Host "Next Steps:" -ForegroundColor Green
+Write-Host "1. Access Grafana: http://localhost:3000"
+Write-Host " - Login: admin / (your password from .env)"
+Write-Host "2. Access Prometheus: http://localhost:9090/targets"
+Write-Host "3. In Grafana Explore run Loki query: {job=`"docker`"}"
+Write-Host "4. In Grafana Explore run PromQL query: sum(rate(http_requests_total[5m]))"
+Write-Host "5. Take screenshots for documentation"
+Write-Host ""
+Write-Host "Useful commands:"
+Write-Host " - View logs: docker compose logs -f [service]"
+Write-Host " - Restart: docker compose restart [service]"
+Write-Host " - Stop all: docker compose down"
+Write-Host ""
+Write-Host "=========================================" -ForegroundColor Cyan
+
+Pop-Location
diff --git a/monitoring/verify-stack.sh b/monitoring/verify-stack.sh
new file mode 100644
index 0000000000..c752ffb78c
--- /dev/null
+++ b/monitoring/verify-stack.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# Lab 8 - Monitoring Stack Testing Script
+# This script tests observability components: Prometheus, Loki, Promtail, Grafana, and app metrics
+
+set -e # Exit on error
+
+echo "========================================="
+echo "Lab 8 - Observability Stack Verification"
+echo "========================================="
+echo ""
+
+# Colors for output
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print status
+print_status() {
+ if [ $1 -eq 0 ]; then
+ echo -e "${GREEN}✓${NC} $2"
+ else
+ echo -e "${RED}✗${NC} $2"
+ fi
+}
+
+# Function to test HTTP endpoint
+test_endpoint() {
+ local url=$1
+ local expected=$2
+ local name=$3
+
+ echo -n "Testing $name... "
+ response=$(curl -s -w "%{http_code}" -o /dev/null "$url" 2>/dev/null || echo "000")
+
+ if [ "$response" = "$expected" ]; then
+ echo -e "${GREEN}✓${NC} (HTTP $response)"
+ return 0
+ else
+ echo -e "${RED}✗${NC} (HTTP $response, expected $expected)"
+ return 1
+ fi
+}
+
+echo "1. Checking Docker Compose services..."
+echo "---------------------------------------"
+cd "$(dirname "$0")"
+
+if docker compose ps --format json > /dev/null 2>&1; then
+ services=$(docker compose ps --format json | jq -r '.[].Service' 2>/dev/null || docker compose ps --services)
+ echo "Services detected: $services"
+
+ # Check each service status
+ docker compose ps --format table
+ echo ""
+else
+ echo -e "${RED}✗${NC} Docker Compose not running or not in correct directory"
+ echo "Please run this script from the monitoring directory"
+ exit 1
+fi
+
+echo ""
+echo "2. Testing service endpoints..."
+echo "---------------------------------------"
+
+# Test Loki
+test_endpoint "http://localhost:3100/ready" "200" "Loki /ready"
+test_endpoint "http://localhost:3100/metrics" "200" "Loki /metrics"
+
+# Test Prometheus
+test_endpoint "http://localhost:9090/-/healthy" "200" "Prometheus /-/healthy"
+test_endpoint "http://localhost:9090/targets" "200" "Prometheus /targets"
+
+# Test Promtail
+test_endpoint "http://localhost:9080/ready" "200" "Promtail /ready"
+test_endpoint "http://localhost:9080/targets" "200" "Promtail /targets"
+
+# Test Grafana
+test_endpoint "http://localhost:3000/api/health" "200" "Grafana /api/health"
+
+# Test Python App
+test_endpoint "http://localhost:8000/" "200" "Python App /"
+test_endpoint "http://localhost:8000/health" "200" "Python App /health"
+test_endpoint "http://localhost:8000/metrics" "200" "Python App /metrics"
+
+echo ""
+echo "3. Checking Promtail targets..."
+echo "---------------------------------------"
+targets=$(curl -s http://localhost:9080/targets 2>/dev/null | jq '.activeTargets | length' 2>/dev/null || echo "0")
+echo "Active targets: $targets"
+
+if [ "$targets" -gt 0 ]; then
+ echo -e "${GREEN}✓${NC} Promtail is collecting logs from $targets targets"
+ echo ""
+ echo "Target details:"
+ curl -s http://localhost:9080/targets | jq '.activeTargets[] | {labels: .labels, discoveredLabels: .discoveredLabels}' | head -30
+else
+ echo -e "${RED}✗${NC} No active targets found"
+ echo "Check if containers have the 'logging=promtail' label"
+fi
+
+echo ""
+echo "4. Checking Loki labels..."
+echo "---------------------------------------"
+labels=$(curl -s http://localhost:3100/loki/api/v1/labels 2>/dev/null | jq -r '.data[]' 2>/dev/null || echo "")
+if [ -n "$labels" ]; then
+ echo "Available labels in Loki:"
+ echo "$labels" | head -20
+ echo -e "${GREEN}✓${NC} Loki has labels configured"
+else
+ echo -e "${YELLOW}⚠${NC} No labels found yet (logs may not have been ingested)"
+fi
+
+echo ""
+echo "5. Checking Docker container logs (JSON format)..."
+echo "---------------------------------------"
+if docker ps --format "{{.Names}}" | grep -q "devops-python-app"; then
+ echo "Sample log from Python app:"
+ docker logs devops-python-app 2>&1 | tail -3
+
+ # Check if logs are JSON
+ if docker logs devops-python-app 2>&1 | tail -1 | jq . > /dev/null 2>&1; then
+ echo -e "${GREEN}✓${NC} Python app is logging in JSON format"
+ else
+ echo -e "${YELLOW}⚠${NC} Python app logs may not be in JSON format"
+ fi
+else
+ echo -e "${YELLOW}⚠${NC} Python app container not found"
+fi
+
+echo ""
+echo "6. Testing Loki queries..."
+echo "---------------------------------------"
+
+# Query all logs from docker job
+echo "Query: {job=\"docker\"}"
+query_result=$(curl -s -G "http://localhost:3100/loki/api/v1/query" \
+ --data-urlencode 'query={job="docker"}' \
+ --data-urlencode 'limit=5' 2>/dev/null | jq '.data.result | length' 2>/dev/null || echo "0")
+
+if [ "$query_result" -gt 0 ]; then
+ echo -e "${GREEN}✓${NC} Query returned $query_result log streams"
+else
+ echo -e "${YELLOW}⚠${NC} No logs found (may need to generate some traffic first)"
+fi
+
+echo ""
+echo "7. Generating test traffic..."
+echo "---------------------------------------"
+echo "Sending 20 requests to Python app..."
+
+for i in {1..10}; do
+ curl -s http://localhost:8000/ > /dev/null 2>&1
+ curl -s http://localhost:8000/health > /dev/null 2>&1
+done
+
+echo -e "${GREEN}✓${NC} Generated 20 requests"
+echo "Wait 10 seconds for logs to be ingested..."
+sleep 10
+
+# Query again after generating traffic
+echo ""
+echo "Query after generating traffic: {app=\"devops-python\"}"
+query_result_after=$(curl -s -G "http://localhost:3100/loki/api/v1/query" \
+ --data-urlencode 'query={app="devops-python"}' \
+ --data-urlencode 'limit=5' 2>/dev/null | jq '.data.result | length' 2>/dev/null || echo "0")
+
+if [ "$query_result_after" -gt 0 ]; then
+ echo -e "${GREEN}✓${NC} Query returned $query_result_after log streams from Python app"
+else
+ echo -e "${YELLOW}⚠${NC} Still no logs from Python app"
+fi
+
+echo ""
+echo "8. Checking Prometheus targets..."
+echo "---------------------------------------"
+up_targets=$(curl -s http://localhost:9090/api/v1/query --data-urlencode 'query=up' 2>/dev/null | jq -r '.data.result | length' 2>/dev/null || echo "0")
+echo "Targets visible in Prometheus up query: $up_targets"
+
+if [ "$up_targets" -gt 0 ]; then
+ echo -e "${GREEN}✓${NC} Prometheus can query targets"
+else
+ echo -e "${RED}✗${NC} Prometheus target query returned no data"
+fi
+
+echo ""
+echo "9. Checking resource usage..."
+echo "---------------------------------------"
+docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" | grep -E "prometheus|loki|promtail|grafana|devops-python"
+
+echo ""
+echo "========================================="
+echo "Verification Summary"
+echo "========================================="
+echo ""
+echo "Next Steps:"
+echo "1. Access Grafana: http://localhost:3000"
+echo " - Login: admin / (your password from .env)"
+echo "2. Access Prometheus: http://localhost:9090/targets"
+echo "3. In Grafana Explore run Loki query: {job=\"docker\"}"
+echo "4. In Grafana Explore run PromQL query: sum(rate(http_requests_total[5m]))"
+echo "5. Take screenshots for documentation"
+echo ""
+echo "Useful commands:"
+echo " - View logs: docker compose logs -f [service]"
+echo " - Restart: docker compose restart [service]"
+echo " - Stop all: docker compose down"
+echo ""
+echo "========================================="
diff --git a/pulumi/.gitignore b/pulumi/.gitignore
new file mode 100644
index 0000000000..091fa9fbcd
--- /dev/null
+++ b/pulumi/.gitignore
@@ -0,0 +1,34 @@
+# Pulumi
+*.pyc
+__pycache__/
+venv/
+.venv/
+*.egg-info/
+
+# Cloud credentials
+*.pem
+*.key
+*.json
+credentials
+key.json
+service-account-key.json
+
+# Pulumi state and config
+Pulumi.*.yaml
+!Pulumi.yaml
+!Pulumi.dev.yaml.example
+
+# Environment variables
+.env
+.env.local
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/pulumi/Pulumi.dev.yaml.example b/pulumi/Pulumi.dev.yaml.example
new file mode 100644
index 0000000000..40439f3004
--- /dev/null
+++ b/pulumi/Pulumi.dev.yaml.example
@@ -0,0 +1,23 @@
+config:
+ # Yandex Cloud Configuration
+ # PLACEHOLDER: Replace with your actual values
+ yandex:cloudId: "b1g1234567890abcdefg" # Your cloud ID
+ yandex:folderId: "b1g0987654321zyxwvut" # Your folder ID
+ yandex:zone: "ru-central1-a"
+ yandex:token: "" # Leave empty, will use service account key
+
+ # Project Configuration
+ devops-lab04-pulumi:vmName: "devops-lab04-vm-pulumi"
+ devops-lab04-pulumi:vmUser: "ubuntu"
+ devops-lab04-pulumi:sshPublicKey: | # PLACEHOLDER: Paste your SSH public key
+ ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC... your-email@example.com
+
+ # VM Resources (Free Tier Compatible)
+ devops-lab04-pulumi:vmCores: "2"
+ devops-lab04-pulumi:vmMemory: "1"
+ devops-lab04-pulumi:vmCoreFraction: "20"
+ devops-lab04-pulumi:diskSize: "10"
+ devops-lab04-pulumi:diskType: "network-hdd"
+
+ # Security
+ devops-lab04-pulumi:allowSshFromCidr: "0.0.0.0/0" # WARNING: Change to your IP!
diff --git a/pulumi/Pulumi.yaml b/pulumi/Pulumi.yaml
new file mode 100644
index 0000000000..5ad7b69cbe
--- /dev/null
+++ b/pulumi/Pulumi.yaml
@@ -0,0 +1,3 @@
+name: devops-lab04-pulumi
+runtime: python
+description: Infrastructure as Code for DevOps Lab 04 using Pulumi
diff --git a/pulumi/__main__.py b/pulumi/__main__.py
new file mode 100644
index 0000000000..0a6444c28d
--- /dev/null
+++ b/pulumi/__main__.py
@@ -0,0 +1,162 @@
+"""
+Pulumi Infrastructure as Code for DevOps Lab 04
+Cloud Provider: Yandex Cloud
+Purpose: Provision a VM for Ansible configuration (Lab 05)
+"""
+
+import pulumi
+import pulumi_yandex as yandex
+
+# Configuration
+config = pulumi.Config()
+
+# Yandex Cloud Configuration
+zone = config.get("yandex:zone") or "ru-central1-a"
+
+# VM Configuration
+vm_name = config.get("vmName") or "devops-lab04-vm-pulumi"
+vm_user = config.get("vmUser") or "ubuntu"
+ssh_public_key = config.require("sshPublicKey")
+
+# VM Resources
+vm_cores = config.get_int("vmCores") or 2
+vm_memory = config.get_int("vmMemory") or 1
+vm_core_fraction = config.get_int("vmCoreFraction") or 20
+disk_size = config.get_int("diskSize") or 10
+disk_type = config.get("diskType") or "network-hdd"
+
+# Security
+allow_ssh_from_cidr = config.get("allowSshFromCidr") or "0.0.0.0/0"
+
+# Data source: Find latest Ubuntu 24.04 LTS image
+ubuntu_image = yandex.get_compute_image(
+ family="ubuntu-2404-lts",
+)
+
+# VPC Network
+network = yandex.VpcNetwork(
+ "devops-network",
+ name="devops-network-pulumi",
+ description="Network for DevOps course lab infrastructure (Pulumi)",
+)
+
+# Subnet
+subnet = yandex.VpcSubnet(
+ "devops-subnet",
+ name="devops-subnet-pulumi",
+ description="Subnet for DevOps VMs (Pulumi)",
+ v4_cidr_blocks=["10.129.0.0/24"],
+ zone=zone,
+ network_id=network.id,
+)
+
+# Security Group (Firewall Rules)
+security_group = yandex.VpcSecurityGroup(
+ "devops-sg",
+ name="devops-security-group-pulumi",
+ description="Security group for DevOps VM - allows SSH (Pulumi)",
+ network_id=network.id,
+ ingress=[
+ # Allow SSH
+ yandex.VpcSecurityGroupIngressArgs(
+ protocol="TCP",
+ description="Allow SSH",
+ v4_cidr_blocks=[allow_ssh_from_cidr],
+ port=22,
+ ),
+ # Allow HTTP
+ yandex.VpcSecurityGroupIngressArgs(
+ protocol="TCP",
+ description="Allow HTTP",
+ v4_cidr_blocks=["0.0.0.0/0"],
+ port=80,
+ ),
+ # Allow HTTPS
+ yandex.VpcSecurityGroupIngressArgs(
+ protocol="TCP",
+ description="Allow HTTPS",
+ v4_cidr_blocks=["0.0.0.0/0"],
+ port=443,
+ ),
+ ],
+ egress=[
+ # Allow all outbound traffic
+ yandex.VpcSecurityGroupEgressArgs(
+ protocol="ANY",
+ description="Allow all outbound traffic",
+ v4_cidr_blocks=["0.0.0.0/0"],
+ from_port=0,
+ to_port=65535,
+ ),
+ ],
+)
+
+# Cloud-init configuration
+cloud_init = f"""#cloud-config
+users:
+ - name: {vm_user}
+ groups: sudo
+ shell: /bin/bash
+ sudo: ['ALL=(ALL) NOPASSWD:ALL']
+ ssh_authorized_keys:
+ - {ssh_public_key}
+package_update: true
+package_upgrade: true
+packages:
+ - curl
+ - wget
+ - git
+ - vim
+runcmd:
+ - echo "VM provisioned by Pulumi for DevOps Lab 04" > /etc/motd
+"""
+
+# Compute Instance (Virtual Machine)
+vm = yandex.ComputeInstance(
+ "devops-vm",
+ name=vm_name,
+ platform_id="standard-v2",
+ zone=zone,
+ hostname="devops-lab04-pulumi",
+ resources=yandex.ComputeInstanceResourcesArgs(
+ cores=vm_cores,
+ memory=vm_memory,
+ core_fraction=vm_core_fraction,
+ ),
+ boot_disk=yandex.ComputeInstanceBootDiskArgs(
+ initialize_params=yandex.ComputeInstanceBootDiskInitializeParamsArgs(
+ image_id=ubuntu_image.id,
+ size=disk_size,
+ type=disk_type,
+ ),
+ ),
+ network_interfaces=[
+ yandex.ComputeInstanceNetworkInterfaceArgs(
+ subnet_id=subnet.id,
+ nat=True, # Assign public IP
+ security_group_ids=[security_group.id],
+ )
+ ],
+ metadata={
+ "user-data": cloud_init,
+ },
+ labels={
+ "environment": "lab04",
+ "managed_by": "pulumi",
+ "purpose": "devops-course",
+ },
+)
+
+# Exports (Outputs)
+pulumi.export("vm_id", vm.id)
+pulumi.export("vm_name", vm.name)
+pulumi.export("vm_fqdn", vm.fqdn)
+pulumi.export("vm_public_ip", vm.network_interfaces[0].nat_ip_address)
+pulumi.export("vm_private_ip", vm.network_interfaces[0].ip_address)
+pulumi.export("ssh_connection", vm.network_interfaces[0].nat_ip_address.apply(
+ lambda ip: f"ssh {vm_user}@{ip}"
+))
+pulumi.export("vm_zone", vm.zone)
+pulumi.export("network_id", network.id)
+pulumi.export("subnet_id", subnet.id)
+pulumi.export("security_group_id", security_group.id)
diff --git a/pulumi/requirements.txt b/pulumi/requirements.txt
new file mode 100644
index 0000000000..ad106a5476
--- /dev/null
+++ b/pulumi/requirements.txt
@@ -0,0 +1,2 @@
+pulumi>=3.0.0,<4.0.0
+pulumi-yandex>=0.13.0
diff --git a/terraform/.gitignore b/terraform/.gitignore
new file mode 100644
index 0000000000..dc4846bb70
--- /dev/null
+++ b/terraform/.gitignore
@@ -0,0 +1,34 @@
+# Terraform
+*.tfstate
+*.tfstate.*
+.terraform/
+terraform.tfvars
+*.tfvars
+.terraform.lock.hcl
+
+# Crash log files
+crash.log
+crash.*.log
+
+# Cloud credentials
+*.pem
+*.key
+*.json
+credentials
+key.json
+service-account-key.json
+
+# Environment variables
+.env
+.env.local
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/terraform/.tflint.hcl b/terraform/.tflint.hcl
new file mode 100644
index 0000000000..379b2f43ef
--- /dev/null
+++ b/terraform/.tflint.hcl
@@ -0,0 +1,42 @@
+# TFLint Configuration for DevOps Lab 04
+
+plugin "terraform" {
+ enabled = true
+ preset = "recommended"
+}
+
+rule "terraform_naming_convention" {
+ enabled = true
+}
+
+rule "terraform_deprecated_interpolation" {
+ enabled = true
+}
+
+rule "terraform_documented_outputs" {
+ enabled = true
+}
+
+rule "terraform_documented_variables" {
+ enabled = true
+}
+
+rule "terraform_typed_variables" {
+ enabled = true
+}
+
+rule "terraform_unused_declarations" {
+ enabled = true
+}
+
+rule "terraform_comment_syntax" {
+ enabled = true
+}
+
+rule "terraform_required_version" {
+ enabled = true
+}
+
+rule "terraform_required_providers" {
+ enabled = true
+}
diff --git a/terraform/main.tf b/terraform/main.tf
new file mode 100644
index 0000000000..de7e1e1802
--- /dev/null
+++ b/terraform/main.tf
@@ -0,0 +1,137 @@
+# Terraform configuration for DevOps Lab 04
+# Cloud Provider: Yandex Cloud
+# Purpose: Provision a VM for Ansible configuration (Lab 05)
+
+terraform {
+ required_version = ">= 1.9.0"
+
+ required_providers {
+ yandex = {
+ source = "yandex-cloud/yandex"
+ version = "~> 0.130"
+ }
+ }
+}
+
+# Provider configuration
+provider "yandex" {
+ service_account_key_file = var.service_account_key_file
+ cloud_id = var.cloud_id
+ folder_id = var.folder_id
+ zone = var.zone
+}
+
+# Data source: Find latest Ubuntu 24.04 LTS image
+data "yandex_compute_image" "ubuntu" {
+ family = var.vm_image_family
+}
+
+# VPC Network
+resource "yandex_vpc_network" "devops_network" {
+ name = "devops-network"
+ description = "Network for DevOps course lab infrastructure"
+}
+
+# Subnet
+resource "yandex_vpc_subnet" "devops_subnet" {
+ name = "devops-subnet"
+ description = "Subnet for DevOps VMs"
+ v4_cidr_blocks = ["10.128.0.0/24"]
+ zone = var.zone
+ network_id = yandex_vpc_network.devops_network.id
+}
+
+# Security Group (Firewall Rules)
+resource "yandex_vpc_security_group" "devops_sg" {
+ name = "devops-security-group"
+ description = "Security group for DevOps VM - allows SSH"
+ network_id = yandex_vpc_network.devops_network.id
+
+ # Allow SSH from specified CIDR
+ ingress {
+ protocol = "TCP"
+ description = "Allow SSH"
+ v4_cidr_blocks = [var.allow_ssh_from_cidr]
+ port = 22
+ }
+
+ # Allow HTTP (for future web applications)
+ ingress {
+ protocol = "TCP"
+ description = "Allow HTTP"
+ v4_cidr_blocks = ["0.0.0.0/0"]
+ port = 80
+ }
+
+ # Allow HTTPS (for future web applications)
+ ingress {
+ protocol = "TCP"
+ description = "Allow HTTPS"
+ v4_cidr_blocks = ["0.0.0.0/0"]
+ port = 443
+ }
+
+ # Allow all outbound traffic
+ egress {
+ protocol = "ANY"
+ description = "Allow all outbound traffic"
+ v4_cidr_blocks = ["0.0.0.0/0"]
+ from_port = 0
+ to_port = 65535
+ }
+}
+
+# Compute Instance (Virtual Machine)
+resource "yandex_compute_instance" "devops_vm" {
+ name = var.vm_name
+ platform_id = "standard-v2"
+ zone = var.zone
+ hostname = "devops-lab04"
+
+ resources {
+ cores = var.vm_cores
+ memory = var.vm_memory
+ core_fraction = var.vm_core_fraction # 20% for free tier
+ }
+
+ boot_disk {
+ initialize_params {
+ image_id = data.yandex_compute_image.ubuntu.id
+ size = var.disk_size
+ type = var.disk_type
+ }
+ }
+
+ network_interface {
+ subnet_id = yandex_vpc_subnet.devops_subnet.id
+ nat = true # Assign public IP
+ security_group_ids = [yandex_vpc_security_group.devops_sg.id]
+ }
+
+ metadata = {
+ ssh-keys = "${var.vm_user}:${file(var.ssh_public_key_path)}"
+ user-data = <<-EOT
+ #cloud-config
+ users:
+ - name: ${var.vm_user}
+ groups: sudo
+ shell: /bin/bash
+ sudo: ['ALL=(ALL) NOPASSWD:ALL']
+ package_update: true
+ package_upgrade: true
+ packages:
+ - curl
+ - wget
+ - git
+ - vim
+ runcmd:
+ - echo "VM provisioned by Terraform for DevOps Lab 04" > /etc/motd
+ EOT
+ }
+
+ labels = {
+ environment = "lab04"
+ managed_by = "terraform"
+ purpose = "devops-course"
+ }
+}
diff --git a/terraform/outputs.tf b/terraform/outputs.tf
new file mode 100644
index 0000000000..7705a3ba37
--- /dev/null
+++ b/terraform/outputs.tf
@@ -0,0 +1,51 @@
+# Outputs for DevOps Lab 04 Infrastructure
+
+output "vm_id" {
+ description = "ID of the created VM"
+ value = yandex_compute_instance.devops_vm.id
+}
+
+output "vm_name" {
+ description = "Name of the VM"
+ value = yandex_compute_instance.devops_vm.name
+}
+
+output "vm_fqdn" {
+ description = "Fully qualified domain name of the VM"
+ value = yandex_compute_instance.devops_vm.fqdn
+}
+
+output "vm_public_ip" {
+ description = "Public IP address of the VM"
+ value = yandex_compute_instance.devops_vm.network_interface[0].nat_ip_address
+}
+
+output "vm_private_ip" {
+ description = "Private IP address of the VM"
+ value = yandex_compute_instance.devops_vm.network_interface[0].ip_address
+}
+
+output "ssh_connection" {
+ description = "SSH connection command"
+ value = "ssh ${var.vm_user}@${yandex_compute_instance.devops_vm.network_interface[0].nat_ip_address}"
+}
+
+output "vm_zone" {
+ description = "Zone where VM is deployed"
+ value = yandex_compute_instance.devops_vm.zone
+}
+
+output "network_id" {
+ description = "ID of the VPC network"
+ value = yandex_vpc_network.devops_network.id
+}
+
+output "subnet_id" {
+ description = "ID of the subnet"
+ value = yandex_vpc_subnet.devops_subnet.id
+}
+
+output "security_group_id" {
+ description = "ID of the security group"
+ value = yandex_vpc_security_group.devops_sg.id
+}
diff --git a/terraform/terraform.tfvars.example b/terraform/terraform.tfvars.example
new file mode 100644
index 0000000000..3bc15a3308
--- /dev/null
+++ b/terraform/terraform.tfvars.example
@@ -0,0 +1,35 @@
+# Example terraform.tfvars file
+# Copy this to terraform.tfvars and fill in your actual values
+# NEVER commit terraform.tfvars to Git!
+
+# Yandex Cloud Configuration
+# Get these from: https://console.cloud.yandex.com/
+cloud_id = "b1g1234567890abcdefg" # PLACEHOLDER: Replace with your cloud ID
+folder_id = "b1g0987654321zyxwvut" # PLACEHOLDER: Replace with your folder ID
+
+# Service Account Key
+# Generate from: https://console.cloud.yandex.com/iam/service-accounts
+service_account_key_file = "key.json" # PLACEHOLDER: Path to your service account key
+
+# Zone Configuration
+zone = "ru-central1-a" # Options: ru-central1-a, ru-central1-b, ru-central1-c
+
+# VM Configuration
+vm_name = "devops-lab04-vm"
+vm_user = "ubuntu"
+
+# SSH Key (generate if needed: ssh-keygen -t rsa -b 4096)
+ssh_public_key_path = "~/.ssh/id_rsa.pub" # PLACEHOLDER: Update if your key is elsewhere
+
+# VM Resources (Free Tier Compatible)
+vm_cores = 2
+vm_memory = 1
+vm_core_fraction = 20 # 20% for free tier
+disk_size = 10
+disk_type = "network-hdd"
+
+# Security Configuration
+# IMPORTANT: Change to your IP for security!
+# Find your IP: curl ifconfig.me
+# Then set to: "YOUR_IP/32"
+allow_ssh_from_cidr = "0.0.0.0/0" # WARNING: Allows SSH from anywhere!
diff --git a/terraform/variables.tf b/terraform/variables.tf
new file mode 100644
index 0000000000..8fac6f63b3
--- /dev/null
+++ b/terraform/variables.tf
@@ -0,0 +1,89 @@
+# Variables for Yandex Cloud Infrastructure
+
+variable "cloud_id" {
+ description = "Yandex Cloud ID"
+ type = string
+ # Get this from: https://console.cloud.yandex.com/cloud
+}
+
+variable "folder_id" {
+ description = "Yandex Cloud Folder ID"
+ type = string
+ # Get this from: https://console.cloud.yandex.com/cloud
+}
+
+variable "zone" {
+ description = "Yandex Cloud zone"
+ type = string
+ default = "ru-central1-a"
+}
+
+variable "service_account_key_file" {
+ description = "Path to service account key JSON file"
+ type = string
+ default = "key.json"
+ # Generate this from: https://console.cloud.yandex.com/iam/service-accounts
+}
+
+variable "vm_name" {
+ description = "Name of the virtual machine"
+ type = string
+ default = "devops-lab04-vm"
+}
+
+variable "vm_user" {
+ description = "Default user for SSH access"
+ type = string
+ default = "ubuntu"
+}
+
+variable "ssh_public_key_path" {
+ description = "Path to SSH public key for VM access"
+ type = string
+ default = "~/.ssh/id_rsa.pub"
+ # Generate key pair if not exists: ssh-keygen -t rsa -b 4096
+}
+
+variable "vm_image_family" {
+ description = "OS image family for the VM"
+ type = string
+ default = "ubuntu-2404-lts"
+}
+
+variable "vm_cores" {
+ description = "Number of CPU cores"
+ type = number
+ default = 2
+}
+
+variable "vm_memory" {
+ description = "RAM in GB"
+ type = number
+ default = 1
+}
+
+variable "vm_core_fraction" {
+ description = "CPU core fraction (20% for free tier)"
+ type = number
+ default = 20
+}
+
+variable "disk_size" {
+ description = "Boot disk size in GB"
+ type = number
+ default = 10
+}
+
+variable "disk_type" {
+ description = "Boot disk type"
+ type = string
+ default = "network-hdd"
+}
+
+variable "allow_ssh_from_cidr" {
+ description = "CIDR block allowed to SSH (your IP for security)"
+ type = string
+ default = "0.0.0.0/0" # WARNING: Change to your IP for production!
+ # Find your IP: curl ifconfig.me
+ # Then set to: "YOUR_IP/32"
+}