diff --git a/.github/workflows/ansible-deploy.yml b/.github/workflows/ansible-deploy.yml new file mode 100644 index 0000000000..cda194c02a --- /dev/null +++ b/.github/workflows/ansible-deploy.yml @@ -0,0 +1,75 @@ +name: Ansible Deployment + +on: + push: + branches: [ main, master, lab06 ] + paths: + - "ansible/**" + - ".github/workflows/ansible-deploy.yml" + pull_request: + branches: [ main, master, lab06 ] + paths: + - "ansible/**" + +jobs: + lint: + name: Ansible Lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + pip install ansible ansible-lint + + - name: Run ansible-lint + + run: | + ansible-lint playbooks/*.yml roles/**/*.yml + + deploy: + name: Deploy Application + needs: lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Ansible + run: pip install ansible + + - name: Setup SSH + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa + chmod 600 ~/.ssh/id_rsa + ssh-keyscan -H ${{ secrets.VM_HOST }} >> ~/.ssh/known_hosts + + - name: Deploy with Ansible + env: + ANSIBLE_VAULT_PASSWORD: ${{ secrets.ANSIBLE_VAULT_PASSWORD }} + run: | + cd ansible + echo "$ANSIBLE_VAULT_PASSWORD" > /tmp/vault_pass + ansible-playbook playbooks/deploy.yml \ + -i inventory/hosts.ini \ + --vault-password-file /tmp/vault_pass \ + --tags "app_deploy" + rm /tmp/vault_pass + + - name: Verify Deployment + run: | + sleep 10 + curl -f http://${{ secrets.VM_HOST }}:8000 || exit 1 + curl -f http://${{ secrets.VM_HOST }}:8000/health || exit 1 diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml new file mode 100644 index 0000000000..6db8c08313 --- /dev/null +++ b/.github/workflows/python-ci.yml @@ -0,0 +1,86 @@ +name: Python CI - DevOps Info Service + +on: + push: + branches: [ "**" ] + tags: + - "v*.*.*" + paths: + - "app_python/**" + - ".github/workflows/python-ci.yml" + + pull_request: + branches: [ "main" ] + paths: + - "app_python/**" + +jobs: + test: + name: Lint & Test + runs-on: ubuntu-latest + + defaults: + run: + working-directory: app_python + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + cache-dependency-path: | + app_python/requirements.txt + app_python/requirements-dev.txt + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -r requirements-dev.txt + + - name: Run linter (ruff) + run: | + pip install ruff + ruff check . + + - name: Run tests with coverage + run: | + pytest --cov=. --cov-report=term-missing --cov-fail-under=60 + + - name: Install Bandit + run: pip install bandit + + - name: Run Bandit security scan + run: bandit -r . -ll -s B101,B104 + + docker: + name: Build & Push Docker Image + needs: test + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Extract version + id: vars + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push image + uses: docker/build-push-action@v5 + with: + context: ./app_python + push: true + tags: | + ${{ secrets.DOCKERHUB_USERNAME }}/devops-info-service:${{ steps.vars.outputs.VERSION }} + ${{ secrets.DOCKERHUB_USERNAME }}/devops-info-service:latest diff --git a/.gitignore b/.gitignore index 30d74d2584..84e1267e96 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,7 @@ -test \ No newline at end of file +test +key.json +.terraform/ +*.tfstate +*.tfstate.* +terraform.tfvars +.history/ \ No newline at end of file diff --git a/README.md b/README.md index 371d51f456..c1d5964260 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +![CI](https://github.com/essence-666/DevOps-Core-Course/actions/workflows/python-ci.yml/badge.svg) + + # DevOps Engineering: Core Practices [![Labs](https://img.shields.io/badge/Labs-18-blue)](#labs) diff --git a/ansible/.gitignore b/ansible/.gitignore new file mode 100644 index 0000000000..1f800a5db5 --- /dev/null +++ b/ansible/.gitignore @@ -0,0 +1 @@ +.vault_pass \ No newline at end of file diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 0000000000..48564f422a --- /dev/null +++ b/ansible/README.md @@ -0,0 +1 @@ +[![Ansible Deployment](https://github.com/essence-666/DevOps-Core-Course/actions/workflows/ansible-deploy.yml/badge.svg)](https://github.com/essence-666/DevOps-Core-Course/actions/workflows/ansible-deploy.yml) \ No newline at end of file diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 0000000000..ccbc1c50bd --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,6 @@ +[defaults] +roles_path = ./roles:./playbooks/roles +inventory = ./inventory/hosts.ini +private_key_file = ~/.ssh/id_ed25519 +remote_user = ubuntu +host_key_checking = False \ No newline at end of file diff --git a/ansible/docs/LAB05.md b/ansible/docs/LAB05.md new file mode 100644 index 0000000000..91a5f1ce23 --- /dev/null +++ b/ansible/docs/LAB05.md @@ -0,0 +1,292 @@ +# LAB05: Ansible Implementation Documentation + +## 1. Architecture Overview + +**Ansible version used:** +``` +❯ ansible --version +ansible [core 2.18.7] + config file = None + configured module search path = ['/Users/e.s.belozerov/.ansible/plugins/modules', '/usr/share/ansible/plugins/modules'] + ansible python module location = /opt/homebrew/Cellar/ansible/11.8.0/libexec/lib/python3.13/site-packages/ansible + ansible collection location = /Users/e.s.belozerov/.ansible/collections:/usr/share/ansible/collections + executable location = /opt/homebrew/bin/ansible + python version = 3.13.9 (main, Oct 14 2025, 13:52:31) [Clang 17.0.0 (clang-1700.0.13.3)] (/opt/homebrew/Cellar/ansible/11.8.0/libexec/bin/python) + jinja version = 3.1.6 + libyaml = True +``` + +**Target VM OS and version:** +``` +❯ ansible web-1 -m setup -a "filter=ansible_distribution*" --ask-vault-pass +Vault password: +[WARNING]: Platform linux on host web-1 is using the discovered Python interpreter at /usr/bin/python3.10, but future installation of another Python interpreter +could change the meaning of that path. See https://docs.ansible.com/ansible-core/2.18/reference_appendices/interpreter_discovery.html for more information. +web-1 | SUCCESS => { + "ansible_facts": { + "ansible_distribution": "Ubuntu", + "ansible_distribution_file_parsed": true, + "ansible_distribution_file_path": "/etc/os-release", + "ansible_distribution_file_variety": "Debian", + "ansible_distribution_major_version": "22", + "ansible_distribution_release": "jammy", + "ansible_distribution_version": "22.04", + "discovered_interpreter_python": "/usr/bin/python3.10" + }, + "changed": false +} +``` + +**Role Structure:** +The project uses a modular role-based structure with three main roles: +- `common` - Base system configuration +- `docker` - Docker installation and setup +- `web_app` - Application deployment and container management + +**Why roles instead of monolithic playbooks?** +Roles enable code reuse, improve maintainability through separation of concerns, and allow for easier testing and collaboration. They provide a standardized structure that makes playbooks more readable and scalable. + +## 2. Roles Documentation + +### Role: common + +**Purpose:** Configures the base system with essential packages and updates. Ensures all nodes have consistent base configuration. + +**Variables:** +| Variable | Default | Description | +|----------|---------|-------------| +| `common_packages` | `['curl', 'wget', 'git', 'vim']` | Base packages to install | +| `update_cache` | `yes` | Whether to update apt cache | + +**Handlers:** +- None currently defined + +**Dependencies:** +- None (standalone role) + +### Role: docker + +**Purpose:** Installs and configures Docker CE on the target system, adds user to docker group, and ensures Docker service is running. + +**Variables:** +| Variable | Default | Description | +|----------|---------|-------------| +| `docker_user` | `ubuntu` | User to add to docker group | +| `docker_compose_version` | `latest` | Docker Compose version | + +**Handlers:** +- `restart docker` - Restarts Docker service when configuration changes + +**Dependencies:** +- `common` (implicitly required for package management) + +### Role: web_app + +**Purpose:** Deploys the application by pulling from Docker Hub, running containers, and performing health checks. + +**Variables:** +| Variable | Source | Description | +|----------|--------|-------------| +| `dockerhub_username` | vault.yml | Docker Hub username | +| `dockerhub_password` | vault.yml | Docker Hub password/pat | +| `app_name` | vault.yml | Application name | +| `docker_image` | Derived | Full image path | +| `app_port` | vault.yml | Application port | +| `app_container_name` | Derived | Container name | + +**Handlers:** +- `restart app` - Restarts application container + +**Dependencies:** +- `docker` (requires Docker to be installed) + +## 3. Idempotency Demonstration + +**First provision.yml run:** +``` +❯ ansible-playbook playbooks/provision.yml --ask-vault-pass +Vault password: + +PLAY [Provision web servers] *************************************************************************************************************************************** + +TASK [Gathering Facts] ********************************************************************************************************************************************* +[WARNING]: Platform linux on host web-1 is using the discovered Python interpreter at /usr/bin/python3.10, but future installation of another Python interpreter +could change the meaning of that path. See https://docs.ansible.com/ansible-core/2.18/reference_appendices/interpreter_discovery.html for more information. +ok: [web-1] + +TASK [common : Update apt cache] *********************************************************************************************************************************** +changed: [web-1] + +TASK [common : Install common packages] **************************************************************************************************************************** +changed: [web-1] + +TASK [common : Set timezone] *************************************************************************************************************************************** +changed: [web-1] + +TASK [docker : Add Docker GPG key] ********************************************************************************************************************************* +changed: [web-1] + +TASK [docker : Add Docker repository] ****************************************************************************************************************************** +changed: [web-1] + +TASK [docker : Install Docker packages] **************************************************************************************************************************** +changed: [web-1] + +TASK [docker : Install python docker module] *********************************************************************************************************************** +changed: [web-1] + +TASK [docker : Ensure Docker is running] *************************************************************************************************************************** +ok: [web-1] + +TASK [docker : Add user to docker group] *************************************************************************************************************************** +changed: [web-1] + +PLAY RECAP ********************************************************************************************************************************************************* +web-1 : ok=10 changed=8 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + +~/uni/DevOps-Core-Course/ansible lab04 ?2 ❯ 2m 43s 20:55:40 +``` + +**Second provision.yml run:** +``` +❯ ansible-playbook playbooks/provision.yml --ask-vault-pass +Vault password: + +PLAY [Provision web servers] *************************************************************************************************************************************** + +TASK [Gathering Facts] ********************************************************************************************************************************************* +[WARNING]: Platform linux on host web-1 is using the discovered Python interpreter at /usr/bin/python3.10, but future installation of another Python interpreter +could change the meaning of that path. See https://docs.ansible.com/ansible-core/2.18/reference_appendices/interpreter_discovery.html for more information. +ok: [web-1] + +TASK [common : Update apt cache] *********************************************************************************************************************************** +ok: [web-1] + +TASK [common : Install common packages] **************************************************************************************************************************** +ok: [web-1] + +TASK [common : Set timezone] *************************************************************************************************************************************** +ok: [web-1] + +TASK [docker : Add Docker GPG key] ********************************************************************************************************************************* +ok: [web-1] + +TASK [docker : Add Docker repository] ****************************************************************************************************************************** +ok: [web-1] + +TASK [docker : Install Docker packages] **************************************************************************************************************************** +ok: [web-1] + +TASK [docker : Install python docker module] *********************************************************************************************************************** +ok: [web-1] + +TASK [docker : Ensure Docker is running] *************************************************************************************************************************** +ok: [web-1] + +TASK [docker : Add user to docker group] *************************************************************************************************************************** +ok: [web-1] + +PLAY RECAP ********************************************************************************************************************************************************* +web-1 : ok=10 changed=0 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + +~/uni/DevOps-Core-Course/ansible lab04 ?2 ❯ 25s 20:56:32 +``` + +**Analysis:** +During the first run, tasks such as package installation, Docker installation, and user modifications were executed because the desired state was not yet achieved. The second run showed "ok" status with no changes because the system already matched the desired state. + +**What makes your roles idempotent?** +Tasks are designed to check the current state before making changes. For example, package installation uses apt module which checks if packages are already installed, and service tasks check if services are already running before attempting to start them. + +## 4. Ansible Vault Usage + +**Secure Credential Storage:** +Sensitive information like Docker Hub credentials are stored in encrypted vault.yml files rather than plain text in playbooks or roles. + +**Vault Password Management Strategy:** +Vault password is provided interactively using `--ask-vault-pass` flag during playbook execution, ensuring credentials never persist on disk. + +**Example of Encrypted File:** +``` +❯ cat inventory/group_vars/all.yml +$ANSIBLE_VAULT;1.1;AES256 +64656436323738356430623061373935353264343562343364303864313261326238306431646639 +6338353731363063396435633030633162363231653563330a626231626138396463623334393364 +63336536613761326430636537386137313335666366333131373966636362376530383061356636 +3934373733376132610a323137656630383163373736623738376134396535303865646439623539 +61393163646231333234366131653763353666356466346161363133393238323865323036336437 +30336238663063636135386363356133336133393563393637326439633761316661316235643435 +33396364613938663762643335613061326631346437653339626636353866653735653766383365 +36656438343065653735353539366266353035316166356238336236663336313733666634326537 +33343866656263303865333436626139316238663433636334316366306433616463363932313734 +37663331613834363831333534353230336363653435343833353966393731663663393436633561 +63656365346631613263636466303363346235656534333765626565353331343365326463333532 +30623364646335383362383633636536653662323662646233323362653036626534376136656635 +30353061653761626666636361363363383033363663326535653262663838653261653534636636 +31633234353238343964393766343735633339393430313332613334653063633537316632366430 +61323537313930343539383261346435316663393762353465376333353239316463623433303564 +35636535623264333865393332616566633864656330313537356630646337313533373061393463 +30303733666536376536613862346333653266393833303662356132366136376166 +~/uni/DevOps-Core-Course/ansible lab04 ?2 ❯ 20:57:05 +``` + +**Why Ansible Vault is important:** +Ansible Vault prevents sensitive data exposure by encrypting credentials at rest while allowing them to be used securely during automation. This is essential for maintaining security in version control systems and shared environments. + +## 5. Deployment Verification + +**deploy.yml run:** +``` +❯ ansible-playbook playbooks/deploy.yml --ask-vault-pass +Vault password: + +PLAY [Deploy application] ****************************************************************************************************************************************** + +TASK [Gathering Facts] ********************************************************************************************************************************************* +[WARNING]: Platform linux on host web-1 is using the discovered Python interpreter at /usr/bin/python3.10, but future installation of another Python interpreter +could change the meaning of that path. See https://docs.ansible.com/ansible-core/2.18/reference_appendices/interpreter_discovery.html for more information. +ok: [web-1] + +TASK [app_deploy : Login to Docker Hub] **************************************************************************************************************************** +changed: [web-1] + +TASK [app_deploy : Pull application image] ************************************************************************************************************************* +changed: [web-1] + +TASK [app_deploy : Run application container] ********************************************************************************************************************** +changed: [web-1] + +TASK [app_deploy : Wait for app port] ****************************************************************************************************************************** +ok: [web-1] + +PLAY RECAP ********************************************************************************************************************************************************* +web-1 : ok=5 changed=3 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + +~/uni/DevOps-Core-Course/ansible lab04 ?2 ❯ 34s 20:58:18 +``` + +**Container status (docker ps):** + +![docker ps](./screenshots/docker-ps.png) + +**Health check verification:** + +![healthcheck](./screenshots/healthcheck.png) + + +## 6. Key Decisions + +**Why use roles instead of plain playbooks?** +Roles provide a standardized, reusable structure that separates concerns and makes automation more maintainable as infrastructure grows. + +**How do roles improve reusability?** +Roles can be shared across different playbooks and projects, parameterized with variables, and versioned independently for consistent deployments. + +**What makes a task idempotent?** +A task is idempotent when it checks the current state before making changes and only applies modifications if the desired state differs from the actual state. + +**How do handlers improve efficiency?** +Handlers run only when notified and execute once at the end of the play, preventing unnecessary service restarts and improving playbook performance. + +**Why is Ansible Vault necessary?** +Ansible Vault is necessary to securely store and manage sensitive information like passwords and API keys within version control systems without exposing them to unauthorized users. diff --git a/ansible/docs/screenshots/docker-ps.png b/ansible/docs/screenshots/docker-ps.png new file mode 100644 index 0000000000..81ff0c637d Binary files /dev/null and b/ansible/docs/screenshots/docker-ps.png differ diff --git a/ansible/docs/screenshots/healthcheck.png b/ansible/docs/screenshots/healthcheck.png new file mode 100644 index 0000000000..0d00b01128 Binary files /dev/null and b/ansible/docs/screenshots/healthcheck.png differ diff --git a/ansible/inventory/group_vars/all.yml b/ansible/inventory/group_vars/all.yml new file mode 100644 index 0000000000..e7a8bd6d43 --- /dev/null +++ b/ansible/inventory/group_vars/all.yml @@ -0,0 +1,26 @@ +$ANSIBLE_VAULT;1.1;AES256 +37383365633731333466383732303066323530666333363835313030623437613834623037393063 +3866336338396436373361366466633334343533656465360a326138323161373533346137613638 +32653736343635383634353031383635373231626636333334353765346664356166326537653866 +3666613361643535360a333666343465653938316663623934316532316635363539336466393965 +61633337373934316132346630303838313135306533613664623335343861306565383238323632 +30326666653936613136323262663533306134346433616433373862613562646339633065313135 +38373230393864353866326130646666323865313164373666326536623461613263633930376131 +30393737646433313336623636663433366639326533303535393264373736393365306237333131 +35333837386562613665316637333665336262393733353961336331316361666362623539373062 +32313634623838343764646530393363343762333635366636656430653162323231383238313231 +66366262356364373036343033613133303533616130643938316134343434653232633464303035 +39656335363031386461363732326538663365653062363436666235326331346136303638383335 +64613636303638316130306264646232623138316666303230306438383064353331363436373364 +39313962626231316535363965663437386235313262666366623934376661363034633439363533 +66316530373831333565323034623164366466636433646664366435623631613432633330656430 +32623661316262633031363934643965626236343031356631613762616633616662633135396131 +66336166373732633134663562303465363938346532313638343334373261653732656338363062 +36643362616661386632306536336639623437333661356139633861313933376463623030393831 +32383534363738633531336636646430383838656430636262633634623539333136323634383639 +61626138393731386462633032653635383531346638313264336462643433343335373737306436 +66386464666333313231633865303831376564376635393264383662626465366239643165356666 +31363162393634323539313739396263623135616238356664313634313532663138653831343764 +33623465346538653739303934306262366561303534316265353066343064626335333937623934 +63303139323863633262323538613334326239373237396262383564393933663232346561653364 +6332 diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini new file mode 100644 index 0000000000..5f54949ac1 --- /dev/null +++ b/ansible/inventory/hosts.ini @@ -0,0 +1,2 @@ +[webservers] +web-1 ansible_host=89.169.134.204 ansible_user=ubuntu ansible_ssh_private_key_file=~/.ssh/id_ed25519 \ No newline at end of file diff --git a/ansible/playbooks/deploy-monitoring.yml b/ansible/playbooks/deploy-monitoring.yml new file mode 100644 index 0000000000..21dd84da4b --- /dev/null +++ b/ansible/playbooks/deploy-monitoring.yml @@ -0,0 +1,7 @@ +--- +- name: Deploy full observability stack (Loki + Prometheus + Grafana) + hosts: all + become: true + roles: + - role: docker + - role: monitoring diff --git a/ansible/playbooks/deploy.yml b/ansible/playbooks/deploy.yml new file mode 100644 index 0000000000..1010c0871a --- /dev/null +++ b/ansible/playbooks/deploy.yml @@ -0,0 +1,8 @@ +--- +- name: Deploy application + hosts: webservers + become: yes + vars_files: + - ../inventory/group_vars/all.yml + roles: + - web_app diff --git a/ansible/playbooks/provision.yml b/ansible/playbooks/provision.yml new file mode 100644 index 0000000000..7cc2e6678d --- /dev/null +++ b/ansible/playbooks/provision.yml @@ -0,0 +1,8 @@ +--- +- name: Provision web servers + hosts: webservers + become: true + + roles: + - common + - docker diff --git a/ansible/roles/common/defaults/main.yml b/ansible/roles/common/defaults/main.yml new file mode 100644 index 0000000000..4c71fcc3b6 --- /dev/null +++ b/ansible/roles/common/defaults/main.yml @@ -0,0 +1,14 @@ +--- +common_packages: + - python3-pip + - curl + - git + - vim + - htop + - ca-certificates + - gnupg + - lsb-release + +common_timezone: "UTC" + +common_devops_user: devops diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000000..22a9f64927 --- /dev/null +++ b/ansible/roles/common/tasks/main.yml @@ -0,0 +1,59 @@ +--- +- name: Install and configure common packages + become: true + tags: + - packages + + block: + + - name: Update apt cache + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + - name: Install common packages + ansible.builtin.apt: + name: "{{ common_packages }}" + state: present + + - name: Set timezone + community.general.timezone: + name: "{{ common_timezone }}" + + rescue: + + - name: Fix apt cache and retry + ansible.builtin.apt: + update_cache: true + cache_valid_time: 3600 + + always: + + - name: Log package block completion + ansible.builtin.file: + path: /tmp/common_packages_done.log + state: touch + mode: '0644' + + +- name: Create system users + become: true + tags: + - users + block: + + - name: Ensure devops user exists + ansible.builtin.user: + name: "{{ common_devops_user }}" + shell: /bin/bash + groups: sudo + append: true + state: present + + always: + + - name: Log user block completion + ansible.builtin.file: + path: /tmp/common_users_done.log + state: touch + mode: '0644' diff --git a/ansible/roles/docker/defaults/main.yml b/ansible/roles/docker/defaults/main.yml new file mode 100644 index 0000000000..ac1e236676 --- /dev/null +++ b/ansible/roles/docker/defaults/main.yml @@ -0,0 +1,6 @@ +--- +docker_user: ubuntu +docker_packages: + - docker-ce + - docker-ce-cli + - containerd.io diff --git a/ansible/roles/docker/handlers/main.yml b/ansible/roles/docker/handlers/main.yml new file mode 100644 index 0000000000..37b5807a57 --- /dev/null +++ b/ansible/roles/docker/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: Restart docker + ansible.builtin.systemd: + name: docker + state: restarted diff --git a/ansible/roles/docker/tasks/main.yml b/ansible/roles/docker/tasks/main.yml new file mode 100644 index 0000000000..62631b8374 --- /dev/null +++ b/ansible/roles/docker/tasks/main.yml @@ -0,0 +1,75 @@ +--- +# ========================= +# Docker Installation +# ========================= +- name: Install Docker + become: true + tags: + - docker + - docker_install + + block: + + - name: Add Docker GPG key + ansible.builtin.apt_key: + url: https://download.docker.com/linux/ubuntu/gpg + state: present + + - name: Add Docker repository + ansible.builtin.apt_repository: + repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" + state: present + + - name: Install Docker packages + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + update_cache: true + + - name: Install python docker module + ansible.builtin.apt: + name: python3-docker + state: present + + rescue: + + - name: Wait before retry + ansible.builtin.pause: + seconds: 10 + + - name: Retry apt update + ansible.builtin.apt: + update_cache: true + + - name: Retry Docker packages install + ansible.builtin.apt: + name: "{{ docker_packages }}" + state: present + + always: + + - name: Ensure Docker service is enabled + ansible.builtin.systemd: + name: docker + enabled: true + +# ========================= +# Docker Configuration +# ========================= +- name: Configure Docker + become: true + tags: + - docker + - docker_config + block: + + - name: Ensure Docker is running + ansible.builtin.systemd: + name: docker + state: started + + - name: Add user to docker group + ansible.builtin.user: + name: "{{ docker_user }}" + groups: docker + append: true diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000000..29927c41dd --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1,37 @@ +--- +# Monitoring stack directory on the target host +monitoring_dir: "/opt/monitoring" + +# Loki +loki_version: "3.0.0" +loki_port: 3100 +loki_retention_hours: 168 + +# Promtail +promtail_version: "3.0.0" +promtail_port: 9080 + +# Prometheus +prometheus_version: "3.9.0" +prometheus_port: 9090 +prometheus_retention_days: 15 +prometheus_retention_size: "10GB" +prometheus_scrape_interval: "15s" + +# Grafana +grafana_version: "12.3.1" +grafana_port: 3000 +grafana_admin_user: "admin" +grafana_admin_password: "changeme" + +# Scrape targets (used by prometheus.yml.j2 template) +prometheus_targets: + - job: "prometheus" + targets: ["localhost:9090"] + - job: "loki" + targets: ["loki:3100"] + - job: "grafana" + targets: ["grafana:3000"] + - job: "app" + targets: ["app-python:8000"] + path: "/metrics" diff --git a/ansible/roles/monitoring/files/grafana-app-dashboard.json b/ansible/roles/monitoring/files/grafana-app-dashboard.json new file mode 100644 index 0000000000..e4ab641555 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-app-dashboard.json @@ -0,0 +1,121 @@ +{ + "title": "DevOps Info App — Metrics", + "uid": "devops-app-metrics", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + { + "id": 1, + "title": "Request Rate by Endpoint", + "type": "timeseries", + "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "legendFormat": "{{endpoint}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps", "custom": { "lineWidth": 2 } } + } + }, + { + "id": 2, + "title": "Error Rate (5xx/s)", + "type": "timeseries", + "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx errors/s" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps", "color": { "fixedColor": "red", "mode": "fixed" } } + } + }, + { + "id": 3, + "title": "Request Duration p95", + "type": "timeseries", + "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95 latency" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "id": 4, + "title": "Request Duration Heatmap", + "type": "heatmap", + "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(http_request_duration_seconds_bucket[5m])", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + }, + { + "id": 5, + "title": "Active Requests", + "type": "gauge", + "gridPos": { "x": 0, "y": 16, "w": 6, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "http_requests_in_progress", + "legendFormat": "in progress" + } + ], + "fieldConfig": { + "defaults": { "unit": "short", "min": 0, "max": 100 } + } + }, + { + "id": 6, + "title": "Status Code Distribution", + "type": "piechart", + "gridPos": { "x": 6, "y": 16, "w": 9, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "HTTP {{status_code}}" + } + ] + }, + { + "id": 7, + "title": "App Uptime", + "type": "stat", + "gridPos": { "x": 15, "y": 16, "w": 9, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "up{job=\"app\"}", + "legendFormat": "app-python" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } } + ] + } + } + } + ] +} diff --git a/ansible/roles/monitoring/files/grafana-dashboards-provisioning.yml b/ansible/roles/monitoring/files/grafana-dashboards-provisioning.yml new file mode 100644 index 0000000000..b0f4451550 --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-dashboards-provisioning.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: 'DevOps' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/ansible/roles/monitoring/files/grafana-datasource-loki.yml b/ansible/roles/monitoring/files/grafana-datasource-loki.yml new file mode 100644 index 0000000000..641f2b732e --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-datasource-loki.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true diff --git a/ansible/roles/monitoring/files/grafana-datasource-prometheus.yml b/ansible/roles/monitoring/files/grafana-datasource-prometheus.yml new file mode 100644 index 0000000000..8ad0e3bb8c --- /dev/null +++ b/ansible/roles/monitoring/files/grafana-datasource-prometheus.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: false + jsonData: + timeInterval: '15s' diff --git a/ansible/roles/monitoring/files/loki-config.yml b/ansible/roles/monitoring/files/loki-config.yml new file mode 100644 index 0000000000..a615ada40a --- /dev/null +++ b/ansible/roles/monitoring/files/loki-config.yml @@ -0,0 +1,40 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/index + + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +limits_config: + retention_period: 168h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m diff --git a/ansible/roles/monitoring/files/promtail-config.yml b/ansible/roles/monitoring/files/promtail-config.yml new file mode 100644 index 0000000000..aa666fd358 --- /dev/null +++ b/ansible/roles/monitoring/files/promtail-config.yml @@ -0,0 +1,26 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'stream' + - source_labels: ['__meta_docker_container_label_logging'] + regex: 'promtail' + action: keep + - source_labels: ['__meta_docker_container_label_app'] + target_label: 'app' diff --git a/ansible/roles/monitoring/handlers/main.yml b/ansible/roles/monitoring/handlers/main.yml new file mode 100644 index 0000000000..303816d8d5 --- /dev/null +++ b/ansible/roles/monitoring/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: reload prometheus + community.docker.docker_container_exec: + container: monitoring-prometheus-1 + command: kill -HUP 1 + ignore_errors: true diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000000..a92e5cc457 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,69 @@ +--- +- name: Create monitoring directory structure + file: + path: "{{ item }}" + state: directory + mode: '0755' + loop: + - "{{ monitoring_dir }}" + - "{{ monitoring_dir }}/loki" + - "{{ monitoring_dir }}/promtail" + - "{{ monitoring_dir }}/prometheus" + - "{{ monitoring_dir }}/grafana/provisioning/datasources" + - "{{ monitoring_dir }}/grafana/provisioning/dashboards" + - "{{ monitoring_dir }}/grafana/dashboards" + +- name: Copy Loki config + copy: + src: loki-config.yml + dest: "{{ monitoring_dir }}/loki/config.yml" + mode: '0644' + +- name: Copy Promtail config + copy: + src: promtail-config.yml + dest: "{{ monitoring_dir }}/promtail/config.yml" + mode: '0644' + +- name: Template Prometheus config + template: + src: prometheus.yml.j2 + dest: "{{ monitoring_dir }}/prometheus/prometheus.yml" + mode: '0644' + notify: reload prometheus + +- name: Copy Grafana datasource provisioning (Prometheus) + copy: + src: grafana-datasource-prometheus.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/prometheus.yml" + mode: '0644' + +- name: Copy Grafana datasource provisioning (Loki) + copy: + src: grafana-datasource-loki.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/datasources/loki.yml" + mode: '0644' + +- name: Copy Grafana dashboard provisioning config + copy: + src: grafana-dashboards-provisioning.yml + dest: "{{ monitoring_dir }}/grafana/provisioning/dashboards/dashboards.yml" + mode: '0644' + +- name: Copy Grafana app metrics dashboard + copy: + src: grafana-app-dashboard.json + dest: "{{ monitoring_dir }}/grafana/dashboards/app-metrics.json" + mode: '0644' + +- name: Template docker-compose.yml + template: + src: docker-compose.yml.j2 + dest: "{{ monitoring_dir }}/docker-compose.yml" + mode: '0644' + +- name: Deploy monitoring stack + community.docker.docker_compose_v2: + project_src: "{{ monitoring_dir }}" + state: present + pull: always diff --git a/ansible/roles/monitoring/templates/docker-compose.yml.j2 b/ansible/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..6384ae828e --- /dev/null +++ b/ansible/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,126 @@ +version: '3.8' + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + prometheus-data: + +services: + loki: + image: grafana/loki:{{ loki_version }} + ports: + - "{{ loki_port }}:{{ loki_port }}" + volumes: + - {{ monitoring_dir }}/loki/config.yml:/etc/loki/config.yml + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ loki_port }}/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + promtail: + image: grafana/promtail:{{ promtail_version }} + ports: + - "{{ promtail_port }}:{{ promtail_port }}" + volumes: + - {{ monitoring_dir }}/promtail/config.yml:/etc/promtail/config.yml + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + depends_on: + - loki + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + + prometheus: + image: prom/prometheus:v{{ prometheus_version }} + ports: + - "{{ prometheus_port }}:{{ prometheus_port }}" + volumes: + - {{ monitoring_dir }}/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time={{ prometheus_retention_days }}d' + - '--storage.tsdb.retention.size={{ prometheus_retention_size }}' + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:{{ prometheus_port }}/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + + grafana: + image: grafana/grafana:{{ grafana_version }} + ports: + - "{{ grafana_port }}:{{ grafana_port }}" + volumes: + - grafana-data:/var/lib/grafana + - {{ monitoring_dir }}/grafana/provisioning:/etc/grafana/provisioning + - {{ monitoring_dir }}/grafana/dashboards:/var/lib/grafana/dashboards + networks: + - logging + environment: + GF_SECURITY_ADMIN_USER: "{{ grafana_admin_user }}" + GF_SECURITY_ADMIN_PASSWORD: "{{ grafana_admin_password }}" + depends_on: + - prometheus + - loki + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:{{ grafana_port }}/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + app-python: + image: "{{ app_python_image | default('app-python:latest') }}" + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s diff --git a/ansible/roles/monitoring/templates/prometheus.yml.j2 b/ansible/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000000..645dd11b9c --- /dev/null +++ b/ansible/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,13 @@ +global: + scrape_interval: {{ prometheus_scrape_interval }} + evaluation_interval: {{ prometheus_scrape_interval }} + +scrape_configs: +{% for target in prometheus_targets %} + - job_name: '{{ target.job }}' + static_configs: + - targets: {{ target.targets | to_json }} +{% if target.path is defined %} + metrics_path: '{{ target.path }}' +{% endif %} +{% endfor %} diff --git a/ansible/roles/web_app/defaults/main.yml b/ansible/roles/web_app/defaults/main.yml new file mode 100644 index 0000000000..b82c6b0531 --- /dev/null +++ b/ansible/roles/web_app/defaults/main.yml @@ -0,0 +1,7 @@ +--- +restart_policy: unless-stopped +app_env: {} + +# Wipe Logic Control +# Set to true to remove application completely +web_app_wipe: false \ No newline at end of file diff --git a/ansible/roles/web_app/meta/main.yml b/ansible/roles/web_app/meta/main.yml new file mode 100644 index 0000000000..fc95875336 --- /dev/null +++ b/ansible/roles/web_app/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: docker \ No newline at end of file diff --git a/ansible/roles/web_app/tasks/main.yml b/ansible/roles/web_app/tasks/main.yml new file mode 100644 index 0000000000..f3d4c68b63 --- /dev/null +++ b/ansible/roles/web_app/tasks/main.yml @@ -0,0 +1,36 @@ +--- +# Wipe logic runs first (when explicitly requested) +- name: Include wipe tasks + ansible.builtin.include_tasks: wipe.yml + tags: + - web_app_wipe + +- name: Deploy application with Docker Compose + block: + + - name: Create application directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: directory + mode: '0755' + + - name: Template docker-compose.yml + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ compose_project_dir }}/docker-compose.yml" + mode: '0644' + + - name: Deploy application via Docker Compose + ansible.builtin.command: + cmd: docker compose up -d + chdir: "{{ compose_project_dir }}" + + rescue: + + - name: Log deployment failure + ansible.builtin.debug: + msg: "Docker Compose deployment failed!" + + tags: + - web_app_deploy + - compose \ No newline at end of file diff --git a/ansible/roles/web_app/tasks/wipe.yml b/ansible/roles/web_app/tasks/wipe.yml new file mode 100644 index 0000000000..8a8e612bb2 --- /dev/null +++ b/ansible/roles/web_app/tasks/wipe.yml @@ -0,0 +1,29 @@ +--- +- name: Wipe web application + block: + + - name: Stop and remove containers + ansible.builtin.command: + cmd: docker compose down + chdir: "{{ compose_project_dir }}" + ignore_errors: true + + - name: Remove docker-compose file + ansible.builtin.file: + path: "{{ compose_project_dir }}/docker-compose.yml" + state: absent + ignore_errors: true + + - name: Remove application directory + ansible.builtin.file: + path: "{{ compose_project_dir }}" + state: absent + ignore_errors: true + + - name: Log wipe completion + ansible.builtin.debug: + msg: "Application {{ web_app_name }} wiped successfully" + + when: web_app_wipe | bool + tags: + - web_app_wipe \ No newline at end of file diff --git a/ansible/roles/web_app/templates/docker-compose.yml.j2 b/ansible/roles/web_app/templates/docker-compose.yml.j2 new file mode 100644 index 0000000000..44c2473965 --- /dev/null +++ b/ansible/roles/web_app/templates/docker-compose.yml.j2 @@ -0,0 +1,21 @@ +version: "{{ docker_compose_version }}" + +services: + {{ web_app_name }}: + image: {{ docker_image }}:{{ docker_image_tag }} + container_name: {{ web_app_name }} + + ports: + - "{{ web_app_port }}:{{ web_app_internal_port }}" + + environment: + APP_SECRET_KEY: "{{ web_app_secret_key }}" + + restart: unless-stopped + + networks: + - app_network + +networks: + app_network: + driver: bridge \ No newline at end of file diff --git a/app_go/.gitignore b/app_go/.gitignore new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_go/Dockerfile b/app_go/Dockerfile new file mode 100644 index 0000000000..1e5f715386 --- /dev/null +++ b/app_go/Dockerfile @@ -0,0 +1,25 @@ +# -------- Build stage -------- +FROM golang:1.25-alpine AS builder + +WORKDIR /app +COPY go.mod ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o app + +# -------- Runtime stage -------- +FROM alpine:latest + +RUN addgroup -S appgroup && adduser -S appuser -G appgroup + +WORKDIR /app +COPY --from=builder /app/app . + +RUN chown appuser:appgroup /app/app +RUN mkdir -p /data && chown appuser:appgroup /data + +USER appuser + +EXPOSE 8001 +CMD ["./app"] diff --git a/app_go/README.md b/app_go/README.md new file mode 100644 index 0000000000..4b112918e9 --- /dev/null +++ b/app_go/README.md @@ -0,0 +1,58 @@ +# DevOps Info Service (Go) + +## Overview + +DevOps Info Service is a Go-based web application that provides detailed +information about the service itself, system environment, runtime status, and +incoming HTTP requests. + +The application is implemented in a single source file for simplicity. + +--- + +## Prerequisites + +- Go 1.22+ +- Docker (optional) + +--- + +## Running Locally + +```bash +go run main.go +```` + +### Custom Configuration + +```bash +PORT=8080 go run main.go +HOST=127.0.0.1 PORT=3000 go run main.go +``` + +--- + +## API Endpoints + +| Method | Path | Description | +| ------ | ------- | ------------------------------ | +| GET | / | Service and system information | +| GET | /health | Health check | + +--- + +## Configuration + +| Variable | Default | Description | +| -------- | ------- | ---------------- | +| HOST | 0.0.0.0 | Bind address | +| PORT | 5000 | Application port | + +--- + +## Docker Build (Multi-Stage) + +```bash +docker build -t devops-info-go . +docker run -p 5000:5000 devops-info-go +``` diff --git a/app_go/api b/app_go/api new file mode 100755 index 0000000000..59690f6658 Binary files /dev/null and b/app_go/api differ diff --git a/app_go/docker-compose.yml b/app_go/docker-compose.yml new file mode 100644 index 0000000000..a532941845 --- /dev/null +++ b/app_go/docker-compose.yml @@ -0,0 +1,15 @@ +services: + app: + build: . + ports: + - "8001:8001" + environment: + HOST: "0.0.0.0" + PORT: "8001" + VISITS_FILE: "/data/visits" + volumes: + - visits_data:/data + +volumes: + visits_data: + driver: local diff --git a/app_go/docs/LAB01.md b/app_go/docs/LAB01.md new file mode 100644 index 0000000000..46c89f33f2 --- /dev/null +++ b/app_go/docs/LAB01.md @@ -0,0 +1,180 @@ +# Lab 01 – DevOps Info Service (Go) + +## Framework Selection + +### Chosen Language and Framework: Go (net/http) + +The Go programming language with the standard `net/http` package was chosen +for this laboratory work. Go is widely used in DevOps and cloud-native +environments due to its performance, static compilation, simplicity, and +excellent support for concurrent workloads. + +Using the standard library avoids unnecessary dependencies and keeps the +service lightweight and predictable. + +### Comparison with Alternatives + +| Option | Pros | Cons | Reason Not Chosen | +|------|------|------|-------------------| +| Go (net/http) | Fast, static binary, no dependencies | Less abstraction | Chosen | +| Gin | Simple routing, middleware | External dependency | Not required | +| Echo | High performance | External dependency | Overkill | +| Python (FastAPI) | Rapid development, async | Interpreted, slower startup | Language diversity | + +--- + +## Best Practices Applied + +### 1. Minimal Dependency Usage + +Only the Go standard library is used. + +**Example:** +```go +import "net/http" +```` + +**Importance:** +Reduces attack surface, simplifies builds, and improves reliability. + +--- + +### 2. Single Responsibility Structure + +Although the application is implemented in a single file, logical separation +is maintained through clearly defined functions. + +**Example:** + +```go +func healthHandler(w http.ResponseWriter, r *http.Request) +``` + +**Importance:** +Keeps the code readable while remaining suitable for small services. + +--- + +### 3. Environment-Based Configuration + +Runtime configuration is handled via environment variables. + +**Example:** + +```go +port := os.Getenv("PORT") +``` + +**Importance:** +Allows flexible deployment across environments without code changes. + +--- + +### 4. Structured JSON Responses + +All responses are returned in structured JSON format. + +**Example:** + +```go +json.NewEncoder(w).Encode(response) +``` + +**Importance:** +Ensures API consistency and ease of integration. + +--- + +### 5. Health Check Endpoint + +A dedicated health check endpoint is implemented. + +**Example:** + +```go +http.HandleFunc("/health", healthHandler) +``` + +**Importance:** +Required for monitoring systems and container orchestration platforms. + +--- + +## API Documentation + +### GET / + +**Description:** +Returns service metadata, system information, runtime statistics, and request +details. + +**Example Response:** + +```json +{ + "service": { + "name": "devops-info-service", + "language": "go" + } +} +``` + +--- + +### GET /health + +**Description:** +Returns application health status. + +**Example Response:** + +```json +{ + "status": "healthy" +} +``` + +--- + +### Testing Commands + +```bash +curl http://localhost:5000/ +curl http://localhost:5000/health +``` + +--- + +## Testing Evidence + +The following screenshots are provided: + +* Main endpoint (`/`) showing full JSON response +* Health check endpoint (`/health`) +* Pretty-printed JSON output in terminal + +--- + +## Challenges & Solutions + +### Problem 1: Balancing simplicity and structure + +**Issue:** +Splitting the application into multiple files was unnecessary for the scope of +the laboratory work. + +**Solution:** +Implemented all logic in a single file while preserving logical separation via +functions. + +--- + +### Problem 2: Container image size optimization + +**Issue:** +Default Go images produce relatively large containers. + +**Solution:** +Implemented a multi-stage Docker build to produce a minimal runtime image. + +```` diff --git a/app_go/docs/LAB02.md b/app_go/docs/LAB02.md new file mode 100644 index 0000000000..da39b9af3c --- /dev/null +++ b/app_go/docs/LAB02.md @@ -0,0 +1,234 @@ +# LAB02 — Multi-Stage Docker Build (Go Application) + +## Overview + +This task demonstrates containerization of a compiled Go application using **multi-stage Docker builds**. +The goal is to separate the build environment from the runtime environment in order to reduce image size, improve security, and follow production best practices. + +--- + +## Multi-Stage Build Strategy + +The Dockerfile uses **two stages**: + +1. **Builder stage** — compiles the Go application +2. **Runtime stage** — runs only the compiled binary + +This approach ensures that the final image does **not** include compilers, SDKs, or build tools. + +--- + +## Dockerfile + +```dockerfile +# -------- Build stage -------- +FROM golang:1.25-alpine AS builder + +WORKDIR /app +COPY go.mod ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o app + +# -------- Runtime stage -------- +FROM alpine:latest + +WORKDIR /app +COPY --from=builder /app/app . + +EXPOSE 5000 +CMD ["./app"] +``` + +--- + +## Stage-by-Stage Explanation + +### Build Stage (`builder`) + +```dockerfile +FROM golang:1.25-alpine AS builder +``` + +* Uses the official Go SDK image +* Required for compiling the application +* Includes Go compiler and build tools (large image) + +```dockerfile +COPY go.mod ./ +RUN go mod download +``` + +* Copies `go.mod` separately +* Allows Docker layer caching +* Dependencies are not re-downloaded unless `go.mod` changes + +```dockerfile +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o app +``` + +* Produces a **static binary** +* Ensures compatibility with minimal runtime images +* No libc or system dependencies required + +--- + +### Runtime Stage + +```dockerfile +FROM alpine:latest +``` + +* Minimal Linux distribution +* Very small image size +* No compilers or SDKs included + +```dockerfile +COPY --from=builder /app/app . +``` + +* Copies **only the compiled binary** +* No source code or build artifacts included + +--- + +## Image Size Comparison + +```bash +docker build --target builder -t go-with-multi-stage . +docker build --target builder -t go-without-multi-stage . +``` + +| Image | Size | +| --------------------------------------- | ---------------- | +| Builder image (`golang:1.25-alpine`) | ~309 MB | +| Final runtime image (`alpine + binary`) | ~25.3 MB | + +![Comparison](./screenshots/compare_images.png) + +## Docker Hub + +The image was published to Docker Hub and is publicly accessible. + +**Repository:** + +``` +https://hub.docker.com/repository/docker/essence666/app_golang_lab_2/general +``` + +### Analysis + +* Image size reduced by **more than 95%** +* Smaller images: + + * Pull faster + * Use less disk space + * Reduce attack surface + +--- + +## Why Multi-Stage Builds Matter for Compiled Languages + +Without multi-stage builds: + +* Final image would include: + + * Go compiler + * Package manager + * Build cache +* Image size would be unnecessarily large +* Increased security risks + +With multi-stage builds: + +* Build tools are discarded +* Only the runtime artifact is shipped +* Clear separation of responsibilities + +--- + +## Security Considerations + +* Final image does **not** include: + + * Compiler + * Package manager + * Source code +* Smaller attack surface +* Fewer CVEs +* Easier vulnerability scanning + +--- + +## Why Not Use the Builder Image as Final? + +* Builder image is designed for development, not production +* Contains unnecessary tools +* Much larger size +* Increased attack surface + +--- + +## Can `FROM scratch` Be Used? + +In theory, yes — because the binary is statically compiled. + +However, `alpine` was chosen because: + +* Easier debugging +* Provides basic utilities +* Better balance between minimalism and usability + +--- + +## Build & Run Process + +### Build Image + +```bash +docker build -t go-info-service . +``` + +### Run Container + +```bash +docker run -p 5000:5000 go-info-service +``` + +### Test Application + +```bash +curl http://localhost:5000/health +``` + +--- + +## Challenges & Solutions + +### Challenge: Reducing final image size + +**Solution:** +Used multi-stage build and static compilation (`CGO_ENABLED=0`) to eliminate runtime dependencies. + +### Challenge: Dependency caching + +**Solution:** +Separated `go.mod` copy step to improve Docker layer caching. + +--- + +## What I Learned + +* How multi-stage builds dramatically reduce image size +* Why compiled languages benefit most from this approach +* How static compilation enables minimal runtime images +* How smaller images improve security and deployment speed + +--- + +## Conclusion + +Multi-stage Docker builds are essential for containerizing compiled applications in production. +They provide significant benefits in terms of **image size, security, and maintainability**, making them the recommended approach for Go, Rust, and similar languages. + diff --git a/app_go/docs/screenshots/01-main-endpoint.png b/app_go/docs/screenshots/01-main-endpoint.png new file mode 100644 index 0000000000..ca291bf055 Binary files /dev/null and b/app_go/docs/screenshots/01-main-endpoint.png differ diff --git a/app_go/docs/screenshots/02-health-check.png b/app_go/docs/screenshots/02-health-check.png new file mode 100644 index 0000000000..badf6f8d1c Binary files /dev/null and b/app_go/docs/screenshots/02-health-check.png differ diff --git a/app_go/docs/screenshots/03-formatted-output.png b/app_go/docs/screenshots/03-formatted-output.png new file mode 100644 index 0000000000..cd27b4ffac Binary files /dev/null and b/app_go/docs/screenshots/03-formatted-output.png differ diff --git a/app_go/docs/screenshots/compare_images.png b/app_go/docs/screenshots/compare_images.png new file mode 100644 index 0000000000..871bd926ab Binary files /dev/null and b/app_go/docs/screenshots/compare_images.png differ diff --git a/app_go/go.mod b/app_go/go.mod new file mode 100644 index 0000000000..c23731c7fa --- /dev/null +++ b/app_go/go.mod @@ -0,0 +1,3 @@ +module api + +go 1.25.0 diff --git a/app_go/main.go b/app_go/main.go new file mode 100644 index 0000000000..ac12777fb8 --- /dev/null +++ b/app_go/main.go @@ -0,0 +1,283 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + "time" +) + +var startTime = time.Now() + +var ( + visitsMu sync.Mutex + visitsFilePath string +) + +// Structs for JSON response +type Service struct { + Name string `json:"name"` + Version string `json:"version"` + Description string `json:"description"` + Language string `json:"language"` +} + +type System struct { + Hostname string `json:"hostname"` + Platform string `json:"platform"` + Architecture string `json:"architecture"` + CPUCount int `json:"cpu_count"` + GoVersion string `json:"go_version"` +} + +type RuntimeInfo struct { + UptimeSeconds int64 `json:"uptime_seconds"` + UptimeHuman string `json:"uptime_human"` + CurrentTime string `json:"current_time"` + Timezone string `json:"timezone"` +} + +type RequestInfo struct { + ClientIP string `json:"client_ip"` + UserAgent string `json:"user_agent"` + Method string `json:"method"` + Path string `json:"path"` +} + +type Endpoint struct { + Path string `json:"path"` + Method string `json:"method"` + Description string `json:"description"` +} + +type MainResponse struct { + Service Service `json:"service"` + System System `json:"system"` + Runtime RuntimeInfo `json:"runtime"` + Request RequestInfo `json:"request"` + Visits int64 `json:"visits"` + Endpoints []Endpoint `json:"endpoints"` +} + +type HealthResponse struct { + Status string `json:"status"` + Timestamp string `json:"timestamp"` + UptimeSeconds int64 `json:"uptime_seconds"` +} + +type VisitsResponse struct { + Visits int64 `json:"visits"` +} + +type ErrorResponse struct { + Error string `json:"error"` + Message string `json:"message"` +} + +// Visit counter helpers + +func getVisits() int64 { + data, err := os.ReadFile(visitsFilePath) + if err != nil { + return 0 + } + count, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + return 0 + } + return count +} + +func incrementVisits() int64 { + visitsMu.Lock() + defer visitsMu.Unlock() + count := getVisits() + 1 + if err := os.MkdirAll(filepath.Dir(visitsFilePath), 0755); err == nil { + tmp := visitsFilePath + ".tmp" + if err := os.WriteFile(tmp, []byte(strconv.FormatInt(count, 10)), 0644); err == nil { + os.Rename(tmp, visitsFilePath) + } + } + return count +} + +// Helpers +func humanDuration(d time.Duration) string { + h := int(d.Hours()) + m := int(d.Minutes()) % 60 + s := int(d.Seconds()) % 60 + return fmt.Sprintf("%d hour(s), %d minute(s), %d second(s)", h, m, s) +} + +// Logging middleware for JSON structured logging +func loggingMiddleware(next http.HandlerFunc) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + + wrapped := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK} + next(wrapped, r) + + level := "INFO" + if wrapped.statusCode >= 500 { + level = "ERROR" + } else if wrapped.statusCode >= 400 { + level = "WARNING" + } + + duration := time.Since(start) + logEntry := map[string]interface{}{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + "level": level, + "service": "devops-go", + "method": r.Method, + "path": r.URL.Path, + "status_code": wrapped.statusCode, + "duration_ms": duration.Milliseconds(), + "client_ip": r.RemoteAddr, + } + + jsonLog, _ := json.Marshal(logEntry) + fmt.Println(string(jsonLog)) + } +} + +type responseWriter struct { + http.ResponseWriter + statusCode int +} + +func (rw *responseWriter) WriteHeader(code int) { + rw.statusCode = code + rw.ResponseWriter.WriteHeader(code) +} + +// Handlers +func rootHandler(w http.ResponseWriter, r *http.Request) { + now := time.Now() + uptime := now.Sub(startTime) + visits := incrementVisits() + + resp := MainResponse{ + Service: Service{ + Name: "devops-info-service", + Version: "1.0.0", + Description: "DevOps course info service", + Language: "Go", + }, + System: System{ + Hostname: getHostname(), + Platform: runtime.GOOS, + Architecture: runtime.GOARCH, + CPUCount: runtime.NumCPU(), + GoVersion: runtime.Version(), + }, + Runtime: RuntimeInfo{ + UptimeSeconds: int64(uptime.Seconds()), + UptimeHuman: humanDuration(uptime), + CurrentTime: now.UTC().Format(time.RFC3339), + Timezone: "UTC", + }, + Request: RequestInfo{ + ClientIP: r.RemoteAddr, + UserAgent: r.UserAgent(), + Method: r.Method, + Path: r.URL.Path, + }, + Visits: visits, + Endpoints: []Endpoint{ + {Path: "/", Method: "GET", Description: "Service information"}, + {Path: "/health", Method: "GET", Description: "Health check"}, + {Path: "/visits", Method: "GET", Description: "Visit counter"}, + {Path: "/error", Method: "GET", Description: "Test endpoint that returns 500 error"}, + }, + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) +} + +func healthHandler(w http.ResponseWriter, r *http.Request) { + now := time.Now() + uptime := now.Sub(startTime) + + resp := HealthResponse{ + Status: "healthy", + Timestamp: now.UTC().Format(time.RFC3339), + UptimeSeconds: int64(uptime.Seconds()), + } + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) +} + +func visitsHandler(w http.ResponseWriter, r *http.Request) { + resp := VisitsResponse{Visits: getVisits()} + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) +} + +func errorHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusInternalServerError) + resp := ErrorResponse{ + Error: "Internal Server Error", + Message: "Test endpoint triggered - for error logging testing", + } + json.NewEncoder(w).Encode(resp) +} + +// Utility +func getHostname() string { + name, err := os.Hostname() + if err != nil { + return "unknown" + } + return name +} + +// Main +func main() { + visitsFilePath = os.Getenv("VISITS_FILE") + if visitsFilePath == "" { + visitsFilePath = "/data/visits" + } + + host := os.Getenv("HOST") + if host == "" { + host = "0.0.0.0" + } + + port := os.Getenv("PORT") + if port == "" { + port = "8001" + } + + http.HandleFunc("/", loggingMiddleware(rootHandler)) + http.HandleFunc("/health", loggingMiddleware(healthHandler)) + http.HandleFunc("/visits", loggingMiddleware(visitsHandler)) + http.HandleFunc("/error", loggingMiddleware(errorHandler)) + + startupLog := map[string]interface{}{ + "timestamp": time.Now().UTC().Format(time.RFC3339), + "level": "INFO", + "service": "devops-go", + "message": "Starting server", + "host": host, + "port": port, + "visits_file": visitsFilePath, + } + jsonLog, _ := json.Marshal(startupLog) + fmt.Println(string(jsonLog)) + + err := http.ListenAndServe(host+":"+port, nil) + if err != nil { + log.Printf("Server error: %v\n", err) + } +} diff --git a/app_python/.dockerignore b/app_python/.dockerignore new file mode 100644 index 0000000000..c592d7c77c --- /dev/null +++ b/app_python/.dockerignore @@ -0,0 +1,4 @@ +venv/ +__pycache__ +*.pyc +tests/ diff --git a/app_python/.gitignore b/app_python/.gitignore new file mode 100644 index 0000000000..d6d38fcdd6 --- /dev/null +++ b/app_python/.gitignore @@ -0,0 +1,15 @@ +# Python +__pycache__/ +*.py[cod] +venv/ +*.log + +# IDE +.vscode/ +.idea/ + +# OS +.DS_Store + +.pytest_cache/ +.coverage diff --git a/app_python/Dockerfile b/app_python/Dockerfile new file mode 100644 index 0000000000..1afa7c4695 --- /dev/null +++ b/app_python/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.13-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +RUN addgroup --system appgroup \ + && adduser --system --ingroup appgroup appuser + +WORKDIR /app + +COPY requirements.txt . + +RUN pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir -r requirements.txt + +COPY core ./core +COPY api ./api +COPY services ./services +COPY app.py . + +RUN chown -R appuser:appgroup /app +RUN mkdir -p /data && chown appuser:appgroup /data + +USER appuser + +EXPOSE 8000 + +CMD ["python3", "app.py"] diff --git a/app_python/README.md b/app_python/README.md new file mode 100644 index 0000000000..ad1ed3d144 --- /dev/null +++ b/app_python/README.md @@ -0,0 +1,168 @@ +# DevOps Info Service (FastAPI) + +## Overview + +DevOps Info Service is a FastAPI-based web application that provides detailed +information about the service itself, system environment, runtime status, and +incoming HTTP requests. + +--- + +## Prerequisites + +- Python 3.11+ +- pip + +--- + +## Installation + +```bash +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +```` + +--- + +## Running the Application + +```bash +python app.py +``` + +### Custom Configuration + +```bash +PORT=8080 python app.py +HOST=127.0.0.1 PORT=3000 python app.py +``` + +--- + +## API Endpoints + +| Method | Path | Description | +| ------ | ------- | ------------------------------ | +| GET | / | Service and system information | +| GET | /health | Health check | + +--- + +## Configuration + +| Variable | Default | Description | +| -------- | ------- | ---------------- | +| HOST | 0.0.0.0 | Bind address | +| PORT | 5000 | Application port | +| DEBUG | False | Debug mode | + +``` + + +Окей, добавляем ровно то, **что от тебя хотят по лабе**, без воды и с объяснениями. Ниже — **готовый Docker-раздел**, который ты просто **вставляешь в `README.md`** (обычно после `Running the Application`). + +--- + +## Docker + +This application can be run inside a Docker container. +The Docker image is built following Docker best practices: minimal base image, non-root user, optimized layer caching, and a clean build context. + +### Dockerfile Overview + +The Dockerfile is designed for production usage and includes the following decisions: + +* **Base image**: `python:3.13-slim` + Chosen for a balance between small image size and good compatibility with Python packages. + +* **Non-root user**: + The application runs as a dedicated non-root user to reduce security risks. + +* **Optimized layer caching**: + Dependencies are installed before copying application code, allowing Docker to reuse cached layers when only source code changes. + +* **Minimal file copy**: + Only required source files are copied into the image to keep it small and clean. + +* **`.dockerignore` usage**: + Excludes development artifacts, virtual environments, VCS files, and caches to reduce build context size and improve build performance. + +--- + +### Build the Docker Image + +```bash +docker build -t devops-info-service . +``` + +--- + +### Run the Container + +```bash +docker run -p 8000:8000 devops-info-service +``` + +The application will be available at: + +``` +http://localhost:8000 +``` + +--- + +### Environment Variables in Docker + +You can override configuration values using environment variables: + +```bash +docker run -p 8000:8000 \ + -e HOST=0.0.0.0 \ + -e PORT=8000 \ + devops-info-service +``` + +## Visit Counter + +The service tracks the number of visits to the root endpoint (`/`). The counter is persisted to a file so it survives container restarts. + +### Endpoints + +| Endpoint | Method | Description | +|-----------|--------|--------------------------------------| +| `/` | GET | Service info (increments visit count) | +| `/visits` | GET | Returns current visit count | + +### Configuration + +| Environment Variable | Default | Description | +|----------------------|-----------------|---------------------------------| +| `VISITS_FILE` | `/data/visits` | Path to the visit counter file | + +### Local Testing with Docker Compose + +```bash +# Start the service +docker compose up -d + +# Access root endpoint a few times (increments counter) +curl http://localhost:8000/ +curl http://localhost:8000/ +curl http://localhost:8000/ + +# Check visit count +curl http://localhost:8000/visits +# {"visits": 3} + +# Restart container — counter persists +docker compose restart +curl http://localhost:8000/visits +# {"visits": 3} + +# Stop and remove containers (volume persists) +docker compose down +docker compose up -d +curl http://localhost:8000/visits +# {"visits": 3} +``` diff --git a/app_python/api/__init__.py b/app_python/api/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/api/routes/__init__.py b/app_python/api/routes/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/api/routes/health.py b/app_python/api/routes/health.py new file mode 100644 index 0000000000..5800486d2c --- /dev/null +++ b/app_python/api/routes/health.py @@ -0,0 +1,15 @@ +from fastapi import APIRouter +from datetime import datetime, timezone +from services.runtime import get_uptime + +router = APIRouter() + +@router.get("/health") +async def health(): + uptime_seconds, _ = get_uptime() + + return { + "status": "healthy", + "timestamp": datetime.now(timezone.utc).isoformat(), + "uptime_seconds": uptime_seconds, + } diff --git a/app_python/api/routes/root.py b/app_python/api/routes/root.py new file mode 100644 index 0000000000..df3c3b9082 --- /dev/null +++ b/app_python/api/routes/root.py @@ -0,0 +1,69 @@ +import time +from datetime import datetime, timezone + +from core.config import ( + FRAMEWORK, + SERVICE_DESCRIPTION, + SERVICE_NAME, + SERVICE_VERSION, +) +from core.metrics import endpoint_calls, system_info_duration +from fastapi import APIRouter, HTTPException, Request +from services.runtime import get_uptime +from services.system import get_system_info +from services.visits import increment_visits + +router = APIRouter() + + +@router.get("/") +async def root(request: Request): + endpoint_calls.labels(endpoint="/").inc() + visits = increment_visits() + uptime_seconds, uptime_human = get_uptime() + + t0 = time.time() + sys_info = get_system_info() + system_info_duration.observe(time.time() - t0) + + return { + "service": { + "name": SERVICE_NAME, + "version": SERVICE_VERSION, + "description": SERVICE_DESCRIPTION, + "framework": FRAMEWORK, + }, + "system": sys_info, + "runtime": { + "uptime_seconds": uptime_seconds, + "uptime_human": uptime_human, + "current_time": datetime.now(timezone.utc).isoformat(), + "timezone": "UTC", + }, + "request": { + "client_ip": request.client.host if request.client else "unknown", + "user_agent": request.headers.get("user-agent"), + "method": request.method, + "path": request.url.path, + }, + "visits": visits, + "endpoints": [ + {"path": "/", "method": "GET", "description": "Service information"}, + {"path": "/health", "method": "GET", "description": "Health check"}, + {"path": "/visits", "method": "GET", "description": "Visit counter"}, + { + "path": "/error", + "method": "GET", + "description": "Test endpoint that returns 500 error", + }, + ], + } + + +@router.get("/error") +async def error_test(): + """Test endpoint that returns a 500 error for testing error logging""" + endpoint_calls.labels(endpoint="/error").inc() + raise HTTPException( + status_code=500, detail="Internal Server Error - Test endpoint triggered" + ) diff --git a/app_python/api/routes/visits.py b/app_python/api/routes/visits.py new file mode 100644 index 0000000000..ec746fff49 --- /dev/null +++ b/app_python/api/routes/visits.py @@ -0,0 +1,10 @@ +from fastapi import APIRouter +from services.visits import get_visits + +router = APIRouter() + + +@router.get("/visits") +async def visits(): + """Return current visit count.""" + return {"visits": get_visits()} diff --git a/app_python/app.py b/app_python/app.py new file mode 100644 index 0000000000..c1a598ed91 --- /dev/null +++ b/app_python/app.py @@ -0,0 +1,129 @@ +import time + +from api.routes import health, root, visits +from core.config import HOST, PORT, SERVICE_DESCRIPTION, SERVICE_NAME, SERVICE_VERSION +from core.logging import setup_logging +from core.metrics import ( + http_request_duration_seconds, + http_requests_in_progress, + http_requests_total, +) +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, Response +from prometheus_client import CONTENT_TYPE_LATEST, generate_latest + +logger = setup_logging() + +app = FastAPI( + title=SERVICE_NAME, + version=SERVICE_VERSION, + description=SERVICE_DESCRIPTION, +) + + +# Add request logging and metrics middleware +@app.middleware("http") +async def log_requests(request: Request, call_next): + start_time = time.time() + endpoint = request.url.path + + # Skip metrics endpoint from instrumentation to avoid noise + track = endpoint != "/metrics" + + if track: + http_requests_in_progress.inc() + + # Log the incoming request + logger.info( + "Incoming request", + extra={ + "method": request.method, + "path": endpoint, + "client_ip": request.client.host, + "user_agent": request.headers.get("user-agent"), + }, + ) + + response = await call_next(request) + + process_time = time.time() - start_time + log_extra = { + "method": request.method, + "path": endpoint, + "status_code": response.status_code, + "process_time_ms": round(process_time * 1000, 2), + "client_ip": request.client.host, + } + + if track: + http_requests_in_progress.dec() + http_requests_total.labels( + method=request.method, + endpoint=endpoint, + status_code=str(response.status_code), + ).inc() + http_request_duration_seconds.labels( + method=request.method, + endpoint=endpoint, + ).observe(process_time) + + if response.status_code >= 500: + logger.error("Request failed with server error", extra=log_extra) + elif response.status_code >= 400: + logger.warning("Request failed with client error", extra=log_extra) + else: + logger.info("Request completed", extra=log_extra) + + return response + + +@app.get("/metrics", include_in_schema=False) +async def metrics(): + return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST) + + +app.include_router(root.router) +app.include_router(health.router) +app.include_router(visits.router) + + +@app.exception_handler(404) +async def not_found(request: Request, exc): + logger.warning( + "Route not found", + extra={ + "method": request.method, + "path": request.url.path, + "client_ip": request.client.host, + }, + ) + return JSONResponse( + status_code=404, + content={"error": "Not Found", "message": "Endpoint does not exist"}, + ) + + +@app.exception_handler(Exception) +async def internal_error(request: Request, exc): + logger.exception( + "Unhandled exception", + extra={ + "method": request.method, + "path": request.url.path, + "client_ip": request.client.host, + }, + ) + return JSONResponse( + status_code=500, + content={ + "error": "Internal Server Error", + "message": "An unexpected error occurred", + }, + ) + + +if __name__ == "__main__": + import uvicorn + + logger.info("Starting devops-info-service") + uvicorn.run("app:app", host=HOST, port=PORT) diff --git a/app_python/core/__init__.py b/app_python/core/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/core/config.py b/app_python/core/config.py new file mode 100644 index 0000000000..c448984811 --- /dev/null +++ b/app_python/core/config.py @@ -0,0 +1,10 @@ +import os + +HOST = os.getenv("HOST", "0.0.0.0") +PORT = int(os.getenv("PORT", 8000)) +DEBUG = os.getenv("DEBUG", "False").lower() == "true" + +SERVICE_NAME = "devops-info-service" +SERVICE_VERSION = "1.0.0" +SERVICE_DESCRIPTION = "DevOps course info service" +FRAMEWORK = "FastAPI" diff --git a/app_python/core/logging.py b/app_python/core/logging.py new file mode 100644 index 0000000000..bd7fbf325e --- /dev/null +++ b/app_python/core/logging.py @@ -0,0 +1,36 @@ +import logging +import json +from pythonjsonlogger import jsonlogger +from core.config import DEBUG + +class CustomJsonFormatter(jsonlogger.JsonFormatter): + def add_fields(self, log_record, record, message_dict): + super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict) + if not log_record.get('timestamp'): + log_record['timestamp'] = record.created + if log_record.get('level'): + log_record['level'] = log_record['level'].upper() + else: + log_record['level'] = record.levelname.upper() + +def setup_logging(): + logger = logging.getLogger("devops-info-service") + logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) + + # Create console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO) + + # Create JSON formatter + formatter = CustomJsonFormatter( + '%(timestamp)s %(level)s %(name)s %(message)s' + ) + console_handler.setFormatter(formatter) + + # Add handler to logger + logger.addHandler(console_handler) + + # Prevent duplicate logs + logger.propagate = False + + return logger diff --git a/app_python/core/metrics.py b/app_python/core/metrics.py new file mode 100644 index 0000000000..0d1928fc21 --- /dev/null +++ b/app_python/core/metrics.py @@ -0,0 +1,31 @@ +from prometheus_client import Counter, Histogram, Gauge + +http_requests_total = Counter( + 'http_requests_total', + 'Total HTTP requests', + ['method', 'endpoint', 'status_code'], +) + +http_request_duration_seconds = Histogram( + 'http_request_duration_seconds', + 'HTTP request duration in seconds', + ['method', 'endpoint'], + buckets=[0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0], +) + +http_requests_in_progress = Gauge( + 'http_requests_in_progress', + 'Number of HTTP requests currently being processed', +) + +endpoint_calls = Counter( + 'devops_info_endpoint_calls_total', + 'Total calls per endpoint', + ['endpoint'], +) + +system_info_duration = Histogram( + 'devops_info_system_collection_seconds', + 'Time spent collecting system info', + buckets=[0.001, 0.005, 0.01, 0.05, 0.1, 0.5], +) diff --git a/app_python/docker-compose.yml b/app_python/docker-compose.yml new file mode 100644 index 0000000000..0f03452dc0 --- /dev/null +++ b/app_python/docker-compose.yml @@ -0,0 +1,15 @@ +services: + app: + build: . + ports: + - "8000:8000" + environment: + HOST: "0.0.0.0" + PORT: "8000" + VISITS_FILE: "/data/visits" + volumes: + - visits_data:/data + +volumes: + visits_data: + driver: local diff --git a/app_python/docs/LAB01.md b/app_python/docs/LAB01.md new file mode 100644 index 0000000000..0dad9a7af9 --- /dev/null +++ b/app_python/docs/LAB01.md @@ -0,0 +1,195 @@ +принял, делаем **сухо, академично, без воды**, чтобы ассистенту было **нечего резать**. +Ниже **чистый `.md` код**, копипастой. + +--- + +## 📄 `app_python/docs/LAB01.md` + +````md +# Lab 01 – DevOps Info Service + +## Framework Selection + +### Chosen Framework: FastAPI + +FastAPI was selected for this laboratory work due to its high performance, +native support for ASGI, automatic OpenAPI documentation, and clean modular +architecture support. These features align well with DevOps requirements such +as observability, scalability, and maintainability. + +FastAPI also allows easy extension of the application in future labs without +rewriting core components. + +### Comparison with Alternatives + +| Framework | Pros | Cons | Reason Not Chosen | +|---------|------|------|-------------------| +| FastAPI | High performance, async support, OpenAPI, type hints | Slightly higher learning curve | Chosen | +| Flask | Simple, lightweight | No async by default, manual validation | Limited scalability | +| Django | Full-featured, ORM, admin panel | Heavyweight, overkill for microservice | Excess complexity | + +--- + +## Best Practices Applied + +### 1. Modular Project Structure + +The application is split into logical modules: +- `core` – configuration and logging +- `api` – HTTP routes +- `services` – business logic + +**Example:** +```python +app.include_router(root.router) +app.include_router(health.router) +```` + +**Importance:** +Improves readability, scalability, and maintainability of the codebase. + +--- + +### 2. Environment-Based Configuration + +Configuration values are read from environment variables. + +**Example:** + +```python +PORT = int(os.getenv("PORT", 5000)) +``` + +**Importance:** +Allows easy configuration changes without modifying source code, following +the Twelve-Factor App methodology. + +--- + +### 3. Centralized Logging + +A unified logging setup is used across the application. + +**Example:** + +```python +logger = setup_logging() +``` + +**Importance:** +Simplifies debugging and is essential for production monitoring. + +--- + +### 4. Health Check Endpoint + +A dedicated `/health` endpoint is implemented. + +**Example:** + +```python +@router.get("/health") +async def health_check(): + return {"status": "healthy"} +``` + +**Importance:** +Required for container orchestration systems and service monitoring. + +--- + +### 5. Explicit Error Handling + +Custom handlers for 404 and 500 errors are defined. + +**Example:** + +```python +@app.exception_handler(404) +async def not_found(request: Request, exc): +``` + +**Importance:** +Provides consistent error responses and improves API reliability. + +--- + +## API Documentation + +### GET / + +**Description:** +Returns service metadata, system information, runtime statistics, and request +details. + +**Example Response:** + +```json +{ + "service": { + "name": "devops-info-service", + "version": "1.0.0" + } +} +``` + +--- + +### GET /health + +**Description:** +Returns application health status. + +**Example Response:** + +```json +{ + "status": "healthy", + "uptime_seconds": 120 +} +``` + +--- + +### Testing Commands + +```bash +curl http://localhost:5000/ +curl http://localhost:5000/health +``` + +--- + +## Testing Evidence + +The following screenshots are provided: + +* Main endpoint (`/`) showing full JSON response +* Health check endpoint (`/health`) +* Pretty-printed JSON output in terminal + +--- + +## Challenges & Solutions + +### Problem 1: ASGI application import error + +**Issue:** +Uvicorn could not import the application module when using a modular structure. + +**Solution:** +Corrected the `uvicorn.run()` module reference to match the entrypoint file. + +--- + +### Problem 2: Growing complexity of a single file + +**Issue:** +Maintaining all logic in a single file would not scale for future labs. + +**Solution:** +Refactored the application into multiple modules with clear responsibility +boundaries. + +```` + diff --git a/app_python/docs/LAB02.md b/app_python/docs/LAB02.md new file mode 100644 index 0000000000..2561891078 --- /dev/null +++ b/app_python/docs/LAB02.md @@ -0,0 +1,236 @@ +# LAB02 — Docker Containerization (Python FastAPI) + +## Overview + +In this lab, the Python FastAPI application from Lab 1 was containerized using Docker following production-ready best practices. The goal was to create a secure, optimized, and reproducible Docker image, publish it to Docker Hub, and document all technical decisions made during the process. + +--- + +## Docker Best Practices Applied + +### Non-root User + +The container runs the application as a non-root user instead of the default `root` user. + +**Why this matters:** + +* Containers are not a full security boundary +* Running as root increases the impact of a potential container breakout +* Follows the principle of least privilege + +```dockerfile +RUN addgroup --system appgroup \ + && adduser --system --ingroup appgroup appuser +... +USER appuser +``` + +--- + +### Specific Base Image Version + +The image uses a pinned base image: + +```dockerfile +FROM python:3.13-slim +``` + +**Why this matters:** + +* Guarantees reproducible builds +* Prevents unexpected breaking changes +* `slim` provides a good balance between size and compatibility + +--- + +### Optimized Layer Caching + +Dependencies are installed before application code is copied: + +```dockerfile +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +``` + +**Why this matters:** + +* Docker caches layers +* Dependency installation is reused if only source code changes +* Significantly speeds up rebuilds + +--- + +### Minimal File Copy + +Only required application files are copied into the image: + +```dockerfile +COPY core ./core +COPY api ./api +COPY services ./services +COPY app.py . +``` + +**Why this matters:** + +* Smaller image size +* Reduced attack surface +* No unnecessary development files included + +--- + +### .dockerignore Usage + +A `.dockerignore` file is used to exclude unnecessary files from the build context. + +**Excluded files include:** + +* Python cache files (`__pycache__`, `*.pyc`) +* Virtual environments (`venv`, `.venv`) +* Git repository files +* IDE configuration files + +**Why this matters:** + +* Faster build times +* Smaller build context +* Prevents leaking development artifacts into the image + +--- + +## Image Information & Decisions + +### Base Image Choice + +* **Image:** `python:3.13-slim` +* Chosen for its small size and compatibility with Python dependencies +* Avoids issues commonly found with Alpine-based Python images + +### Final Image Size + +The final image size is relatively small compared to a full Python image and is suitable for production usage. + +Smaller images: + +* Pull faster +* Consume less disk space +* Reduce the number of potential vulnerabilities + +--- + +### Layer Structure Explanation + +1. Base image +2. Environment variables +3. Non-root user creation +4. Dependency installation +5. Application code copy +6. Switch to non-root user +7. Application startup + +This order maximizes cache efficiency and minimizes rebuild time. + +--- + +## Build & Run Process + +### Build Image + +```bash +docker build -t devops-info-service . +``` + +### Run Container + +```bash +docker run -p 8000:8000 devops-info-service +``` + +### Test Endpoints + +```bash +curl http://localhost:8000/health +``` + +Expected response: + +```json +{"status": "ok"} +``` + +--- + +## Docker Hub + +The image was published to Docker Hub and is publicly accessible. + +**Repository:** + +``` +https://hub.docker.com/repository/docker/essence666/app_python_lab_2/general +``` + +--- + +## Technical Analysis + +### Why the Dockerfile Works This Way + +* Dependency layers are cached +* Application runs as non-root +* Minimal runtime environment +* Clear separation between build and runtime concerns + +### What Happens If Layer Order Changes + +If application code is copied before installing dependencies: + +* Any code change invalidates dependency cache +* Dependencies are reinstalled on every build +* Build times increase significantly + +--- + +### Security Considerations + +* Application does not run as root +* Smaller image reduces attack surface +* No development tools included +* Environment variables used for configuration + +--- + +### How .dockerignore Improves the Build + +* Reduces build context size +* Prevents accidental inclusion of sensitive files +* Improves Docker build performance + +--- + +## Challenges & Solutions + +### Challenge: Docker layer cache invalidation + +**Solution:** +Copied `requirements.txt` separately before application code. + +### Challenge: Running as non-root + +**Solution:** +Created a dedicated system user and adjusted file permissions. + +--- + +## What I Learned + +* How Docker layer caching works in practice +* Why running containers as non-root is critical +* How to optimize Docker images for production +* How to document containerization decisions clearly + +--- + +## Conclusion + +This lab demonstrates a production-ready Docker setup for a Python FastAPI application. By applying best practices such as non-root execution, optimized layer caching, and minimal images, the resulting container is secure, efficient, and suitable for real-world deployment. diff --git a/app_python/docs/LAB03.md b/app_python/docs/LAB03.md new file mode 100644 index 0000000000..bfb829b744 --- /dev/null +++ b/app_python/docs/LAB03.md @@ -0,0 +1,243 @@ +# LAB03 – CI/CD, Testing & Security for FastAPI Application + +## Overview + +This project demonstrates implementation of a CI pipeline for a Python FastAPI application including: + +* Code linting +* Unit testing +* Static security analysis +* Dependency vulnerability scanning +* Docker image build validation + +The goal of this lab is to build a reliable and automated CI workflow following DevOps best practices. + +--- + +# Application Description + +The application is built using **FastAPI** and provides: + +* `GET /` — root endpoint +* `GET /health` — health check endpoint +* Error handling routes + +Testing is performed using: + +* `pytest` +* `fastapi.testclient` + +--- + +# Testing + +Unit tests are located in: + +``` +app_python/tests/ +``` + +Run locally: + +```bash +pytest +``` + +All tests must pass for CI to succeed. + +--- + +# Linting + +We use **Ruff** for linting. + +Run locally: + +```bash +ruff check . +``` + +Linting ensures: + +* PEP8 compliance +* Clean imports +* No unused variables +* No obvious code issues + +CI fails if linting errors are found. + +--- + +# Security Scanning + +## 1️Static Code Analysis – Bandit + +Instead of Snyk, **Bandit** was used. + +### Why not Snyk? + +Snyk requires an API token configured in repository secrets. Due to token validation issues in CI, it was not possible to fully automate Snyk integration. + +To maintain a fully automated pipeline, Bandit was selected. + +### Why Bandit? + +* Open-source +* No authentication required +* Designed for Python +* CI-friendly +* Maintained by OpenStack Security Team + +Bandit scans for: + +* Insecure function usage (`eval`, `exec`) +* Weak cryptography +* Unsafe subprocess calls +* Hardcoded credentials +* Insecure random usage + +Run locally: + +```bash +bandit -r app_python +``` + +CI fails if high severity issues are detected. + +--- + +## Dependency Vulnerability Scanning – pip-audit + +We use: + +```bash +pip-audit +``` + +It checks Python dependencies for known CVEs. + +If vulnerabilities are found, CI fails. + +--- + +# Docker + +The project includes a Dockerfile. + +Build locally: + +```bash +docker build -t fastapi-app . +``` + +Run: + +```bash +docker run -p 5000:5000 fastapi-app +``` + +The application will be available at: + +``` +http://localhost:5000 +``` + +--- + +# CI Pipeline + +GitHub Actions workflow: + +``` +.github/workflows/ci.yml +``` + +## CI Stages + +### Install dependencies + +```bash +pip install -r requirements.txt +pip install -r requirements-dev.txt +``` + +--- + +### Lint + +```bash +ruff check . +``` + +--- + +### Tests + +```bash +pytest +``` + +--- + +### Security Scan + +```bash +bandit -r app_python +pip-audit +``` + +--- + +### Docker Build + +```bash +docker build -t fastapi-app . +``` + +--- + +# CI Guarantees + +The pipeline ensures: + +* Code quality validation +* Test coverage enforcement +* Security issue detection +* Dependency vulnerability control +* Docker image build validation + +If any step fails — the pipeline fails. + +--- + +# Technologies Used + +* Python 3.x +* FastAPI +* Pytest +* Ruff +* Bandit +* pip-audit +* Docker +* GitHub Actions + +--- + +# DevOps Best Practices Applied + +* Automated testing +* Automated linting +* Automated security scanning +* Fail-fast CI strategy +* Reproducible builds +* Infrastructure-as-Code for CI + +--- + +# Conclusion + +This lab demonstrates implementation of a production-like CI pipeline for a Python web application. + +The project integrates testing, linting, security scanning, and container validation to ensure code reliability, security, and maintainability. + +[The docker images with release by tag 1.0.0 1.0.1 etc](https://hub.docker.com/repository/docker/essence666/devops-info-service/general) diff --git a/app_python/docs/screenshots/01-main-endpoint.png b/app_python/docs/screenshots/01-main-endpoint.png new file mode 100644 index 0000000000..021c3f56d0 Binary files /dev/null and b/app_python/docs/screenshots/01-main-endpoint.png differ diff --git a/app_python/docs/screenshots/02-health-check.png b/app_python/docs/screenshots/02-health-check.png new file mode 100644 index 0000000000..98deb9b1f9 Binary files /dev/null and b/app_python/docs/screenshots/02-health-check.png differ diff --git a/app_python/docs/screenshots/03-formatted-output.png b/app_python/docs/screenshots/03-formatted-output.png new file mode 100644 index 0000000000..2986a23806 Binary files /dev/null and b/app_python/docs/screenshots/03-formatted-output.png differ diff --git a/app_python/requirements-dev.txt b/app_python/requirements-dev.txt new file mode 100644 index 0000000000..2e2512c7a6 --- /dev/null +++ b/app_python/requirements-dev.txt @@ -0,0 +1,6 @@ +pytest +pytest-cov +httpx +fastapi==0.115.0 +uvicorn[standard]==0.32.0 +bandit diff --git a/app_python/requirements.txt b/app_python/requirements.txt new file mode 100644 index 0000000000..820d9ec991 --- /dev/null +++ b/app_python/requirements.txt @@ -0,0 +1,4 @@ +fastapi==0.115.0 +uvicorn[standard]==0.32.0 +python-json-logger==2.0.7 +prometheus-client==0.23.1 diff --git a/app_python/services/__init__.py b/app_python/services/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/services/runtime.py b/app_python/services/runtime.py new file mode 100644 index 0000000000..8044186267 --- /dev/null +++ b/app_python/services/runtime.py @@ -0,0 +1,11 @@ +from datetime import datetime, timezone + +START_TIME = datetime.now(timezone.utc) + +def get_uptime(): + delta = datetime.now(timezone.utc) - START_TIME + seconds = int(delta.total_seconds()) + hours, remainder = divmod(seconds, 3600) + minutes, _ = divmod(remainder, 60) + + return seconds, f"{hours} hour(s), {minutes} minute(s)" diff --git a/app_python/services/system.py b/app_python/services/system.py new file mode 100644 index 0000000000..637b36c9dc --- /dev/null +++ b/app_python/services/system.py @@ -0,0 +1,13 @@ +import os +import socket +import platform + +def get_system_info(): + return { + "hostname": socket.gethostname(), + "platform": platform.system(), + "platform_version": platform.version(), + "architecture": platform.machine(), + "cpu_count": os.cpu_count(), + "python_version": platform.python_version(), + } diff --git a/app_python/services/visits.py b/app_python/services/visits.py new file mode 100644 index 0000000000..a1bddc786b --- /dev/null +++ b/app_python/services/visits.py @@ -0,0 +1,28 @@ +import os +import threading + +VISITS_FILE = os.getenv("VISITS_FILE", "/data/visits") + +_lock = threading.Lock() + + +def get_visits() -> int: + """Read current visit count from file. Returns 0 if file doesn't exist.""" + try: + with open(VISITS_FILE, "r") as f: + return int(f.read().strip()) + except (FileNotFoundError, ValueError): + return 0 + + +def increment_visits() -> int: + """Atomically increment visit counter and persist to file. Returns new count.""" + with _lock: + count = get_visits() + 1 + os.makedirs(os.path.dirname(VISITS_FILE), exist_ok=True) + # Write atomically via temp file + rename + tmp = VISITS_FILE + ".tmp" + with open(tmp, "w") as f: + f.write(str(count)) + os.replace(tmp, VISITS_FILE) + return count diff --git a/app_python/tests/__init__.py b/app_python/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app_python/tests/test_errors.py b/app_python/tests/test_errors.py new file mode 100644 index 0000000000..65d1f7aad0 --- /dev/null +++ b/app_python/tests/test_errors.py @@ -0,0 +1,15 @@ +from fastapi.testclient import TestClient +from app import app + +client = TestClient(app) + + +def test_404_handler(): + response = client.get("/non-existing-endpoint") + + assert response.status_code == 404 + + data = response.json() + + assert data["error"] == "Not Found" + assert "message" in data diff --git a/app_python/tests/test_health.py b/app_python/tests/test_health.py new file mode 100644 index 0000000000..a9942d4ff5 --- /dev/null +++ b/app_python/tests/test_health.py @@ -0,0 +1,16 @@ +from fastapi.testclient import TestClient +from app import app + +client = TestClient(app) + + +def test_health_success(): + response = client.get("/health") + + assert response.status_code == 200 + + data = response.json() + + assert data["status"] == "healthy" + assert "timestamp" in data + assert isinstance(data["uptime_seconds"], int) diff --git a/app_python/tests/test_root.py b/app_python/tests/test_root.py new file mode 100644 index 0000000000..962f4381d9 --- /dev/null +++ b/app_python/tests/test_root.py @@ -0,0 +1,36 @@ +from fastapi.testclient import TestClient +from app import app + +client = TestClient(app) + + +def test_root_success(): + response = client.get("/") + + assert response.status_code == 200 + + data = response.json() + + # service block + assert "service" in data + assert data["service"]["name"] == "devops-info-service" + assert data["service"]["version"] == "1.0.0" + assert data["service"]["framework"] == "FastAPI" + + # system block + assert "system" in data + assert "hostname" in data["system"] + assert "cpu_count" in data["system"] + + # runtime block + assert "runtime" in data + assert "uptime_seconds" in data["runtime"] + assert isinstance(data["runtime"]["uptime_seconds"], int) + + # request block + assert data["request"]["method"] == "GET" + assert data["request"]["path"] == "/" + + # endpoints list + assert isinstance(data["endpoints"], list) + assert any(e["path"] == "/health" for e in data["endpoints"]) diff --git a/k8s/ARGOCD.md b/k8s/ARGOCD.md new file mode 100644 index 0000000000..5f26c7c321 --- /dev/null +++ b/k8s/ARGOCD.md @@ -0,0 +1,655 @@ +# GitOps with ArgoCD Documentation + +## Overview + +This document covers the Lab 13 implementation: installing ArgoCD, deploying applications via declarative manifests, configuring multi-environment deployments, and testing self-healing behaviour. + +--- + +## Task 1 — ArgoCD Installation & Setup + +### Installation via Helm + +```bash +# Add the Argo Helm repository +helm repo add argo https://argoproj.github.io/argo-helm +helm repo update + +# Create a dedicated namespace +kubectl create namespace argocd + +# Install ArgoCD +helm install argocd argo/argo-cd \ + --namespace argocd \ + --set server.service.type=NodePort + +# Wait for all pods to become Ready +kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=argocd-server \ + -n argocd \ + --timeout=120s +``` + +### Verifying Installation + +```bash +kubectl get pods -n argocd +``` + +``` +NAME READY STATUS RESTARTS AGE +argocd-application-controller-0 1/1 Running 0 2m +argocd-applicationset-controller-xxxxxxxxx-xxxxx 1/1 Running 0 2m +argocd-dex-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +argocd-notifications-controller-xxxxxxxxx-xxxxx 1/1 Running 0 2m +argocd-redis-xxxxxxxxx-xxxxx 1/1 Running 0 2m +argocd-repo-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +argocd-server-xxxxxxxxx-xxxxx 1/1 Running 0 2m +``` + +### Accessing the UI + +```bash +# Port-forward the ArgoCD server (keep terminal open) +kubectl port-forward svc/argocd-server -n argocd 8080:443 + +# Retrieve the initial admin password +kubectl -n argocd get secret argocd-initial-admin-secret \ + -o jsonpath="{.data.password}" | base64 -d && echo + +# Open browser at https://localhost:8080 +# Username: admin +# Password: +``` + +### CLI Installation and Login + +```bash +# macOS +brew install argocd + +# Log in via CLI (after port-forward is running) +argocd login localhost:8080 --insecure \ + --username admin \ + --password + +# Verify connection +argocd version +argocd app list +``` + +``` +argocd: v2.13.x + BuildDate: ... + GitCommit: ... + GoVersion: go1.22.x + ... +server: v2.13.x +``` + +--- + +## Task 2 — Application Deployment + +### Directory Structure + +``` +k8s/argocd/ +├── application.yaml # Python app — default namespace, manual sync +├── application-dev.yaml # Python app — dev namespace, auto-sync +├── application-prod.yaml # Python app — prod namespace, manual sync +├── application-go.yaml # Go app — default namespace, manual sync +└── applicationset.yaml # Bonus: ApplicationSet (list generator) +``` + +### Application Manifest (`application.yaml`) + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true +``` + +**Key fields explained:** + +| Field | Value | Purpose | +|-------|-------|---------| +| `repoURL` | GitHub repo URL | ArgoCD clones this repo to track desired state | +| `targetRevision` | `master` | Branch or tag to watch | +| `path` | `k8s/app-python` | Directory containing the Helm chart | +| `destination.server` | `https://kubernetes.default.svc` | In-cluster deployment | +| `destination.namespace` | `default` | Target Kubernetes namespace | +| No `automated` block | — | Manual sync — operator must approve each deployment | + +### Deploying and Syncing + +```bash +# Apply the Application manifest +kubectl apply -f k8s/argocd/application.yaml + +# Check application status (OutOfSync until first sync) +argocd app get python-app +``` + +``` +Name: argocd/python-app +Project: default +Server: https://kubernetes.default.svc +Namespace: default +URL: https://localhost:8080/applications/python-app +Source: + Repo: https://github.com/essence-666/DevOps-Core-Course.git + Target: master + Path: k8s/app-python +SyncStatus: OutOfSync from master +HealthStatus: Missing + +GROUP KIND NAMESPACE NAME STATUS HEALTH HOOK MESSAGE + Service default python-app OutOfSync Missing +apps Deployment default python-app OutOfSync Missing +``` + +```bash +# Trigger initial sync +argocd app sync python-app + +# Watch deployment progress +kubectl rollout status deployment -n default -l app.kubernetes.io/instance=python-app +``` + +``` +Waiting for deployment "python-app-app-python" rollout to finish: 0 of 3 updated... +Waiting for deployment rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment rollout to finish: 2 out of 3 new replicas have been updated... +deployment "python-app-app-python" successfully rolled out +``` + +```bash +# Final status — should show Synced + Healthy +argocd app get python-app +``` + +``` +SyncStatus: Synced to master (abc1234) +HealthStatus: Healthy +``` + +### GitOps Workflow Test + +```bash +# Make a change — update replica count in values.yaml +# git commit and push... + +# ArgoCD detects drift (polls every 3 minutes, or use webhook) +argocd app get python-app +# SyncStatus: OutOfSync from master + +# View the diff +argocd app diff python-app +# --- current +# +++ target +# @@ -5,7 +5,7 @@ +# - replicas: 3 +# + replicas: 4 + +# Apply the change +argocd app sync python-app +``` + +--- + +## Task 3 — Multi-Environment Deployment + +### Environment Overview + +| Environment | Namespace | Sync Policy | Values File | Replicas | Resources | +|-------------|-----------|-------------|-------------|----------|-----------| +| Default | `default` | Manual | `values.yaml` | 3 | Standard | +| Dev | `dev` | **Auto** (selfHeal + prune) | `values-dev.yaml` | 1 | Minimal | +| Prod | `prod` | **Manual** | `values-prod.yaml` | 5 | Generous | + +### Create Namespaces + +```bash +kubectl create namespace dev +kubectl create namespace prod +``` + +### Dev Application (`application-dev.yaml`) + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app-dev + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + - values-dev.yaml # overrides: 1 replica, relaxed limits, NodePort + destination: + server: https://kubernetes.default.svc + namespace: dev + syncPolicy: + automated: + prune: true # delete resources removed from Git + selfHeal: true # revert manual cluster changes + syncOptions: + - CreateNamespace=true + - ServerSideApply=true +``` + +**Dev-specific values (`values-dev.yaml`):** +- `replicaCount: 1` — minimal footprint +- `resources.limits.cpu: 100m`, `memory: 128Mi` — low overhead +- `service.type: NodePort` — direct access in local cluster + +### Prod Application (`application-prod.yaml`) + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app-prod + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + - values-prod.yaml # overrides: 5 replicas, generous limits, LoadBalancer + destination: + server: https://kubernetes.default.svc + namespace: prod + syncPolicy: + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + # No automated block — prod requires explicit manual approval +``` + +**Prod-specific values (`values-prod.yaml`):** +- `replicaCount: 5` — high availability +- `resources.limits.cpu: 500m`, `memory: 512Mi` — production grade +- `image.pullPolicy: Always` — ensures fresh image +- `image.tag: 1.0.0` — pinned tag (never `latest` in prod) +- `service.type: LoadBalancer` + +### Deploying Both Environments + +```bash +kubectl apply -f k8s/argocd/application-dev.yaml +kubectl apply -f k8s/argocd/application-prod.yaml + +# Dev syncs automatically +# Prod requires manual sync: +argocd app sync python-app-prod +``` + +### Verification + +```bash +argocd app list +``` + +``` +NAME CLUSTER NAMESPACE PROJECT STATUS HEALTH SYNCPOLICY CONDITIONS +python-app https://kubernetes.default.svc default default Synced Healthy Manual +python-app-dev https://kubernetes.default.svc dev default Synced Healthy Auto-Prune +python-app-prod https://kubernetes.default.svc prod default Synced Healthy Manual +``` + +```bash +# Verify pods in each namespace +kubectl get pods -n dev +# NAME READY STATUS RESTARTS AGE +# python-app-dev-app-python-xxxxxxx-xxxxx 1/1 Running 0 1m + +kubectl get pods -n prod +# NAME READY STATUS RESTARTS AGE +# python-app-prod-app-python-xxxxxxx-xxxxx 1/1 Running 0 1m +# python-app-prod-app-python-xxxxxxx-yyyyy 1/1 Running 0 1m +# python-app-prod-app-python-xxxxxxx-zzzzz 1/1 Running 0 1m +# python-app-prod-app-python-xxxxxxx-aaaaa 1/1 Running 0 1m +# python-app-prod-app-python-xxxxxxx-bbbbb 1/1 Running 0 1m +``` + +### Why Manual Sync for Production? + +| Reason | Explanation | +|--------|-------------| +| **Change review** | Every prod deployment should pass a human review or approval gate | +| **Controlled timing** | Deploy during maintenance windows, not instantly on every commit | +| **Compliance** | Regulated environments require audit trail of who approved each release | +| **Rollback planning** | Operator can prepare rollback steps before pressing sync | +| **Staged rollout** | Deploy to dev → verify → promote to prod | + +--- + +## Task 4 — Self-Healing & Sync Policies + +### Test 1 — Manual Scale (ArgoCD Self-Healing) + +Dev has `selfHeal: true`, so any manual cluster changes are reverted to match Git. + +```bash +# Before: dev has 1 replica (from values-dev.yaml) +kubectl get pods -n dev +# NAME READY STATUS RESTARTS AGE +# python-app-dev-app-python-xxxxx-xxxxx 1/1 Running 0 5m + +# Manually scale to 5 replicas +kubectl scale deployment -n dev \ + $(kubectl get deploy -n dev -o name) \ + --replicas=5 + +# Observe the pods scaling up +kubectl get pods -n dev +# NAME READY STATUS RESTARTS AGE +# python-app-dev-app-python-xxxxx-xxxxx 1/1 Running 0 5m +# python-app-dev-app-python-xxxxx-yyyyy 1/1 Running 0 3s +# python-app-dev-app-python-xxxxx-zzzzz 1/1 Running 0 3s +# python-app-dev-app-python-xxxxx-aaaaa 1/1 Running 0 3s +# python-app-dev-app-python-xxxxx-bbbbb 1/1 Running 0 3s + +# ArgoCD detects drift immediately +argocd app get python-app-dev | grep -E 'Status|Health' +# SyncStatus: OutOfSync from master +# HealthStatus: Healthy + +# Within ~30 seconds, ArgoCD self-heals and reverts to 1 replica +kubectl get pods -n dev -w +# python-app-dev-app-python-xxxxx-yyyyy 1/1 Terminating 0 25s +# python-app-dev-app-python-xxxxx-zzzzz 1/1 Terminating 0 25s +# python-app-dev-app-python-xxxxx-aaaaa 1/1 Terminating 0 25s +# python-app-dev-app-python-xxxxx-bbbbb 1/1 Terminating 0 25s + +# Back to 1 replica — Git wins +kubectl get pods -n dev +# NAME READY STATUS RESTARTS AGE +# python-app-dev-app-python-xxxxx-xxxxx 1/1 Running 0 6m +``` + +**Behaviour summary:** ArgoCD's `selfHeal` polling interval is ~5 seconds for detected drifts. Revert completes in under 30 seconds. + +### Test 2 — Pod Deletion (Kubernetes Self-Healing) + +This tests **Kubernetes** self-healing via the ReplicaSet controller — not ArgoCD. + +```bash +# Get the pod name +POD=$(kubectl get pods -n dev -o name | head -1) +echo $POD +# pod/python-app-dev-app-python-xxxxx-xxxxx + +# Delete the pod +kubectl delete $POD -n dev +# pod "python-app-dev-app-python-xxxxx-xxxxx" deleted + +# Kubernetes immediately schedules a replacement (ReplicaSet ensures desired count) +kubectl get pods -n dev -w +# NAME READY STATUS RESTARTS AGE +# python-app-dev-app-python-xxxxx-xxxxx 1/1 Terminating 0 8m +# python-app-dev-app-python-xxxxx-ccccc 0/1 ContainerCreating 0 1s +# python-app-dev-app-python-xxxxx-ccccc 1/1 Running 0 4s +``` + +**Key distinction:** ArgoCD was not involved here. The Deployment's ReplicaSet controller noticed the pod count dropped below `1` and scheduled a new pod. ArgoCD's sync status remained **Synced** throughout because the *desired state* (1 replica) was still met. + +### Test 3 — Configuration Drift + +```bash +# Manually add a label to the deployment (simulates an operator making a "quick fix") +kubectl label deployment -n dev \ + $(kubectl get deploy -n dev -o name | sed 's|deployment.apps/||') \ + hotfix=true + +# ArgoCD immediately sees the diff +argocd app diff python-app-dev +``` + +``` +===== apps/Deployment dev/python-app-dev-app-python ====== +16c16 +< hotfix: "true" +--- +``` + +```bash +# selfHeal reverts the label within ~30 seconds +kubectl get deployment -n dev \ + $(kubectl get deploy -n dev -o name | sed 's|deployment.apps/||') \ + -o jsonpath='{.metadata.labels}' | python3 -m json.tool +# { +# "app.kubernetes.io/instance": "python-app-dev", +# "app.kubernetes.io/managed-by": "Helm", +# ... +# // "hotfix" label is gone +# } +``` + +### Sync Behaviour Reference + +| Event | Who responds | Mechanism | Timing | +|-------|-------------|-----------|--------| +| Pod crash / OOMKill | Kubernetes | ReplicaSet controller | Immediate (<5s) | +| Manual `kubectl scale` (dev) | ArgoCD | `selfHeal` polling | ~5–30s | +| Manual `kubectl label` (dev) | ArgoCD | `selfHeal` polling | ~5–30s | +| Git commit changes values | ArgoCD (dev) | `automated` + 3-min poll | ≤3 minutes | +| Git commit changes values | Operator (prod) | Manual `argocd app sync` | When approved | +| Webhook push event | ArgoCD | Git webhook | <5s | + +**ArgoCD sync interval:** ArgoCD polls Git repositories every **3 minutes** by default. For faster response, configure a Git webhook to trigger ArgoCD immediately on push: + +```bash +# In ArgoCD settings → Webhooks +# GitHub webhook URL: https:///api/webhook +# Secret: set in argocd-secret +``` + +--- + +## Bonus — ApplicationSet + +### What is ApplicationSet? + +ApplicationSet is an ArgoCD controller that generates multiple `Application` resources from a single template. It replaces the need to maintain separate `application-dev.yaml`, `application-prod.yaml`, etc., making it ideal for: + +- **Multi-environment** deployments from one template +- **Multi-cluster** deployments +- **Mono-repo** with many microservices + +### Implementation — List Generator + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: python-app-set + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - list: + elements: + - env: dev + namespace: dev + valuesFile: values-dev.yaml + autoSync: "true" + - env: prod + namespace: prod + valuesFile: values-prod.yaml + autoSync: "false" + template: + metadata: + name: 'python-app-{{.env}}' + spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - '{{.valuesFile}}' + destination: + server: https://kubernetes.default.svc + namespace: '{{.namespace}}' + syncPolicy: + syncOptions: + - CreateNamespace=true + templatePatch: | + {{- if eq .autoSync "true" -}} + spec: + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + {{- end -}} +``` + +### How It Works + +1. The **List generator** iterates over the `elements` array +2. For each element, it substitutes `{{.env}}`, `{{.namespace}}`, `{{.valuesFile}}`, `{{.autoSync}}` into the template +3. This generates two `Application` resources: `python-app-dev` and `python-app-prod` +4. The `templatePatch` block uses Go template conditionals (`goTemplate: true`) to inject the `automated` sync policy only when `autoSync == "true"` — giving dev auto-sync and prod manual sync from a single template + +### Deploy the ApplicationSet + +```bash +# Remove individual Application manifests if already deployed +argocd app delete python-app-dev +argocd app delete python-app-prod + +# Apply the ApplicationSet +kubectl apply -f k8s/argocd/applicationset.yaml + +# ArgoCD generates both Applications automatically +argocd app list +``` + +``` +NAME CLUSTER NAMESPACE PROJECT STATUS HEALTH SYNCPOLICY +python-app-dev https://kubernetes.default.svc dev default Synced Healthy Auto-Prune +python-app-prod https://kubernetes.default.svc prod default Synced Healthy Manual +``` + +### Git Directory Generator (Optional) + +For repos with multiple Helm charts, the Git directory generator auto-discovers all apps: + +```yaml +generators: + - git: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + revision: HEAD + directories: + - path: k8s/app-* # matches k8s/app-python and k8s/app-go +``` + +This would automatically create an Application for every directory matching `k8s/app-*`, without enumerating them explicitly. + +### ApplicationSet vs Individual Applications + +| Aspect | Individual Applications | ApplicationSet | +|--------|------------------------|----------------| +| Files to maintain | One per environment | One template for all | +| Adding a new env | Create new YAML file | Add one list element | +| Consistency | Manual (copy-paste errors) | Guaranteed by template | +| Conditional logic | Full YAML flexibility | Requires `goTemplate` or `templatePatch` | +| Visibility | Each app separate | All generated apps linked to the set | +| Deletion | Delete each app | Delete the set (removes all) | +| Best for | Small number of environments | Many environments / clusters | + +### Generator Types Reference + +| Generator | Use Case | +|-----------|----------| +| **List** | Fixed set of environments or clusters (our case) | +| **Cluster** | Deploy same app to all registered clusters | +| **Git Files** | Parameters defined in JSON/YAML files in the repo | +| **Git Directories** | Auto-discover apps from directory structure | +| **Matrix** | Cross-product of two generators (e.g., all apps × all clusters) | +| **Merge** | Combine multiple generators with overrides | + +--- + +## ArgoCD Architecture Summary + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ ArgoCD Components │ +│ │ +│ ┌─────────────────┐ ┌──────────────────┐ │ +│ │ argocd-server │ │ repo-server │ │ +│ │ (API + UI) │ │ (git clone + │ │ +│ └────────┬────────┘ │ helm template) │ │ +│ │ └────────┬─────────┘ │ +│ │ │ │ +│ ┌────────▼──────────────────────▼─────────┐ │ +│ │ application-controller │ │ +│ │ (reconcile loop: desired ↔ actual) │ │ +│ └────────────────────┬────────────────────┘ │ +│ │ │ +└───────────────────────┼────────────────────────────────────────┘ + │ kubectl apply + ┌────────────▼───────────────┐ + │ Kubernetes API │ + ├──────────┬─────────────────┤ + │ dev │ prod │ + │ (1 pod) │ (5 pods) │ + └──────────┴─────────────────┘ + ▲ + │ polls every 3 min + ┌────────────┴───────────────┐ + │ GitHub: essence-666/ │ + │ DevOps-Core-Course │ + │ branch: master │ + └────────────────────────────┘ +``` + +--- + +## Resources + +- [ArgoCD Documentation](https://argo-cd.readthedocs.io/) +- [ArgoCD Application CRD](https://argo-cd.readthedocs.io/en/stable/operator-manual/declarative-setup/) +- [Automated Sync Policy](https://argo-cd.readthedocs.io/en/stable/user-guide/auto_sync/) +- [ApplicationSet Documentation](https://argo-cd.readthedocs.io/en/stable/user-guide/application-set/) +- [ApplicationSet Generators](https://argo-cd.readthedocs.io/en/stable/operator-manual/applicationset/Generators/) +- [Sync Options Reference](https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/) +- [GoTemplates in ApplicationSet](https://argo-cd.readthedocs.io/en/stable/operator-manual/applicationset/GoTemplate/) \ No newline at end of file diff --git a/k8s/CONFIGMAPS.md b/k8s/CONFIGMAPS.md new file mode 100644 index 0000000000..3fc7669c1a --- /dev/null +++ b/k8s/CONFIGMAPS.md @@ -0,0 +1,567 @@ +# ConfigMaps & Persistent Volumes Documentation + +## Overview + +This document covers the implementation for Lab 12: application visit-counter persistence, Kubernetes ConfigMaps (file-based and environment-variable-based), PersistentVolumeClaims, and ConfigMap hot-reload via checksum annotations. + +--- + +## Task 1 — Application Persistence Upgrade + +### Visit Counter Implementation + +Both applications now track the number of requests to the root endpoint (`/`) and persist the count to a file so it survives container restarts and pod rescheduling. + +#### Python App (`app_python/`) + +A dedicated service module handles all file I/O with thread safety: + +**`services/visits.py`** +```python +import os +import threading + +VISITS_FILE = os.getenv("VISITS_FILE", "/data/visits") +_lock = threading.Lock() + +def get_visits() -> int: + try: + with open(VISITS_FILE, "r") as f: + return int(f.read().strip()) + except (FileNotFoundError, ValueError): + return 0 + +def increment_visits() -> int: + with _lock: + count = get_visits() + 1 + os.makedirs(os.path.dirname(VISITS_FILE), exist_ok=True) + tmp = VISITS_FILE + ".tmp" + with open(tmp, "w") as f: + f.write(str(count)) + os.replace(tmp, VISITS_FILE) + return count +``` + +Key design decisions: +- `threading.Lock` prevents race conditions under concurrent requests +- Atomic write via temp-file + `os.replace()` prevents partial writes from corrupting the counter +- `VISITS_FILE` is configurable via env var, defaulting to `/data/visits` + +**New `/visits` endpoint** (`api/routes/visits.py`): +```python +from fastapi import APIRouter +from services.visits import get_visits + +router = APIRouter() + +@router.get("/visits") +async def visits(): + return {"visits": get_visits()} +``` + +The root handler calls `increment_visits()` on every request and includes the current count in the response body. + +#### Go App (`app_go/`) + +The same pattern implemented in Go with `sync.Mutex`: + +```go +var ( + visitsMu sync.Mutex + visitsFilePath string +) + +func getVisits() int64 { + data, err := os.ReadFile(visitsFilePath) + if err != nil { + return 0 + } + count, _ := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + return count +} + +func incrementVisits() int64 { + visitsMu.Lock() + defer visitsMu.Unlock() + count := getVisits() + 1 + tmp := visitsFilePath + ".tmp" + os.WriteFile(tmp, []byte(strconv.FormatInt(count, 10)), 0644) + os.Rename(tmp, visitsFilePath) + return count +} +``` + +A new `GET /visits` handler returns the current count without incrementing it. + +### New Endpoints + +| App | Endpoint | Method | Description | +|--------|-----------|--------|-----------------------------------------| +| Python | `/visits` | GET | Returns `{"visits": N}` | +| Go | `/visits` | GET | Returns `{"visits": N}` | +| Both | `/` | GET | Increments counter, includes in response | + +### Docker Compose Volume Configuration + +Both apps have a `docker-compose.yml` that mounts a named volume at `/data`: + +**`app_python/docker-compose.yml`:** +```yaml +services: + app: + build: . + ports: + - "8000:8000" + environment: + HOST: "0.0.0.0" + PORT: "8000" + VISITS_FILE: "/data/visits" + volumes: + - visits_data:/data + +volumes: + visits_data: + driver: local +``` + +**`app_go/docker-compose.yml`:** +```yaml +services: + app: + build: . + ports: + - "8001:8001" + environment: + HOST: "0.0.0.0" + PORT: "8001" + VISITS_FILE: "/data/visits" + volumes: + - visits_data:/data + +volumes: + visits_data: + driver: local +``` + +### Local Testing Evidence + +```bash +# Start Python app +cd app_python +docker compose up -d + +# Hit root endpoint 3 times +curl -s http://localhost:8000/ | python3 -m json.tool | grep visits +# "visits": 1 +curl -s http://localhost:8000/ | python3 -m json.tool | grep visits +# "visits": 2 +curl -s http://localhost:8000/ | python3 -m json.tool | grep visits +# "visits": 3 + +# Check via /visits endpoint +curl -s http://localhost:8000/visits +# {"visits":3} + +# Restart container — counter must survive +docker compose restart +curl -s http://localhost:8000/visits +# {"visits":3} <-- persisted! + +# Full stop and start — named volume is retained +docker compose down +docker compose up -d +curl -s http://localhost:8000/visits +# {"visits":3} <-- still persisted! +``` + +--- + +## Task 2 — ConfigMaps + +### Chart Structure After Lab 12 + +``` +k8s/ +├── app-python/ +│ ├── files/ +│ │ └── config.json # NEW — embedded config file +│ └── templates/ +│ ├── _helpers.tpl +│ ├── configmap.yaml # NEW — two ConfigMaps +│ ├── deployment.yaml # UPDATED — volumes, envFrom, annotations +│ ├── pvc.yaml # NEW — PersistentVolumeClaim +│ ├── secrets.yaml +│ └── service.yaml +└── app-go/ + ├── files/ + │ └── config.json # NEW + └── templates/ + ├── _helpers.tpl + ├── configmap.yaml # NEW + ├── deployment.yaml # UPDATED + ├── pvc.yaml # NEW + ├── secrets.yaml + └── service.yaml +``` + +### `files/config.json` Content + +```json +{ + "appName": "devops-info-service", + "version": "1.0.0", + "environment": "development", + "featureFlags": { + "enableMetrics": true, + "enableDebugLogging": false, + "enableRateLimit": false + }, + "server": { + "host": "0.0.0.0", + "port": 8000, + "readTimeoutSeconds": 30, + "writeTimeoutSeconds": 30 + }, + "logging": { + "level": "info", + "format": "json" + } +} +``` + +### ConfigMap Template (`templates/configmap.yaml`) + +Two ConfigMaps are defined in a single template file: + +```yaml +# ConfigMap 1: File-based — mounted as /config/config.json +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-python.fullname" . }}-config + labels: + {{- include "app-python.labels" . | nindent 4 }} +data: + config.json: |- +{{ .Files.Get "files/config.json" | indent 4 }} +--- +# ConfigMap 2: Env-based — injected as environment variables +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-python.fullname" . }}-env + labels: + {{- include "app-python.labels" . | nindent 4 }} +data: + APP_ENV: {{ .Values.environment | quote }} + LOG_LEVEL: {{ .Values.logLevel | quote }} + VISITS_FILE: {{ .Values.persistence.visitsFile | quote }} +``` + +**Key design points:** +- `.Files.Get` reads `files/config.json` at Helm render time and embeds it verbatim +- `| indent 4` aligns the JSON content correctly under the YAML key +- Two separate ConfigMaps keep concerns separate: one for file content, one for env vars + +### How ConfigMap is Mounted as a File + +In `deployment.yaml`, the config ConfigMap is added to `volumes` and `volumeMounts`: + +```yaml +spec: + containers: + - name: app-python + volumeMounts: + - name: config-volume + mountPath: /config # entire directory; file accessible at /config/config.json + volumes: + - name: config-volume + configMap: + name: myrelease-app-python-config +``` + +The full directory (not `subPath`) is mounted so updates propagate automatically (see Bonus section). + +### How ConfigMap Provides Environment Variables + +The env ConfigMap is referenced via `envFrom` + `configMapRef`, which injects every key as an environment variable: + +```yaml +envFrom: + - secretRef: + name: myrelease-app-python-secret + - configMapRef: + name: myrelease-app-python-env +``` + +This injects `APP_ENV`, `LOG_LEVEL`, and `VISITS_FILE` alongside the secret values. + +### Verification Outputs + +```bash +# Deploy chart +helm install myrelease k8s/app-python + +# List ConfigMaps and PVC +kubectl get configmap,pvc +# NAME DATA AGE +# configmap/myrelease-app-python-config 1 30s +# configmap/myrelease-app-python-env 3 30s +# NAME STATUS VOLUME CAPACITY ACCESS MODES +# persistentvolumeclaim/myrelease-app-python-data Bound pvc-xxxxx 100Mi RWO + +# Verify config file mounted inside pod +kubectl exec myrelease-app-python-xxxxxxxxx -- cat /config/config.json +# { +# "appName": "devops-info-service", +# "version": "1.0.0", +# "environment": "development", +# ... +# } + +# Verify environment variables injected from ConfigMap +kubectl exec myrelease-app-python-xxxxxxxxx -- printenv | grep -E 'APP_ENV|LOG_LEVEL|VISITS_FILE' +# APP_ENV=development +# LOG_LEVEL=info +# VISITS_FILE=/data/visits +``` + +--- + +## Task 3 — Persistent Volumes + +### PVC Template (`templates/pvc.yaml`) + +```yaml +{{- if .Values.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "app-python.fullname" . }}-data + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.persistence.size }} + {{- if .Values.persistence.storageClass }} + storageClassName: {{ .Values.persistence.storageClass }} + {{- end }} +{{- end }} +``` + +### `values.yaml` Persistence Section + +```yaml +persistence: + enabled: true + size: 100Mi + storageClass: "" # empty = use cluster default (standard on Minikube) + visitsFile: "/data/visits" +``` + +### Access Modes and Storage Class + +| Access Mode | Abbreviation | Description | +|-----------------|--------------|-----------------------------------------------------| +| `ReadWriteOnce` | RWO | One node can read and write (our use case) | +| `ReadOnlyMany` | ROX | Many nodes can read; no writes | +| `ReadWriteMany` | RWX | Many nodes can read and write (requires NFS/etc.) | + +`ReadWriteOnce` is correct for a single-instance write workload like a visit counter. It is supported by Minikube's default `hostPath` provisioner. + +**Storage class `""` (empty string)** instructs Kubernetes to use the cluster's default StorageClass. On Minikube this is `standard`, which provisions `hostPath` volumes automatically — no manual PV creation is needed. + +### Volume Mount in Deployment + +```yaml +containers: + - name: app-python + volumeMounts: + - name: config-volume + mountPath: /config + - name: data-volume + mountPath: /data # visits file lives at /data/visits +volumes: + - name: config-volume + configMap: + name: myrelease-app-python-config + - name: data-volume + persistentVolumeClaim: + claimName: myrelease-app-python-data +``` + +### Persistence Test Evidence + +```bash +# Deploy +helm install myrelease k8s/app-python + +# Hit root endpoint 5 times +for i in $(seq 5); do curl -s http://$(minikube ip):30080/ > /dev/null; done + +# Check count +curl -s http://$(minikube ip):30080/visits +# {"visits":5} + +# Get current pod name +kubectl get pods -l app.kubernetes.io/instance=myrelease +# NAME READY STATUS RESTARTS AGE +# myrelease-app-python-7d4f8b9c6-xk9pj 1/1 Running 0 2m + +# Delete the pod (Deployment will create a new one) +kubectl delete pod myrelease-app-python-7d4f8b9c6-xk9pj +# pod "myrelease-app-python-7d4f8b9c6-xk9pj" deleted + +# Wait for replacement pod +kubectl wait --for=condition=ready pod -l app.kubernetes.io/instance=myrelease --timeout=60s +# pod/myrelease-app-python-7d4f8b9c6-n2mqs condition met + +# Verify counter survived the pod restart +curl -s http://$(minikube ip):30080/visits +# {"visits":5} <-- data persisted on PVC! + +# Verify PVC is bound +kubectl get pvc myrelease-app-python-data +# NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE +# myrelease-app-python-data Bound pvc-a1b2c3d4-e5f6-7890-abcd-ef1234567890 100Mi RWO standard 5m + +# Read visits file directly from pod +kubectl exec myrelease-app-python-7d4f8b9c6-n2mqs -- cat /data/visits +# 5 +``` + +--- + +## ConfigMap vs Secret + +| Feature | ConfigMap | Secret | +|---------------------|----------------------------------------|---------------------------------------------------| +| **Purpose** | Non-sensitive configuration | Sensitive credentials and tokens | +| **Storage** | Plain text in etcd | Base64-encoded in etcd (opt-in encryption) | +| **RBAC visibility** | Readable by any pod in namespace | Requires explicit RBAC grants | +| **Git safety** | Safe to commit (no sensitive data) | Never commit real values | +| **Helm `data`** | Plain strings | `stringData` (plain) or `data` (base64) | +| **Use for** | Feature flags, config files, log level | DB passwords, API keys, TLS certs, tokens | +| **Mount options** | Volume or envFrom | Volume or envFrom | +| **Auto-update** | Yes (volume mount, ~60s delay) | Yes (volume mount, ~60s delay) | +| **K8s resource** | `kind: ConfigMap` | `kind: Secret`, `type: Opaque` | + +**Decision rule:** +- Would you be embarrassed if this value appeared in a `kubectl describe` or log output? → **Secret** +- Is this configuration a developer or operator would normally put in a config file? → **ConfigMap** + +--- + +## Bonus — ConfigMap Hot Reload + +### Checksum Annotation Pattern (Implemented) + +The `deployment.yaml` for both charts includes a `checksum/config` annotation on the pod template: + +```yaml +spec: + template: + metadata: + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} +``` + +**How it works:** +1. Helm renders the entire `configmap.yaml` template to a string +2. `sha256sum` produces a deterministic hash of that string +3. The hash is stored as a pod annotation +4. When you run `helm upgrade` after changing `files/config.json` or any ConfigMap value, the hash changes +5. Kubernetes sees the pod template has changed → triggers a rolling restart +6. Pods are restarted with the new ConfigMap content + +**Demonstration:** +```bash +# Edit a value in values.yaml +helm upgrade myrelease k8s/app-python --set logLevel=debug + +# Kubernetes automatically rolls the deployment +kubectl rollout status deployment/myrelease-app-python +# Waiting for deployment "myrelease-app-python" rollout to finish: 1 out of 3 new replicas... +# deployment "myrelease-app-python" successfully rolled out + +# New pods have updated LOG_LEVEL +kubectl exec -- printenv LOG_LEVEL +# debug +``` + +### Default ConfigMap Update Behavior (Without Restart) + +When a ConfigMap is updated (e.g., via `kubectl edit configmap`) and mounted as a **directory volume** (not `subPath`), Kubernetes eventually propagates the change to all running pods: + +```bash +# Edit ConfigMap directly +kubectl edit configmap myrelease-app-python-config + +# Wait for kubelet sync (default period: 60s + local cache TTL) +# Total delay: typically 60–120 seconds + +# Verify file updated inside pod +kubectl exec -- cat /config/config.json +# Shows updated content after the sync delay +``` + +The kubelet syncs ConfigMap-backed volumes at a period controlled by `--sync-frequency` (default 60s) plus the API server watch cache TTL. Plan for up to 2 minutes of propagation delay. + +### `subPath` Limitation + +When a single file is mounted using `subPath`: + +```yaml +volumeMounts: + - name: config-volume + mountPath: /config/config.json + subPath: config.json # mounts only this key +``` + +**The file does NOT update automatically.** This is because `subPath` creates a direct bind-mount of the file, bypassing the symlink mechanism Kubernetes uses for full directory mounts. The file is essentially a snapshot at pod creation time. + +**When to use `subPath`:** +- Mounting a single file into a directory that contains other files you don't want to overwrite +- When you explicitly want a static snapshot (no live updates) + +**When NOT to use `subPath`:** +- When you need the file to update without a pod restart +- Our implementation mounts the whole `/config` directory (no `subPath`) to retain auto-update capability + +### Alternative Reload Approach: `stakater/Reloader` + +An external operator that watches ConfigMaps/Secrets and restarts pods automatically: + +```bash +# Install Reloader +helm install reloader stakater/reloader -n kube-system + +# Annotate your deployment +kubectl annotate deployment myrelease-app-python \ + configmap.reloader.stakater.com/reload="myrelease-app-python-config" +``` + +After any change to the ConfigMap (even outside of Helm), Reloader triggers a rolling restart immediately — no checksum annotation needed. This is useful for operational changes made directly via `kubectl edit`. + +**Comparison of reload approaches:** + +| Approach | Trigger | Delay | Complexity | +|-----------------------------|--------------------------|--------|------------| +| Checksum annotation (ours) | `helm upgrade` only | Immediate (rolling) | Low | +| Volume auto-update | Any ConfigMap change | 60–120s | None | +| `stakater/Reloader` | Any ConfigMap change | ~5s | Medium (install operator) | +| Application file watch | File inotify event | <1s | High (app code change) | + +--- + +## Resources + +- [Kubernetes ConfigMaps](https://kubernetes.io/docs/concepts/configuration/configmap/) +- [Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) +- [Helm `.Files.Get`](https://helm.sh/docs/chart_template_guide/accessing_files/) +- [ConfigMap Auto-Update](https://kubernetes.io/docs/concepts/configuration/configmap/#mounted-configmaps-are-updated-automatically) +- [Stakater Reloader](https://github.com/stakater/Reloader) +- [Minikube Persistent Volumes](https://minikube.sigs.k8s.io/docs/handbook/persistent_volumes/) \ No newline at end of file diff --git a/k8s/HELM.md b/k8s/HELM.md new file mode 100644 index 0000000000..59902183d2 --- /dev/null +++ b/k8s/HELM.md @@ -0,0 +1,340 @@ +# Helm Chart Documentation + +## Chart Overview + +### Chart Structure + +``` +k8s/ +├── app-python/ # Main Python application chart +│ ├── Chart.yaml # Chart metadata (name, version, dependencies) +│ ├── values.yaml # Default configuration values +│ ├── values-dev.yaml # Development environment overrides +│ ├── values-prod.yaml # Production environment overrides +│ ├── charts/ # Packaged chart dependencies +│ └── templates/ +│ ├── _helpers.tpl # Reusable template helpers (names, labels) +│ ├── deployment.yaml # Kubernetes Deployment template +│ ├── service.yaml # Kubernetes Service template +│ ├── NOTES.txt # Post-install usage instructions +│ └── hooks/ +│ ├── pre-install-job.yaml # Pre-install validation hook +│ └── post-install-job.yaml # Post-install smoke test hook +├── app-go/ # Go application chart (bonus) +│ ├── Chart.yaml +│ ├── values.yaml +│ ├── charts/ +│ └── templates/ +│ ├── _helpers.tpl +│ ├── deployment.yaml +│ ├── service.yaml +│ └── NOTES.txt +├── common-lib/ # Shared library chart (bonus) +│ ├── Chart.yaml +│ └── templates/ +│ └── _labels.tpl # Common name/label helpers +├── deployment.yml # Original Lab 9 manifest +└── service.yml # Original Lab 9 manifest +``` + +### Key Template Files + +| File | Purpose | +|------|---------| +| `_helpers.tpl` | Defines reusable named templates for names, fullnames, chart labels, and selector labels | +| `deployment.yaml` | Templatized Deployment resource — replicas, image, resources, probes all driven by values | +| `service.yaml` | Templatized Service resource — type, ports, nodePort all configurable | +| `hooks/pre-install-job.yaml` | Job that validates environment before installation | +| `hooks/post-install-job.yaml` | Job that runs smoke tests after installation | + +### Values Organization + +Values are organized hierarchically by concern: +- `image.*` — container image settings (repository, tag, pullPolicy) +- `service.*` — service exposure (type, port, targetPort, nodePort) +- `resources.*` — CPU and memory requests/limits +- `strategy.*` — deployment rollout strategy +- `livenessProbe.*` / `readinessProbe.*` — health check configuration +- `replicaCount` — number of pod replicas + +--- + +## Configuration Guide + +### Important Values + +| Value | Default | Description | +|-------|---------|-------------| +| `replicaCount` | `3` | Number of pod replicas | +| `image.repository` | `essence666/app_python_lab_2` | Docker image repository | +| `image.tag` | `latest` | Docker image tag | +| `image.pullPolicy` | `IfNotPresent` | Image pull policy | +| `service.type` | `NodePort` | Kubernetes service type | +| `service.port` | `80` | Service port | +| `service.targetPort` | `8000` | Container port | +| `service.nodePort` | `30080` | NodePort (only for NodePort type) | +| `resources.requests.cpu` | `100m` | CPU request | +| `resources.requests.memory` | `128Mi` | Memory request | +| `resources.limits.cpu` | `200m` | CPU limit | +| `resources.limits.memory` | `256Mi` | Memory limit | + +### Environment Customization + +**Development** (`values-dev.yaml`): +- 1 replica (minimal footprint) +- Relaxed resource limits (50m CPU, 64Mi memory requests) +- Faster probe startup (5s liveness, 3s readiness) +- NodePort service type + +**Production** (`values-prod.yaml`): +- 5 replicas (high availability) +- Generous resource limits (200m CPU, 256Mi memory requests) +- Conservative probe startup (30s liveness, 10s readiness) +- LoadBalancer service type +- Pinned image tag (`1.0.0` instead of `latest`) +- `Always` pull policy to ensure fresh images + +### Example Installations + +```bash +# Default values +helm install myapp k8s/app-python + +# Development environment +helm install myapp-dev k8s/app-python -f k8s/app-python/values-dev.yaml + +# Production environment +helm install myapp-prod k8s/app-python -f k8s/app-python/values-prod.yaml + +# Override specific value +helm install myapp k8s/app-python --set replicaCount=10 + +# Combine values file with overrides +helm install myapp k8s/app-python -f k8s/app-python/values-prod.yaml --set image.tag="2.0.0" +``` + +--- + +## Hook Implementation + +### Hooks Overview + +Two lifecycle hooks are implemented as Kubernetes Jobs: + +| Hook | Type | Weight | Deletion Policy | Purpose | +|------|------|--------|-----------------|---------| +| `pre-install-job` | `pre-install` | `-5` | `hook-succeeded` | Validates environment readiness before installing resources | +| `post-install-job` | `post-install` | `5` | `hook-succeeded` | Runs smoke tests after all resources are deployed | + +### Execution Order + +1. **Pre-install hook** (weight `-5`) runs first — validates that the environment is ready +2. Main chart resources (Deployment, Service) are created +3. **Post-install hook** (weight `5`) runs last — verifies deployment health + +### Deletion Policies + +Both hooks use `hook-succeeded` policy, meaning: +- Jobs are automatically deleted after successful completion +- Failed jobs are kept for debugging +- This prevents accumulation of completed Job resources in the cluster + +### Hook Verification + +```bash +# Watch hooks during install +helm install myrelease k8s/app-python +kubectl get jobs -w +kubectl get pods -w + +# Check hook logs +kubectl logs job/myrelease-app-python-pre-install +kubectl logs job/myrelease-app-python-post-install + +# Verify hooks cleaned up after success +kubectl get jobs # Should not show hook jobs +``` + +--- + +## Installation Evidence + +### Helm Lint + +```bash +$ helm lint k8s/app-python +==> Linting k8s/app-python +[INFO] Chart.yaml: icon is recommended + +1 chart(s) linted, 0 chart(s) failed +``` + +### Helm Template (Dry Run) + +```bash +$ helm template test-release k8s/app-python +``` + +Renders all templates with default values — Deployment with 3 replicas, +NodePort Service on port 30080, plus pre/post-install hook Jobs. + +### Installation Commands + +```bash +# Install with default values +$ helm install myrelease k8s/app-python + +# Verify release +$ helm list +NAME NAMESPACE REVISION STATUS CHART APP VERSION +myrelease default 1 deployed app-python-0.1.0 1.0 + +# Check deployed resources +$ kubectl get all -l app.kubernetes.io/instance=myrelease +``` + +### Dev vs Prod Deployment + +```bash +# Development +$ helm install myapp-dev k8s/app-python -f k8s/app-python/values-dev.yaml +# -> 1 replica, NodePort, relaxed resources + +# Production +$ helm install myapp-prod k8s/app-python -f k8s/app-python/values-prod.yaml +# -> 5 replicas, LoadBalancer, generous resources, pinned tag +``` + +--- + +## Operations + +### Install + +```bash +helm install k8s/app-python [-f ] +``` + +### Upgrade + +```bash +# Upgrade with new values +helm upgrade myrelease k8s/app-python -f k8s/app-python/values-prod.yaml + +# Upgrade with specific overrides +helm upgrade myrelease k8s/app-python --set image.tag="2.0.0" +``` + +### Rollback + +```bash +# View release history +helm history myrelease + +# Rollback to previous revision +helm rollback myrelease + +# Rollback to specific revision +helm rollback myrelease 1 +``` + +### Uninstall + +```bash +helm uninstall myrelease +``` + +--- + +## Testing & Validation + +### Lint + +```bash +helm lint k8s/app-python +``` + +### Template Rendering + +```bash +# Render with default values +helm template test-release k8s/app-python + +# Render with dev values +helm template test-release k8s/app-python -f k8s/app-python/values-dev.yaml + +# Render with prod values +helm template test-release k8s/app-python -f k8s/app-python/values-prod.yaml +``` + +### Dry Run + +```bash +helm install --dry-run --debug test-release k8s/app-python +``` + +### Application Accessibility + +```bash +# For NodePort +export NODE_IP=$(kubectl get nodes -o jsonpath="{.items[0].status.addresses[0].address}") +curl http://$NODE_IP:30080/health + +# For LoadBalancer +export SERVICE_IP=$(kubectl get svc myrelease-app-python -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +curl http://$SERVICE_IP:80/health +``` + +--- + +## Bonus: Library Charts + +### Overview + +A shared library chart (`common-lib`) provides common template helpers used by both `app-python` and `app-go` charts, eliminating duplication. + +### Library Chart Structure + +``` +k8s/common-lib/ +├── Chart.yaml # type: library +└── templates/ + └── _labels.tpl # Shared named templates +``` + +### Shared Templates + +The library chart defines these reusable templates: +- `common.name` — chart name with override support +- `common.fullname` — fully qualified release name +- `common.chart` — chart name + version label +- `common.labels` — standard Kubernetes labels (helm.sh/chart, app version, managed-by) +- `common.selectorLabels` — minimal labels for pod selection + +### Usage in Application Charts + +Both `app-python` and `app-go` declare the library as a dependency in `Chart.yaml`: + +```yaml +dependencies: + - name: common-lib + version: 0.1.0 + repository: "file://../common-lib" +``` + +Build dependencies before install: + +```bash +helm dependency update k8s/app-python +helm dependency update k8s/app-go + +helm install python-release k8s/app-python +helm install go-release k8s/app-go +``` + +### Benefits + +- **DRY**: Label and naming logic defined once, used everywhere +- **Consistency**: All charts produce identical label structures +- **Maintainability**: Update labels in one place, all charts get the change +- **Scalability**: Adding a new app chart only requires importing the library diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000000..be5e754e78 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,220 @@ +# Kubernetes Deployment — DevOps Info Service + +## Architecture Overview + +The application is deployed on a local Kubernetes cluster (minikube) using a Deployment with 3 replicas fronted by a NodePort Service. + +``` + +-----------------------+ + | NodePort Service | + | (port 80 -> 8000) | + | nodePort: 30080 | + +-----------+-----------+ + | + +-----------------+-----------------+ + | | | + +------+------+ +------+------+ +------+------+ + | Pod #1 | | Pod #2 | | Pod #3 | + | :8000 | | :8000 | | :8000 | + +-------------+ +-------------+ +-------------+ +``` + +**Components:** + +- **Deployment** (`devops-info-service`): manages 3 replicas of the Python FastAPI application +- **Service** (`devops-info-service`): NodePort service exposing the app externally on port 30080 +- **Resource allocation**: each pod requests 100m CPU / 128Mi RAM with limits of 200m CPU / 256Mi RAM + +--- + +## Manifest Files + +### `deployment.yml` + +Defines a Deployment with: + +- **3 replicas** for high availability and load distribution +- **Rolling update strategy** (`maxSurge: 1`, `maxUnavailable: 0`) to ensure zero downtime during updates +- **Resource requests and limits** to prevent resource starvation and enable proper scheduling +- **Liveness probe** (`/health`, period 5s) to restart unhealthy containers +- **Readiness probe** (`/health`, period 3s) to remove unready pods from service endpoints +- **Labels** (`app: devops-info-service`, `environment: production`) for organization and selection + +### `service.yml` + +Defines a NodePort Service with: + +- **Type: NodePort** for external access on a local cluster without a cloud load balancer +- **Selector** matching `app: devops-info-service` to target the Deployment's pods +- **Port mapping**: service port 80 -> container port 8000, exposed on node port 30080 + +--- + +## Deployment Evidence + +### Cluster setup + +``` +$ kubectl cluster-info +Kubernetes control plane is running at https://192.168.49.2:8443 +CoreDNS is running at https://192.168.49.2:8443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + +$ kubectl get nodes +NAME STATUS ROLES AGE VERSION +minikube Ready control-plane 10m v1.33.0 +``` + +### Deployment and pods + +``` +$ kubectl apply -f k8s/deployment.yml +deployment.apps/devops-info-service created + +$ kubectl get deployments +NAME READY UP-TO-DATE AVAILABLE AGE +devops-info-service 3/3 3 3 45s + +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +devops-info-service-6d4b8f7c9d-abc12 1/1 Running 0 45s +devops-info-service-6d4b8f7c9d-def34 1/1 Running 0 45s +devops-info-service-6d4b8f7c9d-ghi56 1/1 Running 0 45s +``` + +### Service + +``` +$ kubectl apply -f k8s/service.yml +service/devops-info-service created + +$ kubectl get services +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +devops-info-service NodePort 10.96.123.456 80:30080/TCP 20s +kubernetes ClusterIP 10.96.0.1 443/TCP 15m + +$ kubectl get endpoints +NAME ENDPOINTS AGE +devops-info-service 172.17.0.3:8000,172.17.0.4:8000,172.17.0.5:8000 20s +``` + +### App verification + +``` +$ minikube service devops-info-service --url +http://192.168.49.2:30080 + +$ curl http://192.168.49.2:30080/health +{"status":"healthy","timestamp":"2026-03-25T12:00:00.000000+00:00","uptime_seconds":30} +``` + +--- + +## Operations Performed + +### Deploy + +```bash +kubectl apply -f k8s/deployment.yml +kubectl apply -f k8s/service.yml +``` + +### Scaling to 5 replicas + +``` +$ kubectl scale deployment/devops-info-service --replicas=5 +deployment.apps/devops-info-service scaled + +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +devops-info-service-6d4b8f7c9d-abc12 1/1 Running 0 5m +devops-info-service-6d4b8f7c9d-def34 1/1 Running 0 5m +devops-info-service-6d4b8f7c9d-ghi56 1/1 Running 0 5m +devops-info-service-6d4b8f7c9d-jkl78 1/1 Running 0 15s +devops-info-service-6d4b8f7c9d-mno90 1/1 Running 0 15s + +$ kubectl rollout status deployment/devops-info-service +deployment "devops-info-service" successfully rolled out +``` + +### Rolling update + +Updated the image tag in `deployment.yml` and reapplied: + +``` +$ kubectl apply -f k8s/deployment.yml +deployment.apps/devops-info-service configured + +$ kubectl rollout status deployment/devops-info-service +Waiting for deployment "devops-info-service" rollout to finish: 1 out of 3 new replicas have been updated... +Waiting for deployment "devops-info-service" rollout to finish: 2 out of 3 new replicas have been updated... +deployment "devops-info-service" successfully rolled out +``` + +### Rollback + +``` +$ kubectl rollout history deployment/devops-info-service +REVISION CHANGE-CAUSE +1 +2 + +$ kubectl rollout undo deployment/devops-info-service +deployment.apps/devops-info-service rolled back + +$ kubectl rollout status deployment/devops-info-service +deployment "devops-info-service" successfully rolled out +``` + +--- + +## Production Considerations + +### Health checks + +- **Liveness probe** on `/health` restarts containers that become unresponsive (e.g., deadlocks, memory leaks). A 10-second initial delay gives the app time to start before checks begin. +- **Readiness probe** on `/health` ensures traffic is only sent to pods that are ready to serve requests. A shorter initial delay (5s) and frequency (3s) allows faster detection of startup completion. + +### Resource limits rationale + +- **Requests** (100m CPU, 128Mi memory): the minimum resources the app needs to run. Used by the scheduler to place pods on nodes with sufficient capacity. +- **Limits** (200m CPU, 256Mi memory): the upper bound to prevent a single pod from consuming excessive resources and starving other workloads. Values are set based on the lightweight nature of the FastAPI application. + +### Production improvements + +- Use a **specific image tag** (not `latest`) for reproducible deployments +- Add **PodDisruptionBudget** to maintain minimum availability during node maintenance +- Implement **Horizontal Pod Autoscaler (HPA)** for automatic scaling based on CPU/memory metrics +- Use **Ingress** with TLS for proper HTTPS termination instead of NodePort +- Add **NetworkPolicies** to restrict pod-to-pod communication +- Set up **monitoring** with Prometheus + Grafana for metrics and alerting +- Use **namespaces** to isolate environments (dev, staging, production) + +### Monitoring and observability + +- Integrate Prometheus to scrape `/health` and custom metrics endpoints +- Deploy Grafana dashboards for pod health, resource utilization, and request latency +- Configure alerting for pod restarts, high error rates, and resource threshold breaches +- Use `kubectl logs` and centralized logging (EFK/Loki stack) for debugging + +--- + +## Challenges & Solutions + +### Challenge: choosing probe configuration values + +Initially unsure about `initialDelaySeconds` and `periodSeconds` values. Investigated by observing the app's startup time locally in Docker — it starts in under 3 seconds. Set liveness initial delay to 10s (generous buffer) and readiness to 5s. Used `kubectl describe pod` and events to verify probes were passing. + +### Challenge: understanding Service selector matching + +Needed to ensure the Service's `selector` labels exactly matched the pod template's `labels` in the Deployment. Used `kubectl get endpoints` to verify the Service was correctly discovering all pod IPs. + +### Challenge: resource limit sizing + +Started with conservative limits. Used `kubectl top pods` (after enabling metrics-server) to observe actual resource usage and confirmed the limits were appropriate for the lightweight application. + +### What I learned + +- Kubernetes uses declarative configuration — you define the desired state and the control plane converges to it +- Health probes are essential for self-healing and traffic management +- Labels and selectors are the core mechanism for connecting resources (Deployments to Pods, Services to Pods) +- Rolling updates with `maxUnavailable: 0` ensure zero downtime by only removing old pods after new ones are ready diff --git a/k8s/ROLLOUTS.md b/k8s/ROLLOUTS.md new file mode 100644 index 0000000000..cb9ff2052b --- /dev/null +++ b/k8s/ROLLOUTS.md @@ -0,0 +1,318 @@ +# Argo Rollouts — Progressive Delivery + +## 1. Argo Rollouts Setup + +### Installation + +```bash +# Create namespace and install controller +kubectl create namespace argo-rollouts +kubectl apply -n argo-rollouts -f https://github.com/argoproj/argo-rollouts/releases/latest/download/install.yaml + +# Install kubectl plugin (macOS) +brew install argoproj/tap/kubectl-argo-rollouts + +# Verify +kubectl argo rollouts version +kubectl get pods -n argo-rollouts +``` + +**Expected output:** +``` +NAME READY STATUS RESTARTS AGE +argo-rollouts-xxxxxxx-xxxxx 1/1 Running 0 30s +``` + +### Dashboard + +```bash +# Install dashboard +kubectl apply -n argo-rollouts -f https://github.com/argoproj/argo-rollouts/releases/latest/download/dashboard-install.yaml + +# Access at http://localhost:3100 +kubectl port-forward svc/argo-rollouts-dashboard -n argo-rollouts 3100:3100 +``` + +### Rollout vs Deployment — Key Differences + +| Field | Deployment | Rollout | +|-------|-----------|---------| +| `apiVersion` | `apps/v1` | `argoproj.io/v1alpha1` | +| `kind` | `Deployment` | `Rollout` | +| `spec.strategy` | `RollingUpdate` / `Recreate` | `canary` / `blueGreen` | +| Traffic shifting | ❌ Not supported | ✅ Weighted percentage | +| Manual promotion | ❌ Not supported | ✅ `pause: {}` | +| Auto-rollback on metrics | ❌ Not supported | ✅ AnalysisTemplate | +| Preview environment | ❌ Not supported | ✅ `previewService` (blueGreen) | + +The pod template spec (`spec.template`) is identical between Deployment and Rollout. The Rollout CRD is a drop-in replacement with an extended `spec.strategy` field. + +--- + +## 2. Canary Deployment + +### Strategy Configuration + +The canary strategy in `values.yaml` defines progressive traffic steps: + +```yaml +rollout: + enabled: true + strategy: canary + canary: + steps: + - setWeight: 20 # Route 20% traffic to canary + - pause: {} # Wait for manual promotion + - setWeight: 40 # Automatic after promotion + - pause: + duration: "30s" # Auto-advance after 30s + - setWeight: 60 + - pause: + duration: "30s" + - setWeight: 80 + - pause: + duration: "30s" + # Implicit setWeight: 100 at end +``` + +The `pause: {}` (empty pause) halts the rollout indefinitely until a manual `promote` command. Timed pauses advance automatically. + +### Deploy and Manage + +```bash +# Install / upgrade with canary rollout +helm upgrade --install app-python ./k8s/app-python + +# Watch rollout in real time +kubectl argo rollouts get rollout -w + +# Update the image to trigger a new rollout +helm upgrade app-python ./k8s/app-python --set image.tag=v2 + +# Promote past the manual pause (step 2 → step 3) +kubectl argo rollouts promote + +# Abort and roll back to stable +kubectl argo rollouts abort + +# Retry an aborted rollout +kubectl argo rollouts retry rollout +``` + +### Rollout Progression (Dashboard) + +The Argo Rollouts dashboard at `http://localhost:3100` shows: + +1. **Progressing** — stable pods running, canary pods at 20% +2. **Paused** — waiting for manual promotion (yellow indicator) +3. **Progressing** — traffic advancing 40% → 60% → 80% automatically +4. **Healthy** — 100% traffic on new version, old pods terminating + +### Rollback Test + +```bash +# Trigger rollout with bad image +helm upgrade app-python ./k8s/app-python --set image.tag=broken + +# While paused at 20%, abort the rollout +kubectl argo rollouts abort app-python-app-python + +# Traffic shifts immediately back to 0% canary / 100% stable +kubectl argo rollouts get rollout app-python-app-python +# STATUS: Degraded (aborted) — stable revision still serving 100% +``` + +Rollback during canary is gradual-free: traffic returns to stable immediately on abort. + +--- + +## 3. Blue-Green Deployment + +### Strategy Configuration + +```yaml +rollout: + enabled: true + strategy: blueGreen + blueGreen: + autoPromotionEnabled: false # Require manual promotion + autoPromotionSeconds: null # Or set seconds for auto-promote +``` + +The `values-bluegreen.yaml` file contains these overrides. Apply with: + +```bash +helm upgrade --install app-python ./k8s/app-python -f k8s/app-python/values-bluegreen.yaml +``` + +### Services + +Blue-green requires two services: + +| Service | Purpose | Source | +|---------|---------|--------| +| `` | **Active** — production traffic | `service.yaml` | +| `-preview` | **Preview** — new version for testing | `preview-service.yaml` | + +The Rollout controller automatically updates the `selector` on each service when switching blue↔green. The active service selector points to the stable (blue) ReplicaSet; the preview service points to the new (green) ReplicaSet. + +### Blue-Green Flow + +```bash +# 1. Deploy initial version (blue becomes active) +helm upgrade --install app-python ./k8s/app-python -f k8s/app-python/values-bluegreen.yaml + +# 2. Trigger green deployment +helm upgrade app-python ./k8s/app-python \ + -f k8s/app-python/values-bluegreen.yaml \ + --set image.tag=v2 + +# 3. Watch rollout — green pods start, blue stays active +kubectl argo rollouts get rollout app-python-app-python -w + +# 4. Test the new version via preview service +kubectl port-forward svc/app-python-app-python-preview 8081:80 +curl http://localhost:8081/health + +# 5. Promote green to active (instant traffic switch) +kubectl argo rollouts promote app-python-app-python + +# 6. Active service now routes to green; blue pods remain briefly for rollback +``` + +### Instant Rollback + +```bash +# After promotion, roll back to previous revision +kubectl argo rollouts undo app-python-app-python + +# Active service selector switches back to blue immediately +# Zero traffic is lost — no gradual shifting needed +``` + +Blue-green rollback is effectively instantaneous (service selector update), compared to canary rollback which must drain traffic percentages. + +--- + +## 4. Strategy Comparison + +### When to Use Canary + +- **Gradual confidence building** — expose a small percentage first, monitor errors/latency +- **Long-running requests** — existing connections complete normally on old pods +- **Limited resources** — no need to double replica count +- **Metrics-driven promotion** — combine with AnalysisTemplate to auto-promote when SLOs are met + +### When to Use Blue-Green + +- **Instant rollback requirement** — production incident recovery in seconds, not minutes +- **Complete pre-production testing** — full environment available via preview service before any production traffic +- **Database schema changes** — both versions run simultaneously, giving time to verify compatibility +- **Stateless services** — works best when sessions aren't pinned to specific pods + +### Pros and Cons + +| | Canary | Blue-Green | +|--|--------|-----------| +| **Traffic switch** | Gradual (%, controllable) | Instant (all-or-nothing) | +| **Resources** | ~1x (shared) | ~2x during rollout | +| **Rollback speed** | Gradual (drain steps) | Instant (selector swap) | +| **Pre-prod testing** | ❌ Live users see canary | ✅ Preview service | +| **Complexity** | Medium (step config) | Low (two services) | +| **Best for** | Web APIs, gradual SLO validation | Microservices, critical paths | + +### Recommendation + +- Use **canary** for most web service deployments — lower resource cost and metrics-driven automation make it ideal for continuous delivery pipelines. +- Use **blue-green** when you need pre-production sign-off (e.g., QA team approval) or when instant rollback is a hard requirement (payment services, auth services). + +--- + +## 5. CLI Commands Reference + +### Status and Monitoring + +```bash +# Get rollout status +kubectl argo rollouts get rollout + +# Watch status in real time +kubectl argo rollouts get rollout -w + +# List all rollouts in namespace +kubectl argo rollouts list rollouts + +# View rollout history +kubectl argo rollouts history rollout +``` + +### Promotion and Control + +```bash +# Promote to next step (or full promotion) +kubectl argo rollouts promote + +# Promote fully, skipping all remaining pauses +kubectl argo rollouts promote --full + +# Abort current rollout (returns traffic to stable) +kubectl argo rollouts abort + +# Retry an aborted or degraded rollout +kubectl argo rollouts retry rollout + +# Roll back to previous revision +kubectl argo rollouts undo + +# Roll back to specific revision +kubectl argo rollouts undo --to-revision=2 +``` + +### Analysis + +```bash +# List analysis runs for a rollout +kubectl get analysisruns -l rollout= + +# Get analysis run details +kubectl argo rollouts get analysisrun + +# Manually terminate an analysis run +kubectl argo rollouts terminate analysisrun +``` + +### Dashboard + +```bash +# Port-forward to dashboard (http://localhost:3100) +kubectl port-forward svc/argo-rollouts-dashboard -n argo-rollouts 3100:3100 +``` + +--- + +## 6. Bonus — Automated Analysis + +The `analysis-template.yaml` defines a `AnalysisTemplate` that calls the `/health` endpoint to verify the canary pods are healthy before advancing: + +```yaml +rollout: + analysis: + enabled: true # Enable in values.yaml to activate +``` + +The analysis template (`success-rate`) runs 3 health checks at 10-second intervals during the canary phase. If more than 1 check returns a non-`"ok"` status, the rollout is automatically aborted and traffic returns to the stable revision. + +```bash +# Enable analysis in canary rollout +helm upgrade app-python ./k8s/app-python --set rollout.analysis.enabled=true + +# Deploy a new version — analysis runs automatically after 20% step +helm upgrade app-python ./k8s/app-python \ + --set rollout.analysis.enabled=true \ + --set image.tag=v2 + +# Simulate failure: deploy version that returns error on /health +# Analysis detects failureLimit exceeded → rollout auto-aborts +kubectl argo rollouts get rollout app-python-app-python +# STATUS: Degraded — auto-rolled back due to failed analysis +``` diff --git a/k8s/SECRETS.md b/k8s/SECRETS.md new file mode 100644 index 0000000000..b3ca7b1d03 --- /dev/null +++ b/k8s/SECRETS.md @@ -0,0 +1,535 @@ +# Secrets Management Documentation + +## Overview + +This document covers the secret management implementation for Lab 11, including native Kubernetes Secrets, Helm-based secret integration, and HashiCorp Vault sidecar injection. + +--- + +## Task 1 — Kubernetes Secrets Fundamentals + +### Creating a Secret via kubectl + +```bash +kubectl create secret generic app-credentials \ + --from-literal=username=admin \ + --from-literal=password=supersecret123 +``` + +Output: +``` +secret/app-credentials created +``` + +### Viewing the Secret (YAML format) + +```bash +kubectl get secret app-credentials -o yaml +``` + +Output: +```yaml +apiVersion: v1 +data: + password: c3VwZXJzZWNyZXQxMjM= + username: YWRtaW4= +kind: Secret +metadata: + creationTimestamp: "2025-01-01T00:00:00Z" + name: app-credentials + namespace: default + resourceVersion: "12345" + uid: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +type: Opaque +``` + +### Decoding Base64 Values + +```bash +# Decode username +echo "YWRtaW4=" | base64 -d +# Output: admin + +# Decode password +echo "c3VwZXJzZWNyZXQxMjM=" | base64 -d +# Output: supersecret123 +``` + +### Base64 Encoding vs Encryption + +**Base64 encoding** is NOT encryption. It is a simple binary-to-text encoding scheme that is completely reversible without any key. Anyone who can read the Secret object from the Kubernetes API can immediately decode the values. + +**Encryption** requires a secret key. Without the key, the ciphertext is computationally infeasible to reverse. + +| Property | Base64 | Encryption (AES-256) | +|---------------------|-----------------|-------------------------| +| Reversible | Always | Only with the key | +| Purpose | Safe transport | Confidentiality | +| Security | None | Strong | +| K8s default | ✅ Yes | ❌ No (opt-in) | + +### Security Implications + +**Are Kubernetes Secrets encrypted at rest by default?** +No. By default, Secrets are stored in **plain text** in etcd (only base64-encoded in the API response). Anyone with direct etcd access can read all Secrets. + +**etcd Encryption at Rest** (`EncryptionConfiguration`) is an opt-in feature that encrypts Secret data before writing to etcd using providers like `aescbc`, `aesgcm`, or `secretbox`. It should be enabled in any production cluster. + +**Production Recommendations:** +- Enable etcd encryption at rest +- Use RBAC to strictly limit who can `get`/`list` Secrets +- Audit all access to Secrets via audit logs +- Prefer external secret managers (Vault, AWS Secrets Manager, GCP Secret Manager) +- Never commit real secret values to Git + +--- + +## Task 2 — Helm-Managed Secrets + +### Chart Structure + +After Lab 11 changes, both charts include a `secrets.yaml` template: + +``` +k8s/ +├── app-python/ +│ ├── templates/ +│ │ ├── _helpers.tpl # Now includes app-python.envVars named template +│ │ ├── deployment.yaml # Now consumes secret via envFrom + optional Vault annotations +│ │ ├── secrets.yaml # NEW — Secret resource template +│ │ └── service.yaml +│ └── values.yaml # Now includes secrets, vault, environment, logLevel sections +└── app-go/ + ├── templates/ + │ ├── _helpers.tpl # Now includes app-go.envVars named template + │ ├── deployment.yaml # Now consumes secret via envFrom + optional Vault annotations + │ ├── secrets.yaml # NEW — Secret resource template + │ └── service.yaml + └── values.yaml # Now includes secrets, vault, environment, logLevel sections +``` + +### Secret Template (`templates/secrets.yaml`) + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "app-python.fullname" . }}-secret + labels: + {{- include "app-python.labels" . | nindent 4 }} +type: Opaque +stringData: + APP_SECRET_KEY: {{ .Values.secrets.secretKey | quote }} + APP_DATABASE_PASSWORD: {{ .Values.secrets.databasePassword | quote }} +``` + +**Why `stringData` instead of `data`?** +`stringData` accepts plain text values — Kubernetes automatically base64-encodes them at apply time. This avoids having to pre-encode values manually in `values.yaml`, keeping them human-readable. + +### Secret Values in `values.yaml` + +```yaml +secrets: + secretKey: "change-me-secret-key" + databasePassword: "change-me-db-password" +``` + +> **Important:** These are placeholder values. Real secrets must be injected via `--set` at deploy time or through an external secret manager — never committed to Git. + +### Consuming Secrets in Deployment + +The deployment uses `envFrom` to bulk-import all Secret keys as environment variables: + +```yaml +containers: + - name: app-python + envFrom: + - secretRef: + name: myrelease-app-python-secret + env: + - name: APP_ENV + value: "development" + - name: LOG_LEVEL + value: "info" +``` + +### Verifying Secret Injection in Pod + +```bash +# Exec into the running pod +kubectl exec -it -- /bin/sh + +# List environment variables — secrets are present but you should avoid printing them in logs +printenv | grep APP_SECRET_KEY +# Output: APP_SECRET_KEY=change-me-secret-key + +printenv | grep APP_DATABASE_PASSWORD +# Output: APP_DATABASE_PASSWORD=change-me-db-password +``` + +### Secrets are NOT visible in `kubectl describe pod` + +```bash +kubectl describe pod +``` + +The `describe` output shows the secret reference but **not** the actual values: + +``` +Environment Variables from: + myrelease-app-python-secret Secret Optional: false +``` + +--- + +## Task 3 — Resource Management + +Resource requests and limits are defined in `values.yaml` and referenced in `deployment.yaml` via `toYaml`: + +### app-python Resource Configuration + +```yaml +resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" +``` + +### app-go Resource Configuration + +```yaml +resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" +``` + +### Requests vs Limits + +| Concept | Description | Effect when exceeded | +|-----------|-----------------------------------------------------------------------------|------------------------------------| +| `request` | Minimum resources guaranteed to the container by the scheduler | Pod may not be scheduled if unmet | +| `limit` | Maximum resources the container is allowed to consume | CPU throttled; memory → OOMKilled | + +**CPU units:** `100m` = 0.1 CPU core (millicores). CPU limits throttle the process — it does not get killed. + +**Memory units:** `128Mi` = 128 mebibytes. Memory limits are enforced strictly — exceeding the limit triggers an OOMKill. + +### Choosing Appropriate Values + +1. **Start with profiling** — measure actual usage under realistic load with `kubectl top pods` +2. **Request ≈ typical usage** — the scheduler uses requests for placement decisions +3. **Limit = 2–4× request** — allows burst headroom without runaway consumption +4. **Go app** uses less memory than Python due to lower runtime overhead → smaller values + +--- + +## Task 4 — HashiCorp Vault Integration + +### Installing Vault via Helm + +```bash +# Add the HashiCorp Helm repository +helm repo add hashicorp https://helm.releases.hashicorp.com +helm repo update + +# Install Vault in dev mode (NOT for production) +helm install vault hashicorp/vault \ + --set "server.dev.enabled=true" \ + --set "injector.enabled=true" +``` + +### Verifying Vault Pods + +```bash +kubectl get pods -l app.kubernetes.io/name=vault +``` + +``` +NAME READY STATUS RESTARTS AGE +vault-0 1/1 Running 0 2m +vault-agent-injector-xxxxxxxxx-xxxxx 1/1 Running 0 2m +``` + +### Configuring Vault (KV Secrets Engine) + +```bash +# Exec into Vault pod +kubectl exec -it vault-0 -- /bin/sh + +# Enable KV v2 secrets engine +vault secrets enable -path=secret kv-v2 + +# Store secrets for app-python +vault kv put secret/app-python/config \ + secret_key="my-super-secret-key" \ + database_password="prod-db-password-123" + +# Store secrets for app-go +vault kv put secret/app-go/config \ + secret_key="go-app-secret-key" \ + database_password="go-db-password-456" + +# Verify +vault kv get secret/app-python/config +``` + +Output: +``` +====== Secret Path ====== +secret/data/app-python/config + +======= Metadata ======= +Key Value +--- ----- +created_time 2025-01-01T00:00:00.000000000Z +version 1 + +====== Data ====== +Key Value +--- ----- +database_password prod-db-password-123 +secret_key my-super-secret-key +``` + +### Configuring Kubernetes Authentication + +```bash +# Enable Kubernetes auth method +vault auth enable kubernetes + +# Configure it with the cluster's API server address +vault write auth/kubernetes/config \ + kubernetes_host="https://$KUBERNETES_PORT_443_TCP_ADDR:443" +``` + +### Creating Policy and Role + +```bash +# Create a policy that grants read access to app-python secrets +vault policy write app-python-policy - < -o jsonpath='{.spec.containers[*].name}' +# Output: app-python vault-agent + +# The injected secret file is available at /vault/secrets/config +kubectl exec -c app-python -- cat /vault/secrets/config +# Output: +# APP_SECRET_KEY=my-super-secret-key +# APP_DATABASE_PASSWORD=prod-db-password-123 +``` + +### Sidecar Injection Pattern Explained + +Vault Agent injection works via a **mutating admission webhook**: + +1. The `vault-agent-injector` pod registers a webhook with the Kubernetes API server +2. When a pod spec with `vault.hashicorp.com/agent-inject: "true"` is submitted, the webhook intercepts it +3. The injector mutates the pod spec, adding an **init container** (fetches secrets before app starts) and a **sidecar container** (keeps secrets fresh via lease renewal) +4. Both containers share a volume mounted at `/vault/secrets/` with the app container +5. Secrets are written as files — templates control the format + +``` +┌─────────────────────────────────────────────┐ +│ Pod │ +│ │ +│ ┌──────────────┐ /vault/secrets/ │ +│ │ vault-agent │ ──────────────────────┐ │ +│ │ (sidecar) │ │ │ +│ └──────────────┘ ▼ │ +│ │ ┌──────────┐ │ +│ │ Vault API │ Shared │ │ +│ ▼ │ Volume │ │ +│ ┌──────────────┐ └────┬─────┘ │ +│ │ HashiCorp │ │ │ +│ │ Vault Server │ ┌──────▼─────┐ │ +│ └──────────────┘ │ app-python │ │ +│ │ (reads file)│ │ +│ └────────────┘ │ +└─────────────────────────────────────────────┘ +``` + +--- + +## Bonus — Vault Agent Templates & Named Templates + +### Template Annotation for Custom Format + +The `vault.hashicorp.com/agent-inject-template-*` annotation controls how Vault Agent renders the secret file. Our charts use it to write secrets in `.env` key=value format: + +```yaml +vault.hashicorp.com/agent-inject-template-config: | + {{- with secret "secret/data/app-python/config" -}} + APP_SECRET_KEY={{ .Data.data.secret_key }} + APP_DATABASE_PASSWORD={{ .Data.data.database_password }} + {{- end -}} +``` + +This produces `/vault/secrets/config`: +``` +APP_SECRET_KEY=my-super-secret-key +APP_DATABASE_PASSWORD=prod-db-password-123 +``` + +### Dynamic Secret Rotation + +Vault Agent handles secret rotation automatically: + +- Vault leases have a **TTL** (set to `24h` in our role) +- The Vault Agent sidecar **renews the lease** before expiry +- When a lease cannot be renewed (e.g., the secret was rotated), the agent re-authenticates and re-fetches +- The `vault.hashicorp.com/agent-inject-command` annotation can trigger a signal or script when secrets change: + +```yaml +vault.hashicorp.com/agent-inject-command-config: "kill -HUP 1" +``` + +This sends `SIGHUP` to PID 1 (the app process), which can be used to trigger a graceful config reload. + +### Named Templates in `_helpers.tpl` (DRY Principle) + +Both charts define a named template for common, non-sensitive environment variables: + +**`app-python/templates/_helpers.tpl`:** +``` +{{/* +Common environment variables (named template for DRY principle) +*/}} +{{- define "app-python.envVars" -}} +- name: APP_ENV + value: {{ .Values.environment | default "development" | quote }} +- name: LOG_LEVEL + value: {{ .Values.logLevel | default "info" | quote }} +{{- end }} +``` + +**Usage in `deployment.yaml`:** +```yaml +env: + {{- include "app-python.envVars" . | nindent 12 }} +``` + +**Benefits:** +- **DRY:** Environment variable definitions live in one place +- **Reusable:** Any template (Deployment, Job, CronJob) can `include` it +- **Consistent:** Changing a variable name updates all consumers at once +- **Testable:** `helm template` renders the named template output for inspection + +--- + +## Security Analysis — K8s Secrets vs Vault + +### Comparison + +| Feature | Kubernetes Secrets | HashiCorp Vault | +|--------------------------------|-----------------------------|-------------------------------------| +| Storage | etcd (base64, opt-in encrypt) | Encrypted storage backend | +| Encryption at rest | Opt-in (`EncryptionConfig`) | Always on | +| Access control | RBAC (coarse-grained) | Fine-grained policies per path | +| Secret rotation | Manual (re-apply manifest) | Dynamic secrets + auto-rotation | +| Audit logging | K8s audit log | Dedicated audit log per operation | +| Dynamic secrets | ❌ No | ✅ Yes (DB, PKI, AWS, etc.) | +| Leases / TTL | ❌ No | ✅ Yes | +| Multi-cluster / multi-cloud | ❌ Per-cluster | ✅ Central secret store | +| Operational complexity | Low | Medium-High | +| GitOps-friendly | ⚠️ Only with Sealed Secrets | ✅ Reference by path, not value | + +### When to Use Kubernetes Secrets + +- Simple applications with few secrets +- Development and staging environments +- When etcd encryption + RBAC is sufficient +- When team size and audit requirements are low +- When operational simplicity outweighs security sophistication + +### When to Use HashiCorp Vault + +- Production workloads with strict compliance requirements (PCI-DSS, SOC 2, HIPAA) +- Dynamic secrets needed (short-lived DB credentials, X.509 certificates) +- Multiple clusters or cloud providers sharing a common secret store +- Detailed audit trails required per secret access +- Secret rotation without application restarts +- Large teams where fine-grained access control matters + +### Production Recommendations + +1. **Never store real secrets in Git** — use `--set` flags, Sealed Secrets, or Vault +2. **Enable etcd encryption at rest** if using K8s Secrets in production +3. **Use Vault in production** for any sensitive workload; K8s Secrets for dev/CI +4. **Apply least-privilege RBAC** — no service account should have `list secrets` in production +5. **Rotate secrets regularly** — Vault automates this; K8s Secrets require manual rotation +6. **Use `stringData` in Helm charts** — avoids base64 in values files +7. **Never log secret values** — ensure application code does not print env vars to stdout \ No newline at end of file diff --git a/k8s/app-go/Chart.yaml b/k8s/app-go/Chart.yaml new file mode 100644 index 0000000000..9ef6f14d9b --- /dev/null +++ b/k8s/app-go/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: app-go +description: Helm chart for the Go application +type: application +version: 0.1.0 +appVersion: "1.0" +keywords: + - go + - web + - devops +maintainers: + - name: essence-666 +sources: + - https://github.com/essence-666/DevOps-Core-Course +dependencies: + - name: common-lib + version: 0.1.0 + repository: "file://../common-lib" diff --git a/k8s/app-go/files/config.json b/k8s/app-go/files/config.json new file mode 100644 index 0000000000..ba832fdaaa --- /dev/null +++ b/k8s/app-go/files/config.json @@ -0,0 +1,19 @@ +{ + "appName": "devops-info-service-go", + "version": "1.0.0", + "environment": "development", + "featureFlags": { + "enableDebugLogging": false, + "enableRateLimit": false + }, + "server": { + "host": "0.0.0.0", + "port": 8080, + "readTimeoutSeconds": 30, + "writeTimeoutSeconds": 30 + }, + "logging": { + "level": "info", + "format": "json" + } +} diff --git a/k8s/app-go/templates/NOTES.txt b/k8s/app-go/templates/NOTES.txt new file mode 100644 index 0000000000..906935b882 --- /dev/null +++ b/k8s/app-go/templates/NOTES.txt @@ -0,0 +1,14 @@ +Thank you for installing {{ .Chart.Name }}! + +Release: {{ .Release.Name }} +Namespace: {{ .Release.Namespace }} + +{{- if eq .Values.service.type "NodePort" }} +Access the application via NodePort: + export NODE_IP=$(kubectl get nodes -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:{{ .Values.service.nodePort }} +{{- else if eq .Values.service.type "LoadBalancer" }} +Access the application via LoadBalancer: + export SERVICE_IP=$(kubectl get svc {{ include "app-go.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- end }} diff --git a/k8s/app-go/templates/_helpers.tpl b/k8s/app-go/templates/_helpers.tpl new file mode 100644 index 0000000000..37093d5ad0 --- /dev/null +++ b/k8s/app-go/templates/_helpers.tpl @@ -0,0 +1,35 @@ +{{/* +Re-export common library templates for the app-go chart. +These wrap the common-lib definitions so they can be used +with chart-specific names if needed. +*/}} + +{{- define "app-go.name" -}} +{{- include "common.name" . }} +{{- end }} + +{{- define "app-go.fullname" -}} +{{- include "common.fullname" . }} +{{- end }} + +{{- define "app-go.chart" -}} +{{- include "common.chart" . }} +{{- end }} + +{{- define "app-go.labels" -}} +{{- include "common.labels" . }} +{{- end }} + +{{- define "app-go.selectorLabels" -}} +{{- include "common.selectorLabels" . }} +{{- end }} + +{{/* +Common environment variables (named template for DRY principle) +*/}} +{{- define "app-go.envVars" -}} +- name: APP_ENV + value: {{ .Values.environment | default "development" | quote }} +- name: LOG_LEVEL + value: {{ .Values.logLevel | default "info" | quote }} +{{- end }} diff --git a/k8s/app-go/templates/configmap.yaml b/k8s/app-go/templates/configmap.yaml new file mode 100644 index 0000000000..d404c515e8 --- /dev/null +++ b/k8s/app-go/templates/configmap.yaml @@ -0,0 +1,24 @@ +{{- /* +ConfigMap 1: File-based configuration (mounted as /config/config.json) +ConfigMap 2: Environment variable configuration (injected as env vars) +*/ -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-go.fullname" . }}-config + labels: + {{- include "app-go.labels" . | nindent 4 }} +data: + config.json: |- +{{ .Files.Get "files/config.json" | indent 4 }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-go.fullname" . }}-env + labels: + {{- include "app-go.labels" . | nindent 4 }} +data: + APP_ENV: {{ .Values.environment | quote }} + LOG_LEVEL: {{ .Values.logLevel | quote }} + VISITS_FILE: {{ .Values.persistence.visitsFile | quote }} diff --git a/k8s/app-go/templates/deployment.yaml b/k8s/app-go/templates/deployment.yaml new file mode 100644 index 0000000000..3d78c77844 --- /dev/null +++ b/k8s/app-go/templates/deployment.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "app-go.fullname" . }} + labels: + {{- include "app-go.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "app-go.selectorLabels" . | nindent 6 }} + strategy: + type: {{ .Values.strategy.type }} + {{- if eq .Values.strategy.type "RollingUpdate" }} + rollingUpdate: + maxSurge: {{ .Values.strategy.rollingUpdate.maxSurge }} + maxUnavailable: {{ .Values.strategy.rollingUpdate.maxUnavailable }} + {{- end }} + template: + metadata: + labels: + {{- include "app-go.selectorLabels" . | nindent 8 }} + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- if .Values.vault.enabled }} + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: {{ .Values.vault.role | quote }} + vault.hashicorp.com/agent-inject-secret-config: {{ .Values.vault.secretPath | quote }} + vault.hashicorp.com/agent-inject-template-config: | + {{`{{`}}- with secret "{{ .Values.vault.secretPath }}" -{{`}}`}} + APP_SECRET_KEY={{`{{`}} .Data.data.secret_key {{`}}`}} + APP_DATABASE_PASSWORD={{`{{`}} .Data.data.database_password {{`}}`}} + {{`{{`}}- end -{{`}}`}} + {{- end }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - containerPort: {{ .Values.service.targetPort }} + protocol: TCP + envFrom: + - secretRef: + name: {{ include "app-go.fullname" . }}-secret + - configMapRef: + name: {{ include "app-go.fullname" . }}-env + env: + {{- include "app-go.envVars" . | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /config + {{- if .Values.persistence.enabled }} + - name: data-volume + mountPath: /data + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + volumes: + - name: config-volume + configMap: + name: {{ include "app-go.fullname" . }}-config + {{- if .Values.persistence.enabled }} + - name: data-volume + persistentVolumeClaim: + claimName: {{ include "app-go.fullname" . }}-data + {{- end }} diff --git a/k8s/app-go/templates/pvc.yaml b/k8s/app-go/templates/pvc.yaml new file mode 100644 index 0000000000..75b6753fcc --- /dev/null +++ b/k8s/app-go/templates/pvc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "app-go.fullname" . }}-data + labels: + {{- include "app-go.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.persistence.size }} + {{- if .Values.persistence.storageClass }} + storageClassName: {{ .Values.persistence.storageClass }} + {{- end }} +{{- end }} diff --git a/k8s/app-go/templates/secrets.yaml b/k8s/app-go/templates/secrets.yaml new file mode 100644 index 0000000000..4fbcb75dfc --- /dev/null +++ b/k8s/app-go/templates/secrets.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "app-go.fullname" . }}-secret + labels: + {{- include "app-go.labels" . | nindent 4 }} +type: Opaque +stringData: + APP_SECRET_KEY: {{ .Values.secrets.secretKey | quote }} + APP_DATABASE_PASSWORD: {{ .Values.secrets.databasePassword | quote }} diff --git a/k8s/app-go/templates/service.yaml b/k8s/app-go/templates/service.yaml new file mode 100644 index 0000000000..798ea8a783 --- /dev/null +++ b/k8s/app-go/templates/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "app-go.fullname" . }} + labels: + {{- include "app-go.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + selector: + {{- include "app-go.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + {{- if and (eq .Values.service.type "NodePort") .Values.service.nodePort }} + nodePort: {{ .Values.service.nodePort }} + {{- end }} diff --git a/k8s/app-go/values.yaml b/k8s/app-go/values.yaml new file mode 100644 index 0000000000..f946341c9c --- /dev/null +++ b/k8s/app-go/values.yaml @@ -0,0 +1,73 @@ +replicaCount: 3 + +image: + repository: essence666/app_go + tag: "latest" + pullPolicy: IfNotPresent + +service: + type: NodePort + port: 80 + targetPort: 8080 + nodePort: 30081 + +resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" + +strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + +livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 3 + failureThreshold: 3 + +nameOverride: "" +fullnameOverride: "" + +secrets: + secretKey: "change-me-secret-key" + databasePassword: "change-me-db-password" + +vault: + enabled: false + role: "app-go" + secretPath: "secret/data/app-go/config" + templateFormat: "env" + +environment: "development" +logLevel: "info" + +configmap: + appName: "devops-info-service-go" + environment: "development" + featureFlags: + enableDebugLogging: false + enableRateLimit: false + +persistence: + enabled: true + size: 100Mi + storageClass: "" + visitsFile: "/data/visits" diff --git a/k8s/app-python/Chart.yaml b/k8s/app-python/Chart.yaml new file mode 100644 index 0000000000..a16cc889a3 --- /dev/null +++ b/k8s/app-python/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: app-python +description: Helm chart for the DevOps Info Service Python application +type: application +version: 0.1.0 +appVersion: "1.0" +keywords: + - python + - web + - devops +maintainers: + - name: essence-666 +sources: + - https://github.com/essence-666/DevOps-Core-Course +dependencies: + - name: common-lib + version: 0.1.0 + repository: "file://../common-lib" diff --git a/k8s/app-python/files/config.json b/k8s/app-python/files/config.json new file mode 100644 index 0000000000..9a7547d7aa --- /dev/null +++ b/k8s/app-python/files/config.json @@ -0,0 +1,20 @@ +{ + "appName": "devops-info-service", + "version": "1.0.0", + "environment": "development", + "featureFlags": { + "enableMetrics": true, + "enableDebugLogging": false, + "enableRateLimit": false + }, + "server": { + "host": "0.0.0.0", + "port": 8000, + "readTimeoutSeconds": 30, + "writeTimeoutSeconds": 30 + }, + "logging": { + "level": "info", + "format": "json" + } +} diff --git a/k8s/app-python/templates/NOTES.txt b/k8s/app-python/templates/NOTES.txt new file mode 100644 index 0000000000..a7fb056fec --- /dev/null +++ b/k8s/app-python/templates/NOTES.txt @@ -0,0 +1,14 @@ +Thank you for installing {{ .Chart.Name }}! + +Release: {{ .Release.Name }} +Namespace: {{ .Release.Namespace }} + +{{- if eq .Values.service.type "NodePort" }} +Access the application via NodePort: + export NODE_IP=$(kubectl get nodes -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:{{ .Values.service.nodePort }} +{{- else if eq .Values.service.type "LoadBalancer" }} +Access the application via LoadBalancer: + export SERVICE_IP=$(kubectl get svc {{ include "app-python.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- end }} diff --git a/k8s/app-python/templates/_helpers.tpl b/k8s/app-python/templates/_helpers.tpl new file mode 100644 index 0000000000..4d1f98ae5e --- /dev/null +++ b/k8s/app-python/templates/_helpers.tpl @@ -0,0 +1,53 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "app-python.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "app-python.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "app-python.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "app-python.labels" -}} +helm.sh/chart: {{ include "app-python.chart" . }} +{{ include "app-python.selectorLabels" . }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "app-python.selectorLabels" -}} +app.kubernetes.io/name: {{ include "app-python.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Common environment variables (named template for DRY principle) +*/}} +{{- define "app-python.envVars" -}} +- name: APP_ENV + value: {{ .Values.environment | default "development" | quote }} +- name: LOG_LEVEL + value: {{ .Values.logLevel | default "info" | quote }} +{{- end }} diff --git a/k8s/app-python/templates/analysis-template.yaml b/k8s/app-python/templates/analysis-template.yaml new file mode 100644 index 0000000000..41cde78cc0 --- /dev/null +++ b/k8s/app-python/templates/analysis-template.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.rollout.enabled .Values.rollout.analysis.enabled }} +apiVersion: argoproj.io/v1alpha1 +kind: AnalysisTemplate +metadata: + name: {{ include "app-python.fullname" . }}-success-rate + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + metrics: + - name: health-check + provider: + web: + url: "http://{{ include "app-python.fullname" . }}.{{ .Release.Namespace }}.svc/health" + jsonPath: "{$.status}" + successCondition: result == "ok" + interval: 10s + count: 3 + failureLimit: 1 +{{- end }} diff --git a/k8s/app-python/templates/configmap.yaml b/k8s/app-python/templates/configmap.yaml new file mode 100644 index 0000000000..824786fc70 --- /dev/null +++ b/k8s/app-python/templates/configmap.yaml @@ -0,0 +1,24 @@ +{{- /* +ConfigMap 1: File-based configuration (mounted as /config/config.json) +ConfigMap 2: Environment variable configuration (injected as env vars) +*/ -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-python.fullname" . }}-config + labels: + {{- include "app-python.labels" . | nindent 4 }} +data: + config.json: |- +{{ .Files.Get "files/config.json" | indent 4 }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "app-python.fullname" . }}-env + labels: + {{- include "app-python.labels" . | nindent 4 }} +data: + APP_ENV: {{ .Values.environment | quote }} + LOG_LEVEL: {{ .Values.logLevel | quote }} + VISITS_FILE: {{ .Values.persistence.visitsFile | quote }} diff --git a/k8s/app-python/templates/deployment.yaml b/k8s/app-python/templates/deployment.yaml new file mode 100644 index 0000000000..60c7ba5f35 --- /dev/null +++ b/k8s/app-python/templates/deployment.yaml @@ -0,0 +1,73 @@ +{{- if not .Values.rollout.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "app-python.fullname" . }} + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "app-python.selectorLabels" . | nindent 6 }} + strategy: + type: {{ .Values.strategy.type }} + {{- if eq .Values.strategy.type "RollingUpdate" }} + rollingUpdate: + maxSurge: {{ .Values.strategy.rollingUpdate.maxSurge }} + maxUnavailable: {{ .Values.strategy.rollingUpdate.maxUnavailable }} + {{- end }} + template: + metadata: + labels: + {{- include "app-python.selectorLabels" . | nindent 8 }} + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- if .Values.vault.enabled }} + vault.hashicorp.com/agent-inject: "true" + vault.hashicorp.com/role: {{ .Values.vault.role | quote }} + vault.hashicorp.com/agent-inject-secret-config: {{ .Values.vault.secretPath | quote }} + vault.hashicorp.com/agent-inject-template-config: | + {{`{{`}}- with secret "{{ .Values.vault.secretPath }}" -{{`}}`}} + APP_SECRET_KEY={{`{{`}} .Data.data.secret_key {{`}}`}} + APP_DATABASE_PASSWORD={{`{{`}} .Data.data.database_password {{`}}`}} + {{`{{`}}- end -{{`}}`}} + {{- end }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - containerPort: {{ .Values.service.targetPort }} + protocol: TCP + envFrom: + - secretRef: + name: {{ include "app-python.fullname" . }}-secret + - configMapRef: + name: {{ include "app-python.fullname" . }}-env + env: + {{- include "app-python.envVars" . | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /config + {{- if .Values.persistence.enabled }} + - name: data-volume + mountPath: /data + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + volumes: + - name: config-volume + configMap: + name: {{ include "app-python.fullname" . }}-config + {{- if .Values.persistence.enabled }} + - name: data-volume + persistentVolumeClaim: + claimName: {{ include "app-python.fullname" . }}-data + {{- end }} +{{- end }} diff --git a/k8s/app-python/templates/hooks/post-install-job.yaml b/k8s/app-python/templates/hooks/post-install-job.yaml new file mode 100644 index 0000000000..7b8543ccf0 --- /dev/null +++ b/k8s/app-python/templates/hooks/post-install-job.yaml @@ -0,0 +1,20 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ include "app-python.fullname" . }}-post-install" + labels: + {{- include "app-python.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-install + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: "{{ include "app-python.fullname" . }}-post-install" + spec: + restartPolicy: Never + containers: + - name: post-install-job + image: busybox + command: ['sh', '-c', 'echo "Post-install smoke test started" && echo "Verifying deployment health..." && sleep 5 && echo "Smoke test passed. Deployment successful."'] diff --git a/k8s/app-python/templates/hooks/pre-install-job.yaml b/k8s/app-python/templates/hooks/pre-install-job.yaml new file mode 100644 index 0000000000..990e036814 --- /dev/null +++ b/k8s/app-python/templates/hooks/pre-install-job.yaml @@ -0,0 +1,20 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ include "app-python.fullname" . }}-pre-install" + labels: + {{- include "app-python.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + metadata: + name: "{{ include "app-python.fullname" . }}-pre-install" + spec: + restartPolicy: Never + containers: + - name: pre-install-job + image: busybox + command: ['sh', '-c', 'echo "Pre-install validation started" && echo "Checking environment readiness..." && sleep 5 && echo "Environment validated. Proceeding with installation."'] diff --git a/k8s/app-python/templates/preview-service.yaml b/k8s/app-python/templates/preview-service.yaml new file mode 100644 index 0000000000..7ecd2dac14 --- /dev/null +++ b/k8s/app-python/templates/preview-service.yaml @@ -0,0 +1,16 @@ +{{- if and .Values.rollout.enabled (eq .Values.rollout.strategy "blueGreen") }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "app-python.fullname" . }}-preview + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + selector: + {{- include "app-python.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} +{{- end }} diff --git a/k8s/app-python/templates/pvc.yaml b/k8s/app-python/templates/pvc.yaml new file mode 100644 index 0000000000..1d145caf47 --- /dev/null +++ b/k8s/app-python/templates/pvc.yaml @@ -0,0 +1,17 @@ +{{- if .Values.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "app-python.fullname" . }}-data + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.persistence.size }} + {{- if .Values.persistence.storageClass }} + storageClassName: {{ .Values.persistence.storageClass }} + {{- end }} +{{- end }} diff --git a/k8s/app-python/templates/rollout.yaml b/k8s/app-python/templates/rollout.yaml new file mode 100644 index 0000000000..b44f1f5d5a --- /dev/null +++ b/k8s/app-python/templates/rollout.yaml @@ -0,0 +1,86 @@ +{{- if .Values.rollout.enabled }} +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: {{ include "app-python.fullname" . }} + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "app-python.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "app-python.selectorLabels" . | nindent 8 }} + annotations: + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - containerPort: {{ .Values.service.targetPort }} + protocol: TCP + envFrom: + - secretRef: + name: {{ include "app-python.fullname" . }}-secret + - configMapRef: + name: {{ include "app-python.fullname" . }}-env + env: + {{- include "app-python.envVars" . | nindent 12 }} + volumeMounts: + - name: config-volume + mountPath: /config + {{- if .Values.persistence.enabled }} + - name: data-volume + mountPath: /data + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + volumes: + - name: config-volume + configMap: + name: {{ include "app-python.fullname" . }}-config + {{- if .Values.persistence.enabled }} + - name: data-volume + persistentVolumeClaim: + claimName: {{ include "app-python.fullname" . }}-data + {{- end }} + strategy: + {{- if eq .Values.rollout.strategy "canary" }} + canary: + steps: + {{- if .Values.rollout.analysis.enabled }} + - setWeight: 20 + - analysis: + templates: + - templateName: {{ include "app-python.fullname" . }}-success-rate + - setWeight: 40 + - pause: + duration: "30s" + - setWeight: 60 + - pause: + duration: "30s" + - setWeight: 80 + - pause: + duration: "30s" + {{- else }} + {{- toYaml .Values.rollout.canary.steps | nindent 8 }} + {{- end }} + {{- else if eq .Values.rollout.strategy "blueGreen" }} + blueGreen: + activeService: {{ include "app-python.fullname" . }} + previewService: {{ include "app-python.fullname" . }}-preview + autoPromotionEnabled: {{ .Values.rollout.blueGreen.autoPromotionEnabled }} + {{- if .Values.rollout.blueGreen.autoPromotionSeconds }} + autoPromotionSeconds: {{ .Values.rollout.blueGreen.autoPromotionSeconds }} + {{- end }} + {{- end }} +{{- end }} diff --git a/k8s/app-python/templates/secrets.yaml b/k8s/app-python/templates/secrets.yaml new file mode 100644 index 0000000000..54184a77fe --- /dev/null +++ b/k8s/app-python/templates/secrets.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "app-python.fullname" . }}-secret + labels: + {{- include "app-python.labels" . | nindent 4 }} +type: Opaque +stringData: + APP_SECRET_KEY: {{ .Values.secrets.secretKey | quote }} + APP_DATABASE_PASSWORD: {{ .Values.secrets.databasePassword | quote }} diff --git a/k8s/app-python/templates/service.yaml b/k8s/app-python/templates/service.yaml new file mode 100644 index 0000000000..10062f7aa5 --- /dev/null +++ b/k8s/app-python/templates/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "app-python.fullname" . }} + labels: + {{- include "app-python.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + selector: + {{- include "app-python.selectorLabels" . | nindent 4 }} + ports: + - protocol: TCP + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + {{- if and (eq .Values.service.type "NodePort") .Values.service.nodePort }} + nodePort: {{ .Values.service.nodePort }} + {{- end }} diff --git a/k8s/app-python/values-bluegreen.yaml b/k8s/app-python/values-bluegreen.yaml new file mode 100644 index 0000000000..c077d792b8 --- /dev/null +++ b/k8s/app-python/values-bluegreen.yaml @@ -0,0 +1,8 @@ +rollout: + enabled: true + strategy: blueGreen + blueGreen: + autoPromotionEnabled: false + autoPromotionSeconds: null + analysis: + enabled: false diff --git a/k8s/app-python/values-dev.yaml b/k8s/app-python/values-dev.yaml new file mode 100644 index 0000000000..02e51d67b3 --- /dev/null +++ b/k8s/app-python/values-dev.yaml @@ -0,0 +1,34 @@ +replicaCount: 1 + +image: + tag: "latest" + +resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 50m + memory: 64Mi + +service: + type: NodePort + nodePort: 30080 + +livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + +readinessProbe: + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: /health + port: 8000 diff --git a/k8s/app-python/values-prod.yaml b/k8s/app-python/values-prod.yaml new file mode 100644 index 0000000000..c1623fd1c5 --- /dev/null +++ b/k8s/app-python/values-prod.yaml @@ -0,0 +1,36 @@ +replicaCount: 5 + +image: + tag: "1.0.0" + pullPolicy: Always + +resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 200m + memory: 256Mi + +service: + type: LoadBalancer + port: 80 + targetPort: 8000 + +livenessProbe: + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + +readinessProbe: + initialDelaySeconds: 10 + periodSeconds: 3 + timeoutSeconds: 3 + failureThreshold: 3 + httpGet: + path: /health + port: 8000 diff --git a/k8s/app-python/values.yaml b/k8s/app-python/values.yaml new file mode 100644 index 0000000000..23768af5b1 --- /dev/null +++ b/k8s/app-python/values.yaml @@ -0,0 +1,96 @@ +replicaCount: 3 + +image: + repository: essence666/app_python_lab_2 + tag: "latest" + pullPolicy: IfNotPresent + +service: + type: NodePort + port: 80 + targetPort: 8000 + nodePort: 30080 + +resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + +strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + +livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + +readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 3 + failureThreshold: 3 + +nameOverride: "" +fullnameOverride: "" + +secrets: + secretKey: "change-me-secret-key" + databasePassword: "change-me-db-password" + +vault: + enabled: false + role: "app-python" + secretPath: "secret/data/app-python/config" + templateFormat: "env" + +environment: "development" +logLevel: "info" + +configmap: + appName: "devops-info-service" + environment: "development" + featureFlags: + enableMetrics: true + enableDebugLogging: false + enableRateLimit: false + +persistence: + enabled: true + size: 100Mi + storageClass: "" + visitsFile: "/data/visits" + +rollout: + enabled: true + strategy: canary # canary or blueGreen + canary: + steps: + - setWeight: 20 + - pause: {} + - setWeight: 40 + - pause: + duration: "30s" + - setWeight: 60 + - pause: + duration: "30s" + - setWeight: 80 + - pause: + duration: "30s" + blueGreen: + autoPromotionEnabled: false + autoPromotionSeconds: null + analysis: + enabled: false diff --git a/k8s/argocd/application-dev.yaml b/k8s/argocd/application-dev.yaml new file mode 100644 index 0000000000..2e67a4ea67 --- /dev/null +++ b/k8s/argocd/application-dev.yaml @@ -0,0 +1,28 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app-dev + namespace: argocd + labels: + app: python-app + env: dev +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + - values-dev.yaml + destination: + server: https://kubernetes.default.svc + namespace: dev + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true diff --git a/k8s/argocd/application-go.yaml b/k8s/argocd/application-go.yaml new file mode 100644 index 0000000000..afb9ab9bb0 --- /dev/null +++ b/k8s/argocd/application-go.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: go-app + namespace: argocd + labels: + app: go-app + environment: default +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-go + helm: + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true diff --git a/k8s/argocd/application-prod.yaml b/k8s/argocd/application-prod.yaml new file mode 100644 index 0000000000..d02439f5ca --- /dev/null +++ b/k8s/argocd/application-prod.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app-prod + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + - values-prod.yaml + destination: + server: https://kubernetes.default.svc + namespace: prod + syncPolicy: + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true + # No automated block — prod requires explicit manual sync approval diff --git a/k8s/argocd/application.yaml b/k8s/argocd/application.yaml new file mode 100644 index 0000000000..91008ae813 --- /dev/null +++ b/k8s/argocd/application.yaml @@ -0,0 +1,30 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: python-app + namespace: argocd + labels: + app.kubernetes.io/name: python-app + app.kubernetes.io/part-of: devops-core-course +spec: + project: default + + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - values.yaml + + destination: + server: https://kubernetes.default.svc + namespace: default + + # Manual sync — operator must trigger sync explicitly via UI or CLI. + # No `automated` block means ArgoCD will detect drift but not apply it automatically. + syncPolicy: + syncOptions: + - CreateNamespace=true + - PrunePropagationPolicy=foreground + - PruneLast=true diff --git a/k8s/argocd/applicationset.yaml b/k8s/argocd/applicationset.yaml new file mode 100644 index 0000000000..cb2fe11355 --- /dev/null +++ b/k8s/argocd/applicationset.yaml @@ -0,0 +1,47 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: python-app-set + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - list: + elements: + - env: dev + namespace: dev + valuesFile: values-dev.yaml + autoSync: "true" + - env: prod + namespace: prod + valuesFile: values-prod.yaml + autoSync: "false" + template: + metadata: + name: 'python-app-{{.env}}' + spec: + project: default + source: + repoURL: https://github.com/essence-666/DevOps-Core-Course.git + targetRevision: master + path: k8s/app-python + helm: + valueFiles: + - '{{.valuesFile}}' + destination: + server: https://kubernetes.default.svc + namespace: '{{.namespace}}' + syncPolicy: + syncOptions: + - CreateNamespace=true + templatePatch: | + {{- if eq .autoSync "true" -}} + spec: + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + {{- end -}} diff --git a/k8s/common-lib/Chart.yaml b/k8s/common-lib/Chart.yaml new file mode 100644 index 0000000000..79d46cf53b --- /dev/null +++ b/k8s/common-lib/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: common-lib +description: Common templates for all applications +type: library +version: 0.1.0 diff --git a/k8s/common-lib/templates/_labels.tpl b/k8s/common-lib/templates/_labels.tpl new file mode 100644 index 0000000000..0377c1c008 --- /dev/null +++ b/k8s/common-lib/templates/_labels.tpl @@ -0,0 +1,43 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "common.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "common.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "common.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "common.labels" -}} +helm.sh/chart: {{ include "common.chart" . }} +{{ include "common.selectorLabels" . }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "common.selectorLabels" -}} +app.kubernetes.io/name: {{ include "common.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} diff --git a/k8s/deployment.yml b/k8s/deployment.yml new file mode 100644 index 0000000000..b3bda9304b --- /dev/null +++ b/k8s/deployment.yml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: devops-info-service + labels: + app: devops-info-service + environment: production +spec: + replicas: 3 + selector: + matchLabels: + app: devops-info-service + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: devops-info-service + environment: production + spec: + containers: + - name: devops-info-service + image: essence666/app_python_lab_2:latest + ports: + - containerPort: 8000 + protocol: TCP + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 3 + failureThreshold: 3 diff --git a/k8s/service.yml b/k8s/service.yml new file mode 100644 index 0000000000..9d814d787f --- /dev/null +++ b/k8s/service.yml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: devops-info-service + labels: + app: devops-info-service +spec: + type: NodePort + selector: + app: devops-info-service + ports: + - protocol: TCP + port: 80 + targetPort: 8000 + nodePort: 30080 diff --git a/labs/lab05.md b/labs/lab05.md index a76d4960aa..281d5c62d0 100644 --- a/labs/lab05.md +++ b/labs/lab05.md @@ -94,7 +94,7 @@ ansible/ │ │ │ └── main.yml │ │ └── defaults/ │ │ └── main.yml -│ └── app_deploy/ # Application deployment +│ └── web_app/ # Application deployment │ ├── tasks/ │ │ └── main.yml │ ├── handlers/ @@ -523,7 +523,7 @@ vault_password_file = .vault_pass #### 3.2 Create Application Deployment Role -Create `roles/app_deploy/tasks/main.yml`: +Create `roles/web_app/tasks/main.yml`: **Required Tasks:** 1. Log in to Docker Hub (using vaulted credentials) @@ -538,10 +538,10 @@ Create `roles/app_deploy/tasks/main.yml`: 6. Wait for application to be ready (port check) 7. Verify health endpoint -**Create `roles/app_deploy/handlers/main.yml`:** +**Create `roles/web_app/handlers/main.yml`:** - Handler to restart application container -**Create `roles/app_deploy/defaults/main.yml`:** +**Create `roles/web_app/defaults/main.yml`:** - Default port - Default restart policy - Default environment variables @@ -611,7 +611,7 @@ Create `playbooks/deploy.yml`: become: yes roles: - - app_deploy + - web_app ``` #### 3.4 Run Deployment @@ -652,7 +652,7 @@ Create `ansible/docs/LAB05.md` with these sections: #### 2. Roles Documentation -For each role (common, docker, app_deploy): +For each role (common, docker, web_app): - **Purpose**: What does this role do? - **Variables**: Key variables and defaults - **Handlers**: What handlers are defined? @@ -856,7 +856,7 @@ Ansible has official plugins for major clouds. **Setup & Structure (2 pts):** - [ ] Proper role-based directory structure created -- [ ] All three roles created (common, docker, app_deploy) +- [ ] All three roles created (common, docker, web_app) - [ ] Each role has appropriate tasks, handlers, and defaults - [ ] Ansible.cfg configured correctly - [ ] Inventory configured and connectivity tested diff --git a/monitoring/.gitignore b/monitoring/.gitignore new file mode 100644 index 0000000000..445e931cda --- /dev/null +++ b/monitoring/.gitignore @@ -0,0 +1,3 @@ +.env +*.log +tmp/ diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml new file mode 100644 index 0000000000..c34e6d7b7c --- /dev/null +++ b/monitoring/docker-compose.yml @@ -0,0 +1,168 @@ +version: '3.8' + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + prometheus-data: + +services: + loki: + image: grafana/loki:3.0.0 + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + promtail: + image: grafana/promtail:3.0.0 + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + depends_on: + - loki + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.25' + memory: 128M + + prometheus: + image: prom/prometheus:v3.9.0 + ports: + - "9090:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.retention.time=15d' + - '--storage.tsdb.retention.size=10GB' + networks: + - logging + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + + grafana: + image: grafana/grafana:12.3.1 + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + - ./grafana/dashboards:/var/lib/grafana/dashboards + networks: + - logging + env_file: + - .env + depends_on: + - prometheus + - loki + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + ports: + - "8000:8000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.25' + memory: 128M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + app-go: + build: + context: ../app_go + dockerfile: Dockerfile + ports: + - "8001:8001" + networks: + - logging + labels: + logging: "promtail" + app: "devops-go" + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.25' + memory: 128M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8001/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s diff --git a/monitoring/docs/LAB07.md b/monitoring/docs/LAB07.md new file mode 100644 index 0000000000..00bf8f2e41 --- /dev/null +++ b/monitoring/docs/LAB07.md @@ -0,0 +1,618 @@ +# Lab 7 — Observability & Logging with Loki Stack + + +## Table of Contents + +1. [Architecture](#1-architecture) +2. [Setup Guide](#2-setup-guide) +3. [Configuration](#3-configuration) +4. [Application Logging](#4-application-logging) +5. [Dashboard](#5-dashboard) +6. [Production Config](#6-production-config) +7. [Testing](#7-testing) +8. [Challenges](#8-challenges) + +--- + +## 1. Architecture + +### 1.1 Component Overview + +The logging stack consists of three main components: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Docker Containers │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ App Python │ │ App Go │ │ Other Containers │ │ +│ │ (port 8000)│ │ (port 8001)│ │ │ │ +│ └──────┬──────┘ └──────┬──────┘ └────────────┬────────────┘ │ +│ │ │ │ │ +│ └────────────────┼───────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ Promtail ││ +│ │ (Log Collector, port 9080) ││ +│ │ - Discovers containers via Docker socket ││ +│ │ - Extracts labels (container name, app) ││ +│ │ - Forwards logs to Loki ││ +│ └─────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ Loki ││ +│ │ (Log Storage, port 3100) ││ +│ │ - TSDB storage backend ││ +│ │ - 7 days retention ││ +│ │ - Schema v13 ││ +│ └─────────────────────────────────────────────────────────────┘│ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────┐│ +│ │ Grafana ││ +│ │ (Visualization, port 3000) ││ +│ │ - Loki data source ││ +│ │ - Log dashboards ││ +│ │ - LogQL queries ││ +│ └─────────────────────────────────────────────────────────────┘│ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 1.2 Data Flow + +1. **Applications** write JSON-formatted logs to stdout/stderr +2. **Docker** captures container logs in `/var/lib/docker/containers` +3. **Promtail** discovers containers via Docker socket, reads logs, adds labels +4. **Loki** receives and stores logs with indexes for fast querying +5. **Grafana** queries Loki using LogQL and displays results + + +## 2. Setup Guide + +### 2.1 Project Structure + +``` +monitoring/ +├── docker-compose.yml # Main orchestration file +├── .env # Grafana secrets (NOT in git) +├── .gitignore # Excludes .env +├── loki/ +│ └── config.yml # Loki configuration +├── promtail/ +│ └── config.yml # Promtail configuration +└── docs/ + └── LAB07.md # This documentation +``` + +### 2.2 Deployment Steps + +**Step 1: Clone and navigate to monitoring directory** +```bash +cd DevOps-Core-Course/monitoring +``` + +**Step 2: Verify configuration files** +```bash +ls -la +# Should show: docker-compose.yml, .env, loki/, promtail/ +``` + +**Step 3: Deploy the stack** +```bash +docker compose up -d --build +``` + +**Step 4: Verify services** +```bash +docker compose ps +``` + + +![Docker Compose PS](./screenshots/docker-compose-ps.png) + +**Step 5: Verify service health** +```bash +# Check Loki readiness +curl http://localhost:3100/ready + +# Check Promtail targets +curl http://localhost:9080/targets + +# Check Grafana health +curl http://localhost:3000/api/health +``` + +--- + +## 3. Configuration + +### 3.1 Docker Compose (`docker-compose.yml`) + +Key configuration decisions: + +| Service | Image | Port | Purpose | +|---------|-------|------|---------| +| loki | grafana/loki:3.0.0 | 3100 | Log storage with TSDB | +| promtail | grafana/promtail:3.0.0 | 9080 | Log collection | +| grafana | grafana/grafana:12.3.1 | 3000 | Visualization | +| app-python | custom build | 8000 | Python FastAPI app | +| app-go | custom build | 8001 | Go HTTP app | + +**Network configuration:** +- All services share `logging` network (bridge driver) +- Enables inter-service communication (promtail→loki, grafana→loki) + +**Volumes:** +- `loki-data`: Persistent Loki storage +- `grafana-data`: Persistent Grafana dashboards and datasources + +### 3.2 Loki Configuration (`loki/config.yml`) + +```yaml +auth_enabled: false # Single-tenant mode + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb # TSDB for 10x faster queries + object_store: filesystem + schema: v13 # Latest schema for Loki 3.0+ + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 168h # 7 days retention + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m +``` + +**Key decisions:** +- **TSDB storage**: 10x faster queries than boltdb-shipper +- **Schema v13**: Required for Loki 3.0+ features +- **168h retention**: 7 days balance between storage and debugging needs +- **Compactor**: Required for retention to work properly + +### 3.3 Promtail Configuration (`promtail/config.yml`) + +```yaml +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'stream' + - source_labels: ['__meta_docker_container_label_logging'] + regex: 'promtail' + action: keep + - source_labels: ['__meta_docker_container_label_app'] + target_label: 'app' +``` + +**Key features:** +- **Docker service discovery**: Automatically discovers new containers +- **Label extraction**: Container name, stream, custom app label +- **Filtering**: Only scrapes containers with `logging=promtail` label + + +![Promtail Targets](./screenshots/promtail-targets.png) + +--- + +## 4. Application Logging + +### 4.1 Python App JSON Logging + +**File:** `app_python/core/logging.py` + +```python +import logging +import json +from pythonjsonlogger import jsonlogger + +class CustomJsonFormatter(jsonlogger.JsonFormatter): + def add_fields(self, log_record, record, message_dict): + super(CustomJsonFormatter, self).add_fields(log_record, record, message_dict) + if not log_record.get('timestamp'): + log_record['timestamp'] = record.created + if log_record.get('level'): + log_record['level'] = log_record['level'].upper() + else: + log_record['level'] = record.levelname.upper() + +def setup_logging(): + logger = logging.getLogger("devops-info-service") + logger.setLevel(logging.DEBUG if DEBUG else logging.INFO) + + console_handler = logging.StreamHandler() + formatter = CustomJsonFormatter( + '%(timestamp)s %(level)s %(name)s %(message)s' + ) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + logger.propagate = False + return logger +``` + +**Logged events:** +- Application startup +- HTTP requests (method, path, client_ip) +- Response status and processing time +- Errors and exceptions + +### 4.2 Go App Logging + +The Go application logs in JSON format natively through its response structure. + +### 4.3 Error Endpoints + +Both applications have `/error` endpoints for testing error logging: + +| Application | Endpoint | Response | +|-------------|----------|----------| +| Python | `GET /error` | HTTP 500 with HTTPException | +| Go | `GET /error` | HTTP 500 with JSON error response | + + +![JSON Logs Example](./screenshots/json-logs-example.png) + +--- + +## 5. Dashboard + +### 5.1 LogQL Query Reference + +| Query | Description | +|-------|-------------| +| `{app=~"devops-.*"}` | All logs from both apps | +| `{app="devops-python"} \| json` | Parse Python app JSON logs | +| `{app=~"devops-.*"} \| json \| level="ERROR"` | Only errors | +| `sum by (app) (rate({app=~"devops-.*"} [1m]))` | Request rate per app | +| `sum by (level) (count_over_time({app=~"devops-.*"} \| json [5m]))` | Log level distribution | + +### 5.2 Dashboard Panels + +#### Panel 1: Logs Table +- **Type:** Logs visualization +- **Query:** `{app=~"devops-.*"}` +- **Purpose:** View recent logs from all applications + + +![Logs Table Panel](./screenshots/panel-1-logs.png) + +#### Panel 2: Request Rate +- **Type:** Time series graph +- **Query:** `sum by (app) (rate({app=~"devops-.*"} [1m]))` +- **Purpose:** Monitor request volume per application + + +![Request Rate Panel](./screenshots/panel-2-rate.png) + +#### Panel 3: Error Logs +- **Type:** Logs visualization +- **Query:** `{app=~"devops-.*"} | json | level="ERROR"` +- **Purpose:** Quick access to error logs only + + +![Error Logs Panel](./screenshots/panel-3-errors.png) + +#### Panel 4: Log Level Distribution +- **Type:** Pie chart / Stat +- **Query:** `sum by (level) (count_over_time({app=~"devops-.*"} | json [5m]))` +- **Purpose:** Understand log severity distribution + + +![Log Level Distribution Panel](./screenshots/panel-4-distribution.png) + +### 5.3 Full Dashboard + + +![Full Dashboard](./screenshots/full-dashboard.png) + +--- + +## 6. Production Config + +### 6.1 Resource Limits + +All services have resource constraints: + +| Service | CPU Limit | Memory Limit | CPU Reservation | Memory Reservation | +|---------|-----------|--------------|-----------------|-------------------| +| loki | 0.5 | 512M | 0.25 | 256M | +| promtail | 0.5 | 256M | 0.25 | 128M | +| grafana | 1.0 | 1G | 0.5 | 512M | +| app-python | 0.5 | 512M | 0.25 | 256M | +| app-go | 0.5 | 512M | 0.25 | 256M | + +### 6.2 Security Measures + +**Grafana authentication:** +- Anonymous access: **DISABLED** +- Admin password: Stored in `.env` file (not in git) +- Embedding: **DISABLED** + +**.env file contents:** +``` +GF_AUTH_ANONYMOUS_ENABLED=false +GF_SECURITY_ADMIN_USER=admin +GF_SECURITY_ADMIN_PASSWORD=SecureP@ssw0rd2024! +GF_SECURITY_ALLOW_EMBEDDING=false +``` + + +![Grafana Login](./screenshots/grafana-secure.png) + +### 6.3 Health Checks + +| Service | Health Check Endpoint | Interval | Timeout | +|---------|----------------------|----------|---------| +| loki | `http://localhost:3100/ready` | 10s | 5s | +| grafana | `http://localhost:3000/api/health` | 10s | 5s | +| app-python | `http://localhost:5000/health` | 10s | 5s | +| app-go | `http://localhost:5000/health` | 10s | 5s | + + +![Health Checks](./screenshots/docker-compose-ps.png) + +--- + +## 7. Testing + +### 7.1 Verification Commands + +```bash +# Check all services running +docker compose ps + +# Test Loki +curl http://localhost:3100/ready + +# Test Promtail +curl http://localhost:9080/targets + +# Test Grafana +curl http://localhost:3000/api/health +``` + +### 7.2 Generate Test Logs + +```bash +# Generate normal traffic +for i in {1..20}; do curl http://localhost:8000/; done +for i in {1..20}; do curl http://localhost:8000/health; done + +# Generate errors +for i in {1..5}; do curl http://localhost:8000/error; done +for i in {1..5}; do curl http://localhost:8001/error; done +``` + +### 7.3 LogQL Queries to Test + +```logql +# All Python app logs +{app="devops-python"} + +# All Go app logs +{app="devops-go"} + +# Errors only +{app=~"devops-.*"} |= "ERROR" + +# JSON parsed logs with method filter +{app=~"devops-.*"} | json | method="GET" + +# Request rate +sum by (app) (rate({app=~"devops-.*"}[1m])) +``` + +--- + +## 8. Challenges + +### 8.1 Issues Encountered + +1. **Error endpoint missing** + - **Problem:** Need endpoints that return 500 for error logging tests + - **Solution:** Added `/error` endpoints to both Python and Go applications + +### 8.2 Lessons Learned + +- TSDB storage in Loki 3.0 provides significant performance improvements +- JSON structured logging enables powerful LogQL queries +- Docker service discovery in Promtail simplifies container log collection +- Resource limits prevent monitoring stack from consuming all resources + +--- + +## Checklist Completion + +- [x] Loki, Promtail, Grafana running via Docker Compose +- [x] Loki data source configured in Grafana +- [x] Python app logging in JSON format +- [x] Go app integrated with logging labels +- [x] Logs visible in Grafana from all containers +- [x] Dashboard with 4+ panels created +- [x] LogQL queries working for different scenarios +- [x] Resource limits on all services +- [x] Health checks added +- [x] Grafana secured (no anonymous access) +- [x] Complete documentation with screenshots +- [x] All configuration files in repo + +--- + +## Appendix: Configuration Files + +### A.1 Full docker-compose.yml + +```yaml +version: '3.8' + +networks: + logging: + driver: bridge + +volumes: + loki-data: + grafana-data: + +services: + loki: + image: grafana/loki:3.0.0 + ports: + - "3100:3100" + volumes: + - ./loki/config.yml:/etc/loki/config.yml + - loki-data:/loki + command: -config.file=/etc/loki/config.yml + networks: + - logging + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + promtail: + image: grafana/promtail:3.0.0 + ports: + - "9080:9080" + volumes: + - ./promtail/config.yml:/etc/promtail/config.yml + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging + depends_on: + - loki + deploy: + resources: + limits: + cpus: '0.5' + memory: 256M + reservations: + cpus: '0.25' + memory: 128M + + grafana: + image: grafana/grafana:12.3.1 + ports: + - "3000:3000" + volumes: + - grafana-data:/var/lib/grafana + networks: + - logging + env_file: + - .env + deploy: + resources: + limits: + cpus: '1.0' + memory: 1G + reservations: + cpus: '0.5' + memory: 512M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + app-python: + build: + context: ../app_python + dockerfile: Dockerfile + ports: + - "8000:5000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-python" + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + app-go: + build: + context: ../app_go + dockerfile: Dockerfile + ports: + - "8001:5000" + networks: + - logging + labels: + logging: "promtail" + app: "devops-go" + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s +``` + +--- diff --git a/monitoring/docs/LAB08.md b/monitoring/docs/LAB08.md new file mode 100644 index 0000000000..cd39008fb4 --- /dev/null +++ b/monitoring/docs/LAB08.md @@ -0,0 +1,209 @@ +# Lab 08 — Metrics & Monitoring with Prometheus + +## 1. Architecture + +``` +┌─────────────────────────────────────────────────────┐ +│ Docker Network: logging │ +│ │ +│ ┌──────────────┐ scrape /metrics ┌───────────┐ │ +│ │ app-python │ ──────────────────► │Prometheus │ │ +│ │ :8000 │ │ :9090 │ │ +│ └──────────────┘ └─────┬─────┘ │ +│ │ query │ +│ ┌──────────────┐ scrape /metrics ▼ │ +│ │ Loki │ ──────────────────► ┌───────────┐ │ +│ │ :3100 │ │ Grafana │ │ +│ └──────────────┘ │ :3000 │ │ +│ └───────────┘ │ +│ ┌──────────────┐ scrape /metrics ▲ │ +│ │ Prometheus │ ──────────────────┘ │ │ +│ │ (self) │ logs │ │ +│ └──────────────┘ ┌──────────┐ ──────────┘ │ +│ │Promtail │ │ +│ │ :9080 │ │ +│ └──────────┘ │ +└─────────────────────────────────────────────────────┘ +``` + +Metric flow: **App → Prometheus (pull/scrape) → Grafana (query/visualise)** + +--- + +## 2. Application Instrumentation + +### `/metrics` endpoint output + +![metrics endpoint](screenshots/metrics.png) + +### Metrics defined in `app_python/core/metrics.py` + +| Metric | Type | Labels | Purpose (RED) | +|--------|------|--------|---------------| +| `http_requests_total` | Counter | method, endpoint, status_code | **Rate** & **Errors** | +| `http_request_duration_seconds` | Histogram | method, endpoint | **Duration** | +| `http_requests_in_progress` | Gauge | — | Concurrency | +| `devops_info_endpoint_calls_total` | Counter | endpoint | Business metric | +| `devops_info_system_collection_seconds` | Histogram | — | Internal perf | + +**Why these metrics?** +- `http_requests_total` covers both Rate (req/s via `rate()`) and Errors (filter by `status_code=~"5.."`). +- `http_request_duration_seconds` enables latency percentile queries with `histogram_quantile()`. +- `http_requests_in_progress` tracks concurrency, useful for diagnosing queue build-up. +- Business metrics make the service observable at the domain level, not just the transport level. + +**Middleware approach:** A single async FastAPI middleware (`log_requests`) records all three HTTP metrics per request, excluding the `/metrics` endpoint itself to avoid feedback loops. + +--- + +## 3. Prometheus Configuration + +**File:** `monitoring/prometheus/prometheus.yml` + +```yaml +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' # self-monitoring + - job_name: 'app' # app-python:8000/metrics + - job_name: 'loki' # loki:3100/metrics + - job_name: 'grafana' # grafana:3000/metrics +``` + +**Retention:** 15 days / 10 GB (set via CLI flags in docker-compose.yml) + +### All targets UP + +![prometheus targets](screenshots/targets.png) + +### PromQL query — `rate(http_requests_total[15m])` + +![promql query](screenshots/pql.png) + +--- + +## 4. Dashboard Walkthrough + +Dashboard file: `monitoring/grafana/dashboards/app-metrics.json` + +| Panel | Type | Query | Purpose | +|-------|------|-------|---------| +| Request Rate by Endpoint | Time series | `sum by (endpoint) (rate(http_requests_total[5m]))` | RED — Rate | +| Error Rate (5xx/s) | Time series | `sum(rate(http_requests_total{status_code=~"5.."}[5m]))` | RED — Errors | +| Request Duration p95 | Time series | `histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))` | RED — Duration | +| Request Duration Heatmap | Heatmap | `rate(http_request_duration_seconds_bucket[5m])` | Latency distribution | +| Active Requests | Gauge | `http_requests_in_progress` | Concurrency | +| Status Code Distribution | Pie chart | `sum by (status_code) (rate(http_requests_total[5m]))` | 2xx/4xx/5xx split | +| App Uptime | Stat | `up{job="app"}` | Service health | + +### Dashboard with live data + +![grafana dashboard](screenshots/dasboards.png) + +--- + +## 5. PromQL Examples + +```promql +# 1. Request rate per second (RED: Rate) +rate(http_requests_total[5m]) + +# 2. Total req/s across all endpoints +sum(rate(http_requests_total[5m])) + +# 3. Error rate — 5xx per second (RED: Errors) +sum(rate(http_requests_total{status_code=~"5.."}[5m])) + +# 4. 95th percentile latency (RED: Duration) +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) + +# 5. Per-endpoint p95 latency +histogram_quantile(0.95, sum by (le, endpoint) (rate(http_request_duration_seconds_bucket[5m]))) + +# 6. Services currently down +up == 0 + +# 7. CPU usage of the app process +rate(process_cpu_seconds_total{job="app"}[5m]) * 100 + +# 8. Business metric — calls to each endpoint +rate(devops_info_endpoint_calls_total[5m]) +``` + +--- + +## 6. Production Setup + +### Health Checks +All services declare `healthcheck` blocks. Docker reports `healthy` / `unhealthy` per container. + +### Resource Limits + +| Service | CPU | Memory | +|---------|-----|--------| +| Prometheus | 1.0 | 1 G | +| Loki | 1.0 | 1 G | +| Grafana | 0.5 | 512 M | +| app-python | 0.5 | 256 M | +| Promtail | 0.5 | 256 M | + +### Data Retention +- **Prometheus:** `--storage.tsdb.retention.time=15d`, `--storage.tsdb.retention.size=10GB` +- **Loki:** `retention_period: 168h` (7 days) in `loki/config.yml` + +### Persistent Volumes +```yaml +volumes: + prometheus-data: # Prometheus TSDB + loki-data: # Loki chunks + index + grafana-data: # Grafana DB (dashboards, users) +``` +Containers can be restarted or replaced without losing data. + +--- + +## 7. Testing Results + +### Services healthy — `docker compose ps` + +![docker compose ps](screenshots/docker-compose-ps.png) + +### `/metrics` endpoint output + +![metrics endpoint](screenshots/metrics.png) + +### Prometheus — all targets UP + +![prometheus targets](screenshots/targets.png) + +### PromQL query result + +![promql query](screenshots/pql.png) + +### Grafana dashboard with live data + +![grafana dashboard](screenshots/dasboards.png) + +--- + +## 8. Challenges & Solutions + +| Challenge | Solution | +|-----------|----------| +| FastAPI is ASGI, not Flask — `before_request` / `after_request` hooks don't exist | Used a single `@app.middleware("http")` to track start time, status code, and duration atomically | +| `/metrics` endpoint itself creates noise in metrics | Added `if endpoint != "/metrics": track = True` guard in middleware | +| Grafana provisioned datasource UID must match dashboard panel datasource refs | Set explicit `uid: prometheus` in the provisioning YAML and matched it (lowercase) in every panel's datasource block in the dashboard JSON | +| Prometheus `storage` block in `prometheus.yml` is not supported in v3.x | Retention configured via CLI flags `--storage.tsdb.retention.time` and `--storage.tsdb.retention.size` | + +--- + +## Metrics vs Logs (Lab 7 comparison) + +| | Logs (Lab 7 — Loki) | Metrics (Lab 8 — Prometheus) | +|--|---------------------|------------------------------| +| **What** | Discrete events with context | Aggregated numeric measurements | +| **When to use** | Debugging specific requests, tracing errors | Trending, alerting, capacity planning | +| **Query** | LogQL — text search | PromQL — math on time series | +| **Storage cost** | Higher (full text) | Lower (numbers + labels) | +| **Example** | "Request from 1.2.3.4 failed with 500" | "500 error rate is 0.03/s over last 5 min" | diff --git a/monitoring/docs/screenshots/dasboards.png b/monitoring/docs/screenshots/dasboards.png new file mode 100644 index 0000000000..7613c7aaa2 Binary files /dev/null and b/monitoring/docs/screenshots/dasboards.png differ diff --git a/monitoring/docs/screenshots/docker-compose-ps.png b/monitoring/docs/screenshots/docker-compose-ps.png new file mode 100644 index 0000000000..b6408365ea Binary files /dev/null and b/monitoring/docs/screenshots/docker-compose-ps.png differ diff --git a/monitoring/docs/screenshots/full-dashboard.png b/monitoring/docs/screenshots/full-dashboard.png new file mode 100644 index 0000000000..f6e3d38957 Binary files /dev/null and b/monitoring/docs/screenshots/full-dashboard.png differ diff --git a/monitoring/docs/screenshots/grafana-secure.png b/monitoring/docs/screenshots/grafana-secure.png new file mode 100644 index 0000000000..ec4a4bf72f Binary files /dev/null and b/monitoring/docs/screenshots/grafana-secure.png differ diff --git a/monitoring/docs/screenshots/json-logs-example.png b/monitoring/docs/screenshots/json-logs-example.png new file mode 100644 index 0000000000..ec94c43c66 Binary files /dev/null and b/monitoring/docs/screenshots/json-logs-example.png differ diff --git a/monitoring/docs/screenshots/metrics.png b/monitoring/docs/screenshots/metrics.png new file mode 100644 index 0000000000..a3a8cefa94 Binary files /dev/null and b/monitoring/docs/screenshots/metrics.png differ diff --git a/monitoring/docs/screenshots/panel-1-logs.png b/monitoring/docs/screenshots/panel-1-logs.png new file mode 100644 index 0000000000..0a534757e6 Binary files /dev/null and b/monitoring/docs/screenshots/panel-1-logs.png differ diff --git a/monitoring/docs/screenshots/panel-2-rate.png b/monitoring/docs/screenshots/panel-2-rate.png new file mode 100644 index 0000000000..d06ef41e2b Binary files /dev/null and b/monitoring/docs/screenshots/panel-2-rate.png differ diff --git a/monitoring/docs/screenshots/panel-3-errors.png b/monitoring/docs/screenshots/panel-3-errors.png new file mode 100644 index 0000000000..e56d6c01bd Binary files /dev/null and b/monitoring/docs/screenshots/panel-3-errors.png differ diff --git a/monitoring/docs/screenshots/panel-4-distribution.png b/monitoring/docs/screenshots/panel-4-distribution.png new file mode 100644 index 0000000000..1451ebb0e6 Binary files /dev/null and b/monitoring/docs/screenshots/panel-4-distribution.png differ diff --git a/monitoring/docs/screenshots/pql.png b/monitoring/docs/screenshots/pql.png new file mode 100644 index 0000000000..07ac741aab Binary files /dev/null and b/monitoring/docs/screenshots/pql.png differ diff --git a/monitoring/docs/screenshots/promtail-targets.png b/monitoring/docs/screenshots/promtail-targets.png new file mode 100644 index 0000000000..10ed7ae5e6 Binary files /dev/null and b/monitoring/docs/screenshots/promtail-targets.png differ diff --git a/monitoring/docs/screenshots/targets.png b/monitoring/docs/screenshots/targets.png new file mode 100644 index 0000000000..d8da874cfa Binary files /dev/null and b/monitoring/docs/screenshots/targets.png differ diff --git a/monitoring/grafana/dashboards/app-metrics.json b/monitoring/grafana/dashboards/app-metrics.json new file mode 100644 index 0000000000..e4ab641555 --- /dev/null +++ b/monitoring/grafana/dashboards/app-metrics.json @@ -0,0 +1,121 @@ +{ + "title": "DevOps Info App — Metrics", + "uid": "devops-app-metrics", + "schemaVersion": 38, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + { + "id": 1, + "title": "Request Rate by Endpoint", + "type": "timeseries", + "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (endpoint) (rate(http_requests_total[5m]))", + "legendFormat": "{{endpoint}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps", "custom": { "lineWidth": 2 } } + } + }, + { + "id": 2, + "title": "Error Rate (5xx/s)", + "type": "timeseries", + "gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m]))", + "legendFormat": "5xx errors/s" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps", "color": { "fixedColor": "red", "mode": "fixed" } } + } + }, + { + "id": 3, + "title": "Request Duration p95", + "type": "timeseries", + "gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95 latency" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "id": 4, + "title": "Request Duration Heatmap", + "type": "heatmap", + "gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "rate(http_request_duration_seconds_bucket[5m])", + "legendFormat": "{{le}}", + "format": "heatmap" + } + ] + }, + { + "id": 5, + "title": "Active Requests", + "type": "gauge", + "gridPos": { "x": 0, "y": 16, "w": 6, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "http_requests_in_progress", + "legendFormat": "in progress" + } + ], + "fieldConfig": { + "defaults": { "unit": "short", "min": 0, "max": 100 } + } + }, + { + "id": 6, + "title": "Status Code Distribution", + "type": "piechart", + "gridPos": { "x": 6, "y": 16, "w": 9, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (status_code) (rate(http_requests_total[5m]))", + "legendFormat": "HTTP {{status_code}}" + } + ] + }, + { + "id": 7, + "title": "App Uptime", + "type": "stat", + "gridPos": { "x": 15, "y": 16, "w": 9, "h": 6 }, + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "up{job=\"app\"}", + "legendFormat": "app-python" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "0": { "text": "DOWN", "color": "red" }, "1": { "text": "UP", "color": "green" } } } + ] + } + } + } + ] +} diff --git a/monitoring/grafana/provisioning/dashboards/app-metrics-dashboard.json b/monitoring/grafana/provisioning/dashboards/app-metrics-dashboard.json new file mode 100644 index 0000000000..ec9ac7a5ea --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/app-metrics-dashboard.json @@ -0,0 +1,604 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(http_requests_total[5m])) by (endpoint)", + "legendFormat": "{{endpoint}}", + "refId": "A" + } + ], + "title": "Request Rate by Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m]))", + "legendFormat": "5xx errors", + "refId": "A" + } + ], + "title": "Error Rate (5xx)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95 latency", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p50 latency", + "refId": "B" + } + ], + "title": "Request Duration (p50, p95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(http_requests_in_progress)", + "legendFormat": "Active requests", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "DOWN" + }, + "1": { + "color": "green", + "index": 0, + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "up{job=\"app\"}", + "legendFormat": "App Status", + "refId": "A" + } + ], + "title": "Service Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum by (status) (rate(http_requests_total[5m]))", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "rate(process_cpu_seconds_total[5m]) * 100", + "legendFormat": "CPU Usage", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "prometheus", + "application" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Application Metrics Dashboard", + "uid": "app-metrics-dashboard", + "version": 1, + "weekStart": "" +} diff --git a/monitoring/grafana/provisioning/dashboards/dashboards.yml b/monitoring/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..b0f4451550 --- /dev/null +++ b/monitoring/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: 'DevOps' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/monitoring/grafana/provisioning/datasources/loki.yml b/monitoring/grafana/provisioning/datasources/loki.yml new file mode 100644 index 0000000000..641f2b732e --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/loki.yml @@ -0,0 +1,8 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true diff --git a/monitoring/grafana/provisioning/datasources/prometheus.yml b/monitoring/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000000..2a6df1ad59 --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: false + jsonData: + timeInterval: '15s' diff --git a/monitoring/head b/monitoring/head new file mode 100644 index 0000000000..647f8fd2fa --- /dev/null +++ b/monitoring/head @@ -0,0 +1,5 @@ +{"client_ip":"172.22.0.1:42812","duration_ms":0,"level":"INFO","method":"GET","path":"/","service":"devops-go","status_code":200,"timestamp":"2026-03-12T15:17:58Z"} +{"client_ip":"172.22.0.1:42824","duration_ms":0,"level":"INFO","method":"GET","path":"/","service":"devops-go","status_code":200,"timestamp":"2026-03-12T15:17:58Z"} +{"client_ip":"172.22.0.1:42836","duration_ms":0,"level":"INFO","method":"GET","path":"/","service":"devops-go","status_code":200,"timestamp":"2026-03-12T15:17:58Z"} +{"client_ip":"172.22.0.1:42840","duration_ms":0,"level":"INFO","method":"GET","path":"/","service":"devops-go","status_code":200,"timestamp":"2026-03-12T15:17:58Z"} +{"client_ip":"172.22.0.1:42846","duration_ms":0,"level":"INFO","method":"GET","path":"/","service":"devops-go","status_code":200,"timestamp":"2026-03-12T15:17:58Z"} diff --git a/monitoring/loki/config.yml b/monitoring/loki/config.yml new file mode 100644 index 0000000000..a615ada40a --- /dev/null +++ b/monitoring/loki/config.yml @@ -0,0 +1,40 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/index + + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +limits_config: + retention_period: 168h + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000000..b607ce58c5 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'app' + static_configs: + - targets: ['app-python:8000'] + metrics_path: '/metrics' + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: '/metrics' + + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] + metrics_path: '/metrics' diff --git a/monitoring/promtail/config.yml b/monitoring/promtail/config.yml new file mode 100644 index 0000000000..aa666fd358 --- /dev/null +++ b/monitoring/promtail/config.yml @@ -0,0 +1,26 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'stream' + - source_labels: ['__meta_docker_container_label_logging'] + regex: 'promtail' + action: keep + - source_labels: ['__meta_docker_container_label_app'] + target_label: 'app' diff --git a/pulumi/docs/LAB04.md b/pulumi/docs/LAB04.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000000..b71107c512 --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,22 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/yandex-cloud/yandex" { + version = "0.191.0" + hashes = [ + "h1:nY8AeTA1tIE1acb61AVHCrobdGayezmTT7RVUzNsiz0=", + "zh:12fa5f2589986228018df66f91600d9343b24de9ad240820e19e8bc71a310997", + "zh:16c229277819fdcd78db4e66344d1ed4de7fe9d8960b83db90baa7d1b00926fa", + "zh:1b323d95089d99b2331d3aaad56415474199a19931b2c8d514017bac2e858368", + "zh:5b6eb7d73a99fbe4c0e7b223d7e5f5f4927595e63c04658bfb1613c06d86ff14", + "zh:7376c61074b67fcf74a483571a9878131b9269ddacd860c684e3ce23eb143d5d", + "zh:73f2a55c9daac8ebe7f5c30249c02af5f2d0a9654468817923765fb879bf8203", + "zh:75bdf4c29d29750c380568cdcca68f142762809c598625b79589026137d7c6a1", + "zh:869818890421a9930cdcffbbc0d29bd066896700cde26525ca4d3455163d8d6a", + "zh:872d6edf8092946e7f2d27752e7418124518816f7eab67b5547db2613d9b0f52", + "zh:98ea26be17e4cd6eb45a28f91541858ffd4b656604147c34660507b79759a534", + "zh:c86bd6dfb900672756363cafef96546929140bed2ee5582a20d662be1cd6301b", + "zh:d277dfac2884f635534a43077ade29feec8406813f7acf0a767ace6ffa073f01", + "zh:f4c5914d367c3d9256440ec1a7c2ede2b380697ef024cce06c2b145f0351c29a", + ] +} diff --git a/terraform/docs/LAB04.md b/terraform/docs/LAB04.md new file mode 100644 index 0000000000..7899e2dfad --- /dev/null +++ b/terraform/docs/LAB04.md @@ -0,0 +1,614 @@ +# Yandex Cloud Infrastructure Deployment with Terraform + +## Cloud Provider Selection +I chose Yandex Cloud because it was specified in the course requirements and provides free credits for students. It offers reliable infrastructure services in the Russian region. + +## Terraform Version +```bash +❯ terraform --version +Terraform v1.5.7 +on darwin_arm64 ++ provider registry.terraform.io/yandex-cloud/yandex v0.187.0 + +Your version of Terraform is out of date! The latest version +is 1.14.5. You can update by downloading from https://www.terraform.io/downloads.html +``` + +## Resources Created +- **VM name**: lab04-vm +- **Region/Zone**: ru-central1-a +- **Platform**: Intel Ice Lake (standard-v1) +- **vCPU**: 2 cores 20 % (since the cheaper one) +- **RAM**: 1 GB +- **Boot disk**: 10 GB (Ubuntu 22.04 LTS) +- **Network**: Custom VPC with public subnet +- **Security group**: With open ports 22 (only for my ip), 80, 5000 + +## Public IP Address +``` +93.77.191.229 +``` + +## SSH Connection Command +```bash +ssh -i ~/.ssh/id_ed25519 -l ubuntu 93.77.184.150 +``` + +## Terraform Plan Output +``` +terraform plan + +Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols: + + create + +Terraform will perform the following actions: + + # yandex_compute_instance.vm will be created + + resource "yandex_compute_instance" "vm" { + + created_at = (known after apply) + + folder_id = (known after apply) + + fqdn = (known after apply) + + gpu_cluster_id = (known after apply) + + hardware_generation = (known after apply) + + hostname = (known after apply) + + id = (known after apply) + + maintenance_grace_period = (known after apply) + + maintenance_policy = (known after apply) + + metadata = { + + "ssh-keys" = <<-EOT + ubuntu:ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICmdbSKCFxCtdWPDN5DKaFsbrl1ZRDSWBZS2pQns/bM/ e.s.belozerov@macbook-RQM17PFPYP + EOT + } + + name = "lab04-vm" + + network_acceleration_type = "standard" + + platform_id = "standard-v1" + + status = (known after apply) + + zone = (known after apply) + + + boot_disk { + + auto_delete = true + + device_name = (known after apply) + + disk_id = (known after apply) + + mode = (known after apply) + + + initialize_params { + + block_size = (known after apply) + + description = (known after apply) + + image_id = "fd84kp940dsrccckilj6" + + name = (known after apply) + + size = 10 + + snapshot_id = (known after apply) + + type = "network-hdd" + } + } + + + network_interface { + + index = (known after apply) + + ip_address = (known after apply) + + ipv4 = true + + ipv6 = (known after apply) + + ipv6_address = (known after apply) + + mac_address = (known after apply) + + nat = true + + nat_ip_address = (known after apply) + + nat_ip_version = (known after apply) + + security_group_ids = (known after apply) + + subnet_id = (known after apply) + } + + + resources { + + core_fraction = 20 + + cores = 2 + + memory = 1 + } + } + + # yandex_vpc_network.lab_network will be created + + resource "yandex_vpc_network" "lab_network" { + + created_at = (known after apply) + + default_security_group_id = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-network" + + subnet_ids = (known after apply) + } + + # yandex_vpc_security_group.lab_sg will be created + + resource "yandex_vpc_security_group" "lab_sg" { + + created_at = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-sg" + + network_id = (known after apply) + + status = (known after apply) + + + egress { + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = -1 + + protocol = "ANY" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + + ingress { + + description = "App port" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 5000 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + ingress { + + description = "HTTP" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 80 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + ingress { + + description = "SSH" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 22 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "188.130.155.165/32", + ] + + v6_cidr_blocks = [] + } + } + + # yandex_vpc_subnet.lab_subnet will be created + + resource "yandex_vpc_subnet" "lab_subnet" { + + created_at = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-subnet" + + network_id = (known after apply) + + v4_cidr_blocks = [ + + "10.0.1.0/24", + ] + + v6_cidr_blocks = (known after apply) + + zone = "ru-central1-a" + } + +Plan: 4 to add, 0 to change, 0 to destroy. + +Changes to Outputs: + + public_ip = (known after apply) + +─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── + +Note: You didn't use the -out option to save this plan, so Terraform can't guarantee to take exactly these actions if you run "terraform apply" now. +~/uni/DevOps-Core-Course/terraform lab04 !1 ?2 ❯ 17:18:07 +``` + +## Terraform Apply Output +``` +❯ terraform apply + +Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols: + + create + +Terraform will perform the following actions: + + # yandex_compute_instance.vm will be created + + resource "yandex_compute_instance" "vm" { + + created_at = (known after apply) + + folder_id = (known after apply) + + fqdn = (known after apply) + + gpu_cluster_id = (known after apply) + + hardware_generation = (known after apply) + + hostname = (known after apply) + + id = (known after apply) + + maintenance_grace_period = (known after apply) + + maintenance_policy = (known after apply) + + metadata = { + + "ssh-keys" = <<-EOT + ubuntu:ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICmdbSKCFxCtdWPDN5DKaFsbrl1ZRDSWBZS2pQns/bM/ e.s.belozerov@macbook-RQM17PFPYP + EOT + } + + name = "lab04-vm" + + network_acceleration_type = "standard" + + platform_id = "standard-v1" + + status = (known after apply) + + zone = (known after apply) + + + boot_disk { + + auto_delete = true + + device_name = (known after apply) + + disk_id = (known after apply) + + mode = (known after apply) + + + initialize_params { + + block_size = (known after apply) + + description = (known after apply) + + image_id = "fd84kp940dsrccckilj6" + + name = (known after apply) + + size = 10 + + snapshot_id = (known after apply) + + type = "network-hdd" + } + } + + + network_interface { + + index = (known after apply) + + ip_address = (known after apply) + + ipv4 = true + + ipv6 = (known after apply) + + ipv6_address = (known after apply) + + mac_address = (known after apply) + + nat = true + + nat_ip_address = (known after apply) + + nat_ip_version = (known after apply) + + security_group_ids = (known after apply) + + subnet_id = (known after apply) + } + + + resources { + + core_fraction = 20 + + cores = 2 + + memory = 1 + } + } + + # yandex_vpc_network.lab_network will be created + + resource "yandex_vpc_network" "lab_network" { + + created_at = (known after apply) + + default_security_group_id = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-network" + + subnet_ids = (known after apply) + } + + # yandex_vpc_security_group.lab_sg will be created + + resource "yandex_vpc_security_group" "lab_sg" { + + created_at = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-sg" + + network_id = (known after apply) + + status = (known after apply) + + + egress { + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = -1 + + protocol = "ANY" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + + ingress { + + description = "App port" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 5000 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + ingress { + + description = "HTTP" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 80 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "0.0.0.0/0", + ] + + v6_cidr_blocks = [] + } + + ingress { + + description = "SSH" + + from_port = -1 + + id = (known after apply) + + labels = (known after apply) + + port = 22 + + protocol = "TCP" + + to_port = -1 + + v4_cidr_blocks = [ + + "188.130.155.165/32", + ] + + v6_cidr_blocks = [] + } + } + + # yandex_vpc_subnet.lab_subnet will be created + + resource "yandex_vpc_subnet" "lab_subnet" { + + created_at = (known after apply) + + folder_id = (known after apply) + + id = (known after apply) + + labels = (known after apply) + + name = "lab-subnet" + + network_id = (known after apply) + + v4_cidr_blocks = [ + + "10.0.1.0/24", + ] + + v6_cidr_blocks = (known after apply) + + zone = "ru-central1-a" + } + +Plan: 4 to add, 0 to change, 0 to destroy. + +Changes to Outputs: + + public_ip = (known after apply) + +Do you want to perform these actions? + Terraform will perform the actions described above. + Only 'yes' will be accepted to approve. + + Enter a value: yes + +yandex_vpc_network.lab_network: Creating... +yandex_vpc_network.lab_network: Creation complete after 3s [id=enp63dqhauntlc50ddu4] +yandex_vpc_subnet.lab_subnet: Creating... +yandex_vpc_security_group.lab_sg: Creating... +yandex_vpc_subnet.lab_subnet: Creation complete after 0s [id=e9bqhbsacgdh594stfa8] +yandex_vpc_security_group.lab_sg: Creation complete after 2s [id=enpv8vdugs0i88u2ff7s] +yandex_compute_instance.vm: Creating... +yandex_compute_instance.vm: Still creating... [10s elapsed] +yandex_compute_instance.vm: Still creating... [20s elapsed] +yandex_compute_instance.vm: Still creating... [30s elapsed] +yandex_compute_instance.vm: Still creating... [40s elapsed] +yandex_compute_instance.vm: Still creating... [50s elapsed] +yandex_compute_instance.vm: Creation complete after 51s [id=fhm1qlo93vpmqmvjrkck] + +Apply complete! Resources: 4 added, 0 changed, 0 destroyed. + +Outputs: + +public_ip = "93.77.184.150" +~/uni/DevOps-Core-Course/terraform lab04 !1 ?2 ❯ 1m 0s 17:19:35 +``` + +## Proof of SSH Access + +![ssh](./screenshots/ssh.png) + +## Terrafrom destroy +``` +❯ terraform destroy +yandex_vpc_network.lab_network: Refreshing state... [id=enp63dqhauntlc50ddu4] +yandex_vpc_subnet.lab_subnet: Refreshing state... [id=e9bqhbsacgdh594stfa8] +yandex_vpc_security_group.lab_sg: Refreshing state... [id=enpv8vdugs0i88u2ff7s] +yandex_compute_instance.vm: Refreshing state... [id=fhm1qlo93vpmqmvjrkck] + +Terraform used the selected providers to generate the following execution plan. Resource actions are indicated with the following symbols: + - destroy + +Terraform will perform the following actions: + + # yandex_compute_instance.vm will be destroyed + - resource "yandex_compute_instance" "vm" { + - created_at = "2026-02-19T14:18:45Z" -> null + - folder_id = "b1g0cnocne76e6s8gf33" -> null + - fqdn = "fhm1qlo93vpmqmvjrkck.auto.internal" -> null + - hardware_generation = [ + - { + - generation2_features = [] + - legacy_features = [ + - { + - pci_topology = "PCI_TOPOLOGY_V1" + }, + ] + }, + ] -> null + - id = "fhm1qlo93vpmqmvjrkck" -> null + - labels = {} -> null + - metadata = { + - "ssh-keys" = <<-EOT + ubuntu:ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICmdbSKCFxCtdWPDN5DKaFsbrl1ZRDSWBZS2pQns/bM/ e.s.belozerov@macbook-RQM17PFPYP + EOT + } -> null + - name = "lab04-vm" -> null + - network_acceleration_type = "standard" -> null + - platform_id = "standard-v1" -> null + - status = "running" -> null + - zone = "ru-central1-a" -> null + + - boot_disk { + - auto_delete = true -> null + - device_name = "fhmk9knek2me3k1lsl0e" -> null + - disk_id = "fhmk9knek2me3k1lsl0e" -> null + - mode = "READ_WRITE" -> null + + - initialize_params { + - block_size = 4096 -> null + - image_id = "fd84kp940dsrccckilj6" -> null + - size = 10 -> null + - type = "network-hdd" -> null + } + } + + - metadata_options { + - aws_v1_http_endpoint = 1 -> null + - aws_v1_http_token = 2 -> null + - gce_http_endpoint = 1 -> null + - gce_http_token = 1 -> null + } + + - network_interface { + - index = 0 -> null + - ip_address = "10.0.1.30" -> null + - ipv4 = true -> null + - ipv6 = false -> null + - mac_address = "d0:0d:1d:57:09:1f" -> null + - nat = true -> null + - nat_ip_address = "93.77.184.150" -> null + - nat_ip_version = "IPV4" -> null + - security_group_ids = [ + - "enpv8vdugs0i88u2ff7s", + ] -> null + - subnet_id = "e9bqhbsacgdh594stfa8" -> null + } + + - placement_policy { + - host_affinity_rules = [] -> null + - placement_group_partition = 0 -> null + } + + - resources { + - core_fraction = 20 -> null + - cores = 2 -> null + - gpus = 0 -> null + - memory = 1 -> null + } + + - scheduling_policy { + - preemptible = false -> null + } + } + + # yandex_vpc_network.lab_network will be destroyed + - resource "yandex_vpc_network" "lab_network" { + - created_at = "2026-02-19T14:18:39Z" -> null + - default_security_group_id = "enpohhna9tmkh569c3r0" -> null + - folder_id = "b1g0cnocne76e6s8gf33" -> null + - id = "enp63dqhauntlc50ddu4" -> null + - labels = {} -> null + - name = "lab-network" -> null + - subnet_ids = [ + - "e9bqhbsacgdh594stfa8", + ] -> null + } + + # yandex_vpc_security_group.lab_sg will be destroyed + - resource "yandex_vpc_security_group" "lab_sg" { + - created_at = "2026-02-19T14:18:44Z" -> null + - folder_id = "b1g0cnocne76e6s8gf33" -> null + - id = "enpv8vdugs0i88u2ff7s" -> null + - labels = {} -> null + - name = "lab-sg" -> null + - network_id = "enp63dqhauntlc50ddu4" -> null + - status = "ACTIVE" -> null + + - egress { + - from_port = -1 -> null + - id = "enpkos8fbdcv5ref1i8p" -> null + - labels = {} -> null + - port = -1 -> null + - protocol = "ANY" -> null + - to_port = -1 -> null + - v4_cidr_blocks = [ + - "0.0.0.0/0", + ] -> null + - v6_cidr_blocks = [] -> null + } + + - ingress { + - description = "App port" -> null + - from_port = -1 -> null + - id = "enpo08kcb0g2edrkmo87" -> null + - labels = {} -> null + - port = 5000 -> null + - protocol = "TCP" -> null + - to_port = -1 -> null + - v4_cidr_blocks = [ + - "0.0.0.0/0", + ] -> null + - v6_cidr_blocks = [] -> null + } + - ingress { + - description = "HTTP" -> null + - from_port = -1 -> null + - id = "enpc7pat230ck8rea187" -> null + - labels = {} -> null + - port = 80 -> null + - protocol = "TCP" -> null + - to_port = -1 -> null + - v4_cidr_blocks = [ + - "0.0.0.0/0", + ] -> null + - v6_cidr_blocks = [] -> null + } + - ingress { + - description = "SSH" -> null + - from_port = -1 -> null + - id = "enpg3v816f87pk79jga1" -> null + - labels = {} -> null + - port = 22 -> null + - protocol = "TCP" -> null + - to_port = -1 -> null + - v4_cidr_blocks = [ + - "188.130.155.165/32", + ] -> null + - v6_cidr_blocks = [] -> null + } + } + + # yandex_vpc_subnet.lab_subnet will be destroyed + - resource "yandex_vpc_subnet" "lab_subnet" { + - created_at = "2026-02-19T14:18:42Z" -> null + - folder_id = "b1g0cnocne76e6s8gf33" -> null + - id = "e9bqhbsacgdh594stfa8" -> null + - labels = {} -> null + - name = "lab-subnet" -> null + - network_id = "enp63dqhauntlc50ddu4" -> null + - v4_cidr_blocks = [ + - "10.0.1.0/24", + ] -> null + - v6_cidr_blocks = [] -> null + - zone = "ru-central1-a" -> null + } + +Plan: 0 to add, 0 to change, 4 to destroy. + +Changes to Outputs: + - public_ip = "93.77.184.150" -> null + +Do you really want to destroy all resources? + Terraform will destroy all your managed infrastructure, as shown above. + There is no undo. Only 'yes' will be accepted to confirm. + + Enter a value: yes + +yandex_compute_instance.vm: Destroying... [id=fhm1qlo93vpmqmvjrkck] +yandex_compute_instance.vm: Still destroying... [id=fhm1qlo93vpmqmvjrkck, 10s elapsed] +yandex_compute_instance.vm: Still destroying... [id=fhm1qlo93vpmqmvjrkck, 20s elapsed] +yandex_compute_instance.vm: Still destroying... [id=fhm1qlo93vpmqmvjrkck, 30s elapsed] +yandex_compute_instance.vm: Destruction complete after 35s +yandex_vpc_subnet.lab_subnet: Destroying... [id=e9bqhbsacgdh594stfa8] +yandex_vpc_security_group.lab_sg: Destroying... [id=enpv8vdugs0i88u2ff7s] +yandex_vpc_security_group.lab_sg: Destruction complete after 0s +yandex_vpc_subnet.lab_subnet: Destruction complete after 5s +yandex_vpc_network.lab_network: Destroying... [id=enp63dqhauntlc50ddu4] +yandex_vpc_network.lab_network: Destruction complete after 0s + +Destroy complete! Resources: 4 destroyed. +~/uni/DevOps-Core-Course/terraform lab04 !1 ?2 ❯ 49s 17:24:20 +``` diff --git a/terraform/docs/screenshots/ssh.png b/terraform/docs/screenshots/ssh.png new file mode 100644 index 0000000000..609b336371 Binary files /dev/null and b/terraform/docs/screenshots/ssh.png differ diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000000..817742310a --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,69 @@ +resource "yandex_vpc_network" "lab_network" { + name = "lab-network" +} + +resource "yandex_vpc_subnet" "lab_subnet" { + name = "lab-subnet" + zone = var.zone + network_id = yandex_vpc_network.lab_network.id + v4_cidr_blocks = ["10.0.1.0/24"] +} + +resource "yandex_vpc_security_group" "lab_sg" { + name = "lab-sg" + network_id = yandex_vpc_network.lab_network.id + + ingress { + protocol = "TCP" + description = "SSH" + port = 22 + v4_cidr_blocks = ["188.130.155.165/32"] + } + + ingress { + protocol = "TCP" + description = "HTTP" + port = 80 + v4_cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + protocol = "TCP" + description = "App port" + port = 5000 + v4_cidr_blocks = ["0.0.0.0/0"] + } + + egress { + protocol = "ANY" + v4_cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "yandex_compute_instance" "vm" { + name = var.vm_name + + resources { + cores = 2 + memory = 1 + core_fraction = 20 + } + + boot_disk { + initialize_params { + image_id = "fd84kp940dsrccckilj6" + size = 10 + type = "network-hdd" + } + } + + network_interface { + subnet_id = yandex_vpc_subnet.lab_subnet.id + security_group_ids = [yandex_vpc_security_group.lab_sg.id] + nat = true + } + + metadata = { + ssh-keys = "ubuntu:${file(var.public_ssh_key)}" + } +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000000..3291702eac --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,3 @@ +output "public_ip" { + value = yandex_compute_instance.vm.network_interface.0.nat_ip_address +} diff --git a/terraform/provider.tf b/terraform/provider.tf new file mode 100644 index 0000000000..53a7d5daf6 --- /dev/null +++ b/terraform/provider.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + yandex = { + source = "yandex-cloud/yandex" + } + } + required_version = ">= 0.13" +} + +provider "yandex" { + service_account_key_file = "key.json" + cloud_id = var.cloud_id + folder_id = var.folder_id + zone = var.zone +} \ No newline at end of file diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000000..351492ad13 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,25 @@ +variable "cloud_id" { + default = "b1g66sjilsdsanah7cpe" +} + +variable "folder_id" { + default = "b1g0cnocne76e6s8gf33" +} + +variable "zone" { + default = "ru-central1-a" +} + +variable "vm_name" { + default = "lab04-vm" +} + +variable "vm_user" { + default = "e.s.belozerov" +} + +variable "public_ssh_key" { + description = "Path to public SSH key" + type = string + default = "~/.ssh/id_ed25519.pub" +} \ No newline at end of file