From 9289e61c15573f59778331c2842d21476b6f5211 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 29 May 2026 19:54:32 +0300 Subject: [PATCH 1/6] Tests --- spread.yaml | 4 +- .../integration/ha_tests/test_stereo_mode.py | 3 +- tests/integration/helpers.py | 3 +- .../integration/high_availability/__init__.py | 0 .../integration/high_availability/conftest.py | 37 ++++ .../high_availability/test_upgrade.py | 196 ++++++++++++++++++ tests/integration/pyproject.toml | 72 ------- tests/spread/test_upgrade.py/task.yaml | 7 + 8 files changed, 246 insertions(+), 76 deletions(-) create mode 100644 tests/integration/high_availability/__init__.py create mode 100644 tests/integration/high_availability/conftest.py create mode 100644 tests/integration/high_availability/test_upgrade.py delete mode 100644 tests/integration/pyproject.toml create mode 100644 tests/spread/test_upgrade.py/task.yaml diff --git a/spread.yaml b/spread.yaml index fa77fac..91317c5 100644 --- a/spread.yaml +++ b/spread.yaml @@ -43,7 +43,7 @@ backends: CONCIERGE_EXTRA_SNAPS: charmcraft CONCIERGE_EXTRA_DEBS: pipx systems: - - ubuntu-22.04: + - ubuntu-24.04: username: runner prepare: | systemctl disable --now unattended-upgrades.service @@ -97,7 +97,7 @@ backends: LANDSCAPE_ACCOUNT_NAME: '$(HOST: echo $LANDSCAPE_ACCOUNT_NAME)' LANDSCAPE_REGISTRATION_KEY: '$(HOST: echo $LANDSCAPE_REGISTRATION_KEY)' systems: - - ubuntu-22.04: + - ubuntu-24.04: username: runner - ubuntu-24.04-arm: username: runner diff --git a/tests/integration/ha_tests/test_stereo_mode.py b/tests/integration/ha_tests/test_stereo_mode.py index f24f986..5d453b2 100644 --- a/tests/integration/ha_tests/test_stereo_mode.py +++ b/tests/integration/ha_tests/test_stereo_mode.py @@ -18,12 +18,13 @@ import logging import pytest -from constants import RAFT_PARTNER_PREFIX from pysyncobj.utility import TcpUtility from pytest_operator.plugin import OpsTest from tenacity import Retrying, stop_after_delay, wait_fixed from yaml import safe_load +from constants import RAFT_PARTNER_PREFIX + from ..helpers import APPLICATION_NAME, DATABASE_APP_NAME from .helpers import APPLICATION_NAME as TEST_APP_NAME from .helpers import ( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 8a1266d..7688be4 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -9,7 +9,6 @@ import psycopg2 import requests import yaml -from constants import PEER from juju.model import Model from pytest_operator.plugin import OpsTest from tenacity import ( @@ -18,6 +17,8 @@ wait_exponential, ) +from constants import PEER + CHARM_BASE = "ubuntu@22.04" METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) DATABASE_APP_NAME = "postgresql" diff --git a/tests/integration/high_availability/__init__.py b/tests/integration/high_availability/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/high_availability/conftest.py b/tests/integration/high_availability/conftest.py new file mode 100644 index 0000000..7d02b89 --- /dev/null +++ b/tests/integration/high_availability/conftest.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import pytest +from tenacity import Retrying, stop_after_attempt + +from .high_availability_helpers_new import get_app_leader + +logger = logging.getLogger(__name__) + +DB_TEST_APP_NAME = "postgresql-test-app" + + +@pytest.fixture() +def continuous_writes(juju): + """Starts continuous writes to the MySQL cluster for a test and clear the writes at the end.""" + application_unit = get_app_leader(juju, DB_TEST_APP_NAME) + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() + + logger.info("Starting continuous writes") + + for attempt in Retrying(stop=stop_after_attempt(10), reraise=True): + with attempt: + result = juju.run(unit=application_unit, action="start-continuous-writes") + result.raise_on_failure() + + assert result.results["result"] == "True" + + yield + + logger.info("Clearing continuous writes") + juju.run(unit=application_unit, action="clear-continuous-writes", wait=120).raise_on_failure() diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py new file mode 100644 index 0000000..b4e0508 --- /dev/null +++ b/tests/integration/high_availability/test_upgrade.py @@ -0,0 +1,196 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import platform +import shutil +import zipfile +from pathlib import Path + +import jubilant +import tomli +import tomli_w +from jubilant import Juju + +from .high_availability_helpers_new import ( + check_db_units_writes_increment, + count_switchovers, + get_app_leader, + get_app_units, + wait_for_apps_status, +) + +DB_APP_NAME = "postgresql" +WATCHER_APP_NAME = "postgresql-watcher" +DB_TEST_APP_NAME = "postgresql-test-app" + +MINUTE_SECS = 60 + +logging.getLogger("jubilant.wait").setLevel(logging.WARNING) + + +def test_deploy_latest(juju: Juju) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + logging.info("Deploying PostgreSQL cluster") + juju.deploy( + charm=DB_APP_NAME, + app=DB_APP_NAME, + base="ubuntu@24.04", + channel="16/edge", + config={"profile": "testing", "synchronous-mode-strict": False}, + num_units=2, + ) + juju.deploy( + charm=WATCHER_APP_NAME, + app=WATCHER_APP_NAME, + base="ubuntu@24.04", + channel="16/edge", + config={"profile": "testing"}, + num_units=1, + ) + juju.deploy( + charm=DB_TEST_APP_NAME, + app=DB_TEST_APP_NAME, + base="ubuntu@24.04", + channel="latest/edge", + num_units=1, + ) + + juju.integrate( + f"{DB_APP_NAME}:watcher-offer", + f"{WATCHER_APP_NAME}:watcher", + ) + juju.integrate( + f"{DB_APP_NAME}:database", + f"{DB_TEST_APP_NAME}:database", + ) + + logging.info("Wait for applications to become active") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + +def test_pre_refresh_check(juju: Juju) -> None: + """Test that the pre-refresh-check action runs successfully.""" + db_leader = get_app_leader(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + juju.run(unit=db_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + +def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: + """Update the second cluster.""" + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + logging.info("Wait for refresh to block as paused or incompatible") + try: + juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + + units = get_app_units(juju, DB_APP_NAME) + unit_names = sorted(units.keys()) + + if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + logging.info("Application refresh is blocked due to incompatibility") + juju.run( + unit=unit_names[-1], + action="force-refresh-start", + params={"check-compatibility": False}, + wait=5 * MINUTE_SECS, + ) + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + logging.info("Run resume-refresh action") + juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) + except TimeoutError: + logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") + assert juju.status().apps[DB_APP_NAME].is_active + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + timeout=20 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes are incrementing") + check_db_units_writes_increment(juju, DB_APP_NAME) + + logging.info("checking the number of switchovers") + final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) + assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( + "Number of switchovers is greater than 2" + ) + + +def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: + """Test an upgrade failure and its rollback.""" + db_app_leader = get_app_leader(juju, DB_APP_NAME) + db_app_units = get_app_units(juju, DB_APP_NAME) + + logging.info("Run pre-refresh-check action") + juju.run(unit=db_app_leader, action="pre-refresh-check") + + juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) + + tmp_folder = Path("tmp") + tmp_folder.mkdir(exist_ok=True) + tmp_folder_charm = Path(tmp_folder, charm).absolute() + + shutil.copy(charm, tmp_folder_charm) + + logging.info("Inject dependency fault") + inject_dependency_fault(juju, DB_APP_NAME, tmp_folder_charm) + + logging.info("Refresh the charm") + juju.refresh(app=DB_APP_NAME, path=tmp_folder_charm) + + logging.info("Wait for upgrade to fail on leader") + juju.wait( + ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_NAME), + timeout=10 * MINUTE_SECS, + ) + + logging.info("Ensure continuous writes on all units") + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + + logging.info("Re-refresh the charm") + juju.refresh(app=DB_APP_NAME, path=charm) + + logging.info("Wait for upgrade to complete") + juju.wait( + ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS + ) + + logging.info("Ensure continuous writes after rollback procedure") + check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + + # Remove fault charm file + tmp_folder_charm.unlink() + + +def inject_dependency_fault(juju: Juju, app_name: str, charm_file: str | Path) -> None: + """Inject a dependency fault into the PostgreSQL charm.""" + with Path("refresh_versions.toml").open("rb") as file: + versions = tomli.load(file) + + versions["charm"] = "16/0.0.0" + versions["snap"]["revisions"][platform.machine()] = "1" + + # Overwrite refresh_versions.toml with incompatible version. + with zipfile.ZipFile(charm_file, mode="a") as charm_zip: + charm_zip.writestr("refresh_versions.toml", tomli_w.dumps(versions)) diff --git a/tests/integration/pyproject.toml b/tests/integration/pyproject.toml deleted file mode 100644 index f2bc1ab..0000000 --- a/tests/integration/pyproject.toml +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2026 Canonical Ltd. -# See LICENSE file for licensing details. - -# Linting tools configuration -[tool.ruff] -# preview and explicit preview are enabled for CPY001 -preview = true -target-version = "py310" -src = ["."] -line-length = 99 - -[tool.ruff.lint] -explicit-preview-rules = true -select = [ - "A", - "E", - "W", - "F", - "C", - "N", - "D", - "I001", - "B", - "CPY001", - "RUF", - "S", - "SIM", - "UP", - "TC", -] -extend-ignore = [ - "D203", - "D204", - "D213", - "D215", - "D400", - "D404", - "D406", - "D407", - "D408", - "D409", - "D413", - "B904", -] -# Ignore E501 because using black creates errors with this -# Ignore D107 Missing docstring in __init__ -ignore = ["E501", "D107"] - -[tool.ruff.lint.per-file-ignores] -"*" = [ - "D100", - "D101", - "D102", - "D103", - "D104", - # Asserts - "B011", - # Disable security checks for tests - "S", -] - -[tool.ruff.lint.flake8-copyright] -# Check for properly formatted copyright header in each file -author = "Canonical Ltd." -notice-rgx = "Copyright\\s\\d{4}([-,]\\d{4})*\\s+" -min-file-size = 1 - -[tool.ruff.lint.mccabe] -max-complexity = 10 - -[tool.ruff.lint.pydocstyle] -convention = "google" diff --git a/tests/spread/test_upgrade.py/task.yaml b/tests/spread/test_upgrade.py/task.yaml new file mode 100644 index 0000000..f99ac69 --- /dev/null +++ b/tests/spread/test_upgrade.py/task.yaml @@ -0,0 +1,7 @@ +summary: test_upgrade.py +environment: + TEST_MODULE: high_availability/test_upgrade.py +execute: | + tox run -e integration -- "tests/integration/$TEST_MODULE" --model testing --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From e3147cf5003a895eca3ffae00d531dafc9677bd0 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sat, 30 May 2026 02:21:48 +0300 Subject: [PATCH 2/6] Bump snaps and test helpers --- refresh_versions.toml | 6 +- tests/integration/conftest.py | 18 ++ tests/integration/helpers.py | 31 +++ .../high_availability_helpers_new.py | 260 ++++++++++++++++++ 4 files changed, 312 insertions(+), 3 deletions(-) create mode 100644 tests/integration/high_availability/high_availability_helpers_new.py diff --git a/refresh_versions.toml b/refresh_versions.toml index 25bb7b2..c25bcbc 100644 --- a/refresh_versions.toml +++ b/refresh_versions.toml @@ -1,11 +1,11 @@ charm_major = 1 -workload = "16.13" +workload = "16.14" [snap] name = "charmed-postgresql" [snap.revisions] # amd64 -x86_64 = "332" +x86_64 = "360" # arm64 -aarch64 = "331" +aarch64 = "359" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 580a011..0b2e553 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. import logging +import jubilant import pytest from . import architecture @@ -15,3 +16,20 @@ def charm(): # juju bundle files expect local charms to begin with `./` or `/` to distinguish them from # Charmhub charms. return f"./postgresql-watcher_ubuntu@24.04-{architecture.architecture}.charm" + + +@pytest.fixture(scope="module") +def juju(request: pytest.FixtureRequest): + """Pytest fixture that wraps :meth:`jubilant.with_model`. + + This adds command line parameter ``--keep-models`` (see help for details). + """ + model = request.config.getoption("--model") + keep_models = bool(request.config.getoption("--keep-models")) + + if model: + juju = jubilant.Juju(model=model) + yield juju + else: + with jubilant.temp_model(keep=keep_models) as juju: + yield juju diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 7688be4..ae11816 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -195,3 +195,34 @@ async def run_command_on_unit(ops_test: OpsTest, unit_name: str, command: str) - f"Expected command '{command}' to succeed instead it failed: {return_code}" ) return stdout + + +### Ported Mysql jubilant helpers + + +def execute_queries_on_unit( + unit_address: str, username: str, password: str, queries: list[str], database: str +) -> list: + """Execute given PostgreSQL queries on a unit. + + Args: + unit_address: The public IP address of the unit to execute the queries on + username: The PostgreSQL username + password: The PostgreSQL password + queries: A list of queries to execute + database: Database to execute in + + Returns: + A list of rows that were potentially queried + """ + with ( + psycopg2.connect( + f"dbname='{database}' user='{username}' host='{unit_address}' password='{password}' connect_timeout=10" + ) as connection, + connection.cursor() as cursor, + ): + for query in queries: + cursor.execute(query) + output = list(itertools.chain(*cursor.fetchall())) + + return output diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py new file mode 100644 index 0000000..27e0bcc --- /dev/null +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import json +import subprocess +from collections.abc import Callable + +import jubilant +import requests +from jubilant import Juju +from jubilant.statustypes import Status, UnitStatus +from tenacity import Retrying, stop_after_delay, wait_fixed + +from constants import PEER + +from ..helpers import execute_queries_on_unit + +MINUTE_SECS = 60 +SERVER_CONFIG_USERNAME = "operator" + +JujuModelStatusFn = Callable[[Status], bool] +JujuAppsStatusFn = Callable[[Status, str], bool] + + +def check_db_units_writes_increment( + juju: Juju, + app_name: str, + app_units: list[str] | None = None, + db_name: str = "postgresql_test_app_database", +) -> None: + """Ensure that continuous writes is incrementing on all units. + + Also, ensure that all continuous writes up to the max written value is available + on all units (ensure that no committed data is lost). + """ + if not app_units: + app_units = get_app_units(juju, app_name) + + app_primary = get_db_primary_unit(juju, app_name) + app_max_value = get_db_max_written_value(juju, app_name, app_primary, db_name) + + for unit_name in app_units: + for attempt in Retrying( + reraise=True, + stop=stop_after_delay(5 * MINUTE_SECS), + wait=wait_fixed(10), + ): + with attempt: + unit_max_value = get_db_max_written_value(juju, app_name, unit_name, db_name) + assert unit_max_value > app_max_value, "Writes not incrementing" + app_max_value = unit_max_value + + +def get_app_leader(juju: Juju, app_name: str) -> str: + """Get the leader unit for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if status.leader: + return name + + raise Exception("No leader unit found") + + +def get_app_name(juju: Juju, charm_name: str) -> str | None: + """Get the application name for the given charm.""" + model_status = juju.status() + app_statuses = model_status.apps + for name, status in app_statuses.items(): + if status.charm_name == charm_name: + return name + + raise Exception("No application name found") + + +def get_app_units(juju: Juju, app_name: str) -> dict[str, UnitStatus]: + """Get the units for the given application.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + return app_status.units + + +def get_unit_by_number(juju: Juju, app_name: str, unit_number: int) -> str: + """Get unit by number.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name in app_status.units: + if name == f"{app_name}/{unit_number}": + return name + + raise Exception("No application unit found") + + +def get_unit_ip(juju: Juju, app_name: str, unit_name: str) -> str: + """Get the application unit IP.""" + model_status = juju.status() + app_status = model_status.apps[app_name] + for name, status in app_status.units.items(): + if name == unit_name: + return status.public_address + + raise Exception("No application unit found") + + +def get_unit_info(juju: Juju, unit_name: str) -> dict: + """Return a dictionary with the show-unit data.""" + output = subprocess.check_output( + ["juju", "show-unit", f"--model={juju.model}", "--format=json", unit_name], + text=True, + ) + + return json.loads(output) + + +def get_unit_status_log(juju: Juju, unit_name: str, log_lines: int = 0) -> list[dict]: + """Get the status log for a unit. + + Args: + juju: The juju instance to use. + unit_name: The name of the unit to retrieve the status log for + log_lines: The number of status logs to retrieve (optional) + """ + # fmt: off + output = subprocess.check_output( + ["juju", "show-status-log", f"--model={juju.model}", "--format=json", unit_name, "-n", f"{log_lines}"], + text=True, + ) + + return json.loads(output) + + +def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: + """Returns a list that contains the relation-data. + + Args: + juju: The juju instance to use. + app_name: The name of the application + rel_name: name of the relation to get connection data from + + Returns: + A list that contains the relation-data + """ + app_leader = get_app_leader(juju, app_name) + app_leader_info = get_unit_info(juju, app_leader) + if not app_leader_info: + raise ValueError(f"No unit info could be grabbed for unit {app_leader}") + + relation_data = [ + value + for value in app_leader_info[app_leader]["relation-info"] + if value["endpoint"] == rel_name + ] + if not relation_data: + raise ValueError(f"No relation data could be grabbed for relation {rel_name}") + + return relation_data + + +def get_db_unit_name(instance_label: str) -> str: + """Builds a Juju unit name out of a MySQL instance label.""" + return "/".join(instance_label.rsplit("-", 1)) + + +def get_db_primary_unit(juju: Juju, app_name: str) -> str: + """Get the current primary node of the cluster.""" + postgresql_primary = get_app_leader(juju, app_name) + task = juju.run(unit=postgresql_primary, action="get-primary", wait=5 * MINUTE_SECS) + task.raise_on_failure() + + primary = task.results.get("primary") + if primary != "None": + return primary + + raise Exception("No primary node found") + + +def get_db_standby_leader_unit(juju: Juju, app_name: str) -> str: + """Get the current standby node of the cluster.""" + unit_address = get_unit_ip(juju, app_name, get_app_leader(juju, app_name)) + + for member in requests.get(f"https://{unit_address}:8008/cluster", verify=False).json()[ + "members" + ]: + if member["role"] == "standby_leader": + return member["name"][::-1].replace("-", "/")[::-1] + + raise Exception("No standby primary node found") + + +def get_db_max_written_value( + juju: Juju, app_name: str, unit_name: str, db_name: str = "postgresql_test_app_database" +) -> int: + """Retrieve the max written value in the PostgreSQL database. + + Args: + juju: The Juju model. + app_name: The application name. + unit_name: The unit name. + db_name: The database to connect to. + """ + password = get_user_password(juju, app_name, SERVER_CONFIG_USERNAME) + + output = execute_queries_on_unit( + get_unit_ip(juju, app_name, unit_name), + SERVER_CONFIG_USERNAME, + password, + ["SELECT MAX(number) FROM continuous_writes;"], + db_name, + ) + return output[0] + + +def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> JujuModelStatusFn: + """Waits for Juju agents to be idle, and for applications to reach a certain status. + + Args: + jubilant_status_func: The Juju apps status function to wait for. + apps: The applications to wait for. + + Returns: + Juju model status function. + """ + return lambda status: all(( + jubilant.all_agents_idle(status, *apps), + jubilant_status_func(status, *apps), + )) + + +def wait_for_unit_status(app_name: str, unit_name: str, unit_status: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific status.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.current == unit_status + ) + + +def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> JujuModelStatusFn: + """Returns whether a Juju unit to have a specific message.""" + return lambda status: ( + status.apps[app_name].units[unit_name].workload_status.message == unit_message + ) + + +# PG helpers + + +def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: + """Get a system user's password.""" + for secret in juju.secrets(): + if secret.label == f"{PEER}.{app_name}.app": + revealed_secret = juju.show_secret(secret.uri, reveal=True) + return revealed_secret.content.get(f"{user}-password") + + +def count_switchovers(juju: Juju, app_name: str) -> int: + """Return the number of performed switchovers.""" + app_primary = get_db_primary_unit(juju, app_name) + unit_address = get_unit_ip(juju, app_name, app_primary) + switchover_history_info = requests.get(f"https://{unit_address}:8008/history", verify=False) + return len(switchover_history_info.json()) From 3eee089145eedad7999a5cb94887706a8fef1a4f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sat, 30 May 2026 06:19:01 +0300 Subject: [PATCH 3/6] Stop machine before removing --- tests/integration/ha_tests/test_stereo_mode.py | 18 +++++++++++------- tests/integration/helpers.py | 12 ++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tests/integration/ha_tests/test_stereo_mode.py b/tests/integration/ha_tests/test_stereo_mode.py index 5d453b2..79d94f9 100644 --- a/tests/integration/ha_tests/test_stereo_mode.py +++ b/tests/integration/ha_tests/test_stereo_mode.py @@ -25,7 +25,7 @@ from constants import RAFT_PARTNER_PREFIX -from ..helpers import APPLICATION_NAME, DATABASE_APP_NAME +from ..helpers import APPLICATION_NAME, DATABASE_APP_NAME, get_machine_from_unit, stop_machine from .helpers import APPLICATION_NAME as TEST_APP_NAME from .helpers import ( are_writes_increasing, @@ -261,6 +261,7 @@ async def test_replica_shutdown_with_watcher(ops_test: OpsTest, continuous_write logger.info(f"Shutting down replica: {replica}") # Shutdown the replica + await stop_machine(ops_test, await get_machine_from_unit(ops_test, replica)) await ops_test.model.destroy_unit(replica, force=True, destroy_storage=False, max_wait=1500) # Wait for the cluster to stabilize after unit removal @@ -340,6 +341,7 @@ async def test_primary_shutdown_with_watcher(ops_test: OpsTest, continuous_write logger.info(f"Shutting down primary: {original_primary}") # Shutdown the primary + await stop_machine(ops_test, await get_machine_from_unit(ops_test, original_primary)) await ops_test.model.destroy_unit( original_primary, force=True, destroy_storage=False, max_wait=1500 ) @@ -434,6 +436,7 @@ async def test_watcher_shutdown_no_outage(ops_test: OpsTest, continuous_writes) # Remove the watcher watcher_unit = ops_test.model.applications[WATCHER_APP_NAME].units[0] + await stop_machine(ops_test, await get_machine_from_unit(ops_test, watcher_unit.name)) await ops_test.model.destroy_unit(watcher_unit.name, force=True, max_wait=300) # Verify writes continue without interruption @@ -520,12 +523,13 @@ async def test_primary_network_isolation_with_watcher( # Wait for cluster to stabilize with restored network # The old primary may take time to rejoin after getting a new IP address, # so we use raise_on_error=False and wait longer - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], - timeout=900, - idle_period=30, - raise_on_error=False, # Old primary may be in error while rejoining - ) + async with ops_test.fast_forward(fast_interval="60s"): + await ops_test.model.wait_for_idle( + apps=[DATABASE_APP_NAME], + timeout=900, + idle_period=30, + raise_on_error=False, # Old primary may be in error while rejoining + ) # Wait for the old primary to rejoin as replica # This can take a while as it needs to recover with a new IP diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index ae11816..7a4c0db 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -4,6 +4,7 @@ import itertools import json import logging +import subprocess from pathlib import Path import psycopg2 @@ -197,6 +198,17 @@ async def run_command_on_unit(ops_test: OpsTest, unit_name: str, command: str) - return stdout +async def stop_machine(ops_test: OpsTest, machine_name: str) -> None: + """Stop the machine where a unit run on. + + Args: + ops_test: The ops test framework instance + machine_name: The name of the machine to stop + """ + stop_machine_command = f"lxc stop {machine_name}" + subprocess.check_call(stop_machine_command.split()) + + ### Ported Mysql jubilant helpers From f9e9d01632d2d8c442c545feff6af301d38ccb8e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sat, 30 May 2026 06:30:17 +0300 Subject: [PATCH 4/6] Upgrade test --- .../high_availability/test_upgrade.py | 54 +++++++------------ 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index b4e0508..be19cf5 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -14,7 +14,6 @@ from .high_availability_helpers_new import ( check_db_units_writes_increment, - count_switchovers, get_app_leader, get_app_units, wait_for_apps_status, @@ -67,17 +66,19 @@ def test_deploy_latest(juju: Juju) -> None: logging.info("Wait for applications to become active") juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME), + ready=wait_for_apps_status( + jubilant.all_active, DB_APP_NAME, DB_TEST_APP_NAME, WATCHER_APP_NAME + ), timeout=20 * MINUTE_SECS, ) def test_pre_refresh_check(juju: Juju) -> None: """Test that the pre-refresh-check action runs successfully.""" - db_leader = get_app_leader(juju, DB_APP_NAME) + watcher_leader = get_app_leader(juju, WATCHER_APP_NAME) logging.info("Run pre-refresh-check action") - juju.run(unit=db_leader, action="pre-refresh-check") + juju.run(unit=watcher_leader, action="pre-refresh-check") juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) @@ -87,18 +88,16 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) - initial_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) - logging.info("Refresh the charm") - juju.refresh(app=DB_APP_NAME, path=charm) + juju.refresh(app=WATCHER_APP_NAME, path=charm) logging.info("Wait for refresh to block as paused or incompatible") try: - juju.wait(lambda status: status.apps[DB_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) + juju.wait(lambda status: status.apps[WATCHER_APP_NAME].is_blocked, timeout=5 * MINUTE_SECS) - units = get_app_units(juju, DB_APP_NAME) + units = get_app_units(juju, WATCHER_APP_NAME) unit_names = sorted(units.keys()) - if "Refresh incompatible" in juju.status().apps[DB_APP_NAME].app_status.message: + if "Refresh incompatible" in juju.status().apps[WATCHER_APP_NAME].app_status.message: logging.info("Application refresh is blocked due to incompatibility") juju.run( unit=unit_names[-1], @@ -113,37 +112,24 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) except TimeoutError: logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") - assert juju.status().apps[DB_APP_NAME].is_active - - logging.info("Wait for upgrade to complete") - juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), - timeout=20 * MINUTE_SECS, - ) + assert juju.status().apps[WATCHER_APP_NAME].is_active logging.info("Wait for upgrade to complete") juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), + ready=wait_for_apps_status(jubilant.all_active, WATCHER_APP_NAME), timeout=20 * MINUTE_SECS, ) logging.info("Ensure continuous writes are incrementing") check_db_units_writes_increment(juju, DB_APP_NAME) - logging.info("checking the number of switchovers") - final_number_of_switchovers = count_switchovers(juju, DB_APP_NAME) - assert (final_number_of_switchovers - initial_number_of_switchovers) <= 2, ( - "Number of switchovers is greater than 2" - ) - def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: """Test an upgrade failure and its rollback.""" - db_app_leader = get_app_leader(juju, DB_APP_NAME) - db_app_units = get_app_units(juju, DB_APP_NAME) + watcher_app_leader = get_app_leader(juju, WATCHER_APP_NAME) logging.info("Run pre-refresh-check action") - juju.run(unit=db_app_leader, action="pre-refresh-check") + juju.run(unit=watcher_app_leader, action="pre-refresh-check") juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) @@ -154,30 +140,30 @@ def test_fail_and_rollback(juju: Juju, charm: str, continuous_writes) -> None: shutil.copy(charm, tmp_folder_charm) logging.info("Inject dependency fault") - inject_dependency_fault(juju, DB_APP_NAME, tmp_folder_charm) + inject_dependency_fault(juju, WATCHER_APP_NAME, tmp_folder_charm) logging.info("Refresh the charm") - juju.refresh(app=DB_APP_NAME, path=tmp_folder_charm) + juju.refresh(app=WATCHER_APP_NAME, path=tmp_folder_charm) logging.info("Wait for upgrade to fail on leader") juju.wait( - ready=wait_for_apps_status(jubilant.any_blocked, DB_APP_NAME), + ready=wait_for_apps_status(jubilant.any_blocked, WATCHER_APP_NAME), timeout=10 * MINUTE_SECS, ) logging.info("Ensure continuous writes on all units") - check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + check_db_units_writes_increment(juju, DB_APP_NAME) logging.info("Re-refresh the charm") - juju.refresh(app=DB_APP_NAME, path=charm) + juju.refresh(app=WATCHER_APP_NAME, path=charm) logging.info("Wait for upgrade to complete") juju.wait( - ready=wait_for_apps_status(jubilant.all_active, DB_APP_NAME), timeout=20 * MINUTE_SECS + ready=wait_for_apps_status(jubilant.all_active, WATCHER_APP_NAME), timeout=20 * MINUTE_SECS ) logging.info("Ensure continuous writes after rollback procedure") - check_db_units_writes_increment(juju, DB_APP_NAME, list(db_app_units)) + check_db_units_writes_increment(juju, DB_APP_NAME) # Remove fault charm file tmp_folder_charm.unlink() From cd7f9cac9a8954e616f702b8ee4a7d3403a84c41 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sat, 30 May 2026 06:46:03 +0300 Subject: [PATCH 5/6] Unused envvars --- spread.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/spread.yaml b/spread.yaml index 91317c5..1a10d2d 100644 --- a/spread.yaml +++ b/spread.yaml @@ -89,13 +89,6 @@ backends: # Manually pass specific environment variables environment: CI: '$(HOST: echo $CI)' - AWS_ACCESS_KEY: '$(HOST: echo $AWS_ACCESS_KEY)' - AWS_SECRET_KEY: '$(HOST: echo $AWS_SECRET_KEY)' - GCP_ACCESS_KEY: '$(HOST: echo $GCP_ACCESS_KEY)' - GCP_SECRET_KEY: '$(HOST: echo $GCP_SECRET_KEY)' - UBUNTU_PRO_TOKEN: '$(HOST: echo $UBUNTU_PRO_TOKEN)' - LANDSCAPE_ACCOUNT_NAME: '$(HOST: echo $LANDSCAPE_ACCOUNT_NAME)' - LANDSCAPE_REGISTRATION_KEY: '$(HOST: echo $LANDSCAPE_REGISTRATION_KEY)' systems: - ubuntu-24.04: username: runner From 8231132ad4a7118ad8183234bd38d1cfd31a40df Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Sat, 30 May 2026 07:37:50 +0300 Subject: [PATCH 6/6] No multiple units during upgrade --- .../high_availability_helpers_new.py | 120 ------------------ .../high_availability/test_upgrade.py | 3 - 2 files changed, 123 deletions(-) diff --git a/tests/integration/high_availability/high_availability_helpers_new.py b/tests/integration/high_availability/high_availability_helpers_new.py index 27e0bcc..6f87aa2 100644 --- a/tests/integration/high_availability/high_availability_helpers_new.py +++ b/tests/integration/high_availability/high_availability_helpers_new.py @@ -1,13 +1,9 @@ #!/usr/bin/env python3 # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. - -import json -import subprocess from collections.abc import Callable import jubilant -import requests from jubilant import Juju from jubilant.statustypes import Status, UnitStatus from tenacity import Retrying, stop_after_delay, wait_fixed @@ -63,17 +59,6 @@ def get_app_leader(juju: Juju, app_name: str) -> str: raise Exception("No leader unit found") -def get_app_name(juju: Juju, charm_name: str) -> str | None: - """Get the application name for the given charm.""" - model_status = juju.status() - app_statuses = model_status.apps - for name, status in app_statuses.items(): - if status.charm_name == charm_name: - return name - - raise Exception("No application name found") - - def get_app_units(juju: Juju, app_name: str) -> dict[str, UnitStatus]: """Get the units for the given application.""" model_status = juju.status() @@ -81,17 +66,6 @@ def get_app_units(juju: Juju, app_name: str) -> dict[str, UnitStatus]: return app_status.units -def get_unit_by_number(juju: Juju, app_name: str, unit_number: int) -> str: - """Get unit by number.""" - model_status = juju.status() - app_status = model_status.apps[app_name] - for name in app_status.units: - if name == f"{app_name}/{unit_number}": - return name - - raise Exception("No application unit found") - - def get_unit_ip(juju: Juju, app_name: str, unit_name: str) -> str: """Get the application unit IP.""" model_status = juju.status() @@ -103,65 +77,6 @@ def get_unit_ip(juju: Juju, app_name: str, unit_name: str) -> str: raise Exception("No application unit found") -def get_unit_info(juju: Juju, unit_name: str) -> dict: - """Return a dictionary with the show-unit data.""" - output = subprocess.check_output( - ["juju", "show-unit", f"--model={juju.model}", "--format=json", unit_name], - text=True, - ) - - return json.loads(output) - - -def get_unit_status_log(juju: Juju, unit_name: str, log_lines: int = 0) -> list[dict]: - """Get the status log for a unit. - - Args: - juju: The juju instance to use. - unit_name: The name of the unit to retrieve the status log for - log_lines: The number of status logs to retrieve (optional) - """ - # fmt: off - output = subprocess.check_output( - ["juju", "show-status-log", f"--model={juju.model}", "--format=json", unit_name, "-n", f"{log_lines}"], - text=True, - ) - - return json.loads(output) - - -def get_relation_data(juju: Juju, app_name: str, rel_name: str) -> list[dict]: - """Returns a list that contains the relation-data. - - Args: - juju: The juju instance to use. - app_name: The name of the application - rel_name: name of the relation to get connection data from - - Returns: - A list that contains the relation-data - """ - app_leader = get_app_leader(juju, app_name) - app_leader_info = get_unit_info(juju, app_leader) - if not app_leader_info: - raise ValueError(f"No unit info could be grabbed for unit {app_leader}") - - relation_data = [ - value - for value in app_leader_info[app_leader]["relation-info"] - if value["endpoint"] == rel_name - ] - if not relation_data: - raise ValueError(f"No relation data could be grabbed for relation {rel_name}") - - return relation_data - - -def get_db_unit_name(instance_label: str) -> str: - """Builds a Juju unit name out of a MySQL instance label.""" - return "/".join(instance_label.rsplit("-", 1)) - - def get_db_primary_unit(juju: Juju, app_name: str) -> str: """Get the current primary node of the cluster.""" postgresql_primary = get_app_leader(juju, app_name) @@ -175,19 +90,6 @@ def get_db_primary_unit(juju: Juju, app_name: str) -> str: raise Exception("No primary node found") -def get_db_standby_leader_unit(juju: Juju, app_name: str) -> str: - """Get the current standby node of the cluster.""" - unit_address = get_unit_ip(juju, app_name, get_app_leader(juju, app_name)) - - for member in requests.get(f"https://{unit_address}:8008/cluster", verify=False).json()[ - "members" - ]: - if member["role"] == "standby_leader": - return member["name"][::-1].replace("-", "/")[::-1] - - raise Exception("No standby primary node found") - - def get_db_max_written_value( juju: Juju, app_name: str, unit_name: str, db_name: str = "postgresql_test_app_database" ) -> int: @@ -227,20 +129,6 @@ def wait_for_apps_status(jubilant_status_func: JujuAppsStatusFn, *apps: str) -> )) -def wait_for_unit_status(app_name: str, unit_name: str, unit_status: str) -> JujuModelStatusFn: - """Returns whether a Juju unit to have a specific status.""" - return lambda status: ( - status.apps[app_name].units[unit_name].workload_status.current == unit_status - ) - - -def wait_for_unit_message(app_name: str, unit_name: str, unit_message: str) -> JujuModelStatusFn: - """Returns whether a Juju unit to have a specific message.""" - return lambda status: ( - status.apps[app_name].units[unit_name].workload_status.message == unit_message - ) - - # PG helpers @@ -250,11 +138,3 @@ def get_user_password(juju: Juju, app_name: str, user: str) -> str | None: if secret.label == f"{PEER}.{app_name}.app": revealed_secret = juju.show_secret(secret.uri, reveal=True) return revealed_secret.content.get(f"{user}-password") - - -def count_switchovers(juju: Juju, app_name: str) -> int: - """Return the number of performed switchovers.""" - app_primary = get_db_primary_unit(juju, app_name) - unit_address = get_unit_ip(juju, app_name, app_primary) - switchover_history_info = requests.get(f"https://{unit_address}:8008/history", verify=False) - return len(switchover_history_info.json()) diff --git a/tests/integration/high_availability/test_upgrade.py b/tests/integration/high_availability/test_upgrade.py index be19cf5..8b5c420 100644 --- a/tests/integration/high_availability/test_upgrade.py +++ b/tests/integration/high_availability/test_upgrade.py @@ -107,9 +107,6 @@ def test_upgrade_from_edge(juju: Juju, charm: str, continuous_writes) -> None: ) juju.wait(jubilant.all_agents_idle, timeout=5 * MINUTE_SECS) - - logging.info("Run resume-refresh action") - juju.run(unit=unit_names[1], action="resume-refresh", wait=5 * MINUTE_SECS) except TimeoutError: logging.info("Upgrade completed without snap refresh (charm.py upgrade only)") assert juju.status().apps[WATCHER_APP_NAME].is_active