Skip to content

Commit bd996d4

Browse files
authored
Merge pull request #2705 from cloudfoundry/upgrade-spec-improvements
Fix ~75% flake rate in upgrade-mysql and upgrade-postgres pipelines
2 parents 3d9de1d + 89da57d commit bd996d4

7 files changed

Lines changed: 119 additions & 26 deletions

File tree

ci/bats/tasks/destroy-director.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ export BOSH_CLIENT_SECRET
2828

2929
set +e
3030

31-
bosh-cli deployments --column name --json | jq -r ".Tables[0].Rows[].name" | xargs -n1 -I % bosh-cli -n -d % delete-deployment
31+
bosh-cli deployments --column name --json \
32+
| jq -r ".Tables[0].Rows[].name" \
33+
| xargs -n1 -I % bosh-cli -n -d '%' delete-deployment --force
3234
bosh-cli clean-up -n --all
3335
bosh-cli delete-env -n director-state/director.yml -l director-state/director-creds.yml

ci/pipeline.yml

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -804,16 +804,9 @@ jobs:
804804
GCP_JSON_KEY: ((gcp_json_key))
805805
DEPLOY_ARGS: |-
806806
-o bosh-deployment/external-ip-not-recommended.yml
807-
- task: sleep-300-seconds
807+
- task: wait-for-agents
808808
image: integration-image
809-
config:
810-
platform: linux
811-
run:
812-
path: /bin/sh
813-
args:
814-
- -exc
815-
- |
816-
sleep 300
809+
file: bosh-ci/ci/tasks/wait-for-agents.yml
817810
- task: recreate-zookeeper
818811
image: integration-image
819812
file: bosh-ci/ci/tasks/deploy-zookeeper.yml
@@ -915,16 +908,9 @@ jobs:
915908
DEPLOY_ARGS: |-
916909
-o bosh-deployment/external-ip-not-recommended.yml
917910
-o bosh-deployment/misc/external-db.yml
918-
- task: sleep-300-seconds
911+
- task: wait-for-agents
919912
image: integration-image
920-
config:
921-
platform: linux
922-
run:
923-
path: /bin/sh
924-
args:
925-
- -exc
926-
- |
927-
sleep 300
913+
file: bosh-ci/ci/tasks/wait-for-agents.yml
928914
- task: recreate-zookeeper
929915
image: integration-image
930916
file: bosh-ci/ci/tasks/deploy-zookeeper.yml

ci/tasks/deploy-zookeeper.sh

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,47 @@ function get_bosh_environment {
1717
mv bosh-cli/bosh-cli-* /usr/local/bin/bosh-cli
1818
chmod +x /usr/local/bin/bosh-cli
1919

20-
export BOSH_ENVIRONMENT=$(get_bosh_environment)
21-
export BOSH_CLIENT=admin
22-
export BOSH_CLIENT_SECRET=$(bosh-cli int director-state/director-creds.yml --path /admin_password)
23-
export BOSH_CA_CERT=$(bosh-cli int director-state/director-creds.yml --path /director_ssl/ca)
20+
export BOSH_CLIENT="admin"
21+
BOSH_CLIENT_SECRET=$(bosh-cli int director-state/director-creds.yml --path /admin_password)
22+
BOSH_ENVIRONMENT=$(get_bosh_environment)
23+
BOSH_CA_CERT=$(bosh-cli int director-state/director-creds.yml --path /director_ssl/ca)
24+
export BOSH_ENVIRONMENT
25+
export BOSH_CA_CERT
26+
export BOSH_CLIENT_SECRET
2427
export BOSH_NON_INTERACTIVE=true
2528

2629
bosh-cli update-cloud-config "bosh-deployment/${CPI}/cloud-config.yml" \
2730
--vars-file director-state/director-vars.json
2831

2932
bosh-cli upload-stemcell stemcell/*.tgz
30-
bosh-cli -d zookeeper deploy --recreate "${bosh_repo_dir}/ci/tasks/deploy-zookeeper/zookeeper-manifest.yml"
33+
34+
MAX_DEPLOY_ATTEMPTS=${MAX_DEPLOY_ATTEMPTS:-3}
35+
DEPLOY_RETRY_DELAY=${DEPLOY_RETRY_DELAY:-60}
36+
37+
for attempt in $(seq 1 "$MAX_DEPLOY_ATTEMPTS"); do
38+
echo "Deploy attempt ${attempt}/${MAX_DEPLOY_ATTEMPTS}..."
39+
set +e
40+
# DEPLOY_EXTRA_ARGS is intentionally unquoted to allow multiple space-separated arguments
41+
# shellcheck disable=SC2086
42+
bosh-cli -d zookeeper deploy --recreate ${DEPLOY_EXTRA_ARGS:-} \
43+
"${bosh_repo_dir}/ci/tasks/deploy-zookeeper/zookeeper-manifest.yml"
44+
deploy_exit=$?
45+
set -e
46+
47+
if [ $deploy_exit -eq 0 ]; then
48+
echo "Deploy succeeded on attempt ${attempt}."
49+
break
50+
fi
51+
52+
echo "Deploy failed on attempt ${attempt}."
53+
echo "Current VM state:"
54+
bosh-cli -d zookeeper vms || true
55+
if [ "${attempt}" -eq "${MAX_DEPLOY_ATTEMPTS}" ]; then
56+
echo "Deploy failed after ${MAX_DEPLOY_ATTEMPTS} attempts."
57+
exit 1
58+
fi
59+
echo "Waiting ${DEPLOY_RETRY_DELAY}s before retry..."
60+
sleep "${DEPLOY_RETRY_DELAY}"
61+
done
62+
3163
bosh-cli -d zookeeper run-errand smoke-tests

ci/tasks/deploy-zookeeper.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,6 @@ run:
1414

1515
params:
1616
CPI:
17+
DEPLOY_EXTRA_ARGS:
18+
MAX_DEPLOY_ATTEMPTS:
19+
DEPLOY_RETRY_DELAY:

ci/tasks/deploy-zookeeper/zookeeper-manifest.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@ stemcells:
1212
version: latest
1313

1414
update:
15-
canaries: 2
15+
canaries: 1
1616
max_in_flight: 1
1717
canary_watch_time: 5000-60000
1818
update_watch_time: 5000-60000
1919

2020
instance_groups:
2121
- name: zookeeper
2222
azs: [z1, z2, z3]
23-
instances: 5
23+
instances: 3
2424
jobs:
2525
- name: zookeeper
2626
release: zookeeper

ci/tasks/wait-for-agents.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env bash
2+
set -e
3+
4+
state_path() { bosh-cli int director-state/director.yml --path="$1" ; }
5+
6+
function get_bosh_environment {
7+
if [[ -z $(state_path /instance_groups/name=bosh/networks/name=public/static_ips/0 2>/dev/null) ]]; then
8+
state_path /instance_groups/name=bosh/networks/name=default/static_ips/0 2>/dev/null
9+
else
10+
state_path /instance_groups/name=bosh/networks/name=public/static_ips/0 2>/dev/null
11+
fi
12+
}
13+
14+
mv bosh-cli/bosh-cli-* /usr/local/bin/bosh-cli
15+
chmod +x /usr/local/bin/bosh-cli
16+
17+
export BOSH_CLIENT="admin"
18+
BOSH_CLIENT_SECRET=$(bosh-cli int director-state/director-creds.yml --path /admin_password)
19+
BOSH_ENVIRONMENT=$(get_bosh_environment)
20+
BOSH_CA_CERT=$(bosh-cli int director-state/director-creds.yml --path /director_ssl/ca)
21+
export BOSH_ENVIRONMENT
22+
export BOSH_CA_CERT
23+
export BOSH_CLIENT_SECRET
24+
export BOSH_NON_INTERACTIVE=true
25+
26+
MAX_ATTEMPTS=60
27+
SLEEP_INTERVAL=10
28+
TOTAL_TIMEOUT=$((MAX_ATTEMPTS * SLEEP_INTERVAL))
29+
30+
echo "Waiting up to ${TOTAL_TIMEOUT}s for all zookeeper agents to become responsive..."
31+
32+
for i in $(seq 1 "${MAX_ATTEMPTS}"); do
33+
set +e
34+
vms_json=$(bosh-cli -d zookeeper vms --json)
35+
exit_code=$?
36+
set -e
37+
38+
if [ $exit_code -eq 0 ]; then
39+
zookeeper_instances_json=$(echo "${vms_json}" | jq -r '[.Tables[0].Rows[] | select(.instance | startswith("zookeeper/"))]')
40+
total=$(echo "${zookeeper_instances_json}" | jq -r '. | length')
41+
running=$(echo "${zookeeper_instances_json}" | jq -r '[.[] | select(.process_state == "running")] | length')
42+
43+
echo " Attempt $i/${MAX_ATTEMPTS}: ${running}/${total} agents responsive"
44+
45+
if [ "$running" -eq "$total" ] && [ "$total" -gt 0 ]; then
46+
echo "All ${total} agents are responsive after $(((i - 1) * SLEEP_INTERVAL)) seconds."
47+
exit 0
48+
fi
49+
else
50+
echo " Attempt $i/${MAX_ATTEMPTS}: bosh vms failed (director may still be starting)"
51+
fi
52+
53+
sleep "${SLEEP_INTERVAL}"
54+
done
55+
56+
echo "ERROR: Not all agents became responsive within ${TOTAL_TIMEOUT}s."
57+
echo "Final VM state:"
58+
bosh-cli -d zookeeper vms || true
59+
exit 1

ci/tasks/wait-for-agents.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
platform: linux
3+
4+
inputs:
5+
- name: director-state
6+
- name: bosh-ci
7+
- name: bosh-cli
8+
- name: bosh-deployment
9+
10+
run:
11+
path: bosh-ci/ci/tasks/wait-for-agents.sh

0 commit comments

Comments
 (0)