Skip to content

Commit 2083b56

Browse files
authored
Merge pull request #739 from OpenHistoricalMap/staging
Tiler monitoring pipeline, server migration & operational improvements
2 parents d5cc78c + 995dd23 commit 2083b56

40 files changed

Lines changed: 4694 additions & 142 deletions

.github/workflows/chartpress.yaml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44
branches:
55
- 'main'
66
- 'staging'
7-
- 'k8s_deploy'
7+
- 'tiler_monitoring'
88
jobs:
99
build:
1010
runs-on: ubuntu-22.04
@@ -71,7 +71,7 @@ jobs:
7171
OHM_SLACK_WEBHOOK_URL: ${{ secrets.OHM_SLACK_WEBHOOK_URL }}
7272
################ Staging secrets ################
7373
- name: Staging - substitute secrets
74-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/k8s_deploy'
74+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/tiler_monitoring'
7575
uses: bluwy/substitute-string-action@v1
7676
with:
7777
_input-file: 'values.staging.template.yaml'
@@ -189,46 +189,46 @@ jobs:
189189
PRODUCTION_OPENSTREETMAP_AUTH_SECRET: ${{ secrets.PRODUCTION_OPENSTREETMAP_AUTH_SECRET }}
190190

191191
- name: AWS Credentials
192-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/k8s_deploy'
192+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/tiler_monitoring'
193193
uses: aws-actions/configure-aws-credentials@v1
194194
with:
195195
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
196196
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
197197
aws-region: us-east-1
198198
- name: Setup Kubectl and Helm Dependencies
199-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/k8s_deploy'
199+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/tiler_monitoring'
200200
run: |
201201
sudo pip install awscli --ignore-installed six
202-
sudo curl -L -o /usr/bin/kubectl https://dl.k8s.io/release/v1.33.0/bin/linux/amd64/kubectl
202+
sudo curl -L -o /usr/bin/kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.17.7/2020-07-08/bin/linux/amd64/kubectl
203203
sudo chmod +x /usr/bin/kubectl
204-
sudo curl -o /usr/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator
204+
sudo curl -o /usr/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.17.7/2020-07-08/bin/linux/amd64/aws-iam-authenticator
205205
sudo chmod +x /usr/bin/aws-iam-authenticator
206-
curl -L https://get.helm.sh/helm-v3.17.3-linux-amd64.tar.gz -o helm.tar.gz
206+
curl -L https://get.helm.sh/helm-v3.14.4-linux-amd64.tar.gz -o helm.tar.gz
207207
tar -xvzf helm.tar.gz
208208
sudo mv linux-amd64/helm /usr/local/bin/
209209
sudo chmod +x /usr/local/bin/helm
210210
helm version
211211
212212
- name: Update kube-config staging
213-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/k8s_deploy'
213+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/tiler_monitoring'
214214
run: aws eks --region us-east-1 update-kubeconfig --name osmseed-staging
215215
- name: Update kube-config prod
216216
if: github.ref == 'refs/heads/main'
217217
run: aws eks --region us-east-1 update-kubeconfig --name osmseed-production-v2
218218
- name: Add Helm repository
219-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/k8s_deploy'
219+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/tiler_monitoring'
220220
run: |
221221
helm repo add osm-seed https://osm-seed.github.io/osm-seed-chart/
222222
helm repo update
223223
- name: Install helm dependencies for
224-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/k8s_deploy'
224+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/tiler_monitoring'
225225
run: cd ohm && helm dep up
226226
# Staging
227227
- name: Staging - helm deploy
228-
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/k8s_deploy'
229-
run: helm upgrade --install staging --wait --timeout 10m ohm/ -f values.staging.yaml -f ohm/values.yaml
228+
if: github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/tiler_monitoring'
229+
run: helm upgrade --install staging --wait ohm/ -f values.staging.yaml -f ohm/values.yaml
230230
# Production
231231
- name: Production - helm deploy
232232
if: github.ref == 'refs/heads/main'
233-
run: helm upgrade --install production --wait --timeout 10m ohm/ -f values.production.yaml -f ohm/values.yaml
233+
run: helm upgrade --install production --wait ohm/ -f values.production.yaml -f ohm/values.yaml
234234

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ hetzner/*/.envs.*.production
3333
hetzner/traefik/cloudflare-ips.txt
3434
hetzner/traefik/traefik.yml
3535
.vscode/
36+
.claude/
3637
imposm3.json
37-
3838
cachedir_reimport/
3939
config_reimport.json
4040
imposm3_reimport.json

compose/tiler.yml

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -71,21 +71,24 @@ services:
7171
# - ohm_network
7272

7373

74-
# tiler-monitor:
75-
# image: rub21/tiler-monitor:v1
76-
# build:
77-
# context: ../images/tiler-monitor
78-
# dockerfile: Dockerfile
79-
# volumes:
80-
# - /var/run/docker.sock:/var/run/docker.sock
81-
# - ../images/tiler-monitor:/app
82-
# - ../hetzner:/app/hetzner
83-
# environment:
84-
# - DOCKER_CONFIG_ENVIRONMENT=staging
85-
# env_file:
86-
# - ../envs/.env.tiler
87-
# stdin_open: true
88-
# tty: true
74+
tiler-monitor:
75+
image: rub21/tiler-monitor:v2
76+
build:
77+
context: ../images/tiler-monitor
78+
dockerfile: Dockerfile
79+
volumes:
80+
- /var/run/docker.sock:/var/run/docker.sock
81+
- ../hetzner:/app/hetzner
82+
- tiler_monitor_data:/data
83+
ports:
84+
- "8001:8001"
85+
environment:
86+
- TILER_MONITORING_DOCKER_CONFIG_ENVIRONMENT=staging
87+
env_file:
88+
- ../envs/.env.tiler
89+
restart: always
90+
networks:
91+
- ohm_network
8992

9093
networks:
9194
ohm_network:
@@ -99,3 +102,7 @@ volumes:
99102
tiler_imposm_data:
100103
driver: local
101104
name: tiler_imposm
105+
106+
tiler_monitor_data:
107+
driver: local
108+
name: tiler_monitor

hetzner/nominatim/nominatim.base.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ services:
55
volumes:
66
- nominatim_data:/var/lib/postgresql/16/main
77
environment:
8-
PBF_URL: http://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-250716_0101.osm.pbf
8+
PBF_URL: http://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-260408_0307.osm.pbf
99
REPLICATION_URL: http://planet.openhistoricalmap.org.s3.amazonaws.com/replication/minute
1010
REPLICATION_UPDATE_INTERVAL: 60
1111
REPLICATION_RECHECK_INTERVAL: 30

hetzner/osmcha/osmcha.base.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ services:
9191
volumes:
9292
- staticfiles:/srv/www/static/django:ro
9393
- ${PWD}/hetzner/osmcha/script/update.sh:/app/update.sh
94+
- ${PWD}/hetzner/osmcha/script/backfill_changesets.py:/app/osmchadjango/changeset/management/commands/backfill_changesets.py
9495
command: sh /app/update.sh
9596

9697
volumes:

hetzner/osmcha/osmcha.production.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ services:
44
container_name: osmcha_ohmx_adiff
55
environment:
66
- API_URL=https://api.${OHM_DOMAIN}
7-
- PLANET_PBF_URL=https://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-260106_0350.osm.pbf
7+
- PLANET_PBF_URL=https://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-260408_0307.osm.pbf
88
- MINUTE_REPLICATION_URL=https://planet.openhistoricalmap.org/?prefix=replication/minute/
99
# Add OSMX_INITIAL_SEQNUM to start from a specific sequence number
10-
# - OSMX_INITIAL_SEQNUM=1881020
10+
# - OSMX_INITIAL_SEQNUM=1900000
1111
- AWS_S3_BUCKET=planet.openhistoricalmap.org
1212
env_file:
1313
- ./.env.osmcha
@@ -22,5 +22,5 @@ services:
2222
- ${PWD}/images/ohmx-adiff-builder/update.sh:/app/update.sh
2323
networks:
2424
- ohm_network
25-
cpus: '8.0'
26-
mem_limit: 20G
25+
# cpus: '8.0'
26+
# mem_limit: 20G
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Django management command to backfill changesets, customized from the original one."""
2+
3+
from datetime import date, datetime, timedelta
4+
from django.core.management.base import BaseCommand
5+
6+
from osmchadjango.changeset.models import Changeset
7+
from osmchadjango.changeset.tasks import create_changeset
8+
9+
class Command(BaseCommand):
10+
help = """Backfill missing changesets by ID range or date range.
11+
Use --start_id for specific ID, --step=1/-1 for direction."""
12+
13+
def add_arguments(self, parser):
14+
parser.add_argument("--start_date", type=str)
15+
parser.add_argument("--end_date", type=str)
16+
parser.add_argument("--start_id", type=int)
17+
parser.add_argument("--limit", type=int, default=50)
18+
parser.add_argument("--step", type=int, default=1)
19+
20+
def handle(self, *args, **options):
21+
# Date defaults
22+
try:
23+
start_date = date.fromisoformat(options["start_date"])
24+
except:
25+
start_date = date.today() - timedelta(days=1)
26+
27+
try:
28+
end_date = date.fromisoformat(options["end_date"])
29+
except:
30+
end_date = datetime.now().date()
31+
32+
# Priority: ID mode > date mode
33+
if options["start_id"]:
34+
start_id = options["start_id"]
35+
# Get local context (recent IDs around start_id)
36+
cl = list(Changeset.objects.filter(id__gte=start_id - 100).values_list("id", flat=True))
37+
max_id = max(cl) if cl else start_id
38+
min_id = min(cl) if cl else start_id + options["step"]
39+
self.stdout.write(f"Backfilling from ID {start_id}, range {max_id}{min_id}")
40+
41+
current_id = start_id
42+
count = 0
43+
while count < options["limit"]:
44+
if current_id not in cl:
45+
try:
46+
create_changeset(current_id)
47+
self.stdout.write(f"✅ {current_id}")
48+
except Exception as e:
49+
self.stdout.write(f"✗ {current_id}: {e}")
50+
current_id += options["step"]
51+
count += 1
52+
else:
53+
# Original date-based logic + empty list FIX
54+
cl_qs = Changeset.objects.filter(date__gte=start_date, date__lte=end_date).values_list("id", flat=True)
55+
cl = list(cl_qs)
56+
57+
if not cl:
58+
self.stdout.write("No changesets in date range. Use --start_id=25")
59+
return
60+
61+
max_id = max(cl)
62+
min_id = min(cl)
63+
self.stdout.write(f"Found range: {min_id}{max_id}")
64+
65+
current_id = max_id + 1
66+
count = 0
67+
while current_id < min_id and count < options["limit"]:
68+
try:
69+
create_changeset(current_id)
70+
self.stdout.write(f"✅ {current_id}")
71+
except Exception as e:
72+
self.stdout.write(f"✗ {current_id}: {e}")
73+
current_id += 1
74+
count += 1
75+
76+
self.stdout.write("✅ Complete")
77+

hetzner/overpass/overpass.base.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ services:
99
environment:
1010
OVERPASS_META: attic
1111
OVERPASS_MODE: init
12-
OVERPASS_PLANET_URL: https://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-260216_0301.osm.pbf
12+
OVERPASS_PLANET_URL: https://s3.amazonaws.com/planet.openhistoricalmap.org/planet/planet-260408_0307.osm.pbf
1313
OVERPASS_DIFF_URL: http://s3.amazonaws.com/planet.openhistoricalmap.org/replication/minute
1414
OVERPASS_RULES_LOAD: 10
15-
OVERPASS_REPLICATION_SEQUENCE_NUMBER: 1796000
15+
OVERPASS_REPLICATION_SEQUENCE_NUMBER: 1900000
1616
OVERPASS_ALLOW_DUPLICATE_QUERIES: yes
1717
restart: always
1818
## Disable healthcheck during initialization phase to prevent premature restarts
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
services:
22
overpass:
3-
mem_limit: 16g
4-
cpus: "8.0"
3+
# mem_limit: 16g
4+
# cpus: "8.0"

hetzner/tiler/config/postgresql.production.conf

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,18 @@
22
# CONNECTIONS AND AUTHENTICATION
33
#------------------------------------------------------------------------------
44
listen_addresses = '*' # Allow connections from any network interface
5-
max_connections = 200 # Increase if you expect more concurrent connections
5+
max_connections = 300 # 4 Tegola replicas × 56 conn + refresh/imposm headroom
66
superuser_reserved_connections = 5 # Reserve connections for superusers
77

88
#------------------------------------------------------------------------------
99
# RESOURCE USAGE
1010
#------------------------------------------------------------------------------
1111

1212
# - Memory Configuration -
13-
shared_buffers = 10GB # ~25% of 40GB container limit
14-
work_mem = 256MB # Memory for each sort/hash operation; be cautious with many parallel queries
15-
maintenance_work_mem = 4GB # Larger memory for VACUUM / CREATE INDEX / ALTER
16-
effective_cache_size = 26GB # ~65% of 40GB container limit
13+
shared_buffers = 16GB # ~13% of 125GB RAM
14+
work_mem = 256MB # Memory for each sort/hash operation; balanced for 300 max connections
15+
maintenance_work_mem = 8GB # Larger memory for VACUUM / CREATE INDEX / ALTER / REFRESH
16+
effective_cache_size = 80GB # ~65% of 125GB RAM
1717

1818
# - Disk Optimization for SSD (if using SSD) -
1919
random_page_cost = 1.0 # Lower cost for random I/O on SSD
@@ -32,7 +32,7 @@ synchronous_commit = off # Improves write performance, risk of
3232
#------------------------------------------------------------------------------
3333
# AUTOVACUUM SETTINGS
3434
#------------------------------------------------------------------------------
35-
autovacuum_max_workers = 6 # More parallel vacuum workers for busy systems
35+
autovacuum_max_workers = 8 # More parallel vacuum workers for busy systems
3636
autovacuum_naptime = 30s # How often the autovacuum daemon checks for work
3737
autovacuum_vacuum_cost_limit = -1 # Let PostgreSQL adjust vacuum cost dynamically
3838

@@ -42,26 +42,27 @@ autovacuum_vacuum_cost_limit = -1 # Let PostgreSQL adjust vacuum cost d
4242
effective_io_concurrency = 300 # For SSD; helps the planner estimate IO concurrency
4343
parallel_tuple_cost = 0.001 # Lower cost to encourage parallelization
4444
parallel_setup_cost = 100 # Lower to encourage more parallel plans
45-
max_worker_processes = 25 # Match 25 CPUs container limit
46-
max_parallel_workers_per_gather = 6 # Max workers that can help a single query
47-
max_parallel_workers = 25 # Total number of parallel workers across all queries
45+
max_worker_processes = 36 # Match 36 CPUs
46+
max_parallel_workers_per_gather = 8 # Max workers that can help a single query
47+
max_parallel_workers = 36 # Total number of parallel workers across all queries
4848

4949
#------------------------------------------------------------------------------
5050
# LOGGING
5151
#------------------------------------------------------------------------------
5252
logging_collector = off # Disable log collection
53+
log_min_messages = fatal # Only log FATAL and PANIC (crashes, OOM kills)
54+
log_min_error_statement = panic # Do not log SQL statements even on errors
5355
log_statement = 'none' # Do not log any statements
5456
log_duration = off # Disable logging query duration
5557
log_min_duration_statement = -1 # Disable logging slow queries
56-
log_error_verbosity = terse # Minimal error messages
58+
log_error_verbosity = default # Show enough detail for debugging errors
5759
log_autovacuum_min_duration = -1 # Do not log autovacuum runs
58-
log_connections = on # Do not log new connections
59-
log_disconnections = on # Do not log disconnections
60+
log_connections = off # Do not log connections
61+
log_disconnections = off # Do not log disconnections
6062
log_lock_waits = off # Do not log lock waits
6163
log_temp_files = -1 # Do not log temporary file creation
6264
log_checkpoints = off # Do not log checkpoints
6365
log_replication_commands = off # Do not log replication-related commands
64-
log_directory = '/dev/null' # Redirect logs to /dev/null (no storage)
6566
#------------------------------------------------------------------------------
6667
# CLIENT CONNECTION DEFAULTS
6768
#------------------------------------------------------------------------------

0 commit comments

Comments
 (0)