Skip to content

Commit c914c82

Browse files
ambledclaude
andcommitted
Implement robust lock file cleanup (v0.3.23)
- Added atexit.register() for guaranteed cleanup on normal exit - Added SIGTERM and SIGINT signal handlers for graceful shutdown - Lock file now always cleaned up even on crash or interruption - Prevents stuck lock files from blocking subsequent runs - Tracks creation with _lock_file_created flag - Centralized cleanup logic in cleanup_lock_file() - Removed 5 manual os.remove(LOCK_FILE) calls 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent c4e371f commit c914c82

3 files changed

Lines changed: 51 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,19 @@
22

33
## [Unreleased]
44

5+
## [0.3.23] - 2025-12-15
6+
7+
### Fixed
8+
- **Robust lock file cleanup**: Implemented guaranteed lock file cleanup using signal handlers and atexit
9+
- Added `atexit.register()` to ensure lock file cleanup on normal exit
10+
- Added SIGTERM and SIGINT signal handlers for graceful shutdown on interruption (Ctrl+C, kill)
11+
- Lock file now always cleaned up even if WNM crashes or is interrupted
12+
- Prevents stuck lock files from blocking subsequent runs
13+
- Tracks lock file creation with `_lock_file_created` flag to only clean up files created by this process
14+
- Centralized cleanup logic in `cleanup_lock_file()` function
15+
- Removed 5 manual `os.remove(LOCK_FILE)` calls - now handled automatically
16+
- `--remove_lockfile` flag still available for manual cleanup if needed
17+
518
## [0.3.22] - 2025-12-14
619

720
### Fixed

src/wnm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""A service to manage a cluster of decentralized Autonomi nodes"""
22

3-
__version__ = "0.3.22"
3+
__version__ = "0.3.23"

src/wnm/__main__.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import atexit
12
import json
23
import logging
34
import os
5+
import signal
46
import sys
57
import time
68

@@ -33,9 +35,39 @@
3335
# A storage place for ant node data
3436
Workers = []
3537

38+
# Track whether we created the lock file
39+
_lock_file_created = False
40+
3641
# Detect ANM
3742

3843

44+
def cleanup_lock_file():
45+
"""Safely remove lock file if it was created by this process."""
46+
global _lock_file_created
47+
if _lock_file_created and os.path.exists(LOCK_FILE):
48+
try:
49+
os.remove(LOCK_FILE)
50+
logging.debug("Lock file removed during cleanup")
51+
except (PermissionError, OSError) as e:
52+
logging.error(f"Error removing lock file during cleanup: {e}")
53+
54+
55+
def signal_handler(signum, frame):
56+
"""Handle termination signals by cleaning up and exiting."""
57+
signal_name = signal.Signals(signum).name
58+
logging.info(f"Received {signal_name}, cleaning up...")
59+
cleanup_lock_file()
60+
sys.exit(1)
61+
62+
63+
# Register signal handlers for graceful shutdown
64+
signal.signal(signal.SIGTERM, signal_handler)
65+
signal.signal(signal.SIGINT, signal_handler)
66+
67+
# Register cleanup function to run on normal exit
68+
atexit.register(cleanup_lock_file)
69+
70+
3971
# Make a decision about what to do (new implementation using DecisionEngine)
4072
def choose_action(machine_config, metrics, dry_run):
4173
"""Plan and execute actions using DecisionEngine and ActionExecutor.
@@ -138,9 +170,12 @@ def main():
138170
sys.exit(1)
139171

140172
# We're starting, so lets create a lock file
173+
global _lock_file_created
141174
try:
142175
with open(LOCK_FILE, "w") as file:
143176
file.write(str(int(time.time())))
177+
_lock_file_created = True
178+
logging.debug(f"Lock file created: {LOCK_FILE}")
144179
except (PermissionError, OSError) as e:
145180
logging.error(f"Unable to create lock file: {e}")
146181
sys.exit(1)
@@ -150,7 +185,6 @@ def main():
150185
if not options.confirm:
151186
logging.error("Database migration requires --confirm flag for safety")
152187
logging.info("Use: wnm --force_action wnm-db-migration --confirm")
153-
os.remove(LOCK_FILE)
154188
sys.exit(1)
155189

156190
# Import migration utilities
@@ -162,7 +196,6 @@ def main():
162196
if not pending:
163197
logging.info("Database is already up to date!")
164198
logging.info(f"Current revision: {current}")
165-
os.remove(LOCK_FILE)
166199
sys.exit(0)
167200

168201
logging.info("=" * 70)
@@ -176,13 +209,11 @@ def main():
176209
run_migrations(engine, options.dbpath)
177210
logging.info("Database migration completed successfully!")
178211
logging.info("=" * 70)
179-
os.remove(LOCK_FILE)
180212
sys.exit(0)
181213
except Exception as e:
182214
logging.error(f"Migration failed: {e}")
183215
logging.error("Please restore from backup and report this issue.")
184216
logging.info("=" * 70)
185-
os.remove(LOCK_FILE)
186217
sys.exit(1)
187218

188219
# Config should have loaded the machine_config
@@ -208,8 +239,7 @@ def main():
208239
logging.info("Configuration updated successfully")
209240
else:
210241
logging.info("No configuration changes detected")
211-
# Clean up and exit immediately
212-
os.remove(LOCK_FILE)
242+
# Exit immediately (atexit will clean up lock file)
213243
sys.exit(0)
214244

215245
# Check for config updates
@@ -324,7 +354,6 @@ def main():
324354
# Handle --init flag: exit after initialization (and optional survey)
325355
if options.init:
326356
logging.info("Initialization complete")
327-
os.remove(LOCK_FILE)
328357
sys.exit(0)
329358

330359
# Check for reports
@@ -373,15 +402,13 @@ def main():
373402
report_output = f"Unknown report type: {options.report}"
374403

375404
print(report_output)
376-
os.remove(LOCK_FILE)
377405
sys.exit(0)
378406

379407
# Check for forced actions
380408
if options.force_action:
381409
# Teardown requires confirmation for safety
382410
if options.force_action == "teardown" and not options.confirm:
383411
logging.error("Teardown requires --confirm flag for safety")
384-
os.remove(LOCK_FILE)
385412
sys.exit(1)
386413

387414
logging.info(f"Executing forced action: {options.force_action}")
@@ -399,7 +426,7 @@ def main():
399426

400427
logging.info("Action: " + json.dumps(this_action, indent=2))
401428

402-
os.remove(LOCK_FILE)
429+
# Exit normally (atexit will clean up lock file)
403430
sys.exit(0)
404431

405432

0 commit comments

Comments
 (0)