Skip to content

Commit 1ddb16e

Browse files
committed
Add file-based heartbeating
1 parent 3d040fd commit 1ddb16e

3 files changed

Lines changed: 109 additions & 6 deletions

File tree

docker/agent/main.go

Lines changed: 88 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,26 @@ type HealthStats struct {
4343
DiskUsage float64 `json:"diskusage"`
4444
}
4545

46+
// ANSI color helpers for user-facing log messages
47+
var (
48+
colorReset = "\033[0m"
49+
colorRed = "\033[31m"
50+
colorGreen = "\033[32m"
51+
colorOrange = "\033[33m"
52+
)
53+
54+
func colored(color, msg string) string {
55+
return color + msg + colorReset
56+
}
57+
4658
var (
4759
workerURL = os.Getenv("WORKER_URL")
4860
sessionID = os.Getenv("SESSION_ID")
4961
logFilePath = "/home/steam/gmodserver/garrysmod/console.log"
5062
pidFilePath = "/home/steam/gmodserver/garrysmod/gmod.pid"
5163
metadataDir = "/home/steam/metadata"
52-
scriptDir = "/home/steam/gmodserver/garrysmod/lua/gluadev"
64+
scriptDir = "/home/steam/gmodserver/garrysmod/lua/gluadev"
65+
heartbeatPath = "/home/steam/gmodserver/garrysmod/data/gluadev/heartbeat.txt"
5366
scriptCount = 0
5467

5568
gameBranch string
@@ -91,9 +104,10 @@ func main() {
91104
go tailLogs(ctx, writeChan)
92105
go sendHealthStats(ctx, writeChan)
93106
go monitorGameProcess(ctx, pid, writeChan, cancel)
107+
go monitorHeartbeat(ctx, pid, writeChan, cancel)
94108

95109
listenForCommands(ctx, conn)
96-
shutdown(writeChan, cancel)
110+
shutdown(writeChan, cancel, "")
97111
}
98112

99113
func getGameVersionString() string {
@@ -107,6 +121,7 @@ func getGameVersionString() string {
107121
"x86-64": "sixty-four",
108122
"prerelease": "prerelease",
109123
"dev": "dev",
124+
"network-test": "network-test",
110125
}
111126

112127
if version, ok := versionNameMap[versionName]; ok {
@@ -196,7 +211,7 @@ func monitorGameProcess(ctx context.Context, pid int, writeChan chan<- WebSocket
196211
process, err := os.FindProcess(pid)
197212
if err != nil {
198213
log.Printf("Could not find process with PID %d: %v. Shutting down.", pid, err)
199-
shutdown(writeChan, cancel)
214+
shutdown(writeChan, cancel, colored(colorRed, "Game server process not found."))
200215
return
201216
}
202217

@@ -208,13 +223,78 @@ func monitorGameProcess(ctx context.Context, pid int, writeChan chan<- WebSocket
208223
err := process.Signal(syscall.Signal(0))
209224
if err != nil {
210225
log.Printf("Game server process (PID: %d) is no longer running (err: %v). Shutting down.", pid, err)
211-
shutdown(writeChan, cancel)
226+
shutdown(writeChan, cancel, colored(colorOrange, "Game server process exited."))
227+
return
228+
}
229+
}
230+
}
231+
}
232+
233+
func waitForFile(ctx context.Context, path string, timeout time.Duration) bool {
234+
ticker := time.NewTicker(1 * time.Second)
235+
defer ticker.Stop()
236+
237+
deadline := time.After(timeout)
238+
for {
239+
select {
240+
case <-ctx.Done():
241+
return false
242+
case <-deadline:
243+
return false
244+
case <-ticker.C:
245+
if _, err := os.Stat(path); err == nil {
246+
return true
247+
}
248+
}
249+
}
250+
}
251+
252+
func monitorHeartbeat(ctx context.Context, pid int, writeChan chan<- WebSocketMessageOut, cancel context.CancelFunc) {
253+
if !waitForFile(ctx, heartbeatPath, 15*time.Second) {
254+
log.Println("Heartbeat file never appeared after 15s, server is unresponsive.")
255+
killStaleServer(pid, 15, writeChan, cancel)
256+
return
257+
}
258+
259+
log.Println("Heartbeat file found, monitoring started.")
260+
261+
ticker := time.NewTicker(1 * time.Second)
262+
defer ticker.Stop()
263+
264+
lastGoodHeartbeat := time.Now()
265+
266+
for {
267+
select {
268+
case <-ctx.Done():
269+
return
270+
case <-ticker.C:
271+
if data, err := os.ReadFile(heartbeatPath); err == nil {
272+
if ts, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64); err == nil {
273+
if time.Now().Unix()-ts <= 15 {
274+
lastGoodHeartbeat = time.Now()
275+
}
276+
}
277+
}
278+
279+
if time.Since(lastGoodHeartbeat) >= 15*time.Second {
280+
killStaleServer(pid, int64(time.Since(lastGoodHeartbeat).Seconds()), writeChan, cancel)
212281
return
213282
}
214283
}
215284
}
216285
}
217286

287+
func killStaleServer(pid int, age int64, writeChan chan<- WebSocketMessageOut, cancel context.CancelFunc) {
288+
log.Printf("Server heartbeat lost (%ds stale). Killing PID %d.", age, pid)
289+
290+
process, err := os.FindProcess(pid)
291+
if err == nil {
292+
process.Signal(syscall.SIGKILL)
293+
}
294+
295+
shutdown(writeChan, cancel, colored(colorRed, fmt.Sprintf("Server heartbeat lost (%ds ago). Killing unresponsive server 🔪", age)))
296+
}
297+
218298
func sendMetadata(writeChan chan<- WebSocketMessageOut) {
219299
message := WebSocketMessageOut{
220300
Type: "METADATA",
@@ -227,11 +307,14 @@ func sendMetadata(writeChan chan<- WebSocketMessageOut) {
227307
writeChan <- message
228308
}
229309

230-
func shutdown(writeChan chan<- WebSocketMessageOut, cancel context.CancelFunc) {
310+
func shutdown(writeChan chan<- WebSocketMessageOut, cancel context.CancelFunc, message string) {
231311
shutdownOnce.Do(func() {
232312
log.Println("Initiating shutdown sequence...")
233313
cancel()
234314

315+
if message != "" {
316+
writeChan <- WebSocketMessageOut{Type: "LOG", Payload: message}
317+
}
235318
writeChan <- WebSocketMessageOut{Type: "AGENT_SHUTDOWN", Payload: "Agent is shutting down."}
236319

237320
close(writeChan)

docker/entrypoint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ echo "Starting the server with gamemode: $gamemode"
1313
pidfile="gmod.pid"
1414

1515
# Dirs
16-
mkdir -p "$server/data"
16+
mkdir -p "$server/data/gluadev"
1717
mkdir -p "$server/lua/gluadev"
1818
touch "$server/console.log"
1919

docker/harness.lua

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ local Harness = {
55
--- The diretory in `lua/` where scripts are saved and executed
66
scriptDir = "gluadev",
77

8+
--- The directory in `data/` where ops-level files are stored
9+
dataDir = "gluadev",
10+
811
--- Script names that have already been seen and executed
912
--- (This is poopy but will be fine for now)
1013
seenScripts = {},
@@ -64,13 +67,19 @@ end
6467
do
6568
local ipairs = ipairs
6669
local include = include
70+
local os_time = os.time
6771
local file_Find = file.Find
72+
local file_Write = file.Write
73+
local timer_Create = timer.Create
6874
local timer_Simple = timer.Simple
6975
local ProtectedCall = ProtectedCall
7076

77+
local dataDir = Harness.dataDir
7178
local scriptDir = Harness.scriptDir
7279
local seenScripts = Harness.seenScripts
7380

81+
--- Logs and executes the given file
82+
--- @param filename string The filename within scriptDir to process
7483
local function processFile( filename )
7584
if seenScripts[filename] then return end
7685

@@ -82,6 +91,7 @@ do
8291
ProtectedCall( include, scriptPath )
8392
end
8493

94+
--- Creates the timer that watches for new script creation
8595
function Harness.CreateScriptWatcher()
8696
local findString = scriptDir .. "/*.lua"
8797

@@ -97,10 +107,20 @@ do
97107

98108
timer_Simple( 0.25, tick )
99109
end
110+
111+
--- Starts the heartbeat timer
112+
function Harness.StartHeartbeat()
113+
local heartbeatFile = dataDir .. "/" .. "heartbeat.txt"
114+
115+
timer_Create( "GLuaDev_Heartbeat", 1, 0, function()
116+
file_Write( heartbeatFile, os_time() )
117+
end )
118+
end
100119
end
101120

102121
function Harness:Init()
103122
self.CreateScriptWatcher()
123+
self.StartHeartbeat()
104124
end
105125

106126
Harness:Init()

0 commit comments

Comments
 (0)