Skip to content

Commit 890910f

Browse files
Added server, client, and proxy
0 parents  commit 890910f

23 files changed

Lines changed: 3562 additions & 0 deletions

File tree

.github/workflows/release.yml

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
tags:
6+
- "v*"
7+
workflow_dispatch:
8+
inputs:
9+
version:
10+
description: "Release version (e.g. v1.2.3)"
11+
required: true
12+
13+
jobs:
14+
build:
15+
runs-on: ubuntu-latest
16+
strategy:
17+
matrix:
18+
include:
19+
- { goos: linux, goarch: amd64, suffix: linux-amd64 }
20+
- { goos: linux, goarch: arm64, suffix: linux-arm64 }
21+
- { goos: linux, goarch: arm, goarm: "7", suffix: linux-armv7 }
22+
- { goos: linux, goarch: "386", suffix: linux-386 }
23+
- { goos: darwin, goarch: amd64, suffix: darwin-amd64 }
24+
- { goos: darwin, goarch: arm64, suffix: darwin-arm64 }
25+
- { goos: windows, goarch: amd64, suffix: windows-amd64 }
26+
- { goos: windows, goarch: "386", suffix: windows-386 }
27+
- { goos: windows, goarch: arm64, suffix: windows-arm64 }
28+
29+
steps:
30+
- uses: actions/checkout@v4
31+
32+
- uses: actions/setup-go@v5
33+
with:
34+
go-version: "1.26"
35+
36+
- name: Build
37+
env:
38+
GOOS: ${{ matrix.goos }}
39+
GOARCH: ${{ matrix.goarch }}
40+
GOARM: ${{ matrix.goarm }}
41+
CGO_ENABLED: "0"
42+
VERSION: ${{ github.event.inputs.version || github.ref_name }}
43+
run: |
44+
EXT=""
45+
if [ "$GOOS" = "windows" ]; then EXT=".exe"; fi
46+
go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" \
47+
-o dist/server${EXT} ./src/server
48+
go build -trimpath -ldflags="-s -w -X main.version=${VERSION}" \
49+
-o dist/client${EXT} ./src/client
50+
51+
- name: Archive (unix)
52+
if: matrix.goos != 'windows'
53+
env:
54+
VERSION: ${{ github.event.inputs.version || github.ref_name }}
55+
run: |
56+
tar -czf distributed-llama-${{ matrix.suffix }}-${VERSION}.tar.gz \
57+
dist/server dist/client \
58+
examples/env/.env.server \
59+
examples/env/.env.client
60+
61+
- name: Archive (windows)
62+
if: matrix.goos == 'windows'
63+
env:
64+
VERSION: ${{ github.event.inputs.version || github.ref_name }}
65+
run: |
66+
zip distributed-llama-${{ matrix.suffix }}-${VERSION}.zip \
67+
dist/server.exe dist/client.exe \
68+
examples/env/.env.server \
69+
examples/env/.env.client
70+
71+
- uses: actions/upload-artifact@v4
72+
with:
73+
name: release-${{ matrix.suffix }}
74+
path: distributed-llama-${{ matrix.suffix }}.*
75+
76+
release:
77+
needs: build
78+
runs-on: ubuntu-latest
79+
permissions:
80+
contents: write
81+
steps:
82+
- uses: actions/download-artifact@v4
83+
with:
84+
pattern: release-*
85+
merge-multiple: true
86+
path: dist/
87+
88+
- name: Create GitHub release
89+
uses: softprops/action-gh-release@v2
90+
with:
91+
tag_name: ${{ github.event.inputs.version || github.ref_name }}
92+
files: dist/*
93+
generate_release_notes: true

.gitignore

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
*.pem
2+
bin/
3+
.claude/
4+
5+
*.exe
6+
*.exe~
7+
*.dll
8+
*.so
9+
*.dylib
10+
11+
*.test
12+
*.out
13+
coverage.*
14+
*.coverprofile
15+
profile.cov
16+
17+
go.work
18+
go.work.sum
19+

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2026 Marian-Sergiu Nistor
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

Makefile

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
.PHONY: proto install-tools build-server build-client build gen-cert clean
2+
3+
PROTO_DIR = src/proto
4+
PROTO_FILE = $(PROTO_DIR)/inference.proto
5+
6+
ifeq ($(OS),Windows_NT)
7+
EXT = .exe
8+
else
9+
EXT =
10+
endif
11+
12+
SERVER_BIN = bin/server/server$(EXT)
13+
CLIENT_BIN = bin/client/client$(EXT)
14+
15+
proto: $(PROTO_FILE)
16+
protoc \
17+
--go_out=. \
18+
--go_opt=module=distributed-llama \
19+
--go-grpc_out=. \
20+
--go-grpc_opt=module=distributed-llama \
21+
$(PROTO_FILE)
22+
23+
install-tools:
24+
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
25+
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
26+
27+
build: build-server build-client
28+
29+
build-server:
30+
mkdir -p bin/server || true
31+
go build -o $(SERVER_BIN) ./src/server
32+
33+
build-client:
34+
mkdir -p bin/client || true
35+
go build -o $(CLIENT_BIN) ./src/client
36+
37+
SERVER_IP ?= 127.0.0.1
38+
39+
gen-cert:
40+
printf '[req]\ndistinguished_name=dn\n[dn]\n' > _openssl.cnf
41+
OPENSSL_CONF=_openssl.cnf MSYS_NO_PATHCONV=1 openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:P-256 \
42+
-keyout _key.pem -out _cert.pem -days 3650 -nodes \
43+
-subj "/CN=distributed-llama" \
44+
-addext "subjectAltName=IP:$(SERVER_IP),IP:127.0.0.1,DNS:localhost"
45+
cat _cert.pem _key.pem > shared.pem
46+
rm -f _openssl.cnf _cert.pem _key.pem
47+
@echo "shared.pem generated. Copy it to all client machines."
48+
49+
clean:
50+
rm -rf bin/ generated/inference/

README.md

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# distributed-llama.cpp
2+
3+
Runs LLM inference across multiple machines. A central **server** accepts OpenAI-compatible HTTP requests and routes them to **client** nodes, each running llama.cpp in Docker. All communication is secured with mutual TLS using a single shared certificate.
4+
5+
---
6+
7+
## How it works
8+
9+
### Server
10+
11+
The server does two things: it manages clients over gRPC, and it exposes an OpenAI-compatible HTTP API.
12+
13+
When a client connects, the server dials back to it (dial-back pattern), checks whether it already has the model cached, and if not streams the model file to it in 256 KB chunks. It then tells the client to start its llama.cpp Docker container. The connection stays open for the lifetime of the client session. The server uses it to send commands and the client uses it to heartbeat.
14+
15+
When an inference request arrives over HTTP, the server polls all connected clients for their current capacity (how many concurrent in-flight requests they have free), sorts by most available slots and lowest average latency, and forwards the request to the best one. If all clients are at capacity it still routes to one, chosen at random.
16+
17+
### Client
18+
19+
The client connects to the server, registers itself, and waits for commands. It runs a local gRPC server (the **agent**) on a separate port so the server can dial back for capacity queries and inference forwarding.
20+
21+
After receiving the model (or confirming it's already cached), the client pulls the appropriate llama.cpp Docker image and starts a container with `--parallel MAX_SLOTS`, mounting the model file read-only. It polls llama.cpp's `/health` endpoint every 2 seconds until the model is fully loaded, then sends a single hardcoded warmup prompt. Once the warmup completes the client marks itself ready and begins reporting capacity to the server.
22+
23+
Capacity is queried directly from llama.cpp's `/slots` endpoint, which returns one object per slot with a `state` field. The client counts idle slots (state = 0) and returns that number to the server for routing decisions.
24+
25+
The client sends a heartbeat to the server every 10 seconds. After 10 consecutive failures it stops the Docker container and triggers a reconnect. If the server shuts down cleanly (sends `StopContainer` then closes the stream), the client exits instead of reconnecting.
26+
27+
GPU mode is detected automatically via `nvidia-smi`. It can be overridden with the `GPU_MODE` env var.
28+
29+
### Security
30+
31+
All gRPC communication uses mutual TLS. Both server and clients share a single `.pem` file containing a self-signed certificate and private key. Each side verifies the other's certificate against the same CA. The server's dial-back to client agent addresses skips hostname verification (since client IPs are not in the certificate SANs) but still validates the certificate chain.
32+
33+
---
34+
35+
## Project structure
36+
37+
```
38+
src/
39+
server/ coordinator gRPC service + HTTP API
40+
client/ inference node agent
41+
shared/ mTLS, config, model streaming
42+
proto/ protobuf definitions
43+
generated/ generated gRPC code (do not edit)
44+
tst/
45+
e2e/ end-to-end tests
46+
bin/
47+
server/ server binary + .env
48+
client/ client binary + .env
49+
```
50+
51+
---
52+
53+
## Setup
54+
55+
### 1. Install tools
56+
57+
Install [protoc](https://github.com/protocolbuffers/protobuf/releases), then:
58+
59+
```bash
60+
make install-tools
61+
```
62+
63+
### 2. Generate gRPC code
64+
65+
```bash
66+
make proto
67+
```
68+
69+
This generates Go code from `src/proto/inference.proto` into `generated/inference/`.
70+
71+
### 3. Generate the shared TLS certificate
72+
73+
```bash
74+
make gen-cert SERVER_IP=192.168.1.100
75+
```
76+
77+
Replace `192.168.1.100` with the LAN IP of the machine running the server. This creates `shared.pem`. Copy it to every client machine alongside the client binary.
78+
79+
For local testing only:
80+
81+
```bash
82+
make gen-cert SERVER_IP=127.0.0.1
83+
```
84+
85+
### 4. Configure
86+
87+
Copy the example env files and fill them in:
88+
89+
```bash
90+
cp examples/env/.env.server bin/server/.env
91+
cp examples/env/.env.client bin/client/.env
92+
```
93+
94+
### 5. Build
95+
96+
```bash
97+
make build
98+
```
99+
100+
Produces `bin/server/server(.exe)` and `bin/client/client(.exe)`.
101+
102+
---
103+
104+
## Configuration
105+
106+
### Server
107+
108+
| Variable | Default | Description |
109+
| ------------ | -------------- | ---------------------------------------------------- |
110+
| `MODEL_PATH` | - | Path to the `.gguf` model file to serve to clients |
111+
| `PEM_PATH` | `./shared.pem` | Path to the shared mTLS certificate |
112+
| `GRPC_PORT` | `50051` | Port for the gRPC coordinator (clients connect here) |
113+
| `HTTP_PORT` | `8181` | Port for the OpenAI-compatible HTTP API |
114+
115+
### Client
116+
117+
| Variable | Default | Description |
118+
| ------------------- | -------------- | ------------------------------------------------------------------------------ |
119+
| `SERVER_ADDR` | - | Server address in `host:port` format, e.g. `192.168.1.100:50051` |
120+
| `PEM_PATH` | `./shared.pem` | Path to the shared mTLS certificate |
121+
| `MODEL_STORAGE_DIR` | `./models` | Directory where downloaded model files are stored (created automatically) |
122+
| `GPU_MODE` | `auto` | `auto` detects NVIDIA GPU via `nvidia-smi`, `gpu` forces GPU, `cpu` forces CPU |
123+
| `AGENT_PORT` | `50052` | Port the client's gRPC agent listens on (server dials back here) |
124+
| `LLAMA_PORT` | `18080` | Host port mapped to the llama.cpp Docker container |
125+
| `MAX_SLOTS` | `16` | Maximum number of concurrent inference requests this client will accept |
126+
127+
---
128+
129+
## Running
130+
131+
Each binary reads `.env` from its working directory.
132+
133+
```bash
134+
cd bin/server && ./server
135+
cd bin/client && ./client
136+
```
137+
138+
Place `shared.pem` at the path specified via the configuration. On client machines, the model is downloaded automatically on first connect.
139+
140+
---
141+
142+
## Testing
143+
144+
The e2e test sends a chat completion request to a running server and verifies the response contains at least one character.
145+
146+
**The server (and at least one client) must already be running before the test is executed.**
147+
148+
```bash
149+
go test ./tst/e2e/ -v -timeout 5m -count=1
150+
```
151+
152+
By default the test hits `http://localhost:8181`. Override with `SERVER_ADDR`:
153+
154+
```bash
155+
SERVER_ADDR=http://192.168.1.100:8181 go test ./tst/e2e/ -v -timeout 5m -count=1
156+
```
157+
158+
The `-timeout 5m` flag is recommended since inference can take time depending on hardware and model size.
159+
160+
---
161+
162+
## Deployment layout (multi-machine)
163+
164+
```
165+
Server machine
166+
├── bin/server/server
167+
├── bin/server/.env (MODEL_PATH points to your .gguf file)
168+
├── bin/server/shared.pem
169+
└── bin/server/models/
170+
└── your-model.gguf
171+
172+
Client machine(s)
173+
├── bin/client/client
174+
├── bin/client/.env (SERVER_ADDR points to server machine)
175+
└── bin/client/shared.pem (same file as on server)
176+
```
177+
178+
The model is transferred automatically from server to client on first connect. Subsequent restarts skip the transfer if the local file matches the server's file size.
179+
180+
---
181+
182+
## License
183+
184+
MIT - see [LICENSE](LICENSE).

examples/env/.env.client

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
SERVER_ADDR=192.168.1.100:50051
2+
PEM_PATH=./shared.pem
3+
MODEL_STORAGE_DIR=./models
4+
GPU_MODE=auto
5+
AGENT_PORT=50052
6+
LLAMA_PORT=18080
7+
MAX_SLOTS=16

examples/env/.env.server

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
MODEL_PATH=./models/Qwen3.5-4B-Q4_0.gguf
2+
PEM_PATH=./shared.pem
3+
GRPC_PORT=50051
4+
HTTP_PORT=8181

0 commit comments

Comments
 (0)