Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.devcontainer
models
backends
volumes
examples/chatbot-ui/models
backend/go/image/stablediffusion-ggml/build/
backend/go/*/build
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ go-bert
LocalAI
/local-ai
/local-ai-launcher
# Root-level build artifacts when running `go build ./...` against
# Go backend packages whose main lives under backend/go/.
/cloud-proxy
/local-store
# prevent above rules from omitting the helm chart
!charts/*
# prevent above rules from omitting the api/localai folder
Expand Down
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,7 @@ BACKEND_DS4 = ds4|ds4|.|false|false
# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
BACKEND_LOCAL_STORE = local-store|golang|.|false|true
BACKEND_CLOUD_PROXY = cloud-proxy|golang|.|false|true
BACKEND_HUGGINGFACE = huggingface|golang|.|false|true
BACKEND_SILERO_VAD = silero-vad|golang|.|false|true
BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true
Expand Down Expand Up @@ -1149,6 +1150,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
$(eval $(call generate-docker-build-target,$(BACKEND_DS4)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_CLOUD_PROXY)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
$(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD)))
$(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
Expand Down Expand Up @@ -1201,7 +1203,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar

docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx docker-build-cloud-proxy

########################################################
### Mock Backend for E2E Tests
Expand All @@ -1213,6 +1215,12 @@ build-mock-backend: protogen-go
clean-mock-backend:
rm -f tests/e2e/mock-backend/mock-backend

build-cloud-proxy-backend: protogen-go
$(GOCMD) build -o tests/e2e/mock-backend/cloud-proxy ./backend/go/cloud-proxy

clean-cloud-proxy-backend:
rm -f tests/e2e/mock-backend/cloud-proxy

########################################################
### UI E2E Test Server
########################################################
Expand Down
151 changes: 151 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ service Backend {

rpc Rerank(RerankRequest) returns (RerankResult) {}

// TokenClassify runs a token-classification (NER) model on the
// supplied text and returns each detected entity span. Used by the
// PII redactor's optional NER tier — the regex tier still handles
// formatted hits cheaply, while this catches names, locations, and
// other unformatted PII that regex misses.
rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}

// Score evaluates the model's joint log-probability of each
// supplied candidate continuation given a shared prompt. The
// prompt's KV cache is computed once and reused across candidates.
// Used for routing-policy multi-label classification, reranking,
// calibrated confidence, and reward-model scoring — any task where
// the consumer wants the model's confidence in a pre-specified
// continuation rather than a generated one.
rpc Score(ScoreRequest) returns (ScoreResponse) {}

rpc GetMetrics(MetricsRequest) returns (MetricsResponse);

rpc VAD(VADRequest) returns (VADResponse) {}
Expand Down Expand Up @@ -68,6 +84,23 @@ service Backend {
rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {}
rpc StopQuantization(QuantizationStopRequest) returns (Result) {}

// Forward proxies a raw HTTP request to an upstream provider. The
// cloud-proxy backend implements this for passthrough-mode model
// configs: the client wire format is preserved end-to-end (no
// translation through internal proto), which means new provider
// fields work the day they ship. Translation-mode proxies use the
// standard Predict/PredictStream RPCs instead. Backends that don't
// support this return UNIMPLEMENTED.
//
// The request is bidirectionally streamed so large bodies can flow
// without buffering. In practice the first ForwardRequest carries
// path, method, headers, and the initial body chunk; subsequent
// messages append body chunks. The first ForwardReply carries the
// upstream status and response headers; subsequent messages stream
// body chunks (SSE frames or chunked transfer). Cancellation of the
// gRPC context closes the upstream connection.
rpc Forward(stream ForwardRequest) returns (stream ForwardReply) {}

}

// Define the empty request
Expand All @@ -81,6 +114,76 @@ message MetricsResponse {
int32 prompt_tokens_processed = 5;
}

// TokenClassifyRequest carries the text to classify plus an optional
// score threshold. The transformers backend interprets threshold as
// the minimum confidence to include in the response; 0 = include all.
message TokenClassifyRequest {
string text = 1;
float threshold = 2;
}

// TokenClassifyEntity is one detected entity span. Byte offsets are
// into the original UTF-8 text — start..end is a half-open range that
// addresses the substring corresponding to entity_group.
//
// entity_group follows HuggingFace's aggregated-tag convention (e.g.
// "PER", "LOC", "ORG", or a PII-specific label like "EMAIL" /
// "SSN" depending on the model). The redactor's per-pattern action
// map keys off this string.
message TokenClassifyEntity {
string entity_group = 1;
int32 start = 2;
int32 end = 3;
float score = 4;
string text = 5;
}

message TokenClassifyResponse {
repeated TokenClassifyEntity entities = 1;
}

// ScoreRequest carries one shared prompt and one or more continuations
// to score against it. The backend tokenises the prompt once and reuses
// the resulting KV cache across all candidates in this request.
message ScoreRequest {
string prompt = 1;
repeated string candidates = 2;
// Return per-token logprobs for each candidate when true. Default
// false to keep the wire response small; the joint log_prob field
// covers the common ranking case.
bool include_token_logprobs = 3;
// When true, the response also populates length_normalized_log_prob
// (joint log-prob divided by candidate token count). Useful when
// candidates differ in length and the consumer wants a per-token
// measure comparable across them (PMI-style scoring).
bool length_normalize = 4;
}

// CandidateScore is one row in the ScoreResponse, matching by index
// the candidate in ScoreRequest.candidates.
message CandidateScore {
// Sum of log P(token_i | prompt, candidate_token_<i) across the
// candidate's tokens. The primary ranking signal.
double log_prob = 1;
// log_prob / num_tokens — populated when length_normalize=true on
// the request.
double length_normalized_log_prob = 2;
// Per-token detail — populated when include_token_logprobs=true.
repeated TokenLogProb tokens = 3;
// Number of tokens the backend tokenised this candidate into, after
// any backend-specific normalisation (e.g. leading-space handling).
int32 num_tokens = 4;
}

message TokenLogProb {
string token = 1;
double log_prob = 2;
}

message ScoreResponse {
repeated CandidateScore candidates = 1;
}

message RerankRequest {
string query = 1;
repeated string documents = 2;
Expand Down Expand Up @@ -325,6 +428,25 @@ message ModelOptions {
// applied verbatim to the backend's engine constructor (e.g. vLLM AsyncEngineArgs).
// Unknown keys produce an error at LoadModel time.
string EngineArgs = 73;

// Proxy carries the cloud-proxy backend's per-model configuration.
// Empty for non-proxy backends.
ProxyOptions Proxy = 74;
}

// ProxyOptions configures the cloud-proxy backend. UpstreamURL and
// Mode are always meaningful; Provider only matters in translate mode.
// The two api_key_* fields are mutually exclusive and resolved by the
// backend at LoadModel — core forwards the references rather than the
// plaintext key.
message ProxyOptions {
string upstream_url = 1;
string mode = 2;
string provider = 3;
string api_key_env = 4;
string api_key_file = 5;
string upstream_model = 6;
int32 request_timeout_seconds = 7;
}

message Result {
Expand Down Expand Up @@ -1002,3 +1124,32 @@ message QuantizationStopRequest {
string job_id = 1;
}

// ForwardHeader is one HTTP header on the request or response. Headers
// like Authorization are typically injected by the backend (from the
// resolved API key) rather than passed through from the client.
message ForwardHeader {
string name = 1;
string value = 2;
}

// ForwardRequest is a streamed HTTP request to the upstream. First
// message carries path/method/headers; subsequent messages carry
// body_chunk only. All fields except body_chunk are honoured on the
// first message and ignored thereafter.
message ForwardRequest {
string path = 1; // e.g. "/v1/chat/completions" — appended to the model's upstream_url
string method = 2; // usually "POST"
repeated ForwardHeader headers = 3;
bytes body_chunk = 4;
}

// ForwardReply is a streamed HTTP response from the upstream. First
// message carries status/headers; subsequent messages carry body_chunk
// only. SSE responses arrive as a sequence of body_chunk frames; the
// caller is responsible for any parsing.
message ForwardReply {
int32 status = 1;
repeated ForwardHeader headers = 2;
bytes body_chunk = 3;
}

Loading
Loading