|
| 1 | +// |
| 2 | +// dbmem-lembed.c |
| 3 | +// sqlitememory |
| 4 | +// |
| 5 | +// Created by Marco Bambini on 04/02/26. |
| 6 | +// |
| 7 | + |
| 8 | +#include "dbmem-lembed.h" |
| 9 | +#include "llama.h" |
| 10 | +#include "ggml.h" |
| 11 | + |
| 12 | +#include <math.h> |
| 13 | +#include <string.h> |
| 14 | + |
| 15 | +#define DEFAULT_CHUNK_SIZE 256 |
| 16 | +#define DEFAULT_OVERLAP 32 |
| 17 | +#define MAX_CONTEXT_SIZE 8192 |
| 18 | +#define MAX_CHUNKS 256 // Pre-allocate for up to this many chunks |
| 19 | + |
| 20 | +struct dbmem_local_engine_t { |
| 21 | + // Model and context |
| 22 | + struct llama_model *model; |
| 23 | + struct llama_context *ctx; |
| 24 | + const struct llama_vocab *vocab; |
| 25 | + enum llama_pooling_type pooling; |
| 26 | + llama_memory_t mem; |
| 27 | + |
| 28 | + // Model info |
| 29 | + int n_embd; |
| 30 | + int n_ctx; |
| 31 | + bool is_encoder_only; |
| 32 | + |
| 33 | + // Settings |
| 34 | + int chunk_size; |
| 35 | + int overlap; |
| 36 | + bool json_output; |
| 37 | + bool normalize; // Whether to L2 normalize embeddings |
| 38 | + |
| 39 | + // Reusable buffers (avoid repeated allocations) |
| 40 | + llama_token *tokens; |
| 41 | + int tokens_capacity; |
| 42 | + |
| 43 | + // Pre-allocated embedding storage |
| 44 | + float *embedding_pool; // Pool for all chunk embeddings |
| 45 | + int embedding_pool_size; // Number of embeddings that fit |
| 46 | + |
| 47 | + // Statistics |
| 48 | + int64_t total_tokens_processed; |
| 49 | + int64_t total_embeddings_generated; |
| 50 | +}; |
| 51 | + |
| 52 | +// MARK: - |
| 53 | + |
| 54 | +// L2 normalize an embedding vector (with loop unrolling for better pipelining) |
| 55 | +static void dbmem_embedding_normalize (float *vec, int n) { |
| 56 | + float sum = 0.0f; |
| 57 | + int i = 0; |
| 58 | + |
| 59 | + // Process 4 elements at a time for better CPU pipelining |
| 60 | + for (; i + 3 < n; i += 4) { |
| 61 | + sum += vec[i] * vec[i] + vec[i+1] * vec[i+1] + |
| 62 | + vec[i+2] * vec[i+2] + vec[i+3] * vec[i+3]; |
| 63 | + } |
| 64 | + // Handle remainder |
| 65 | + for (; i < n; i++) { |
| 66 | + sum += vec[i] * vec[i]; |
| 67 | + } |
| 68 | + |
| 69 | + float norm = sqrtf(sum); |
| 70 | + if (norm > 1e-12f) { |
| 71 | + float inv_norm = 1.0f / norm; |
| 72 | + i = 0; |
| 73 | + for (; i + 3 < n; i += 4) { |
| 74 | + vec[i] *= inv_norm; |
| 75 | + vec[i+1] *= inv_norm; |
| 76 | + vec[i+2] *= inv_norm; |
| 77 | + vec[i+3] *= inv_norm; |
| 78 | + } |
| 79 | + for (; i < n; i++) { |
| 80 | + vec[i] *= inv_norm; |
| 81 | + } |
| 82 | + } |
| 83 | +} |
| 84 | + |
| 85 | +// Copy embedding data |
| 86 | +static inline void dbmem_embedding_copy (float *dst, const float *src, int n) { |
| 87 | + memcpy(dst, src, sizeof(float) * n); |
| 88 | +} |
| 89 | + |
| 90 | +// Free embedding result structure |
| 91 | +static void dbmem_embedding_free( embedding_result_t *result) { |
| 92 | + if (result == NULL) return; |
| 93 | + |
| 94 | + if (result->chunks != NULL) { |
| 95 | + // Only free individual embeddings if they weren't from the pool |
| 96 | + if (!result->used_pool) { |
| 97 | + for (int i = 0; i < result->n_chunks; i++) { |
| 98 | + if (result->chunks[i].embedding != NULL) { |
| 99 | + dbmem_free(result->chunks[i].embedding); |
| 100 | + } |
| 101 | + } |
| 102 | + } |
| 103 | + dbmem_free(result->chunks); |
| 104 | + result->chunks = NULL; |
| 105 | + } |
| 106 | + result->n_chunks = 0; |
| 107 | + result->used_pool = false; |
| 108 | +} |
| 109 | + |
| 110 | +// MARK: - |
| 111 | + |
| 112 | +dbmem_local_engine_t *dbmem_local_engine_init (const char *model_path, char err_msg[DBMEM_MAXERROR_SIZE]) { |
| 113 | + dbmem_local_engine_t *engine = (dbmem_local_engine_t *)dbmem_zeroalloc(sizeof(dbmem_local_engine_t)); |
| 114 | + if (!engine) return NULL; |
| 115 | + |
| 116 | + engine->chunk_size = DEFAULT_CHUNK_SIZE; |
| 117 | + engine->overlap = DEFAULT_OVERLAP; |
| 118 | + engine->json_output = false; |
| 119 | + |
| 120 | + // Initialize backend |
| 121 | + llama_backend_init(); |
| 122 | + |
| 123 | + // Load model |
| 124 | + struct llama_model_params model_params = llama_model_default_params(); |
| 125 | + engine->model = llama_model_load_from_file(model_path, model_params); |
| 126 | + if (!engine->model) { |
| 127 | + snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to load model: %s", model_path); |
| 128 | + goto cleanup; |
| 129 | + } |
| 130 | + |
| 131 | + // Create context |
| 132 | + struct llama_context_params ctx_params = llama_context_default_params(); |
| 133 | + ctx_params.embeddings = true; |
| 134 | + ctx_params.n_ctx = MAX_CONTEXT_SIZE; |
| 135 | + ctx_params.n_batch = MAX_CONTEXT_SIZE; |
| 136 | + ctx_params.n_ubatch = MAX_CONTEXT_SIZE; |
| 137 | + |
| 138 | + engine->ctx = llama_init_from_model(engine->model, ctx_params); |
| 139 | + if (!engine->ctx) { |
| 140 | + snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to create context"); |
| 141 | + goto cleanup; |
| 142 | + } |
| 143 | + |
| 144 | + // Get model info |
| 145 | + engine->vocab = llama_model_get_vocab(engine->model); |
| 146 | + engine->n_embd = llama_model_n_embd(engine->model); |
| 147 | + engine->n_ctx = llama_n_ctx(engine->ctx); |
| 148 | + engine->pooling = llama_pooling_type(engine->ctx); |
| 149 | + engine->mem = llama_get_memory(engine->ctx); |
| 150 | + |
| 151 | + // Show model architecture info |
| 152 | + bool has_encoder = llama_model_has_encoder(engine->model); |
| 153 | + bool has_decoder = llama_model_has_decoder(engine->model); |
| 154 | + engine->is_encoder_only = has_encoder && !has_decoder; |
| 155 | + |
| 156 | + // Debug |
| 157 | + printf("[INFO] Architecture: %s\n", engine->is_encoder_only ? "encoder-only (BERT-style)" : (has_encoder && has_decoder ? "encoder-decoder" : "decoder-only (GPT-style)")); |
| 158 | + printf("[INFO] Embedding dimension: %d\n", engine->n_embd); |
| 159 | + printf("[INFO] Max context: %d tokens\n", engine->n_ctx); |
| 160 | + |
| 161 | + // Allocate token buffer |
| 162 | + engine->tokens_capacity = MAX_CONTEXT_SIZE; |
| 163 | + engine->tokens = (llama_token *)dbmem_alloc(sizeof(llama_token) * engine->tokens_capacity); |
| 164 | + if (!engine->tokens) { |
| 165 | + snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate token buffer"); |
| 166 | + goto cleanup; |
| 167 | + } |
| 168 | + |
| 169 | + // Pre-allocate embedding pool to avoid malloc/free per chunk |
| 170 | + engine->embedding_pool_size = MAX_CHUNKS; |
| 171 | + engine->embedding_pool = (float *)dbmem_alloc(sizeof(float) * engine->n_embd * MAX_CHUNKS); |
| 172 | + if (!engine->embedding_pool) { |
| 173 | + snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate embedding pool"); |
| 174 | + goto cleanup; |
| 175 | + } |
| 176 | + |
| 177 | + // Default settings |
| 178 | + engine->normalize = true; // L2 normalize by default |
| 179 | + engine->total_tokens_processed = 0; |
| 180 | + engine->total_embeddings_generated = 0; |
| 181 | + |
| 182 | + return engine; |
| 183 | + |
| 184 | +cleanup: |
| 185 | + dbmem_local_engine_free(engine); |
| 186 | + return NULL; |
| 187 | +} |
| 188 | + |
| 189 | +bool dbmem_local_engine_warmup (dbmem_local_engine_t *engine) { |
| 190 | + // Pre-warm the model by running a dummy inference |
| 191 | + // This ensures all Metal/GPU shaders are compiled before actual use |
| 192 | + |
| 193 | + const char *warmup_text = "Warmup"; |
| 194 | + int warmup_tokens = llama_tokenize(engine->vocab, warmup_text, (int32_t)strlen(warmup_text), engine->tokens, engine->tokens_capacity, true, true); |
| 195 | + if (warmup_tokens > 0) { |
| 196 | + struct llama_batch batch = { |
| 197 | + .n_tokens = warmup_tokens, |
| 198 | + .token = engine->tokens, |
| 199 | + .embd = NULL, |
| 200 | + .pos = NULL, |
| 201 | + .n_seq_id = NULL, |
| 202 | + .seq_id = NULL, |
| 203 | + .logits = NULL, |
| 204 | + }; |
| 205 | + llama_encode(engine->ctx, batch); |
| 206 | + |
| 207 | + if (engine->mem != NULL) { |
| 208 | + llama_memory_clear(engine->mem, true); |
| 209 | + } |
| 210 | + } |
| 211 | + |
| 212 | + return true; |
| 213 | +} |
| 214 | + |
| 215 | +int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *text, int text_len, embedding_result_t *result) { |
| 216 | + memset(result, 0, sizeof(embedding_result_t)); |
| 217 | + if (text_len == -1) text_len = (int)strlen(text); |
| 218 | + if (text_len == 0) return 0; |
| 219 | + |
| 220 | + // Tokenize |
| 221 | + int n_tokens = llama_tokenize(engine->vocab, text, text_len, engine->tokens, engine->tokens_capacity, true, true); |
| 222 | + if (n_tokens < 0) { |
| 223 | + snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Tokenization failed (text too long?)"); |
| 224 | + return -1; |
| 225 | + } |
| 226 | + |
| 227 | + // Calculate chunks |
| 228 | + int chunk_size = engine->chunk_size; |
| 229 | + int overlap = engine->overlap; |
| 230 | + int step = chunk_size - overlap; |
| 231 | + if (step < 1) step = 1; |
| 232 | + int n_chunks = (n_tokens <= chunk_size) ? 1 : 1 + (n_tokens - chunk_size + step - 1) / step; |
| 233 | + |
| 234 | + // Check if we exceed pre-allocated pool |
| 235 | + bool use_pool = (n_chunks <= engine->embedding_pool_size); |
| 236 | + |
| 237 | + // Setup result |
| 238 | + result->total_tokens = n_tokens; |
| 239 | + result->total_chars = text_len; |
| 240 | + result->n_embd = engine->n_embd; |
| 241 | + result->chunk_size = chunk_size; |
| 242 | + result->overlap = overlap; |
| 243 | + result->n_chunks = n_chunks; |
| 244 | + result->used_pool = use_pool; |
| 245 | + result->chunks = (chunk_result_t *)dbmem_zeroalloc(n_chunks * sizeof(chunk_result_t)); |
| 246 | + if (!result->chunks) { |
| 247 | + snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate chunks"); |
| 248 | + return -1; |
| 249 | + } |
| 250 | + |
| 251 | + // Calculate average chars per token for position estimation |
| 252 | + float avg_chars_per_token = (n_tokens > 0) ? (float)text_len / n_tokens : 1.0f; |
| 253 | + |
| 254 | + // Process each chunk |
| 255 | + for (int chunk_idx = 0; chunk_idx < n_chunks; chunk_idx++) { |
| 256 | + int token_start = chunk_idx * step; |
| 257 | + int token_length = chunk_size; |
| 258 | + |
| 259 | + if (token_start + token_length > n_tokens) { |
| 260 | + token_length = n_tokens - token_start; |
| 261 | + } |
| 262 | + |
| 263 | + // Create batch |
| 264 | + struct llama_batch batch = { |
| 265 | + .n_tokens = token_length, |
| 266 | + .token = &engine->tokens[token_start], |
| 267 | + .embd = NULL, |
| 268 | + .pos = NULL, |
| 269 | + .n_seq_id = NULL, |
| 270 | + .seq_id = NULL, |
| 271 | + .logits = NULL, |
| 272 | + }; |
| 273 | + |
| 274 | + // Clear memory |
| 275 | + if (engine->mem != NULL) { |
| 276 | + llama_memory_clear(engine->mem, true); |
| 277 | + } |
| 278 | + |
| 279 | + // Always use llama_encode() for embedding generation |
| 280 | + int ret = llama_encode(engine->ctx, batch); |
| 281 | + |
| 282 | + if (ret != 0) { |
| 283 | + snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "llama_encode failed for chunk %d", chunk_idx); |
| 284 | + dbmem_embedding_free(result); |
| 285 | + return -1; |
| 286 | + } |
| 287 | + |
| 288 | + // Get embeddings |
| 289 | + const float *emb_ptr = NULL; |
| 290 | + if (engine->pooling == LLAMA_POOLING_TYPE_NONE) { |
| 291 | + emb_ptr = llama_get_embeddings_ith(engine->ctx, token_length - 1); |
| 292 | + } else { |
| 293 | + emb_ptr = llama_get_embeddings_seq(engine->ctx, 0); |
| 294 | + } |
| 295 | + |
| 296 | + if (!emb_ptr) { |
| 297 | + snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to get embeddings for chunk %d", chunk_idx); |
| 298 | + dbmem_embedding_free(result); |
| 299 | + return -1; |
| 300 | + } |
| 301 | + |
| 302 | + // Store result |
| 303 | + chunk_result_t *chunk = &result->chunks[chunk_idx]; |
| 304 | + chunk->index = chunk_idx; |
| 305 | + chunk->token_start = token_start; |
| 306 | + chunk->token_length = token_length; |
| 307 | + chunk->char_start = (int)(token_start * avg_chars_per_token); |
| 308 | + chunk->char_length = (int)(token_length * avg_chars_per_token); |
| 309 | + |
| 310 | + if (chunk->char_start > text_len) chunk->char_start = text_len; |
| 311 | + if (chunk->char_start + chunk->char_length > text_len) { |
| 312 | + chunk->char_length = text_len - chunk->char_start; |
| 313 | + } |
| 314 | + |
| 315 | + // Use pre-allocated pool if possible, otherwise malloc |
| 316 | + if (use_pool) { |
| 317 | + chunk->embedding = &engine->embedding_pool[chunk_idx * engine->n_embd]; |
| 318 | + } else { |
| 319 | + chunk->embedding = (float *)dbmem_alloc(sizeof(float) * engine->n_embd); |
| 320 | + if (!chunk->embedding) { |
| 321 | + snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate embedding"); |
| 322 | + dbmem_embedding_free(result); |
| 323 | + return -1; |
| 324 | + } |
| 325 | + } |
| 326 | + |
| 327 | + // Copy embedding (using SIMD-optimized copy) |
| 328 | + dbmem_embedding_copy(chunk->embedding, emb_ptr, engine->n_embd); |
| 329 | + |
| 330 | + // Normalize if enabled |
| 331 | + if (engine->normalize) { |
| 332 | + dbmem_embedding_normalize(chunk->embedding, engine->n_embd); |
| 333 | + } |
| 334 | + |
| 335 | + // Update statistics |
| 336 | + engine->total_tokens_processed += token_length; |
| 337 | + engine->total_embeddings_generated++; |
| 338 | + } |
| 339 | + |
| 340 | + return 0; |
| 341 | +} |
| 342 | + |
| 343 | +void dbmem_local_engine_free (dbmem_local_engine_t *engine) { |
| 344 | + if (!engine) return; |
| 345 | + |
| 346 | + if (engine->embedding_pool) { |
| 347 | + dbmem_free(engine->embedding_pool); |
| 348 | + engine->embedding_pool = NULL; |
| 349 | + } |
| 350 | + if (engine->tokens) { |
| 351 | + dbmem_free(engine->tokens); |
| 352 | + engine->tokens = NULL; |
| 353 | + } |
| 354 | + if (engine->ctx) { |
| 355 | + llama_free(engine->ctx); |
| 356 | + engine->ctx = NULL; |
| 357 | + } |
| 358 | + if (engine->model) { |
| 359 | + llama_model_free(engine->model); |
| 360 | + engine->model = NULL; |
| 361 | + } |
| 362 | + llama_backend_free(); |
| 363 | +} |
0 commit comments