Skip to content

Commit 1423a1f

Browse files
committed
First version
1 parent 79da23d commit 1423a1f

14 files changed

Lines changed: 9592 additions & 0 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.DS_Store

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "modules/llama.cpp"]
2+
path = modules/llama.cpp
3+
url = https://github.com/ggml-org/llama.cpp

modules/llama.cpp

Submodule llama.cpp added at 8abcc70

src/dbmem-lembed.c

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
//
2+
// dbmem-lembed.c
3+
// sqlitememory
4+
//
5+
// Created by Marco Bambini on 04/02/26.
6+
//
7+
8+
#include "dbmem-lembed.h"
9+
#include "llama.h"
10+
#include "ggml.h"
11+
12+
#include <math.h>
13+
#include <string.h>
14+
15+
#define DEFAULT_CHUNK_SIZE 256
16+
#define DEFAULT_OVERLAP 32
17+
#define MAX_CONTEXT_SIZE 8192
18+
#define MAX_CHUNKS 256 // Pre-allocate for up to this many chunks
19+
20+
struct dbmem_local_engine_t {
21+
// Model and context
22+
struct llama_model *model;
23+
struct llama_context *ctx;
24+
const struct llama_vocab *vocab;
25+
enum llama_pooling_type pooling;
26+
llama_memory_t mem;
27+
28+
// Model info
29+
int n_embd;
30+
int n_ctx;
31+
bool is_encoder_only;
32+
33+
// Settings
34+
int chunk_size;
35+
int overlap;
36+
bool json_output;
37+
bool normalize; // Whether to L2 normalize embeddings
38+
39+
// Reusable buffers (avoid repeated allocations)
40+
llama_token *tokens;
41+
int tokens_capacity;
42+
43+
// Pre-allocated embedding storage
44+
float *embedding_pool; // Pool for all chunk embeddings
45+
int embedding_pool_size; // Number of embeddings that fit
46+
47+
// Statistics
48+
int64_t total_tokens_processed;
49+
int64_t total_embeddings_generated;
50+
};
51+
52+
// MARK: -
53+
54+
// L2 normalize an embedding vector (with loop unrolling for better pipelining)
55+
static void dbmem_embedding_normalize (float *vec, int n) {
56+
float sum = 0.0f;
57+
int i = 0;
58+
59+
// Process 4 elements at a time for better CPU pipelining
60+
for (; i + 3 < n; i += 4) {
61+
sum += vec[i] * vec[i] + vec[i+1] * vec[i+1] +
62+
vec[i+2] * vec[i+2] + vec[i+3] * vec[i+3];
63+
}
64+
// Handle remainder
65+
for (; i < n; i++) {
66+
sum += vec[i] * vec[i];
67+
}
68+
69+
float norm = sqrtf(sum);
70+
if (norm > 1e-12f) {
71+
float inv_norm = 1.0f / norm;
72+
i = 0;
73+
for (; i + 3 < n; i += 4) {
74+
vec[i] *= inv_norm;
75+
vec[i+1] *= inv_norm;
76+
vec[i+2] *= inv_norm;
77+
vec[i+3] *= inv_norm;
78+
}
79+
for (; i < n; i++) {
80+
vec[i] *= inv_norm;
81+
}
82+
}
83+
}
84+
85+
// Copy embedding data
86+
static inline void dbmem_embedding_copy (float *dst, const float *src, int n) {
87+
memcpy(dst, src, sizeof(float) * n);
88+
}
89+
90+
// Free embedding result structure
91+
static void dbmem_embedding_free( embedding_result_t *result) {
92+
if (result == NULL) return;
93+
94+
if (result->chunks != NULL) {
95+
// Only free individual embeddings if they weren't from the pool
96+
if (!result->used_pool) {
97+
for (int i = 0; i < result->n_chunks; i++) {
98+
if (result->chunks[i].embedding != NULL) {
99+
dbmem_free(result->chunks[i].embedding);
100+
}
101+
}
102+
}
103+
dbmem_free(result->chunks);
104+
result->chunks = NULL;
105+
}
106+
result->n_chunks = 0;
107+
result->used_pool = false;
108+
}
109+
110+
// MARK: -
111+
112+
dbmem_local_engine_t *dbmem_local_engine_init (const char *model_path, char err_msg[DBMEM_MAXERROR_SIZE]) {
113+
dbmem_local_engine_t *engine = (dbmem_local_engine_t *)dbmem_zeroalloc(sizeof(dbmem_local_engine_t));
114+
if (!engine) return NULL;
115+
116+
engine->chunk_size = DEFAULT_CHUNK_SIZE;
117+
engine->overlap = DEFAULT_OVERLAP;
118+
engine->json_output = false;
119+
120+
// Initialize backend
121+
llama_backend_init();
122+
123+
// Load model
124+
struct llama_model_params model_params = llama_model_default_params();
125+
engine->model = llama_model_load_from_file(model_path, model_params);
126+
if (!engine->model) {
127+
snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to load model: %s", model_path);
128+
goto cleanup;
129+
}
130+
131+
// Create context
132+
struct llama_context_params ctx_params = llama_context_default_params();
133+
ctx_params.embeddings = true;
134+
ctx_params.n_ctx = MAX_CONTEXT_SIZE;
135+
ctx_params.n_batch = MAX_CONTEXT_SIZE;
136+
ctx_params.n_ubatch = MAX_CONTEXT_SIZE;
137+
138+
engine->ctx = llama_init_from_model(engine->model, ctx_params);
139+
if (!engine->ctx) {
140+
snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to create context");
141+
goto cleanup;
142+
}
143+
144+
// Get model info
145+
engine->vocab = llama_model_get_vocab(engine->model);
146+
engine->n_embd = llama_model_n_embd(engine->model);
147+
engine->n_ctx = llama_n_ctx(engine->ctx);
148+
engine->pooling = llama_pooling_type(engine->ctx);
149+
engine->mem = llama_get_memory(engine->ctx);
150+
151+
// Show model architecture info
152+
bool has_encoder = llama_model_has_encoder(engine->model);
153+
bool has_decoder = llama_model_has_decoder(engine->model);
154+
engine->is_encoder_only = has_encoder && !has_decoder;
155+
156+
// Debug
157+
printf("[INFO] Architecture: %s\n", engine->is_encoder_only ? "encoder-only (BERT-style)" : (has_encoder && has_decoder ? "encoder-decoder" : "decoder-only (GPT-style)"));
158+
printf("[INFO] Embedding dimension: %d\n", engine->n_embd);
159+
printf("[INFO] Max context: %d tokens\n", engine->n_ctx);
160+
161+
// Allocate token buffer
162+
engine->tokens_capacity = MAX_CONTEXT_SIZE;
163+
engine->tokens = (llama_token *)dbmem_alloc(sizeof(llama_token) * engine->tokens_capacity);
164+
if (!engine->tokens) {
165+
snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate token buffer");
166+
goto cleanup;
167+
}
168+
169+
// Pre-allocate embedding pool to avoid malloc/free per chunk
170+
engine->embedding_pool_size = MAX_CHUNKS;
171+
engine->embedding_pool = (float *)dbmem_alloc(sizeof(float) * engine->n_embd * MAX_CHUNKS);
172+
if (!engine->embedding_pool) {
173+
snprintf(err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate embedding pool");
174+
goto cleanup;
175+
}
176+
177+
// Default settings
178+
engine->normalize = true; // L2 normalize by default
179+
engine->total_tokens_processed = 0;
180+
engine->total_embeddings_generated = 0;
181+
182+
return engine;
183+
184+
cleanup:
185+
dbmem_local_engine_free(engine);
186+
return NULL;
187+
}
188+
189+
bool dbmem_local_engine_warmup (dbmem_local_engine_t *engine) {
190+
// Pre-warm the model by running a dummy inference
191+
// This ensures all Metal/GPU shaders are compiled before actual use
192+
193+
const char *warmup_text = "Warmup";
194+
int warmup_tokens = llama_tokenize(engine->vocab, warmup_text, (int32_t)strlen(warmup_text), engine->tokens, engine->tokens_capacity, true, true);
195+
if (warmup_tokens > 0) {
196+
struct llama_batch batch = {
197+
.n_tokens = warmup_tokens,
198+
.token = engine->tokens,
199+
.embd = NULL,
200+
.pos = NULL,
201+
.n_seq_id = NULL,
202+
.seq_id = NULL,
203+
.logits = NULL,
204+
};
205+
llama_encode(engine->ctx, batch);
206+
207+
if (engine->mem != NULL) {
208+
llama_memory_clear(engine->mem, true);
209+
}
210+
}
211+
212+
return true;
213+
}
214+
215+
int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *text, int text_len, embedding_result_t *result) {
216+
memset(result, 0, sizeof(embedding_result_t));
217+
if (text_len == -1) text_len = (int)strlen(text);
218+
if (text_len == 0) return 0;
219+
220+
// Tokenize
221+
int n_tokens = llama_tokenize(engine->vocab, text, text_len, engine->tokens, engine->tokens_capacity, true, true);
222+
if (n_tokens < 0) {
223+
snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Tokenization failed (text too long?)");
224+
return -1;
225+
}
226+
227+
// Calculate chunks
228+
int chunk_size = engine->chunk_size;
229+
int overlap = engine->overlap;
230+
int step = chunk_size - overlap;
231+
if (step < 1) step = 1;
232+
int n_chunks = (n_tokens <= chunk_size) ? 1 : 1 + (n_tokens - chunk_size + step - 1) / step;
233+
234+
// Check if we exceed pre-allocated pool
235+
bool use_pool = (n_chunks <= engine->embedding_pool_size);
236+
237+
// Setup result
238+
result->total_tokens = n_tokens;
239+
result->total_chars = text_len;
240+
result->n_embd = engine->n_embd;
241+
result->chunk_size = chunk_size;
242+
result->overlap = overlap;
243+
result->n_chunks = n_chunks;
244+
result->used_pool = use_pool;
245+
result->chunks = (chunk_result_t *)dbmem_zeroalloc(n_chunks * sizeof(chunk_result_t));
246+
if (!result->chunks) {
247+
snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate chunks");
248+
return -1;
249+
}
250+
251+
// Calculate average chars per token for position estimation
252+
float avg_chars_per_token = (n_tokens > 0) ? (float)text_len / n_tokens : 1.0f;
253+
254+
// Process each chunk
255+
for (int chunk_idx = 0; chunk_idx < n_chunks; chunk_idx++) {
256+
int token_start = chunk_idx * step;
257+
int token_length = chunk_size;
258+
259+
if (token_start + token_length > n_tokens) {
260+
token_length = n_tokens - token_start;
261+
}
262+
263+
// Create batch
264+
struct llama_batch batch = {
265+
.n_tokens = token_length,
266+
.token = &engine->tokens[token_start],
267+
.embd = NULL,
268+
.pos = NULL,
269+
.n_seq_id = NULL,
270+
.seq_id = NULL,
271+
.logits = NULL,
272+
};
273+
274+
// Clear memory
275+
if (engine->mem != NULL) {
276+
llama_memory_clear(engine->mem, true);
277+
}
278+
279+
// Always use llama_encode() for embedding generation
280+
int ret = llama_encode(engine->ctx, batch);
281+
282+
if (ret != 0) {
283+
snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "llama_encode failed for chunk %d", chunk_idx);
284+
dbmem_embedding_free(result);
285+
return -1;
286+
}
287+
288+
// Get embeddings
289+
const float *emb_ptr = NULL;
290+
if (engine->pooling == LLAMA_POOLING_TYPE_NONE) {
291+
emb_ptr = llama_get_embeddings_ith(engine->ctx, token_length - 1);
292+
} else {
293+
emb_ptr = llama_get_embeddings_seq(engine->ctx, 0);
294+
}
295+
296+
if (!emb_ptr) {
297+
snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to get embeddings for chunk %d", chunk_idx);
298+
dbmem_embedding_free(result);
299+
return -1;
300+
}
301+
302+
// Store result
303+
chunk_result_t *chunk = &result->chunks[chunk_idx];
304+
chunk->index = chunk_idx;
305+
chunk->token_start = token_start;
306+
chunk->token_length = token_length;
307+
chunk->char_start = (int)(token_start * avg_chars_per_token);
308+
chunk->char_length = (int)(token_length * avg_chars_per_token);
309+
310+
if (chunk->char_start > text_len) chunk->char_start = text_len;
311+
if (chunk->char_start + chunk->char_length > text_len) {
312+
chunk->char_length = text_len - chunk->char_start;
313+
}
314+
315+
// Use pre-allocated pool if possible, otherwise malloc
316+
if (use_pool) {
317+
chunk->embedding = &engine->embedding_pool[chunk_idx * engine->n_embd];
318+
} else {
319+
chunk->embedding = (float *)dbmem_alloc(sizeof(float) * engine->n_embd);
320+
if (!chunk->embedding) {
321+
snprintf(result->err_msg, DBMEM_MAXERROR_SIZE, "Failed to allocate embedding");
322+
dbmem_embedding_free(result);
323+
return -1;
324+
}
325+
}
326+
327+
// Copy embedding (using SIMD-optimized copy)
328+
dbmem_embedding_copy(chunk->embedding, emb_ptr, engine->n_embd);
329+
330+
// Normalize if enabled
331+
if (engine->normalize) {
332+
dbmem_embedding_normalize(chunk->embedding, engine->n_embd);
333+
}
334+
335+
// Update statistics
336+
engine->total_tokens_processed += token_length;
337+
engine->total_embeddings_generated++;
338+
}
339+
340+
return 0;
341+
}
342+
343+
void dbmem_local_engine_free (dbmem_local_engine_t *engine) {
344+
if (!engine) return;
345+
346+
if (engine->embedding_pool) {
347+
dbmem_free(engine->embedding_pool);
348+
engine->embedding_pool = NULL;
349+
}
350+
if (engine->tokens) {
351+
dbmem_free(engine->tokens);
352+
engine->tokens = NULL;
353+
}
354+
if (engine->ctx) {
355+
llama_free(engine->ctx);
356+
engine->ctx = NULL;
357+
}
358+
if (engine->model) {
359+
llama_model_free(engine->model);
360+
engine->model = NULL;
361+
}
362+
llama_backend_free();
363+
}

0 commit comments

Comments
 (0)