Skip to content

Commit 557515b

Browse files
authored
graph : utilize ggml_build_forward_select() to avoid reallocations (ggml-org#18898)
* graph : avoid branches between embedding and token inputs * models : make deepstack graphs (e.g. Qwen3 VL) have constant topology * ci : enable -DGGML_SCHED_NO_REALLOC=ON for server CI * cont : pad token embeddings to n_embd_inp
1 parent cb6caca commit 557515b

7 files changed

Lines changed: 69 additions & 53 deletions

File tree

.github/workflows/server.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
- name: Build
7373
id: cmake_build
7474
run: |
75-
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
75+
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
7676
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
7777
7878
- name: Python setup
@@ -108,7 +108,7 @@ jobs:
108108
- name: Build
109109
id: cmake_build
110110
run: |
111-
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
111+
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
112112
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
113113
114114
- name: Python setup

src/llama-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2903,7 +2903,7 @@ void llama_context::opt_epoch_iter(
29032903
};
29042904
ctx_compute_opt = ggml_init(params);
29052905
}
2906-
ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
2906+
ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
29072907
ggml_opt_alloc(opt_ctx, train);
29082908

29092909
res->set_inputs(&ubatch);

src/llama-graph.cpp

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
2323
}
2424

2525
if (ubatch->embd) {
26-
const int64_t n_embd = embd->ne[0];
26+
GGML_ASSERT(n_embd == embd->ne[0]);
27+
2728
const int64_t n_tokens = ubatch->n_tokens;
2829

2930
ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
@@ -33,8 +34,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
3334
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
3435
bool res = true;
3536

36-
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
37-
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
37+
res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
38+
res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
3839

3940
return res;
4041
}
@@ -634,7 +635,8 @@ int64_t llm_graph_result::get_max_nodes() const {
634635
}
635636

636637
void llm_graph_result::reset() {
637-
t_tokens = nullptr;
638+
t_inp_tokens = nullptr;
639+
t_inp_embd = nullptr;
638640
t_logits = nullptr;
639641
t_embd = nullptr;
640642
t_embd_pooled = nullptr;
@@ -1338,17 +1340,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
13381340

13391341
// input embeddings with optional lora
13401342
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1341-
const int64_t n_embd = hparams.n_embd_inp();
1343+
const int64_t n_embd_inp = hparams.n_embd_inp();
1344+
const int64_t n_embd = hparams.n_embd;
1345+
1346+
assert(n_embd_inp >= n_embd);
1347+
1348+
auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
1349+
1350+
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
1351+
cb(inp->tokens, "inp_tokens", -1);
1352+
ggml_set_input(inp->tokens);
1353+
res->t_inp_tokens = inp->tokens;
13421354

1343-
auto inp = std::make_unique<llm_graph_input_embd>();
1355+
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
1356+
cb(inp->embd, "inp_embd", -1);
1357+
ggml_set_input(inp->embd);
13441358

1345-
ggml_tensor * cur = nullptr;
1359+
// select one of the 2 inputs, based on the batch contents
1360+
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
1361+
std::array<ggml_tensor *, 2> inps;
13461362

1347-
if (ubatch.token) {
1348-
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
1349-
//cb(inp->tokens, "inp_tokens", -1);
1350-
ggml_set_input(inp->tokens);
1351-
res->t_tokens = inp->tokens;
1363+
// token embeddings path (ubatch.token != nullptr)
1364+
{
1365+
auto & cur = inps[0];
13521366

13531367
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
13541368

@@ -1369,19 +1383,36 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
13691383

13701384
cur = ggml_add(ctx0, cur, inpL_delta);
13711385
}
1372-
} else {
1373-
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
1374-
ggml_set_input(inp->embd);
1386+
1387+
if (n_embd_inp != n_embd) {
1388+
cur = ggml_pad(ctx0, cur, hparams.n_embd_inp() - n_embd, 0, 0, 0);
1389+
}
1390+
}
1391+
1392+
// vector embeddings path (ubatch.embd != nullptr)
1393+
{
1394+
auto & cur = inps[1];
13751395

13761396
cur = inp->embd;
13771397
}
13781398

1399+
assert(ggml_are_same_shape (inps[0], inps[1]));
1400+
assert(ggml_are_same_stride(inps[0], inps[1]));
1401+
1402+
ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
1403+
1404+
if (n_embd_inp != n_embd) {
1405+
cur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0);
1406+
}
1407+
1408+
res->t_inp_embd = cur;
1409+
13791410
// For Granite architecture
13801411
if (hparams.f_embedding_scale != 0.0f) {
13811412
cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
13821413
}
13831414

1384-
cb(cur, "inp_embd", -1);
1415+
cb(cur, "embd", -1);
13851416

13861417
res->add_input(std::move(inp));
13871418

@@ -1480,7 +1511,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
14801511
//}
14811512

14821513
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
1483-
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1514+
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
14841515

14851516
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
14861517
ggml_set_input(cur);

src/llama-graph.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
106106

107107
class llm_graph_input_embd : public llm_graph_input_i {
108108
public:
109-
llm_graph_input_embd() = default;
109+
llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
110110
virtual ~llm_graph_input_embd() = default;
111111

112112
void set_input(const llama_ubatch * ubatch) override;
@@ -115,6 +115,8 @@ class llm_graph_input_embd : public llm_graph_input_i {
115115

116116
ggml_tensor * tokens = nullptr; // I32 [n_batch]
117117
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
118+
119+
const int64_t n_embd = 0;
118120
};
119121

120122
class llm_graph_input_pos : public llm_graph_input_i {
@@ -566,7 +568,7 @@ class llm_graph_result {
566568

567569
virtual ~llm_graph_result() = default;
568570

569-
ggml_tensor * get_tokens() const { return t_tokens; }
571+
ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
570572
ggml_tensor * get_logits() const { return t_logits; }
571573
ggml_tensor * get_embd() const { return t_embd; }
572574
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@@ -593,7 +595,8 @@ class llm_graph_result {
593595
void set_params(const llm_graph_params & params);
594596

595597
// important graph nodes
596-
ggml_tensor * t_tokens = nullptr;
598+
ggml_tensor * t_inp_tokens = nullptr;
599+
ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
597600
ggml_tensor * t_logits = nullptr;
598601
ggml_tensor * t_embd = nullptr;
599602
ggml_tensor * t_embd_pooled = nullptr;

src/models/gemma3n-iswa.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,12 +245,12 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
245245
// equivalent to get_per_layer_inputs() in python code
246246
// output shape: [n_embd_altup, n_layer, n_tokens]
247247
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
248-
auto inp = std::make_unique<llm_graph_input_embd>();
248+
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
249249
ggml_tensor * inp_per_layer;
250250
if (ubatch.token) {
251251
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
252252
ggml_set_input(inp->tokens);
253-
res->t_tokens = inp->tokens;
253+
res->t_inp_tokens = inp->tokens;
254254
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
255255
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
256256
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));

src/models/qwen3vl-moe.cpp

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
44
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
5-
const int64_t n_embd = hparams.n_embd;
5+
6+
const int64_t n_embd = hparams.n_embd;
67
const int64_t n_embd_head = hparams.n_embd_head_v;
78

89
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -16,17 +17,6 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
1617
int sections[4];
1718
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
1819

19-
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
20-
21-
if (ubatch.embd) {
22-
// Image input: split main embd and deepstack embds
23-
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
24-
for (size_t i = 0; i < n_deepstack_layers; i++) {
25-
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
26-
}
27-
inpL = inpL_main;
28-
}
29-
3020
// inp_pos - contains the positions
3121
ggml_tensor * inp_pos = build_inp_pos();
3222

@@ -120,8 +110,9 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
120110
cur = build_cvec(cur, il);
121111
cb(cur, "l_out", il);
122112

123-
if (ubatch.embd && (size_t)il < n_deepstack_layers) {
124-
cur = ggml_add(ctx0, cur, deepstack_features[il]);
113+
if (il < (int) n_deepstack_layers) {
114+
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
115+
cur = ggml_add(ctx0, cur, ds);
125116
cb(cur, "deepstack_out", il);
126117
}
127118

src/models/qwen3vl.cpp

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
44
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
5-
const int64_t n_embd = hparams.n_embd;
5+
6+
const int64_t n_embd = hparams.n_embd;
67
const int64_t n_embd_head = hparams.n_embd_head_v;
78

89
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -16,17 +17,6 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
1617
int sections[4];
1718
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
1819

19-
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
20-
21-
if (ubatch.embd) {
22-
// Image input: split main embd and deepstack embds
23-
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
24-
for (size_t i = 0; i < n_deepstack_layers; i++) {
25-
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
26-
}
27-
inpL = inpL_main;
28-
}
29-
3020
// inp_pos - contains the positions
3121
ggml_tensor * inp_pos = build_inp_pos();
3222

@@ -113,8 +103,9 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
113103
cur = build_cvec(cur, il);
114104
cb(cur, "l_out", il);
115105

116-
if (ubatch.embd && (size_t)il < n_deepstack_layers) {
117-
cur = ggml_add(ctx0, cur, deepstack_features[il]);
106+
if (il < (int) n_deepstack_layers) {
107+
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
108+
cur = ggml_add(ctx0, cur, ds);
118109
cb(cur, "deepstack_out", il);
119110
}
120111

0 commit comments

Comments
 (0)