Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2011,6 +2011,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_no_cache * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
Expand Down Expand Up @@ -2044,7 +2045,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);

if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}

if (wo_b) {
Expand Down Expand Up @@ -2095,6 +2096,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
Expand Down Expand Up @@ -2146,10 +2148,15 @@ ggml_tensor * llm_graph_context::build_attn(
}

if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
}
}

Expand Down Expand Up @@ -2193,6 +2200,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
Expand Down Expand Up @@ -2227,10 +2235,15 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);

if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
Comment on lines 2238 to +2246
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a follow-up PR to fix the order of the build_lora_mm arguments (e.g. cur, wo, wo_s) and add an optional precision argument to avoid this branching.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will be more manageable after merging #21245

}
}

Expand All @@ -2245,6 +2258,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
Expand Down Expand Up @@ -2313,7 +2327,7 @@ ggml_tensor * llm_graph_context::build_attn(
}

if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}

if (wo_b) {
Expand Down Expand Up @@ -2344,6 +2358,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
Expand All @@ -2368,7 +2383,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);

if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}

if (wo_b) {
Expand Down
5 changes: 5 additions & 0 deletions src/llama-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,7 @@ struct llm_graph_context {
llm_graph_input_attn_no_cache * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
Expand All @@ -907,6 +908,7 @@ struct llm_graph_context {
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
Expand All @@ -922,6 +924,7 @@ struct llm_graph_context {
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
Expand All @@ -938,6 +941,7 @@ struct llm_graph_context {
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
Expand All @@ -953,6 +957,7 @@ struct llm_graph_context {
llm_graph_input_attn_cross * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
Expand Down
4 changes: 2 additions & 2 deletions src/models/afmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

cur = build_attn(inp_attn,
NULL, NULL, // wo will be applied after gating
NULL, NULL, NULL, // wo will be applied after gating
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);

Expand All @@ -91,7 +91,7 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
cb(cur, "attn_gated", il);

// now apply output projection
cur = build_lora_mm(model.layers[il].wo, cur);
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
cb(cur, "attn_o_proj", il);
}

Expand Down
4 changes: 1 addition & 3 deletions src/models/apertus.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "models.h"



llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();

Expand Down Expand Up @@ -62,7 +60,7 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur_pos", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
Expand Down
3 changes: 1 addition & 2 deletions src/models/arcee.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "models.h"


llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();

Expand Down Expand Up @@ -78,7 +77,7 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/arctic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
3 changes: 1 addition & 2 deletions src/models/baichuan.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "models.h"


llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();

Expand Down Expand Up @@ -67,7 +66,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/bailingmoe2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
cb(cur, "kqv_out", il);
}
Expand Down
2 changes: 1 addition & 1 deletion src/models/bitnet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
NULL, NULL,
NULL, NULL, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);

cur = build_norm(cur,
Expand Down
2 changes: 1 addition & 1 deletion src/models/bloom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_para
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/chameleon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, nullptr,
model.layers[il].wo, nullptr, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/codeshell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
6 changes: 4 additions & 2 deletions src/models/cogvlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,20 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa

for (int il = 0; il < n_layer; ++il) {
// get either the text or image weight tensors
ggml_tensor *wqkv, *wo;
ggml_tensor *wqkv, *wo, *wo_s;
ggml_tensor *ffn_gate, *ffn_down, *ffn_up;

if (is_text) {
wqkv = model.layers[il].wqkv;
wo = model.layers[il].wo;
wo_s = model.layers[il].wo_s;
ffn_gate = model.layers[il].ffn_gate;
ffn_down = model.layers[il].ffn_down;
ffn_up = model.layers[il].ffn_up;
} else {
wqkv = model.layers[il].visexp_attn_wqkv;
wo = model.layers[il].visexp_attn_wo;
wo_s = nullptr;
ffn_gate = model.layers[il].visexp_ffn_gate;
ffn_down = model.layers[il].visexp_ffn_down;
ffn_up = model.layers[il].visexp_ffn_up;
Expand All @@ -64,7 +66,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type);

cur = build_attn(inp_attn,
wo, nullptr,
wo, nullptr, wo_s,
Qcur, Kcur, Vcur,
nullptr, nullptr, nullptr,
kq_scale, il);
Expand Down
2 changes: 1 addition & 1 deletion src/models/cohere2-iswa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
2 changes: 1 addition & 1 deletion src/models/command-r.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_gr
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}
if (il == n_layer - 1 && inp_out_ids) {
Expand Down
2 changes: 1 addition & 1 deletion src/models/dbrx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}

Expand Down
4 changes: 1 addition & 3 deletions src/models/deci.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "models.h"



llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();

Expand Down Expand Up @@ -80,7 +78,7 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {
Expand Down
2 changes: 1 addition & 1 deletion src/models/deepseek.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
if (il == n_layer - 1 && inp_out_ids) {
Expand Down
6 changes: 3 additions & 3 deletions src/models/deepseek2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
cb(Kcur, "k_pe", il);

cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_out", il);
}
Expand Down Expand Up @@ -182,7 +182,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
Expand Down Expand Up @@ -219,7 +219,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
cur = build_attn(inp_attn_kv,
model.layers[il].wo, NULL,
model.layers[il].wo, NULL, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
}
}
Expand Down
Loading