Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,15 @@ jobs:

- name: Xmake Build & Install
run: |
xmake
xmake install

xmake -y
xmake install -y

- name: Install System Dependencies (Ubuntu)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install -y libopenblas-dev

- name: Install Python
run: |
cd python
Expand Down
61 changes: 61 additions & 0 deletions chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
from transformers import AutoTokenizer
from python.llaisys.models.qwen2 import Qwen2
from python.llaisys.libllaisys import DeviceType

MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562"

def main():
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = Qwen2(MODEL_PATH, device=DeviceType.CPU)

print("\n🚀 LLAISYS 有状态流式推理引擎已启动!")
is_first_turn = True

while True:
user_input = input("🧑 你: ")
if user_input.strip().lower() in ['quit', 'exit']: break
if not user_input.strip(): continue

if is_first_turn:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": user_input}
]
prompt_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
is_first_turn = False
else:
# 强行补上上一轮丢失的 <|im_end|>,维持完美的 KV Cache!
prompt_str = f"<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"

# encode 时必须加 add_special_tokens=False,防止污染 C++ 里的 Cache
new_input_tokens = tokenizer.encode(prompt_str, add_special_tokens=False)

print("🤖 AI: ", end="", flush=True)

all_generated_tokens = []
printed_text_len = 0

# 👑 终极替换:调用我们刚才写的流式生成器!
for next_token in model.stream_generate(
new_input_tokens,
max_new_tokens=400,
temperature=0.7,
top_p=0.9
):
all_generated_tokens.append(next_token)

# 完整解码当前生成的所有 token
current_text = tokenizer.decode(all_generated_tokens, skip_special_tokens=True)

# 算出这次新增了哪些字符(完美解决中文多 Token 乱码问题)
new_text = current_text[printed_text_len:]

if new_text:
print(new_text, end="", flush=True)
printed_text_len = len(current_text)

print("\n") # 这轮对话结束,换行

if __name__ == "__main__":
main()
174 changes: 174 additions & 0 deletions chat_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import uvicorn
import json
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, StreamingResponse
from pydantic import BaseModel

# ==========================================
# 1. 导入你的模型和分词器
# ==========================================
from llaisys.models.qwen2 import Qwen2
from transformers import AutoTokenizer

# ⚠️ 注意:请将这里的路径替换为你电脑上真实的 Qwen2 模型文件夹路径!
MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562"

print(f"正在加载分词器: {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

print("正在将大模型权重载入 C++ 引擎内存,请稍候...")
model = Qwen2(MODEL_PATH)
print("模型加载完毕!LLAISYS 引擎启动成功!")

# 初始化 FastAPI
app = FastAPI(title="LLAISYS Chatbot")

class ChatRequest(BaseModel):
message: str

# ==========================================
# 2. 前端 HTML + CSS + JS (流式打字机效果)
# ==========================================
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>LLAISYS 纯 C++ 推理引擎</title>
<meta charset="utf-8">
<style>
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f0f2f5; margin: 0; display: flex; justify-content: center; height: 100vh; align-items: center; }
.chat-container { width: 100%; max-width: 800px; height: 90vh; background: white; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1); display: flex; flex-direction: column; overflow: hidden; }
.header { background: #1a73e8; color: white; padding: 20px; text-align: center; font-size: 1.2em; font-weight: bold; }
.chat-box { flex: 1; padding: 20px; overflow-y: auto; display: flex; flex-direction: column; gap: 15px; }
.message { max-width: 80%; padding: 12px 16px; border-radius: 8px; line-height: 1.6; word-wrap: break-word; }
.user-message { background: #e3f2fd; color: #0d47a1; align-self: flex-end; border-bottom-right-radius: 0; }
.bot-message { background: #f1f3f4; color: #202124; align-self: flex-start; border-bottom-left-radius: 0; }
.input-area { display: flex; padding: 20px; background: #fff; border-top: 1px solid #e0e0e0; }
input[type="text"] { flex: 1; padding: 12px; border: 1px solid #ccc; border-radius: 24px; outline: none; font-size: 16px; }
button { margin-left: 10px; padding: 12px 24px; background: #1a73e8; color: white; border: none; border-radius: 24px; cursor: pointer; font-size: 16px; font-weight: bold; transition: 0.2s;}
button:hover { background: #1557b0; }
</style>
</head>
<body>
<div class="chat-container">
<div class="header">LLAISYS - 纯 C++ 驱动的 AI 助手</div>
<div class="chat-box" id="chat-box">
<div class="message bot-message">你好!我是由你从零手写的 LLAISYS 推理引擎驱动的。我们来聊天吧!</div>
</div>
<div class="input-area">
<input type="text" id="user-input" placeholder="输入你想说的话..." onkeypress="handleKeyPress(event)">
<button onclick="sendMessage()">发送</button>
</div>
</div>

<script>
const chatBox = document.getElementById('chat-box');
const userInput = document.getElementById('user-input');

function appendMessage(text, className, id = null) {
const msgDiv = document.createElement('div');
msgDiv.className = `message ${className}`;
msgDiv.innerText = text;
if (id) msgDiv.id = id;
chatBox.appendChild(msgDiv);
chatBox.scrollTop = chatBox.scrollHeight;
return msgDiv;
}

async function sendMessage() {
const text = userInput.value.trim();
if (!text) return;

// 显示用户消息
appendMessage(text, 'user-message');
userInput.value = '';

// 创建一个空的气泡,准备接收流式字符
const loadingId = 'loading-' + Date.now();
const botMsgDiv = appendMessage('', 'bot-message', loadingId);

try {
const response = await fetch('/v1/chat/completions', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ message: text })
});

// 流式读取 SSE 数据
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let fullText = "";

while (true) {
const { done, value } = await reader.read();
if (done) break;

const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\\n');

for (let line of lines) {
if (line.startsWith('data: ')) {
const data = JSON.parse(line.slice(6));
fullText += data.delta;
botMsgDiv.innerText = fullText; // 逐字更新气泡
chatBox.scrollTop = chatBox.scrollHeight; // 自动滚到底部
}
}
}
} catch (error) {
botMsgDiv.innerText = '网络连接失败,请检查 C++ 引擎状态。';
}
}

function handleKeyPress(e) { if (e.key === 'Enter') sendMessage(); }
</script>
</body>
</html>
"""

@app.get("/")
async def get_ui():
return HTMLResponse(content=html_content)

# ==========================================
# 3. 后端流式推理接口 (真正呼叫 C++ 底层)
# ==========================================
@app.post("/v1/chat/completions")
async def chat_api(req: ChatRequest):

def stream_generator():
try:
# 1. 使用官方标准的对话模板 (Chat Template)
messages = [
{"role": "system", "content": "你是一个乐于助人的AI助手。"},
{"role": "user", "content": req.message}
]
# apply_chat_template 会自动且正确地把特殊符号转换成真正的控制 Token ID
prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer.encode(prompt_text)

# 2. 呼叫 C++ 底层的 stream_generate 算子
# 这里传的参数 (temperature, top_p, top_k) 会直接穿透到你手写的 sampler_cpu.cpp 里!
for token_id in model.stream_generate(
inputs=input_ids,
max_new_tokens=512,
temperature=0.8,
top_p=0.9,
top_k=50
):
# 3. ID 还原成汉字
word = tokenizer.decode([token_id], skip_special_tokens=True)

# 4. 组装成 Server-Sent Events (SSE) 格式发送给前端
if word:
yield f"data: {json.dumps({'delta': word}, ensure_ascii=False)}\n\n"

except Exception as e:
print(f"推理时发生错误: {e}")
yield f"data: {json.dumps({'delta': '[Engine Error]'}, ensure_ascii=False)}\n\n"

return StreamingResponse(stream_generator(), media_type="text/event-stream")

if __name__ == "__main__":
# 启动命令
uvicorn.run(app, host="127.0.0.1", port=8000)
92 changes: 57 additions & 35 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,62 @@

#include "../tensor.h"

__C {
struct LlaisysQwen2Meta {
llaisysDataType_t dtype;
size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
float epsilon, theta;
int64_t end_token;
};

struct LlaisysQwen2Weights {
llaisysTensor_t in_embed;
llaisysTensor_t out_embed;
llaisysTensor_t out_norm_w; // a.k.a. model.norm.weight
llaisysTensor_t *attn_norm_w; // a.k.a. input_layernorm.weight
llaisysTensor_t *attn_q_w;
llaisysTensor_t *attn_q_b;
llaisysTensor_t *attn_k_w;
llaisysTensor_t *attn_k_b;
llaisysTensor_t *attn_v_w;
llaisysTensor_t *attn_v_b;
llaisysTensor_t *attn_o_w;
llaisysTensor_t *mlp_norm_w; // a.k.a. post_attention_layernorm.weight
llaisysTensor_t *mlp_gate_w;
llaisysTensor_t *mlp_up_w;
llaisysTensor_t *mlp_down_w;
};

struct LlaisysQwen2Model;

__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);

__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);

__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
#ifdef __cplusplus
extern "C" {
#endif

// 模型超参数元数据
typedef struct {
int dtype; // 0=F32, 1=F16...
size_t nlayer; // 层数
size_t hs; // Hidden Size
size_t nh; // Num Attention Heads
size_t nkvh; // Num KV Heads
size_t dh; // Head Dim (hs / nh)
size_t di; // Intermediate Size (FFN)
size_t maxseq; // Max Position Embeddings
size_t voc; // Vocab Size
float epsilon; // RMS Norm Epsilon
float theta; // RoPE Theta
int64_t end_token; // EOS Token ID
} LlaisysQwen2Meta;

// 权重指针容器 (C++端分配数组,Python端填充数据)
typedef struct {
llaisysTensor_t in_embed;
llaisysTensor_t out_embed;
llaisysTensor_t out_norm_w;

// 以下是指针数组 (Array of Tensors),长度为 nlayer
llaisysTensor_t *attn_norm_w;
llaisysTensor_t *attn_q_w;
llaisysTensor_t *attn_q_b;
llaisysTensor_t *attn_k_w;
llaisysTensor_t *attn_k_b;
llaisysTensor_t *attn_v_w;
llaisysTensor_t *attn_v_b;
llaisysTensor_t *attn_o_w; // Qwen 通常无 o_bias

llaisysTensor_t *mlp_norm_w;
llaisysTensor_t *mlp_gate_w;
llaisysTensor_t *mlp_up_w;
llaisysTensor_t *mlp_down_w;
} LlaisysQwen2Weights;

// 不透明模型句柄
struct LlaisysQwen2Model;

// API 导出
__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);

__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model);

__export LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken, float temperature, float top_p, size_t top_k);

#ifdef __cplusplus
}
#endif

#endif // LLAISYS_MODELS_QWEN2_H
Loading