InfiniTensor · koishiyo · Jan 30, 2026 · Jan 30, 2026 · Jan 31, 2026 · Feb 3, 2026
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -27,9 +27,15 @@ jobs:
 
     - name: Xmake Build & Install
       run: | 
-        xmake
-        xmake install
-
+        xmake -y
+        xmake install -y
+
+    - name: Install System Dependencies (Ubuntu)
+      if: runner.os == 'Linux'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libopenblas-dev
+
     - name: Install Python
       run: | 
         cd python

diff --git a/chat.py b/chat.py
@@ -0,0 +1,61 @@
+import os
+from transformers import AutoTokenizer
+from python.llaisys.models.qwen2 import Qwen2
+from python.llaisys.libllaisys import DeviceType
+
+MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562" 
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = Qwen2(MODEL_PATH, device=DeviceType.CPU)
+
+    print("\n🚀 LLAISYS 有状态流式推理引擎已启动！")
+    is_first_turn = True
+
+    while True:
+        user_input = input("🧑 你: ")
+        if user_input.strip().lower() in ['quit', 'exit']: break
+        if not user_input.strip(): continue
+
+        if is_first_turn:
+            messages = [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": user_input}
+            ]
+            prompt_str = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            is_first_turn = False
+        else:
+            # 强行补上上一轮丢失的 <|im_end|>，维持完美的 KV Cache！
+            prompt_str = f"<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
+
+        # encode 时必须加 add_special_tokens=False，防止污染 C++ 里的 Cache
+        new_input_tokens = tokenizer.encode(prompt_str, add_special_tokens=False)
+
+        print("🤖 AI: ", end="", flush=True)
+
+        all_generated_tokens = []
+        printed_text_len = 0
+
+        # 👑 终极替换：调用我们刚才写的流式生成器！
+        for next_token in model.stream_generate(
+            new_input_tokens,
+            max_new_tokens=400,   
+            temperature=0.7,
+            top_p=0.9
+        ):
+            all_generated_tokens.append(next_token)
+
+            # 完整解码当前生成的所有 token
+            current_text = tokenizer.decode(all_generated_tokens, skip_special_tokens=True)
+
+            # 算出这次新增了哪些字符（完美解决中文多 Token 乱码问题）
+            new_text = current_text[printed_text_len:]
+
+            if new_text:
+                print(new_text, end="", flush=True)
+                printed_text_len = len(current_text)
+
+        print("\n") # 这轮对话结束，换行
+
+if __name__ == "__main__":
+    main()
diff --git a/chat_server.py b/chat_server.py
@@ -0,0 +1,174 @@
+import uvicorn
+import json
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse, StreamingResponse
+from pydantic import BaseModel
+
+# ==========================================
+# 1. 导入你的模型和分词器
+# ==========================================
+from llaisys.models.qwen2 import Qwen2
+from transformers import AutoTokenizer
+
+# ⚠️ 注意：请将这里的路径替换为你电脑上真实的 Qwen2 模型文件夹路径！
+MODEL_PATH = "/home/koishiyo/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562" 
+
+print(f"正在加载分词器: {MODEL_PATH}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+
+print("正在将大模型权重载入 C++ 引擎内存，请稍候...")
+model = Qwen2(MODEL_PATH)
+print("模型加载完毕！LLAISYS 引擎启动成功！")
+
+# 初始化 FastAPI
+app = FastAPI(title="LLAISYS Chatbot")
+
+class ChatRequest(BaseModel):
+    message: str
+
+# ==========================================
+# 2. 前端 HTML + CSS + JS (流式打字机效果)
+# ==========================================
+html_content = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>LLAISYS 纯 C++ 推理引擎</title>
+    <meta charset="utf-8">
+    <style>
+        body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f0f2f5; margin: 0; display: flex; justify-content: center; height: 100vh; align-items: center; }
+        .chat-container { width: 100%; max-width: 800px; height: 90vh; background: white; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.1); display: flex; flex-direction: column; overflow: hidden; }
+        .header { background: #1a73e8; color: white; padding: 20px; text-align: center; font-size: 1.2em; font-weight: bold; }
+        .chat-box { flex: 1; padding: 20px; overflow-y: auto; display: flex; flex-direction: column; gap: 15px; }
+        .message { max-width: 80%; padding: 12px 16px; border-radius: 8px; line-height: 1.6; word-wrap: break-word; }
+        .user-message { background: #e3f2fd; color: #0d47a1; align-self: flex-end; border-bottom-right-radius: 0; }
+        .bot-message { background: #f1f3f4; color: #202124; align-self: flex-start; border-bottom-left-radius: 0; }
+        .input-area { display: flex; padding: 20px; background: #fff; border-top: 1px solid #e0e0e0; }
+        input[type="text"] { flex: 1; padding: 12px; border: 1px solid #ccc; border-radius: 24px; outline: none; font-size: 16px; }
+        button { margin-left: 10px; padding: 12px 24px; background: #1a73e8; color: white; border: none; border-radius: 24px; cursor: pointer; font-size: 16px; font-weight: bold; transition: 0.2s;}
+        button:hover { background: #1557b0; }
+    </style>
+</head>
+<body>
+    <div class="chat-container">
+        <div class="header">LLAISYS - 纯 C++ 驱动的 AI 助手</div>
+        <div class="chat-box" id="chat-box">
+            <div class="message bot-message">你好！我是由你从零手写的 LLAISYS 推理引擎驱动的。我们来聊天吧！</div>
+        </div>
+        <div class="input-area">
+            <input type="text" id="user-input" placeholder="输入你想说的话..." onkeypress="handleKeyPress(event)">
+            <button onclick="sendMessage()">发送</button>
+        </div>
+    </div>
+
+    <script>
+        const chatBox = document.getElementById('chat-box');
+        const userInput = document.getElementById('user-input');
+
+        function appendMessage(text, className, id = null) {
+            const msgDiv = document.createElement('div');
+            msgDiv.className = `message ${className}`;
+            msgDiv.innerText = text;
+            if (id) msgDiv.id = id;
+            chatBox.appendChild(msgDiv);
+            chatBox.scrollTop = chatBox.scrollHeight;
+            return msgDiv;
+        }
+
+        async function sendMessage() {
+            const text = userInput.value.trim();
+            if (!text) return;
+
+            // 显示用户消息
+            appendMessage(text, 'user-message');
+            userInput.value = '';
+
+            // 创建一个空的气泡，准备接收流式字符
+            const loadingId = 'loading-' + Date.now();
+            const botMsgDiv = appendMessage('', 'bot-message', loadingId); 
+
+            try {
+                const response = await fetch('/v1/chat/completions', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ message: text })
+                });
+
+                // 流式读取 SSE 数据
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder('utf-8');
+                let fullText = "";
+
+                while (true) {
+                    const { done, value } = await reader.read();
+                    if (done) break;
+
+                    const chunk = decoder.decode(value, { stream: true });
+                    const lines = chunk.split('\\n');
+
+                    for (let line of lines) {
+                        if (line.startsWith('data: ')) {
+                            const data = JSON.parse(line.slice(6));
+                            fullText += data.delta; 
+                            botMsgDiv.innerText = fullText; // 逐字更新气泡
+                            chatBox.scrollTop = chatBox.scrollHeight; // 自动滚到底部
+                        }
+                    }
+                }
+            } catch (error) {
+                botMsgDiv.innerText = '网络连接失败，请检查 C++ 引擎状态。';
+            }
+        }
+
+        function handleKeyPress(e) { if (e.key === 'Enter') sendMessage(); }
+    </script>
+</body>
+</html>
+"""
+
+@app.get("/")
+async def get_ui():
+    return HTMLResponse(content=html_content)
+
+# ==========================================
+# 3. 后端流式推理接口 (真正呼叫 C++ 底层)
+# ==========================================
+@app.post("/v1/chat/completions")
+async def chat_api(req: ChatRequest):
+
+    def stream_generator():
+        try:
+            # 1. 使用官方标准的对话模板 (Chat Template) 
+            messages = [
+                {"role": "system", "content": "你是一个乐于助人的AI助手。"},
+                {"role": "user", "content": req.message}
+            ]
+            # apply_chat_template 会自动且正确地把特殊符号转换成真正的控制 Token ID
+            prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            input_ids = tokenizer.encode(prompt_text)
+
+            # 2. 呼叫 C++ 底层的 stream_generate 算子
+            # 这里传的参数 (temperature, top_p, top_k) 会直接穿透到你手写的 sampler_cpu.cpp 里！
+            for token_id in model.stream_generate(
+                inputs=input_ids, 
+                max_new_tokens=512, 
+                temperature=0.8, 
+                top_p=0.9, 
+                top_k=50
+            ):
+                # 3. ID 还原成汉字
+                word = tokenizer.decode([token_id], skip_special_tokens=True)
+
+                # 4. 组装成 Server-Sent Events (SSE) 格式发送给前端
+                if word:
+                    yield f"data: {json.dumps({'delta': word}, ensure_ascii=False)}\n\n"
+
+        except Exception as e:
+            print(f"推理时发生错误: {e}")
+            yield f"data: {json.dumps({'delta': '[Engine Error]'}, ensure_ascii=False)}\n\n"
+
+    return StreamingResponse(stream_generator(), media_type="text/event-stream")
+
+if __name__ == "__main__":
+    # 启动命令
+    uvicorn.run(app, host="127.0.0.1", port=8000)
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
@@ -3,40 +3,62 @@
 
 #include "../tensor.h"
 
-__C {
-    struct LlaisysQwen2Meta {
-        llaisysDataType_t dtype;
-        size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
-        float epsilon, theta;
-        int64_t end_token;
-    };
-
-    struct LlaisysQwen2Weights {
-        llaisysTensor_t in_embed;
-        llaisysTensor_t out_embed;
-        llaisysTensor_t out_norm_w;   // a.k.a. model.norm.weight
-        llaisysTensor_t *attn_norm_w; // a.k.a. input_layernorm.weight
-        llaisysTensor_t *attn_q_w;
-        llaisysTensor_t *attn_q_b;
-        llaisysTensor_t *attn_k_w;
-        llaisysTensor_t *attn_k_b;
-        llaisysTensor_t *attn_v_w;
-        llaisysTensor_t *attn_v_b;
-        llaisysTensor_t *attn_o_w;
-        llaisysTensor_t *mlp_norm_w; // a.k.a. post_attention_layernorm.weight
-        llaisysTensor_t *mlp_gate_w;
-        llaisysTensor_t *mlp_up_w;
-        llaisysTensor_t *mlp_down_w;
-    };
-
-    struct LlaisysQwen2Model;
-
-    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
-
-    __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
-
-    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
-
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 模型超参数元数据
+typedef struct {
+    int dtype;         // 0=F32, 1=F16...
+    size_t nlayer;     // 层数
+    size_t hs;         // Hidden Size
+    size_t nh;         // Num Attention Heads
+    size_t nkvh;       // Num KV Heads
+    size_t dh;         // Head Dim (hs / nh)
+    size_t di;         // Intermediate Size (FFN)
+    size_t maxseq;     // Max Position Embeddings
+    size_t voc;        // Vocab Size
+    float epsilon;     // RMS Norm Epsilon
+    float theta;       // RoPE Theta
+    int64_t end_token; // EOS Token ID
+} LlaisysQwen2Meta;
+
+// 权重指针容器 (C++端分配数组，Python端填充数据)
+typedef struct {
+    llaisysTensor_t in_embed;
+    llaisysTensor_t out_embed;
+    llaisysTensor_t out_norm_w;
+
+    // 以下是指针数组 (Array of Tensors)，长度为 nlayer
+    llaisysTensor_t *attn_norm_w;
+    llaisysTensor_t *attn_q_w;
+    llaisysTensor_t *attn_q_b;
+    llaisysTensor_t *attn_k_w;
+    llaisysTensor_t *attn_k_b;
+    llaisysTensor_t *attn_v_w;
+    llaisysTensor_t *attn_v_b;
+    llaisysTensor_t *attn_o_w; // Qwen 通常无 o_bias
+
+    llaisysTensor_t *mlp_norm_w;
+    llaisysTensor_t *mlp_gate_w;
+    llaisysTensor_t *mlp_up_w;
+    llaisysTensor_t *mlp_down_w;
+} LlaisysQwen2Weights;
+
+// 不透明模型句柄
+struct LlaisysQwen2Model;
+
+// API 导出
+__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
+
+__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model);
+
+__export LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model);
+
+__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken, float temperature, float top_p, size_t top_k);
+
+#ifdef __cplusplus
 }
+#endif
+
 #endif // LLAISYS_MODELS_QWEN2_H