TutorBot/main.py at main · Rookiecoder-jsjs/TutorBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""中文 LLM 教学路由器 - 主入口"""

import yaml
from pathlib import Path

from src.core.simple_classifier import classify, classify_with_llm, SubjectType, SUBJECT_CONFIG, SubjectCategory, get_category
from src.core.model_router import ModelRouter
from src.core.narrator import generate_script, generate_svg_animation, generate_highlight_html, parse_solution
from src.services.deepseek_client import DeepSeekClient
from src.services.qwen_client import QwenClient
from src.services.minimax_tts import MinimaxTTS


def load_config(path: str = "config.yaml") -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)


def process(input_path: str, output_dir: str = "output"):
    """处理一道题目：图片或文本 -> 分类 -> 求解 -> 脚本 -> 语音 + 动画"""
    cfg = load_config()
    Path(output_dir).mkdir(exist_ok=True)

    qwen = QwenClient(cfg["dashscope_api_key"])
    deepseek = DeepSeekClient(cfg["deepseek_api_key"])
    router = ModelRouter(deepseek, qwen)
    tts = MinimaxTTS(cfg["minimax_api_key"], cfg["minimax_group_id"])

    # 1. OCR（如果是图片）
    if input_path.lower().endswith((".png", ".jpg", ".jpeg")):
        print("[OCR] 正在识别图片...")
        question = qwen.ocr(input_path)
        print(f"[OCR] 识别结果:\n{question}\n")
    else:
        with open(input_path, "r", encoding="utf-8") as f:
            question = f.read()

    # 2. 分类
    dashscope_key = cfg.get("dashscope_api_key", "")
    if dashscope_key:
        subject = classify_with_llm(question, dashscope_key)
    else:
        subject = classify(question)
    label = SUBJECT_CONFIG[subject]["label"]
    print(f"[分类] 判定为: {label}")

    # 3. 求解
    print(f"[求解] 调用模型中...")
    solution = router.solve(question, subject)
    parsed = parse_solution(solution)
    print(f"[求解] 完成\n")

    if parsed["thinking"]:
        print(f"[思路]\n{parsed['thinking']}\n")
    if parsed["answer"]:
        print(f"[答案]\n{parsed['answer']}\n")

    # 4. 生成脚本
    script = generate_script(solution, subject)

    # 5. TTS
    voice = SUBJECT_CONFIG[subject]["voice"]
    audio_path = str(Path(output_dir) / "audio.mp3")
    print("[TTS] 生成语音中...")
    tts.synthesize(script, voice=voice, output_path=audio_path)
    print(f"[TTS] 已保存: {audio_path}")

    # 6. 动画
    anim_type = SUBJECT_CONFIG[subject]["anim_type"]
    if anim_type == "svg":
        svg = generate_svg_animation(solution)
        anim_path = str(Path(output_dir) / "animation.svg")
        with open(anim_path, "w", encoding="utf-8") as f:
            f.write(svg)
        print(f"[动画] SVG 已保存: {anim_path}")
    else:
        html = generate_highlight_html(solution)
        anim_path = str(Path(output_dir) / "animation.html")
        with open(anim_path, "w", encoding="utf-8") as f:
            f.write(html)
        print(f"[动画] HTML 已保存: {anim_path}")

    # 保存解题结果
    sol_path = str(Path(output_dir) / "solution.md")
    with open(sol_path, "w", encoding="utf-8") as f:
        f.write(solution)
    print(f"[结果] 已保存: {sol_path}")

    return {"solution": solution, "audio": audio_path, "animation": anim_path}


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("用法: python main.py <图片或文本文件路径>")
        sys.exit(1)
    process(sys.argv[1])