fix qwen3.5_stream

shihaobai · shihaobai · commit 8a34c8f2b22c · 2026-04-02T12:55:35.000Z
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
@@ -87,7 +87,7 @@ class ToolCall(BaseModel):
 
     id: Optional[str] = None
     index: Optional[int] = None
-    type: Literal["function"] = "function"
+    type: Optional[Literal["function"]] = None
     function: FunctionResponse
 
 
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -160,6 +160,22 @@ def _process_tools_stream(index: int, delta: str, parser_dict: Dict, request: Ch
     return normal_text, calls
 
 
+def _split_tool_argument_delta(arguments: Optional[str]) -> List[str]:
+    """Split a complete JSON argument string into OpenAI-style deltas."""
+    if not arguments:
+        return []
+    if len(arguments) <= 2:
+        return [arguments]
+    if arguments[0] in "{[" and arguments[-1] in "}]":
+        middle = arguments[1:-1]
+        chunks = [arguments[0]]
+        if middle:
+            chunks.append(middle)
+        chunks.append(arguments[-1])
+        return [chunk for chunk in chunks if chunk]
+    return [arguments]
+
+
 async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Request) -> Response:
     from .api_http import g_objs
 
@@ -342,6 +358,7 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
                             ToolCall(
                                 id=tool_id,
                                 index=getattr(call_info, "tool_index", None),
+                                type="function",
                                 function=FunctionResponse(name=call_info.name, arguments=call_info.parameters),
                             )
                         )
@@ -408,7 +425,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                         choices=[choice_data],
                         model=request.model,
                     )
-                    yield f"data: {chunk.model_dump_json()}\n\n"
+                    yield f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
 
             if request.tool_choice != "none" and request.tools:
                 # parse_increment => returns (normal_text, calls)
@@ -417,7 +434,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 )
 
                 # 1) if there's normal_text, output it as normal content
-                if normal_text:
+                if normal_text and (normal_text.strip() or not has_emitted_tool_calls[sub_req_id]):
                     choice_data = ChatCompletionStreamResponseChoice(
                         index=choice_index,
                         delta=DeltaMessage(role="assistant", content=normal_text),
@@ -429,7 +446,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                         choices=[choice_data],
                         model=request.model,
                     )
-                    yield f"data: {chunk.model_dump_json()}\n\n"
+                    yield f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
 
                 # 2) if we found calls, we output them as separate chunk(s)
                 history_tool_calls_cnt = _get_history_tool_calls_cnt(request)
@@ -456,7 +473,8 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                             call_item.parameters = remaining_call
 
                     tool_parser = getattr(g_objs.args, "tool_call_parser", None) or "llama3"
-                    id_key = (choice_index, call_item.tool_index)
+                    stream_index = getattr(call_item, "tool_index", None)
+                    id_key = (choice_index, stream_index)
                     if call_item.name:
                         if id_key not in stream_tool_call_ids:
                             stream_tool_call_ids[id_key] = _process_tool_call_id(
@@ -468,26 +486,74 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                         tool_call_id = stream_tool_call_ids.get(id_key)
                         function_name = None
 
-                    tool_call = ToolCall(
-                        id=tool_call_id,
-                        index=getattr(call_item, "tool_index", None),
-                        function=FunctionResponse(
-                            name=function_name,
-                            arguments=call_item.parameters,
-                        ),
-                    )
-                    choice_data = ChatCompletionStreamResponseChoice(
-                        index=choice_index,
-                        delta=DeltaMessage(role="assistant", tool_calls=[tool_call]),
-                        finish_reason=None,
-                    )
-                    chunk = ChatCompletionStreamResponse(
-                        id=group_request_id,
-                        created=created_time,
-                        choices=[choice_data],
-                        model=request.model,
-                    )
-                    yield f"data: {chunk.model_dump_json()}\n\n"
+                    is_tool_head = call_item.name is not None
+
+                    if is_tool_head and call_item.parameters:
+                        head_tool_call = ToolCall(
+                            id=tool_call_id,
+                            index=stream_index,
+                            type="function",
+                            function=FunctionResponse(
+                                name=function_name,
+                                arguments="",
+                            ),
+                        )
+                        head_choice = ChatCompletionStreamResponseChoice(
+                            index=choice_index,
+                            delta=DeltaMessage(tool_calls=[head_tool_call]),
+                            finish_reason=None,
+                        )
+                        head_chunk = ChatCompletionStreamResponse(
+                            id=group_request_id,
+                            created=created_time,
+                            choices=[head_choice],
+                            model=request.model,
+                        )
+                        yield f"data: {head_chunk.model_dump_json(exclude_none=True)}\n\n"
+
+                        for arg_delta in _split_tool_argument_delta(call_item.parameters):
+                            arg_tool_call = ToolCall(
+                                index=stream_index,
+                                function=FunctionResponse(arguments=arg_delta),
+                            )
+                            arg_choice = ChatCompletionStreamResponseChoice(
+                                index=choice_index,
+                                delta=DeltaMessage(tool_calls=[arg_tool_call]),
+                                finish_reason=None,
+                            )
+                            arg_chunk = ChatCompletionStreamResponse(
+                                id=group_request_id,
+                                created=created_time,
+                                choices=[arg_choice],
+                                model=request.model,
+                            )
+                            yield f"data: {arg_chunk.model_dump_json(exclude_none=True)}\n\n"
+                    else:
+                        tool_call = ToolCall(
+                            id=tool_call_id if is_tool_head else None,
+                            index=stream_index,
+                            type="function" if is_tool_head else None,
+                            function=FunctionResponse(
+                                name=function_name,
+                                arguments=(
+                                    (call_item.parameters if call_item.parameters is not None else "")
+                                    if is_tool_head
+                                    else call_item.parameters
+                                ),
+                            ),
+                        )
+                        choice_data = ChatCompletionStreamResponseChoice(
+                            index=choice_index,
+                            delta=DeltaMessage(tool_calls=[tool_call]),
+                            finish_reason=None,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=group_request_id,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json(exclude_none=True)}\n\n"
             else:
                 delta_message = DeltaMessage(role="assistant", content=delta)
                 stream_choice = ChatCompletionStreamResponseChoice(
@@ -499,7 +565,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                     model=request.model,
                     choices=[stream_choice],
                 )
-                yield f"data: {stream_resp.model_dump_json()}\n\n"
+                yield f"data: {stream_resp.model_dump_json(exclude_none=True)}\n\n"
 
             # Emit a per-choice final empty chunk with finish_reason.
             if current_finish_reason is not None:
@@ -516,7 +582,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                     model=request.model,
                     choices=[final_choice],
                 )
-                yield f"data: {final_chunk.model_dump_json()}\n\n"
+                yield f"data: {final_chunk.model_dump_json(exclude_none=True)}\n\n"
 
         if request.stream_options and request.stream_options.include_usage:
             usage = UsageInfo(
@@ -531,7 +597,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 model=request.model,
                 usage=usage,
             )
-            yield f"data: {usage_chunk.model_dump_json()}\n\n"
+            yield f"data: {usage_chunk.model_dump_json(exclude_none=True)}\n\n"
 
     background_tasks = BackgroundTasks()
     return StreamingResponse(stream_results(), media_type="text/event-stream", background=background_tasks)
diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py
@@ -1846,6 +1846,48 @@ def _parse_function_call(self, function_str: str, tools: List[Tool]) -> Optional
             parameters=json.dumps(param_dict, ensure_ascii=False),
         )
 
+    def _build_partial_arguments_json(self, func_name: str, partial_body: str, tools: List[Tool]) -> Optional[str]:
+        """Build the current argument JSON from a partial XML tool-call body."""
+        param_matches = self.parameter_regex.findall(partial_body)
+        if not param_matches:
+            return None
+
+        param_config = self._get_param_config(func_name, tools)
+        param_dict = {}
+        has_visible_value = False
+
+        for match in param_matches:
+            try:
+                idx = match.index(">")
+            except ValueError:
+                continue
+
+            param_name = match[:idx].strip()
+            param_value = match[idx + 1 :]
+            if param_value.startswith("\n"):
+                param_value = param_value[1:]
+            if param_value.endswith("\n"):
+                param_value = param_value[:-1]
+
+            if param_value.strip():
+                has_visible_value = True
+            elif (
+                f"<parameter={param_name}>" in partial_body
+                and f"<parameter={param_name}>{param_value}</parameter>" in partial_body
+            ):
+                # Closed empty-string parameter. We can safely emit it.
+                has_visible_value = True
+            else:
+                # Parameter tag is present but its value has not started streaming yet.
+                continue
+
+            param_dict[param_name] = self._convert_param_value(param_value, param_name, param_config, func_name)
+
+        if not param_dict and not has_visible_value:
+            return None
+
+        return json.dumps(param_dict, ensure_ascii=False)
+
     def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
         idx = text.find(self.bot_token)
         normal_text = text[:idx].strip() if idx != -1 else text
@@ -1865,79 +1907,57 @@ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult
                 func_str = match[0] if match[0] else match[1]
                 item = self._parse_function_call(func_str, tools)
                 if item:
+                    item.tool_index = len(calls)
                     calls.append(item)
 
         return StreamingParseResult(normal_text=normal_text, calls=calls)
 
     def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> StreamingParseResult:
         """Streaming incremental parsing for Qwen3-Coder XML tool calls."""
         self._buffer += new_text
-        current_text = self._buffer
-
-        if not self.has_tool_call(current_text):
-            partial_len = self._ends_with_partial_token(current_text, self.bot_token)
-            if partial_len:
-                return StreamingParseResult()
-            self._buffer = ""
-            cleaned = new_text.replace(self.eot_token, "")
-            return StreamingParseResult(normal_text=cleaned)
-
-        # Check for complete tool call blocks
-        if self.eot_token in current_text:
-            result = self.detect_and_parse(current_text, tools)
-            last_end = current_text.rfind(self.eot_token)
-            if last_end != -1:
-                self._buffer = current_text[last_end + len(self.eot_token) :].lstrip()
-            else:
-                self._buffer = ""
-            self.current_tool_id = -1
-            self.current_tool_name_sent = False
-            return result
-
-        # Partial tool call - try to extract function name for early streaming
         if not hasattr(self, "_tool_indices"):
             self._tool_indices = self._get_tool_indices(tools)
 
-        calls = []
-        tool_call_start = current_text.find(self.bot_token)
-        if tool_call_start == -1:
-            return StreamingParseResult()
+        normal_text = ""
+        calls: List[ToolCallItem] = []
 
-        content_after = current_text[tool_call_start + len(self.bot_token) :]
-        func_prefix = "<function="
-        func_pos = content_after.find(func_prefix)
-        if func_pos == -1:
-            return StreamingParseResult()
+        while True:
+            current_text = self._buffer
+            tool_call_start = current_text.find(self.bot_token)
 
-        after_func = content_after[func_pos + len(func_prefix) :]
-        gt_pos = after_func.find(">")
-        if gt_pos == -1:
-            return StreamingParseResult()
+            if tool_call_start == -1:
+                partial_len = self._ends_with_partial_token(current_text, self.bot_token)
+                if partial_len:
+                    return StreamingParseResult(normal_text=normal_text, calls=calls)
+                if current_text:
+                    normal_text += current_text.replace(self.eot_token, "")
+                    self._buffer = ""
+                return StreamingParseResult(normal_text=normal_text, calls=calls)
 
-        func_name = after_func[:gt_pos].strip()
+            if tool_call_start > 0:
+                normal_text += current_text[:tool_call_start]
+                self._buffer = current_text[tool_call_start:]
+                current_text = self._buffer
 
-        if self.current_tool_id == -1:
-            self.current_tool_id = 0
-            self.prev_tool_call_arr = []
-            self.streamed_args_for_tool = [""]
+            eot_pos = current_text.find(self.eot_token)
+            if eot_pos == -1:
+                return StreamingParseResult(normal_text=normal_text, calls=calls)
 
-        while len(self.prev_tool_call_arr) <= self.current_tool_id:
-            self.prev_tool_call_arr.append({})
-        while len(self.streamed_args_for_tool) <= self.current_tool_id:
-            self.streamed_args_for_tool.append("")
+            complete_block = current_text[: eot_pos + len(self.eot_token)]
+            func_matches = self.function_regex.findall(complete_block)
 
-        if func_name and func_name in self._tool_indices and not self.current_tool_name_sent:
-            calls.append(
-                ToolCallItem(
-                    tool_index=self.current_tool_id,
-                    name=func_name,
-                    parameters="",
-                )
-            )
-            self.current_tool_name_sent = True
-            self.prev_tool_call_arr[self.current_tool_id] = {"name": func_name, "arguments": {}}
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+
+            for match in func_matches:
+                func_str = match[0] if match[0] else match[1]
+                item = self._parse_function_call(func_str, tools)
+                if item:
+                    item.tool_index = self.current_tool_id
+                    calls.append(item)
+                    self.current_tool_id += 1
 
-        return StreamingParseResult(normal_text="", calls=calls)
+            self._buffer = current_text[eot_pos + len(self.eot_token) :].lstrip()
 
 
 class FunctionCallParser: