add v1/models && stream fc

shihaobai · shihaobai · commit 00eb78ebec77 · 2026-04-02T09:29:30.000Z
diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -98,6 +98,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
         default="default_model_name",
         help="just help to distinguish internal model name, use 'host:port/get_model_name' to get",
     )
+    parser.add_argument(
+        "--model_owner",
+        type=str,
+        default=None,
+        help="the model owner, if not set, will use lightllm",
+    )
 
     parser.add_argument(
         "--model_dir",
diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py
@@ -19,6 +19,7 @@
 import asyncio
 import collections
 import time
+
 import uvloop
 import requests
 import base64
@@ -57,6 +58,8 @@
     ChatCompletionResponse,
     CompletionRequest,
     CompletionResponse,
+    ModelCard,
+    ModelListResponse,
 )
 from .build_prompt import build_prompt, init_tokenizer
 
@@ -72,6 +75,9 @@ class G_Objs:
     g_generate_stream_func: Callable = None
     httpserver_manager: Union[HttpServerManager, HttpServerManagerForPDMaster] = None
     shared_token_load: TokenLoad = None
+    # OpenAI-compatible "created" timestamp for /v1/models.
+    # Should be stable for the lifetime of this server process.
+    model_created: int = None
 
     def set_args(self, args: StartArgs):
         self.args = args
@@ -101,6 +107,8 @@ def set_args(self, args: StartArgs):
             self.httpserver_manager = HttpServerManager(args=args)
             dp_size_in_node = max(1, args.dp // args.nnodes)  # 兼容多机纯tp的运行模式，这时候 1 // 2 == 0, 需要兼容
             self.shared_token_load = TokenLoad(f"{get_unique_server_name()}_shared_token_load", dp_size_in_node)
+            if self.model_created is None:
+                self.model_created = int(time.time())
 
 
 g_objs = G_Objs()
@@ -258,6 +266,26 @@ async def completions(request: CompletionRequest, raw_request: Request) -> Respo
     return resp
 
 
+@app.get("/v1/models", response_model=ModelListResponse)
+@app.post("/v1/models", response_model=ModelListResponse)
+async def get_models(raw_request: Request):
+    model_name = g_objs.args.model_name
+    max_model_len = g_objs.args.max_req_total_len
+    if model_name == "default_model_name" and g_objs.args.model_dir:
+        model_name = os.path.basename(g_objs.args.model_dir.rstrip("/"))
+
+    return ModelListResponse(
+        data=[
+            ModelCard(
+                id=model_name,
+                created=g_objs.model_created,
+                max_model_len=max_model_len,
+                owned_by=g_objs.args.model_owner,
+            )
+        ]
+    )
+
+
 @app.get("/tokens")
 @app.post("/tokens")
 async def tokens(request: Request):
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
@@ -370,3 +370,16 @@ class CompletionStreamResponse(BaseModel):
     @field_validator("id", mode="before")
     def ensure_id_is_str(cls, v):
         return str(v)
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "lightllm"
+    max_model_len: Optional[int] = None
+
+
+class ModelListResponse(BaseModel):
+    object: str = "list"
+    data: List[ModelCard]
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -19,7 +19,7 @@
 from http import HTTPStatus
 from PIL import Image
 import multiprocessing as mp
-from typing import Any, AsyncGenerator, Optional, Union, List, Dict
+from typing import Any, AsyncGenerator, Optional, Union, List, Dict, Tuple
 from typing import Callable
 from lightllm.server import TokenLoad
 from fastapi import BackgroundTasks, FastAPI, Request, WebSocket, WebSocketDisconnect
@@ -371,16 +371,13 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
         )
         return resp
 
-    if sampling_params.n != 1:
-        return create_error_response(HTTPStatus.BAD_REQUEST, "stream api only support n = 1")
-
     parser_dict = {}
     reasoning_parser_dict = {}
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
-        finish_reason = None
-        has_emitted_tool_calls = False
+        has_emitted_tool_calls: Dict[int, bool] = collections.defaultdict(bool)
+        stream_tool_call_ids: Dict[Tuple[int, int], str] = {}
         from .req_id_generator import convert_sub_id_to_group_id
 
         prompt_tokens = 0
@@ -389,18 +386,19 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
             prompt_tokens = metadata["prompt_tokens"]
             completion_tokens += 1
             group_request_id = convert_sub_id_to_group_id(sub_req_id)
-            index = sub_req_id
+            choice_index = sub_req_id - group_request_id
+
             delta = request_output
-            finish_reason = finish_status.get_finish_reason()
+            current_finish_reason = finish_status.get_finish_reason()
 
             # Handle reasoning content
             if get_env_start_args().reasoning_parser and request.separate_reasoning:
                 reasoning_text, delta = _process_reasoning_stream(
-                    index, delta, reasoning_parser_dict, request_output, request
+                    choice_index, delta, reasoning_parser_dict, request_output, request
                 )
                 if reasoning_text:
                     choice_data = ChatCompletionStreamResponseChoice(
-                        index=0,
+                        index=choice_index,
                         delta=DeltaMessage(role="assistant", reasoning_content=reasoning_text),
                         finish_reason=None,
                     )
@@ -415,13 +413,13 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
             if request.tool_choice != "none" and request.tools:
                 # parse_increment => returns (normal_text, calls)
                 normal_text, calls = _process_tools_stream(
-                    index=index, delta=delta, parser_dict=parser_dict, request=request
+                    index=choice_index, delta=delta, parser_dict=parser_dict, request=request
                 )
 
                 # 1) if there's normal_text, output it as normal content
                 if normal_text:
                     choice_data = ChatCompletionStreamResponseChoice(
-                        index=0,
+                        index=choice_index,
                         delta=DeltaMessage(role="assistant", content=normal_text),
                         finish_reason=None,
                     )
@@ -435,32 +433,39 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
                 # 2) if we found calls, we output them as separate chunk(s)
                 history_tool_calls_cnt = _get_history_tool_calls_cnt(request)
+                fc_parser = parser_dict[choice_index]
                 for call_item in calls:
-                    has_emitted_tool_calls = True
+                    has_emitted_tool_calls[sub_req_id] = True
                     # transform call_item -> FunctionResponse + ToolCall
-                    if finish_reason == "stop":
-                        latest_delta_len = 0
-                        if isinstance(call_item.parameters, str):
-                            latest_delta_len = len(call_item.parameters)
-
-                        expected_call = json.dumps(
-                            parser.multi_format_parser.detectors[0].prev_tool_call_arr[index].get("arguments", {}),
-                            ensure_ascii=False,
-                        )
-                        actual_call = parser.multi_format_parser.detectors[0].streamed_args_for_tool[index]
-                        if latest_delta_len > 0:
-                            actual_call = actual_call[:-latest_delta_len]
-                        remaining_call = expected_call.replace(actual_call, "", 1)
-                        call_item.parameters = remaining_call
+                    if current_finish_reason == "stop":
+                        det = fc_parser.detector
+                        ti = call_item.tool_index
+                        if ti >= 0 and ti < len(det.prev_tool_call_arr) and ti < len(det.streamed_args_for_tool):
+                            latest_delta_len = 0
+                            if isinstance(call_item.parameters, str):
+                                latest_delta_len = len(call_item.parameters)
+
+                            expected_call = json.dumps(
+                                det.prev_tool_call_arr[ti].get("arguments", {}),
+                                ensure_ascii=False,
+                            )
+                            actual_call = det.streamed_args_for_tool[ti]
+                            if latest_delta_len > 0:
+                                actual_call = actual_call[:-latest_delta_len]
+                            remaining_call = expected_call.replace(actual_call, "", 1)
+                            call_item.parameters = remaining_call
 
+                    tool_parser = getattr(g_objs.args, "tool_call_parser", None) or "llama3"
+                    id_key = (choice_index, call_item.tool_index)
                     if call_item.name:
-                        # First chunk: include ID and function name
-                        tool_parser = getattr(g_objs.args, "tool_call_parser", None) or "llama3"
-                        tool_call_id = _process_tool_call_id(tool_parser, call_item, history_tool_calls_cnt)
+                        if id_key not in stream_tool_call_ids:
+                            stream_tool_call_ids[id_key] = _process_tool_call_id(
+                                tool_parser, call_item, history_tool_calls_cnt
+                            )
+                        tool_call_id = stream_tool_call_ids[id_key]
                         function_name = call_item.name
                     else:
-                        # Subsequent chunks: null ID and name for argument deltas
-                        tool_call_id = None
+                        tool_call_id = stream_tool_call_ids.get(id_key)
                         function_name = None
 
                     tool_call = ToolCall(
@@ -472,7 +477,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                         ),
                     )
                     choice_data = ChatCompletionStreamResponseChoice(
-                        index=0,
+                        index=choice_index,
                         delta=DeltaMessage(role="assistant", tool_calls=[tool_call]),
                         finish_reason=None,
                     )
@@ -485,7 +490,9 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                     yield f"data: {chunk.model_dump_json()}\n\n"
             else:
                 delta_message = DeltaMessage(role="assistant", content=delta)
-                stream_choice = ChatCompletionStreamResponseChoice(index=0, delta=delta_message, finish_reason=None)
+                stream_choice = ChatCompletionStreamResponseChoice(
+                    index=choice_index, delta=delta_message, finish_reason=None
+                )
                 stream_resp = ChatCompletionStreamResponse(
                     id=group_request_id,
                     created=created_time,
@@ -494,24 +501,22 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 )
                 yield f"data: {stream_resp.model_dump_json()}\n\n"
 
-        # Determine final finish_reason: override to "tool_calls" if tool calls were emitted
-        if has_emitted_tool_calls and finish_reason == "stop":
-            finish_reason = "tool_calls"
-
-        # Final empty chunk containing only finish_reason (and role)
-        if finish_reason is not None:
-            final_choice = ChatCompletionStreamResponseChoice(
-                index=0,
-                delta=DeltaMessage(),
-                finish_reason=finish_reason,
-            )
-            final_chunk = ChatCompletionStreamResponse(
-                id=group_request_id,
-                created=created_time,
-                model=request.model,
-                choices=[final_choice],
-            )
-            yield f"data: {final_chunk.model_dump_json()}\n\n"
+            # Emit a per-choice final empty chunk with finish_reason.
+            if current_finish_reason is not None:
+                if has_emitted_tool_calls[sub_req_id] and current_finish_reason == "stop":
+                    current_finish_reason = "tool_calls"
+                final_choice = ChatCompletionStreamResponseChoice(
+                    index=choice_index,
+                    delta=DeltaMessage(),
+                    finish_reason=current_finish_reason,
+                )
+                final_chunk = ChatCompletionStreamResponse(
+                    id=group_request_id,
+                    created=created_time,
+                    model=request.model,
+                    choices=[final_choice],
+                )
+                yield f"data: {final_chunk.model_dump_json()}\n\n"
 
         if request.stream_options and request.stream_options.include_usage:
             usage = UsageInfo(
@@ -634,9 +639,6 @@ async def _process_prompts_completion(
                 "Streaming is not supported for batch requests",
             )
 
-        if sampling_params.n != 1:
-            return create_error_response(HTTPStatus.BAD_REQUEST, "stream api only support n = 1")
-
         return await _handle_streaming_completion(
             prompts[0], sampling_params, multimodal_params, raw_request, request, created_time
         )
@@ -690,6 +692,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
 
         async for sub_req_id, request_output, metadata, finish_status in results_generator:
             group_request_id = convert_sub_id_to_group_id(sub_req_id)
+            choice_index = sub_req_id - group_request_id
             prompt_tokens = metadata["prompt_tokens"]
             completion_tokens += 1
             current_finish_reason = None
@@ -704,7 +707,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 output_text = prompt_str + output_text
 
             stream_choice = CompletionStreamChoice(
-                index=0,
+                index=choice_index,
                 text=output_text,
                 finish_reason=current_finish_reason,
                 logprobs=None if request.logprobs is None else {},