feat: auto-calculate max_new_tokens to align with vLLM behavior

sufubao · sufubao · commit 1a2b06268b93 · 2026-03-23T10:11:36.000Z
When max_new_tokens is not specified (None or -1), automatically
calculate it as max_req_total_len - prompt_tokens. This aligns
with vLLM's behavior where max_tokens defaults to the remaining
context length.

Changes:
- sampling_params.py: default max_new_tokens changed from 16384 to -1
- py_sampling_params.py: default max_new_tokens changed from 16384 to None
- manager.py: add auto-calculation logic in _check_and_repair_length
diff --git a/lightllm/server/core/objs/py_sampling_params.py b/lightllm/server/core/objs/py_sampling_params.py
@@ -38,7 +38,7 @@ def __init__(
         top_k: int = None,  # -1 is for all
         ignore_eos: bool = False,
         image_max_patch_num: int = -1,
-        max_new_tokens: int = 16384,
+        max_new_tokens: int = None,  # If None, auto-calculate as max_req_total_len - input_len
         min_new_tokens: int = 1,
         stop_sequences: Optional[Union[str, List[str], List[List[int]]]] = None,  # 停止句子条件
         skip_special_tokens: bool = True,  # whether to skip special tokens when decoding
@@ -141,11 +141,11 @@ def verify(self):
             raise ValueError(f"top_p must in (0.0, 1.0], got {self.top_p}")
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(f"top_k must be -1 (disable), or at least 1, got {self.top_k}.")
-        if self.max_new_tokens < 1:
+        if self.max_new_tokens is not None and self.max_new_tokens < 1:
             raise ValueError(f"max_new_tokens must be at least 1, got {self.max_new_tokens}.")
         if self.min_new_tokens < 1:
             raise ValueError(f"min_new_tokens must be at least 1, got {self.min_new_tokens}.")
-        if self.min_new_tokens > self.max_new_tokens:
+        if self.max_new_tokens is not None and self.min_new_tokens > self.max_new_tokens:
             raise ValueError(
                 f"min_new_tokens must <= max_new_tokens, but got min {self.min_new_tokens}, max {self.max_new_tokens}."
             )
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -345,7 +345,9 @@ def init(self, tokenizer, **kwargs):
         self.top_k = kwargs.get("top_k", SamplingParams._top_k)
         self.ignore_eos = kwargs.get("ignore_eos", False)
         self.image_max_patch_num = kwargs.get("image_max_patch_num", -1)
-        self.max_new_tokens = kwargs.get("max_new_tokens", 16384)
+        self.max_new_tokens = kwargs.get(
+            "max_new_tokens", -1
+        )  # -1 means auto-calculate as max_req_total_len - input_len
         self.min_new_tokens = kwargs.get("min_new_tokens", 1)
         self.input_penalty = kwargs.get("input_penalty", DEFAULT_INPUT_PENALTY)
         self.group_request_id = kwargs.get("group_request_id", -1)
@@ -439,11 +441,11 @@ def verify(self):
             raise ValueError(f"top_p must be in (0.0, 1.0], got {self.top_p}")
         if self.top_k < -1 or self.top_k == 0:
             raise ValueError(f"top_k must be -1 (disable), or at least 1, got {self.top_k}.")
-        if self.max_new_tokens < 1:
+        if self.max_new_tokens != -1 and self.max_new_tokens < 1:
             raise ValueError(f"max_new_tokens must be at least 1 , got {self.max_new_tokens}.")
         if self.min_new_tokens < 1:
             raise ValueError(f"min_new_tokens must be at least 1 , got {self.min_new_tokens}.")
-        if self.min_new_tokens > self.max_new_tokens:
+        if self.max_new_tokens != -1 and self.min_new_tokens > self.max_new_tokens:
             raise ValueError(
                 f"min_new_tokens must <= max_new_tokens, but got min {self.min_new_tokens}, max {self.max_new_tokens}."
             )
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -477,6 +477,21 @@ async def _check_and_repair_length(self, prompt_ids: List[int], sampling_params:
         if not prompt_ids:
             raise ValueError("prompt_ids is empty")
         prompt_tokens = len(prompt_ids)
+
+        # If max_new_tokens is None or -1, auto-calculate based on model context length (align with vLLM behavior)
+        # -1 is used as sentinel for ctypes-based SamplingParams, None for pure Python SamplingParams
+        if sampling_params.max_new_tokens is None or sampling_params.max_new_tokens == -1:
+            sampling_params.max_new_tokens = self.max_req_total_len - prompt_tokens
+            if sampling_params.max_new_tokens < 1:
+                raise ValueError(
+                    f"the input prompt token len {prompt_tokens} >= max_req_total_len {self.max_req_total_len}, "
+                    f"no space for output tokens"
+                )
+            logger.debug(
+                f"max_new_tokens is unset, auto-calculate to {sampling_params.max_new_tokens} "
+                f"(max_req_total_len {self.max_req_total_len} - prompt_tokens {prompt_tokens})"
+            )
+
         if prompt_tokens + sampling_params.max_new_tokens > self.max_req_total_len:
             # use long_truncation_mode to truncate long input len req.
             if self.args.long_truncation_mode is None: