fixes #103: responses api max_output_tokens bug

RayBytes · RayBytes · commit da0e3c3176e9 · 2026-03-23T23:33:56.000+05:00
diff --git a/DOCKER.md b/DOCKER.md
@@ -25,13 +25,12 @@ Set options in `.env` or pass environment variables:
 - `CHATGPT_LOCAL_REASONING_SUMMARY`: auto|concise|detailed|none
 - `CHATGPT_LOCAL_REASONING_COMPAT`: legacy|o3|think-tags|current
 - `CHATGPT_LOCAL_FAST_MODE`: `true|false` to enable fast mode by default for supported models
-- `CHATGPT_LOCAL_DEBUG_MODEL`: force model override (e.g., `gpt-5.4`)
 - `CHATGPT_LOCAL_CLIENT_ID`: OAuth client id override (rarely needed)
 - `CHATGPT_LOCAL_EXPOSE_REASONING_MODELS`: `true|false` to add reasoning model variants to `/v1/models`
 - `CHATGPT_LOCAL_ENABLE_WEB_SEARCH`: `true|false` to enable default web search tool
 
 ## Logs
-Set `VERBOSE=true` to include extra logging for debugging issues in upstream or chat app requests. Please include and use these logs when submitting bug reports.
+Set `VERBOSE=true` to include extra logging for troubleshooting upstream or chat app requests. Please include and use these logs when submitting bug reports.
 
 ## Test
 
diff --git a/chatmock/cli.py b/chatmock/cli.py
@@ -284,7 +284,7 @@ def cmd_serve(
         default_web_search=default_web_search,
     )
 
-    app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)
+    app.run(host=host, use_reloader=False, port=port, threaded=True)
     return 0
 
 
diff --git a/chatmock/responses_api.py b/chatmock/responses_api.py
@@ -88,6 +88,7 @@ def normalize_responses_payload(
 
     normalized = dict(payload)
     normalized["model"] = normalized_model
+    normalized.pop("max_output_tokens", None)
 
     if "input" in normalized:
         normalized["input"] = canonicalize_responses_input(normalized.get("input"))
diff --git a/chatmock/routes_ollama.py b/chatmock/routes_ollama.py
@@ -250,7 +250,7 @@ def ollama_chat() -> Response:
     input_items = convert_chat_messages_to_responses_input(messages)
 
     model_reasoning = extract_reasoning_from_model_name(model)
-    normalized_model = normalize_model_name(model)
+    normalized_model = normalize_model_name(model, current_app.config.get("DEBUG_MODEL"))
     service_tier_resolution = resolve_service_tier(
         normalized_model,
         request_fast_mode=payload.get("fast_mode"),
@@ -306,7 +306,7 @@ def ollama_chat() -> Response:
             base_tools_only = convert_tools_chat_to_responses(normalize_ollama_tools(tools_req))
             safe_choice = payload.get("tool_choice", "auto")
             upstream2, err2 = start_upstream_request(
-                normalize_model_name(model),
+                normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
                 input_items,
                 instructions=BASE_INSTRUCTIONS,
                 tools=base_tools_only,
@@ -570,7 +570,7 @@ def _gen():
             full_text = f"<think>{rtxt}</think>" + (full_text or "")
 
     out_json = {
-        "model": normalize_model_name(model),
+        "model": normalize_model_name(model, current_app.config.get("DEBUG_MODEL")),
         "created_at": created_at,
         "message": {"role": "assistant", "content": full_text, **({"tool_calls": tool_calls} if tool_calls else {})},
         "done": True,
diff --git a/chatmock/routes_openai.py b/chatmock/routes_openai.py
@@ -109,7 +109,6 @@ def chat_completions() -> Response:
     reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
     reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
     reasoning_compat = current_app.config.get("REASONING_COMPAT", "think-tags")
-    debug_model = current_app.config.get("DEBUG_MODEL")
 
     raw = request.get_data(cache=True, as_text=True) or ""
     if verbose:
@@ -129,7 +128,7 @@ def chat_completions() -> Response:
             return jsonify(err), 400
 
     requested_model = payload.get("model")
-    model = normalize_model_name(requested_model, debug_model)
+    model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
     messages = payload.get("messages")
     if messages is None and isinstance(payload.get("prompt"), str):
         messages = [{"role": "user", "content": payload.get("prompt") or ""}]
@@ -413,7 +412,6 @@ def _extract_usage(evt: Dict[str, Any]) -> Dict[str, int] | None:
 def completions() -> Response:
     verbose = bool(current_app.config.get("VERBOSE"))
     verbose_obfuscation = bool(current_app.config.get("VERBOSE_OBFUSCATION"))
-    debug_model = current_app.config.get("DEBUG_MODEL")
     reasoning_effort = current_app.config.get("REASONING_EFFORT", "medium")
     reasoning_summary = current_app.config.get("REASONING_SUMMARY", "auto")
 
@@ -432,7 +430,7 @@ def completions() -> Response:
         return jsonify(err), 400
 
     requested_model = payload.get("model")
-    model = normalize_model_name(requested_model, debug_model)
+    model = normalize_model_name(requested_model, current_app.config.get("DEBUG_MODEL"))
     prompt = payload.get("prompt")
     if isinstance(prompt, list):
         prompt = "".join([p if isinstance(p, str) else "" for p in prompt])
diff --git a/gui.py b/gui.py
@@ -19,6 +19,7 @@ def run_server(
     reasoning_summary: str = "auto",
     reasoning_compat: str = "think-tags",
     fast_mode: bool = False,
+    debug_model: str | None = None,
     expose_reasoning_models: bool = False,
     default_web_search: bool = False,
 ) -> None:
@@ -27,10 +28,11 @@ def run_server(
         reasoning_summary=reasoning_summary,
         reasoning_compat=reasoning_compat,
         fast_mode=fast_mode,
+        debug_model=debug_model,
         expose_reasoning_models=expose_reasoning_models,
         default_web_search=default_web_search,
     )
-    app.run(host=host, port=port, debug=False, use_reloader=False, threaded=True)
+    app.run(host=host, port=port, use_reloader=False, threaded=True)
 
 
 class ServerProcess(QtCore.QObject):
@@ -45,6 +47,7 @@ def __init__(self) -> None:
         self._summary = "auto"
         self._compat = "think-tags"
         self._fast_mode = False
+        self._debug_model: str | None = None
         self._expose_reasoning_models = False
         self._default_web_search = False
 
@@ -59,6 +62,7 @@ def start(
         summary: str,
         compat: str,
         fast_mode: bool,
+        debug_model: str | None,
         expose_reasoning_models: bool,
         default_web_search: bool,
     ) -> None:
@@ -68,6 +72,7 @@ def start(
         self._effort, self._summary = effort, summary
         self._compat = compat
         self._fast_mode = fast_mode
+        self._debug_model = debug_model
         self._expose_reasoning_models = expose_reasoning_models
         self._default_web_search = default_web_search
         self._proc = QtCore.QProcess()
@@ -80,6 +85,8 @@ def start(
             "--summary", summary,
             "--compat", compat,
         ]
+        if isinstance(debug_model, str) and debug_model.strip():
+            args.extend(["--debug-model", debug_model.strip()])
         if fast_mode:
             args.append("--fast-mode")
         if expose_reasoning_models:
@@ -317,6 +324,12 @@ def __init__(self) -> None:
         self.port_edit.setValidator(QtGui.QIntValidator(1, 65535, self))
         self.port_edit.setMaximumWidth(100)
         form.addWidget(self.port_edit, 0, 3)
+        form.addWidget(QtWidgets.QLabel("Debug Model"), 1, 0)
+        self.debug_model_edit = QtWidgets.QLineEdit("")
+        self.debug_model_edit.setClearButtonEnabled(True)
+        self.debug_model_edit.setPlaceholderText("Optional override, e.g. gpt-5.4")
+        self.debug_model_edit.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Fixed)
+        form.addWidget(self.debug_model_edit, 1, 1, 1, 3)
         form.setColumnStretch(1, 1)
         srv_layout.addLayout(form)
 
@@ -473,6 +486,7 @@ def _start_server(self) -> None:
         summary = self.summary.currentText().strip()
         compat = self.compat.currentText().strip()
         fast_mode = self.fast_mode.isChecked()
+        debug_model = self.debug_model_edit.text().strip() or None
         expose_reasoning_models = self.expose_reasoning_models.isChecked()
         default_web_search = self.enable_web_search.isChecked()
         self.status.setText(f"Starting server at http://{host}:{port} …")
@@ -484,6 +498,7 @@ def _start_server(self) -> None:
             summary,
             compat,
             fast_mode,
+            debug_model,
             expose_reasoning_models,
             default_web_search,
         )
@@ -536,6 +551,7 @@ def main() -> None:
         p.add_argument("--summary", default="auto")
         p.add_argument("--compat", default="think-tags")
         p.add_argument("--fast-mode", action="store_true")
+        p.add_argument("--debug-model")
         p.add_argument("--expose-reasoning-models", action="store_true")
         p.add_argument("--enable-web-search", action="store_true")
         args, _ = p.parse_known_args()
@@ -546,6 +562,7 @@ def main() -> None:
             args.summary,
             args.compat,
             args.fast_mode,
+            args.debug_model,
             args.expose_reasoning_models,
             args.enable_web_search,
         )
diff --git a/scripts/test_responses_cached_tokens.py b/scripts/test_responses_cached_tokens.py
diff --git a/scripts/test_responses_reuse.py b/scripts/test_responses_reuse.py
diff --git a/tests/test_routes.py b/tests/test_routes.py

Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ def cmd_serve(`
`284`	`284`	`default_web_search=default_web_search,`
`285`	`285`	`)`
`286`	`286`
`287`		`- app.run(host=host, debug=False, use_reloader=False, port=port, threaded=True)`
	`287`	`+ app.run(host=host, use_reloader=False, port=port, threaded=True)`
`288`	`288`	`return 0`
`289`	`289`
`290`	`290`