Merge pull request #473 from krissetto/thinking-budgets-google

krissetto · web-flow · commit 91b91b2ebed6 · 2025-10-10T14:41:28.000+02:00
Google/Gemini thinking budget support
diff --git a/cagent-schema.json b/cagent-schema.json
@@ -182,7 +182,7 @@
           "description": "Whether to track usage"
         },
         "thinking_budget": {
-          "description": "Controls reasoning effort/budget. For OpenAI: string levels ('minimal', 'low', 'medium', 'high'). For Anthropic: integer token budget (1024-32768)",
+          "description": "Controls reasoning effort/budget. OpenAI: string levels ('minimal','low','medium','high'). Anthropic: integer token budget (1024-32768). Gemini: integer token budget (-1 for unlimited, 0 to disable, 24576 max).",
           "oneOf": [
             {
               "type": "string",
@@ -191,12 +191,12 @@
             },
             {
               "type": "integer",
-              "minimum": 1024,
+              "minimum": -1,
               "maximum": 32768,
-              "description": "Token budget for extended thinking (Anthropic)"
+              "description": "Token budget for extended thinking (Anthropic, Google)"
             }
           ],
-          "examples": ["minimal", "low", "medium", "high", 1024, 32768]
+          "examples": ["minimal", "low", "medium", "high", -1, 0, 1024, 24576, 32768]
         }
       },
       "additionalProperties": false
diff --git a/docs/USAGE.md b/docs/USAGE.md
@@ -139,17 +139,17 @@ cagent run ./agent.yaml --command ls
 
 ### Model Properties
 
-| Property            | Type       | Description                                                           | Required |
-|---------------------|------------|-----------------------------------------------------------------------|----------|
-| `provider`          | string     | Provider: `openai`, `anthropic`, `dmr`                                | ✓        |
-| `model`             | string     | Model name (e.g., `gpt-4o`, `claude-sonnet-4-0`)                      | ✓        |
-| `temperature`       | float      | Randomness (0.0-1.0)                                                  | ✗        |
-| `max_tokens`        | integer    | Response length limit                                                 | ✗        |
-| `top_p`             | float      | Nucleus sampling (0.0-1.0)                                            | ✗        |
-| `frequency_penalty` | float      | Repetition penalty (0.0-2.0)                                          | ✗        |
-| `presence_penalty`  | float      | Topic repetition penalty (0.0-2.0)                                    | ✗        |
-| `base_url`          | string     | Custom API endpoint                                                   | ✗        |
-| `thinking_budget`   | string/int | Reasoning effort — OpenAI: effort string, Anthropic: token budget int | ✗        |
+| Property            | Type       | Description                                                                  | Required |
+|---------------------|------------|------------------------------------------------------------------------------|----------|
+| `provider`          | string     | Provider: `openai`, `anthropic`, `google`, `dmr`                             | ✓        |
+| `model`             | string     | Model name (e.g., `gpt-4o`, `claude-sonnet-4-0`, `gemini-2.5-flash`)         | ✓        |
+| `temperature`       | float      | Randomness (0.0-1.0)                                                         | ✗        |
+| `max_tokens`        | integer    | Response length limit                                                        | ✗        |
+| `top_p`             | float      | Nucleus sampling (0.0-1.0)                                                   | ✗        |
+| `frequency_penalty` | float      | Repetition penalty (0.0-2.0)                                                 | ✗        |
+| `presence_penalty`  | float      | Topic repetition penalty (0.0-2.0)                                           | ✗        |
+| `base_url`          | string     | Custom API endpoint                                                          | ✗        |
+| `thinking_budget`   | string/int | Reasoning effort — OpenAI: effort string, Anthropic/Google: token budget int | ✗        | 
 
 #### Example
 
@@ -164,15 +164,16 @@ models:
     frequency_penalty: float # Repetition penalty (0.0-2.0)
     presence_penalty: float # Topic repetition penalty (0.0-2.0)
     parallel_tool_calls: boolean
-    thinking_budget: string|integer # OpenAI: effort level string; Anthropic: integer token budget
+    thinking_budget: string|integer # OpenAI: effort level string; Anthropic/Google: integer token budget
 ```
 
 ### Reasoning Effort (thinking_budget)
 
 Determine how much the model should think by setting the `thinking_budget`
 
 - **OpenAI**: use effort levels — `minimal`, `low`, `medium`, `high`
-- **Anthropic**: set an integer token budget. Minimum is 1024; range is 1024–32768; must be strictly less than `max_tokens`. When set, cagent uses Anthropic's Beta Messages API with interleaved thinking enabled.
+- **Anthropic**: set an integer token budget. Range is 1024–32768; must be strictly less than `max_tokens`.
+- **Google (Gemini)**: set an integer token budget. `0` -> disable thinking, `-1` -> dynamic thinking (model decides). Most models: 0–24576 tokens. Gemini 2.5 Pro: 128–32768 tokens (and cannot disabled thinking). 
 
 Examples (OpenAI):
 
@@ -204,6 +205,31 @@ agents:
     instruction: you are a helpful assistant that doesn't think very much
 ```
 
+Examples (Google):
+
+```yaml
+models:
+  gemini-no-thinking:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: 0  # Disable thinking
+
+  gemini-dynamic:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: -1  # Dynamic thinking (model decides)
+
+  gemini-fixed:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: 8192  # Fixed token budget
+
+agents:
+  root:
+    model: gemini-fixed
+    instruction: you are a helpful assistant
+```
+
 #### Interleaved Thinking (Anthropic)
 
 Anthropic's interleaved thinking feature uses the Beta Messages API to provide tool calling during model reasoning. You can control this behavior using the `interleaved_thinking` provider option:
@@ -220,11 +246,14 @@ models:
 
 Notes:
 
-- If an invalid OpenAI effort value is set, the request will fail with a clear error
-- For Anthropic, values < 1024 or ≥ `max_tokens` are ignored (warning logged)
-- When `interleaved_thinking` is enabled, cagent uses Anthropic's Beta Messages API with a default thinking budget of 16384 tokens if not specified
+- **OpenAI**: If an invalid effort value is set, the request will fail with a clear error
+- **Anthropic**: Values < 1024 or ≥ `max_tokens` are ignored (warning logged). When `interleaved_thinking` is enabled, cagent uses Anthropic's Beta Messages API with a default thinking budget of 16384 tokens if not specified
+- **Google**: 
+  - Most models support values between -1 and 24576 tokens. Set to `0` to disable, `-1` for dynamic thinking
+  - Gemini 2.5 Pro: supports 128–32768 tokens. Cannot be disabled (minimum 128)
+  - Gemini 2.5 Flash-Lite: supports 512–24576 tokens. Set to `0` to disable, `-1` for dynamic thinking
 - For unsupported providers, `thinking_budget` has no effect
-- Debug logs include the applied effort (e.g., "OpenAI request using thinking_budget", "Anthropic Beta API using thinking_budget")
+- Debug logs include the applied effort (e.g., "OpenAI request using thinking_budget", "Gemini request using thinking_budget")
 
 See `examples/thinking_budget.yaml` for a complete runnable demo.
 
diff --git a/examples/thinking_budget.yaml b/examples/thinking_budget.yaml
@@ -9,8 +9,9 @@ agents:
   root:
     model: gpt-5-mini-min # <- try with gpt-5-mini-high
     # model: claude-4-5-sonnet-min # <- try with claude-4-5-sonnet-high
+    # model: gemini-2-5-flash-dynamic-thinking # <- try with -no-thinking, -low or -high variants
     description: a helpful assistant that thinks
-    instruction: you are a helpful assistant
+    instruction: you are a helpful assistant who can also use tools, but only if you need to
     commands:
       demo: "hey i need python code for a mandelbrot fractal"
     toolsets:
@@ -35,6 +36,26 @@ models:
   claude-4-5-sonnet-high:
     provider: anthropic
     model: claude-sonnet-4-5-20250929
-    thinking_budget: 32768 # <- tokens, 32768 is the suggested maximum without batching
+    thinking_budget: 32768 # <- tokens, 32768 is the Anthropic suggested maximum without batching
     provider_opts:
-      interleaved_thinking: true # <- enable interleaved thinking, aka tool calling during model reasoning
+      interleaved_thinking: true # <- enables interleaved thinking, aka tool calling during model reasoning
+
+  gemini-2-5-flash-dynamic-thinking:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: -1 # <- google only, dynamic thinking
+
+  gemini-2-5-flash-no-thinking:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: 0 # <- google only, no thinking
+  
+  gemini-2-5-flash-low:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: 1024
+  
+  gemini-2-5-flash-high:
+    provider: google
+    model: gemini-2.5-flash
+    thinking_budget: 24576 # <- google's maximum thinking budget for all models except Gemini 2.5 Pro (max 32768)
diff --git a/pkg/chat/chat.go b/pkg/chat/chat.go
@@ -119,6 +119,7 @@ type Usage struct {
 	OutputTokens       int `json:"output_tokens"`
 	CachedInputTokens  int `json:"cached_input_tokens"`
 	CachedOutputTokens int `json:"cached_output_tokens"`
+	ReasoningTokens    int `json:"reasoning_tokens,omitempty"`
 }
 
 // MessageStream interface represents a stream of chat completions
diff --git a/pkg/model/provider/gemini/adapter.go b/pkg/model/provider/gemini/adapter.go
@@ -179,20 +179,29 @@ func (g *StreamAdapter) Recv() (chat.MessageStreamResponse, error) {
 				OutputTokens:       int(res.resp.UsageMetadata.CandidatesTokenCount),
 				CachedInputTokens:  int(res.resp.UsageMetadata.CachedContentTokenCount),
 				CachedOutputTokens: 0, // Gemini doesn't provide cached output tokens
+				ReasoningTokens:    int(res.resp.UsageMetadata.ThoughtsTokenCount),
 			}
 		}
 
-		// Handle text content without using Text() to avoid warnings
+		// Handle text and thoughts separately so TUI can render them distinctly
 		var textContent string
+		var reasoningText string
 		for _, candidate := range res.resp.Candidates {
 			if candidate.Content != nil {
 				for _, part := range candidate.Content.Parts {
 					if part.Text != "" {
-						textContent += part.Text
+						if part.Thought {
+							reasoningText += part.Text
+						} else {
+							textContent += part.Text
+						}
 					}
 				}
 			}
 		}
+		if reasoningText != "" {
+			resp.Choices[0].Delta.ReasoningContent = reasoningText
+		}
 		if textContent != "" {
 			resp.Choices[0].Delta.Content = textContent
 		}
diff --git a/pkg/model/provider/gemini/client.go b/pkg/model/provider/gemini/client.go
@@ -220,6 +220,30 @@ func (c *Client) buildConfig() *genai.GenerateContentConfig {
 	if c.config.MaxTokens > 0 {
 		config.MaxOutputTokens = int32(c.config.MaxTokens)
 	}
+
+	// Apply thinking budget for Gemini models using token-based configuration.
+	// Per official docs: https://ai.google.dev/gemini-api/docs/thinking
+	// - Set thinkingBudget to 0 to disable thinking
+	// - Set thinkingBudget to -1 for dynamic thinking (model decides)
+	// - Set to a specific value for a fixed token budget,
+	//   maximum is 24576 for all models except Gemini 2.5 Pro (max 32768)
+	if c.config.ThinkingBudget != nil {
+		if config.ThinkingConfig == nil {
+			config.ThinkingConfig = &genai.ThinkingConfig{}
+		}
+		config.ThinkingConfig.IncludeThoughts = true
+		tokens := c.config.ThinkingBudget.Tokens
+		config.ThinkingConfig.ThinkingBudget = genai.Ptr(int32(tokens))
+
+		switch tokens {
+		case 0:
+			slog.Debug("Gemini request with thinking disabled", "budget_tokens", tokens)
+		case -1:
+			slog.Debug("Gemini request with dynamic thinking", "budget_tokens", tokens)
+		default:
+			slog.Debug("Gemini request using thinking_budget", "budget_tokens", tokens)
+		}
+	}
 	return config
 }
 
diff --git a/pkg/model/provider/oaistream/adapter.go b/pkg/model/provider/oaistream/adapter.go
@@ -49,10 +49,14 @@ func (a *StreamAdapter) Recv() (chat.MessageStreamResponse, error) {
 			OutputTokens:       openaiResponse.Usage.CompletionTokens,
 			CachedInputTokens:  0,
 			CachedOutputTokens: 0,
+			ReasoningTokens:    0,
 		}
 		if openaiResponse.Usage.PromptTokensDetails != nil {
 			response.Usage.CachedInputTokens = openaiResponse.Usage.PromptTokensDetails.CachedTokens
 		}
+		if openaiResponse.Usage.CompletionTokensDetails != nil {
+			response.Usage.ReasoningTokens = openaiResponse.Usage.CompletionTokensDetails.ReasoningTokens
+		}
 		// Use the tracked finish reason instead of hardcoding stop
 		finishReason := a.lastFinishReason
 		if finishReason == "" {
diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go
@@ -481,19 +481,19 @@ func (r *runtime) handleStream(ctx context.Context, stream chat.MessageStream, a
 		if response.Usage != nil {
 			if m != nil {
 				sess.Cost += (float64(response.Usage.InputTokens)*m.Cost.Input +
-					float64(response.Usage.OutputTokens)*m.Cost.Output +
+					float64(response.Usage.OutputTokens+response.Usage.ReasoningTokens)*m.Cost.Output +
 					float64(response.Usage.CachedInputTokens)*m.Cost.CacheRead +
 					float64(response.Usage.CachedOutputTokens)*m.Cost.CacheWrite) / 1e6
 			}
 
 			sess.InputTokens = response.Usage.InputTokens + response.Usage.CachedInputTokens
-			sess.OutputTokens = response.Usage.OutputTokens + response.Usage.CachedOutputTokens
+			sess.OutputTokens = response.Usage.OutputTokens + response.Usage.CachedOutputTokens + response.Usage.ReasoningTokens
 
 			modelName := "unknown"
 			if m != nil {
 				modelName = m.Name
 			}
-			telemetry.RecordTokenUsage(ctx, modelName, int64(response.Usage.InputTokens), int64(response.Usage.OutputTokens), sess.Cost)
+			telemetry.RecordTokenUsage(ctx, modelName, int64(response.Usage.InputTokens), int64(response.Usage.OutputTokens+response.Usage.ReasoningTokens), sess.Cost)
 		}
 
 		if len(response.Choices) == 0 {

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@`
`182`	`182`	`"description": "Whether to track usage"`
`183`	`183`	`},`
`184`	`184`	`"thinking_budget": {`
`185`		`- "description": "Controls reasoning effort/budget. For OpenAI: string levels ('minimal', 'low', 'medium', 'high'). For Anthropic: integer token budget (1024-32768)",`
	`185`	`+ "description": "Controls reasoning effort/budget. OpenAI: string levels ('minimal','low','medium','high'). Anthropic: integer token budget (1024-32768). Gemini: integer token budget (-1 for unlimited, 0 to disable, 24576 max).",`
`186`	`186`	`"oneOf": [`
`187`	`187`	`{`
`188`	`188`	`"type": "string",`
`@@ -191,12 +191,12 @@`
`191`	`191`	`},`
`192`	`192`	`{`
`193`	`193`	`"type": "integer",`
`194`		`- "minimum": 1024,`
	`194`	`+ "minimum": -1,`
`195`	`195`	`"maximum": 32768,`
`196`		`- "description": "Token budget for extended thinking (Anthropic)"`
	`196`	`+ "description": "Token budget for extended thinking (Anthropic, Google)"`
`197`	`197`	`}`
`198`	`198`	`],`
`199`		`- "examples": ["minimal", "low", "medium", "high", 1024, 32768]`
	`199`	`+ "examples": ["minimal", "low", "medium", "high", -1, 0, 1024, 24576, 32768]`
`200`	`200`	`}`
`201`	`201`	`},`
`202`	`202`	`"additionalProperties": false`
Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@ type Usage struct {`
`119`	`119`	OutputTokens int `json:"output_tokens"`
`120`	`120`	CachedInputTokens int `json:"cached_input_tokens"`
`121`	`121`	CachedOutputTokens int `json:"cached_output_tokens"`
	`122`	+ ReasoningTokens int `json:"reasoning_tokens,omitempty"`
`122`	`123`	`}`
`123`	`124`
`124`	`125`	`// MessageStream interface represents a stream of chat completions`
Original file line number	Diff line number	Diff line change
`@@ -179,20 +179,29 @@ func (g *StreamAdapter) Recv() (chat.MessageStreamResponse, error) {`
`179`	`179`	`OutputTokens: int(res.resp.UsageMetadata.CandidatesTokenCount),`
`180`	`180`	`CachedInputTokens: int(res.resp.UsageMetadata.CachedContentTokenCount),`
`181`	`181`	`CachedOutputTokens: 0, // Gemini doesn't provide cached output tokens`
	`182`	`+ ReasoningTokens: int(res.resp.UsageMetadata.ThoughtsTokenCount),`
`182`	`183`	`}`
`183`	`184`	`}`
`184`	`185`
`185`		`- // Handle text content without using Text() to avoid warnings`
	`186`	`+ // Handle text and thoughts separately so TUI can render them distinctly`
`186`	`187`	`var textContent string`
	`188`	`+ var reasoningText string`
`187`	`189`	`for _, candidate := range res.resp.Candidates {`
`188`	`190`	`if candidate.Content != nil {`
`189`	`191`	`for _, part := range candidate.Content.Parts {`
`190`	`192`	`if part.Text != "" {`
`191`		`- textContent += part.Text`
	`193`	`+ if part.Thought {`
	`194`	`+ reasoningText += part.Text`
	`195`	`+ } else {`
	`196`	`+ textContent += part.Text`
	`197`	`+ }`
`192`	`198`	`}`
`193`	`199`	`}`
`194`	`200`	`}`
`195`	`201`	`}`
	`202`	`+ if reasoningText != "" {`
	`203`	`+ resp.Choices[0].Delta.ReasoningContent = reasoningText`
	`204`	`+ }`
`196`	`205`	`if textContent != "" {`
`197`	`206`	`resp.Choices[0].Delta.Content = textContent`
`198`	`207`	`}`