Skip to content

Commit 8c0b2ee

Browse files
Lumi-nodeclaude
andcommitted
Add model_showdown demo with real Ollama LLM competition
- model_showdown.py: 3 Ollama models compete on coding tasks with LLM judging - Fix Warrior to properly call LLMProvider.complete() with messages - Fix Elder to wire up LLMJudge when given an LLM provider - Fix vendored dynabots_core Ollama provider for Pydantic SDK responses - Falls back to mock warriors if Ollama not available Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 55cd8bc commit 8c0b2ee

4 files changed

Lines changed: 318 additions & 23 deletions

File tree

dynabots_core/providers/ollama.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -156,30 +156,39 @@ async def complete(
156156
# Make the request
157157
response = await self._client.chat(**kwargs)
158158

159+
# Handle both dict and Pydantic response objects
160+
msg = response.message if hasattr(response, "message") else response.get("message", {})
161+
content = msg.content if hasattr(msg, "content") else msg.get("content", "")
162+
159163
# Extract tool calls if present
160164
tool_calls = None
161-
if "message" in response and "tool_calls" in response["message"]:
162-
tool_calls = response["message"]["tool_calls"]
165+
msg_tools = getattr(msg, "tool_calls", None) or (msg.get("tool_calls") if isinstance(msg, dict) else None)
166+
if msg_tools:
167+
tool_calls = msg_tools
168+
169+
# Extract usage
170+
prompt_tokens = getattr(response, "prompt_eval_count", 0) or (response.get("prompt_eval_count", 0) if isinstance(response, dict) else 0)
171+
completion_tokens = getattr(response, "eval_count", 0) or (response.get("eval_count", 0) if isinstance(response, dict) else 0)
163172

164-
# Build response
165173
return LLMResponse(
166-
content=response["message"]["content"],
174+
content=content,
167175
model=self._model,
168176
tool_calls=tool_calls,
169177
usage={
170-
"prompt_tokens": response.get("prompt_eval_count", 0),
171-
"completion_tokens": response.get("eval_count", 0),
172-
"total_tokens": (
173-
response.get("prompt_eval_count", 0)
174-
+ response.get("eval_count", 0)
175-
),
178+
"prompt_tokens": prompt_tokens,
179+
"completion_tokens": completion_tokens,
180+
"total_tokens": prompt_tokens + completion_tokens,
176181
},
177182
)
178183

179184
async def list_models(self) -> List[str]:
180185
"""List available models on the Ollama server."""
181186
response = await self._client.list()
182-
return [model["name"] for model in response["models"]]
187+
models = response.models if hasattr(response, "models") else response.get("models", [])
188+
return [
189+
m.model if hasattr(m, "model") else m.get("name", m.get("model", str(m)))
190+
for m in models
191+
]
183192

184193
async def pull_model(self, model: str) -> None:
185194
"""Pull a model from the Ollama library."""

examples/model_showdown.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
"""
2+
Model Showdown - Real LLMs compete in the Arena.
3+
4+
Three different models (or prompting strategies) compete on coding tasks.
5+
An LLM Judge evaluates their output quality. The best model earns Warchief.
6+
7+
Requirements:
8+
pip install orc-arena ollama
9+
10+
Setup:
11+
1. Install Ollama: https://ollama.ai
12+
2. Pull models:
13+
ollama pull qwen2.5:14b
14+
ollama pull phi4-mini
15+
ollama pull gemma3:1b
16+
3. Run: python examples/model_showdown.py
17+
18+
Works with whatever Ollama models you have installed.
19+
Falls back to mock warriors if Ollama isn't available.
20+
"""
21+
22+
import asyncio
23+
import sys
24+
25+
26+
# ─── Arena Setup ─────────────────────────────────────────────────────
27+
28+
CODING_CHALLENGES = [
29+
"Write a Python function that checks if a string is a valid palindrome, ignoring spaces and punctuation. Include edge cases.",
30+
"Write a Python function to find the two numbers in a list that add up to a target sum. Return their indices.",
31+
"Write a Python class for a simple LRU cache with get() and put() methods. Use O(1) time complexity.",
32+
"Write a Python function that flattens a deeply nested dictionary into dot-notation keys. Example: {'a': {'b': 1}} -> {'a.b': 1}",
33+
"Write a Python async function that fetches multiple URLs concurrently with a max concurrency limit of 5.",
34+
]
35+
36+
37+
async def run_with_ollama():
38+
"""Run the showdown with real Ollama models."""
39+
from dynabots_core.providers import OllamaProvider
40+
from orc import Warrior, Elder, TheArena
41+
from orc.judges import LLMJudge
42+
43+
# Check which models are available
44+
probe = OllamaProvider(model="qwen2.5:14b")
45+
try:
46+
available = await probe.list_models()
47+
except Exception as e:
48+
print(f" Ollama probe failed: {type(e).__name__}: {e}")
49+
return False
50+
51+
available_names = [m.split(":")[0] for m in available]
52+
print(f" Ollama models found: {', '.join(available)}\n")
53+
54+
# Pick 3 models — prefer variety in size/family
55+
# Order of preference for each warrior slot
56+
warrior_configs = [
57+
{
58+
"name": "Qwen",
59+
"prefer": ["qwen2.5:14b", "qwen3.5-35b-a3b:latest", "qwen2.5:7b"],
60+
"prompt": "You are a senior Python engineer. Write clean, efficient, well-documented code. Include type hints.",
61+
"temp": 0.2,
62+
},
63+
{
64+
"name": "Phi",
65+
"prefer": ["phi4:latest", "phi4-mini:latest", "phi3:latest"],
66+
"prompt": "You are an expert Python developer. Write concise, production-ready code with error handling.",
67+
"temp": 0.3,
68+
},
69+
{
70+
"name": "Gemma",
71+
"prefer": ["gemma3:1b", "gemma2:latest", "gemma:latest"],
72+
"prompt": "You are a Python programmer. Write simple, correct code. Focus on readability.",
73+
"temp": 0.4,
74+
},
75+
]
76+
77+
# Match warriors to available models
78+
warriors = []
79+
used_models = set()
80+
for config in warrior_configs:
81+
model = None
82+
for pref in config["prefer"]:
83+
if pref in available and pref not in used_models:
84+
model = pref
85+
break
86+
if not model:
87+
# Fall back to any unused model
88+
for m in available:
89+
if m not in used_models:
90+
model = m
91+
break
92+
if not model:
93+
continue
94+
95+
used_models.add(model)
96+
llm = OllamaProvider(model=model)
97+
warriors.append(
98+
Warrior(
99+
name=f"{config['name']} ({model})",
100+
llm_client=llm,
101+
system_prompt=config["prompt"],
102+
temperature=config["temp"],
103+
domains=["coding", "python"],
104+
capabilities=["code_generation", "problem_solving"],
105+
)
106+
)
107+
108+
if len(warriors) < 2:
109+
print(" Need at least 2 models. Pull more with: ollama pull <model>")
110+
return False
111+
112+
print(f" Warriors entering the Arena:")
113+
for w in warriors:
114+
print(f" - {w.name}")
115+
print()
116+
117+
# The Elder uses the strongest available model to judge
118+
judge_model = warriors[0].llm_client # Use the first (typically strongest) model
119+
elder = Elder(
120+
llm=judge_model,
121+
evaluation_criteria="correctness, code quality, efficiency, edge case handling",
122+
)
123+
124+
# Create the Arena
125+
arena = TheArena(
126+
warriors=warriors,
127+
elder=elder,
128+
challenge_probability=0.9, # High probability — we want to see fights
129+
)
130+
131+
# ─── The Showdown ────────────────────────────────────────────────
132+
133+
print("=" * 70)
134+
print("THE SHOWDOWN BEGINS")
135+
print("=" * 70)
136+
137+
for i, challenge in enumerate(CODING_CHALLENGES, 1):
138+
short = challenge.split(".")[0][:60]
139+
print(f"\n{'─' * 70}")
140+
print(f" ROUND {i}: {short}...")
141+
print(f"{'─' * 70}")
142+
143+
result = await arena.battle(challenge)
144+
145+
# Show the winning response (truncated)
146+
if result.winner_result and result.winner_result.data:
147+
response = result.winner_result.data.get("response", "")
148+
duration = result.winner_result.duration_ms or 0
149+
150+
# Show first few lines of the winning code
151+
lines = response.strip().split("\n")
152+
preview = "\n".join(lines[:8])
153+
if len(lines) > 8:
154+
preview += f"\n ... ({len(lines) - 8} more lines)"
155+
156+
print(f"\n Winner: {result.winner} ({duration}ms)")
157+
158+
if result.was_challenged and result.verdict:
159+
reason = result.verdict.reasoning.split("\n")[0][:80]
160+
print(f" Reason: {reason}")
161+
162+
print(f"\n Code preview:")
163+
for line in preview.split("\n"):
164+
print(f" {line}")
165+
166+
# ─── Final Leaderboard ───────────────────────────────────────────
167+
168+
print(f"\n\n{'=' * 70}")
169+
print("FINAL LEADERBOARD")
170+
print(f"{'=' * 70}")
171+
172+
leaderboard = arena.get_leaderboard("coding")
173+
for i, entry in enumerate(leaderboard, 1):
174+
crown = " [WARCHIEF]" if entry["is_warlord"] else ""
175+
print(
176+
f" {i}. {entry['agent']:30s} "
177+
f"Rep: {entry['reputation']:.2f} "
178+
f"W:{entry['wins']} L:{entry['losses']}{crown}"
179+
)
180+
181+
# Show trial history
182+
history = arena.get_trial_history()
183+
if history:
184+
print(f"\n Trials fought: {len(history)}")
185+
for trial in history:
186+
if trial.verdict:
187+
print(
188+
f" {trial.verdict.reasoning.split(chr(10))[0][:70]}"
189+
)
190+
191+
print(f"\n{'=' * 70}")
192+
print("Showdown complete!")
193+
print(f"{'=' * 70}")
194+
return True
195+
196+
197+
async def run_mock_fallback():
198+
"""Fallback demo with mock warriors (no LLM needed)."""
199+
from orc import Warrior, Elder, TheArena
200+
201+
print(" Running with mock warriors (no real LLM).")
202+
print(" Install Ollama for the real experience: https://ollama.ai\n")
203+
204+
warriors = [
205+
Warrior(
206+
name="GPT-4o (mock)",
207+
llm_client="gpt-4o",
208+
system_prompt="Senior engineer",
209+
domains=["coding", "python"],
210+
capabilities=["code_generation"],
211+
),
212+
Warrior(
213+
name="Claude (mock)",
214+
llm_client="claude-sonnet",
215+
system_prompt="Expert developer",
216+
domains=["coding", "python"],
217+
capabilities=["code_generation"],
218+
),
219+
Warrior(
220+
name="Qwen (mock)",
221+
llm_client="qwen2.5",
222+
system_prompt="Python programmer",
223+
domains=["coding", "python"],
224+
capabilities=["code_generation"],
225+
),
226+
]
227+
228+
elder = Elder(evaluator_model="metrics")
229+
230+
arena = TheArena(
231+
warriors=warriors,
232+
elder=elder,
233+
challenge_probability=0.8,
234+
)
235+
236+
for challenge in CODING_CHALLENGES[:3]:
237+
short = challenge.split(".")[0][:50]
238+
print(f"\n Round: {short}...")
239+
result = await arena.battle(challenge)
240+
print(f" Victor: {result.winner}")
241+
242+
print(f"\n Leaderboard:")
243+
for entry in arena.get_leaderboard("coding"):
244+
crown = " [WARCHIEF]" if entry["is_warlord"] else ""
245+
print(
246+
f" {entry['agent']:20s} Rep: {entry['reputation']:.2f} "
247+
f"W:{entry['wins']} L:{entry['losses']}{crown}"
248+
)
249+
250+
251+
async def main():
252+
print()
253+
print("=" * 70)
254+
print(" ORC!! MODEL SHOWDOWN")
255+
print(" Real LLMs compete for the coding throne")
256+
print("=" * 70)
257+
print()
258+
259+
# Try Ollama first, fall back to mock
260+
try:
261+
success = await run_with_ollama()
262+
if not success:
263+
await run_mock_fallback()
264+
except Exception as e:
265+
print(f" Failed to use Ollama: {type(e).__name__}: {e}\n")
266+
await run_mock_fallback()
267+
268+
269+
if __name__ == "__main__":
270+
asyncio.run(main())

orc/themed/elder.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__(
3030
evaluator_model: Optional[str] = None,
3131
evaluation_criteria: Optional[str] = None,
3232
judge: Optional[Judge] = None,
33+
llm: Optional[Any] = None,
3334
):
3435
"""
3536
Initialize an Elder.
@@ -38,9 +39,11 @@ def __init__(
3839
evaluator_model: LLM model to use for evaluation (e.g., "claude-3-opus").
3940
evaluation_criteria: Custom criteria for evaluation.
4041
judge: Pre-built Judge instance (overrides evaluator_model if provided).
42+
llm: LLMProvider instance for LLM-based judging.
4143
"""
4244
self.evaluator_model = evaluator_model
4345
self.evaluation_criteria = evaluation_criteria
46+
self._llm = llm
4447
self._judge = judge
4548

4649
@property
@@ -49,13 +52,16 @@ def judge(self) -> Judge:
4952
if self._judge is not None:
5053
return self._judge
5154

52-
# If no judge was provided, create a MetricsJudge as fallback
53-
if self.evaluator_model is None:
54-
# Default to MetricsJudge
55-
self._judge = MetricsJudge()
55+
# If an LLM provider was given, use LLMJudge
56+
if self._llm is not None:
57+
criteria = (
58+
[c.strip() for c in self.evaluation_criteria.split(",")]
59+
if self.evaluation_criteria
60+
else None
61+
)
62+
self._judge = LLMJudge(self._llm, criteria=criteria)
5663
else:
57-
# Would create LLMJudge if we had LLM provider setup
58-
# For now, fall back to MetricsJudge
64+
# Default to MetricsJudge
5965
self._judge = MetricsJudge()
6066

6167
return self._judge

0 commit comments

Comments
 (0)