diff --git a/.gitignore b/.gitignore index a781e8f..2b043c0 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ wheels/ # Virtual environments .venv +.ruff-venv # Misc .DS_Store @@ -16,3 +17,11 @@ simulation_results/ # Result files *.json + +# Jupyter notebooks +*.ipynb + +# Syncthing temporary files +.syncthing.*.tmp + +.ruff-venv diff --git a/players/player_10/.syncthing.__init__.py.tmp b/players/player_10/.syncthing.__init__.py.tmp new file mode 100644 index 0000000..a2d1396 Binary files /dev/null and b/players/player_10/.syncthing.__init__.py.tmp differ diff --git a/players/player_10/Analyse_results.ipynb b/players/player_10/Analyse_results.ipynb new file mode 100644 index 0000000..c9deba4 --- /dev/null +++ b/players/player_10/Analyse_results.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ac93bef8-0df4-4f98-bc7e-f1fc50bc29cb", + "metadata": {}, + "source": [ + "\"\"\"\n", + "Results analysis and visualization tools for Monte Carlo simulations.\n", + "\n", + "This module provides tools to analyze simulation results and create visualizations\n", + "to understand the performance of different Player10 configurations.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0264f566-fb53-4e69-974b-d6cd7313bc9d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + }, + { + "ename": "ImportError", + "evalue": "attempted relative import with no known parent package", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msns\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msim\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmonte_carlo\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m MonteCarloSimulator, SimulationResult\n", + "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" + ] + } + ], + "source": [ + "from __future__ import annotations\n", + "\n", + "import argparse\n", + "import os\n", + "from pathlib import Path\n", + "from typing import Any\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "from ..sim.monte_carlo import MonteCarloSimulator, SimulationResult" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5efd409-016b-4a02-ab5d-3a507b905b46", + "metadata": {}, + "outputs": [], + "source": [ + "# ----------------------------\n", + "# Analyzer\n", + "# ----------------------------\n", + "\n", + "class ResultsAnalyzer:\n", + " \"\"\"Analyzer for Monte Carlo simulation results (enhanced).\"\"\"\n", + "\n", + " def __init__(self, results_file: str | None = None):\n", + " \"\"\"\n", + " Initialize the analyzer.\n", + "\n", + " Args:\n", + " results_file: Path to results JSON file to load\n", + " \"\"\"\n", + " self.simulator = MonteCarloSimulator()\n", + " self.results: list[SimulationResult] = []\n", + " self.metadata: dict[str, Any] = {}\n", + "\n", + " if results_file:\n", + " self.load_results(results_file)\n", + "\n", + " # ---------- IO ----------\n", + "\n", + " def load_results(self, filename: str):\n", + " \"\"\"Load results from a JSON file (produced by MonteCarloSimulator.save_results).\"\"\"\n", + " self.results = self.simulator.load_results(filename)\n", + " self.metadata = self.simulator.last_metadata\n", + " print(f'Loaded {len(self.results)} simulation results')\n", + "\n", + " # ---------- DataFrames ----------\n", + "\n", + " def create_dataframe(self) -> pd.DataFrame:\n", + " \"\"\"\n", + " Convert results to a pandas DataFrame for run-level analysis.\n", + "\n", + " Includes:\n", + " - All important config knobs (altruism prob, tau, epsilons, weights)\n", + " - Run outcomes (scores, lengths, pauses, etc.)\n", + " - Shared component breakdown (prefixed with shared_)\n", + " - Derived feature: length_utilization\n", + " \"\"\"\n", + " data: list[dict[str, Any]] = []\n", + "\n", + " for r in self.results:\n", + " row = {\n", + " # core config knobs\n", + " 'altruism_prob': r.config.altruism_prob,\n", + " 'tau_margin': r.config.tau_margin,\n", + " 'epsilon_fresh': r.config.epsilon_fresh,\n", + " 'epsilon_mono': r.config.epsilon_mono,\n", + " 'subjects': r.config.subjects,\n", + " 'memory_size': r.config.memory_size,\n", + " 'conversation_length_cfg': r.config.conversation_length,\n", + " 'seed': r.config.seed,\n", + "\n", + " # weights / algo params\n", + " 'min_samples_pid': r.config.min_samples_pid,\n", + " 'ewma_alpha': r.config.ewma_alpha,\n", + " 'importance_weight': r.config.importance_weight,\n", + " 'coherence_weight': r.config.coherence_weight,\n", + " 'freshness_weight': r.config.freshness_weight,\n", + " 'monotony_weight': r.config.monotony_weight,\n", + "\n", + " # run-level outcomes\n", + " 'total_score': r.total_score,\n", + " 'player10_score': r.player10_total_mean,\n", + " 'player10_individual': r.player10_individual_mean,\n", + " 'player10_rank': r.player10_rank_mean,\n", + " 'player10_gap_to_best': r.player10_gap_to_best,\n", + " 'player10_instances': r.player10_instances,\n", + " 'best_total_score': r.best_total_score,\n", + " 'conversation_length': r.conversation_length,\n", + " 'early_termination': float(r.early_termination),\n", + " 'pause_count': r.pause_count,\n", + " 'unique_items_used': r.unique_items_used,\n", + " 'execution_time': r.execution_time,\n", + " }\n", + "\n", + " # Include shared score components (flatten)\n", + " for comp, val in (r.score_breakdown or {}).items():\n", + " if comp == 'total':\n", + " continue\n", + " row[f'shared_{comp}'] = val\n", + "\n", + " data.append(row)\n", + "\n", + " df = pd.DataFrame(data)\n", + "\n", + " # Derived features\n", + " if 'conversation_length_cfg' in df and 'conversation_length' in df:\n", + " with np.errstate(divide='ignore', invalid='ignore'):\n", + " df['length_utilization'] = df['conversation_length'] / df['conversation_length_cfg']\n", + "\n", + " return df\n", + "\n", + " def create_player_long(self) -> pd.DataFrame:\n", + " \"\"\"\n", + " Explode SimulationResult.player_metrics into a long-form dataframe.\n", + "\n", + " Columns: seed, config knobs, label, class_name, alias, total, shared, individual, rank\n", + " Useful for rank distributions and per-player analyses.\n", + " \"\"\"\n", + " rows: list[dict[str, Any]] = []\n", + " for r in self.results:\n", + " cfg = {\n", + " 'altruism_prob': r.config.altruism_prob,\n", + " 'tau_margin': r.config.tau_margin,\n", + " 'epsilon_fresh': r.config.epsilon_fresh,\n", + " 'epsilon_mono': r.config.epsilon_mono,\n", + " 'seed': r.config.seed,\n", + " }\n", + " for label, m in (r.player_metrics or {}).items():\n", + " rows.append({\n", + " **cfg,\n", + " 'label': label,\n", + " 'class_name': m.get('class_name'),\n", + " 'alias': m.get('alias'),\n", + " 'total': m.get('total'),\n", + " 'shared': m.get('shared'),\n", + " 'individual': m.get('individual'),\n", + " 'rank': m.get('rank'),\n", + " })\n", + " return pd.DataFrame(rows)\n", + "\n", + " # ---------- Statistics helpers ----------\n", + "\n", + " def bootstrap_ci(\n", + " self,\n", + " df: pd.DataFrame,\n", + " group_cols: list[str],\n", + " metric: str,\n", + " B: int = 1000,\n", + " ci: float = 0.95\n", + " ) -> pd.DataFrame:\n", + " \"\"\"\n", + " Bootstrapped mean & CI for (group_cols, metric).\n", + " Returns columns: group_cols..., mean, ci_low, ci_high, n\n", + " \"\"\"\n", + " out = []\n", + " q_lo, q_hi = (1 - ci) / 2, 1 - (1 - ci) / 2\n", + " for key, g in df.groupby(group_cols):\n", + " values = g[metric].dropna().to_numpy()\n", + " if len(values) == 0:\n", + " continue\n", + " boot = []\n", + " for _ in range(B):\n", + " sample = np.random.choice(values, size=len(values), replace=True)\n", + " boot.append(sample.mean())\n", + " lo, hi = np.quantile(boot, [q_lo, q_hi])\n", + " row = {'mean': values.mean(), 'ci_low': float(lo), 'ci_high': float(hi), 'n': len(values)}\n", + " # Attach group key(s)\n", + " if isinstance(key, tuple):\n", + " for c, v in zip(group_cols, key, strict=False):\n", + " row[c] = v\n", + " else:\n", + " row[group_cols[0]] = key\n", + " out.append(row)\n", + " cols = group_cols + ['mean', 'ci_low', 'ci_high', 'n']\n", + " return pd.DataFrame(out)[cols]\n", + "\n", + " def pairwise_altruism_deltas(self, metric: str = 'total_score') -> pd.DataFrame:\n", + " \"\"\"\n", + " Pairwise mean delta & Cohen's d between altruism levels for a metric.\n", + " Returns: a, b, delta_mean, cohens_d, nx, ny\n", + " \"\"\"\n", + " df = self.create_dataframe()\n", + " levels = sorted(df['altruism_prob'].unique())\n", + " rows = []\n", + " for i, a in enumerate(levels):\n", + " for b in levels[i + 1:]:\n", + " x = df[df.altruism_prob == a][metric].dropna()\n", + " y = df[df.altruism_prob == b][metric].dropna()\n", + " if len(x) and len(y):\n", + " delta = y.mean() - x.mean()\n", + " pooled = np.sqrt(\n", + " ((x.var(ddof=1) * (len(x) - 1)) + (y.var(ddof=1) * (len(y) - 1)))\n", + " / (len(x) + len(y) - 2)\n", + " )\n", + " d = delta / pooled if pooled > 0 else np.nan\n", + " rows.append({\n", + " 'a': a, 'b': b,\n", + " 'delta_mean': float(delta),\n", + " 'cohens_d': float(d),\n", + " 'nx': int(len(x)), 'ny': int(len(y)),\n", + " })\n", + " return pd.DataFrame(rows)\n", + "\n", + " # ---------- Plots (existing + new) ----------\n", + "\n", + " def plot_altruism_comparison(self, save_path: str | None = None):\n", + " \"\"\"Create plots comparing different altruism probabilities (existing, kept).\"\"\"\n", + " if not self.results:\n", + " print('No results loaded. Please load results first.')\n", + " return\n", + "\n", + " df = self.create_dataframe()\n", + "\n", + " altruism_groups = (\n", + " df.groupby('altruism_prob')\n", + " .agg(\n", + " {\n", + " 'total_score': ['mean', 'std', 'count'],\n", + " 'player10_score': ['mean', 'std'],\n", + " 'conversation_length': 'mean',\n", + " 'early_termination': 'mean',\n", + " 'pause_count': 'mean',\n", + " }\n", + " )\n", + " .round(3)\n", + " )\n", + "\n", + " fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n", + " fig.suptitle('Player10 Altruism Probability Comparison', fontsize=16)\n", + "\n", + " # Plot 1: Total Score vs Altruism Probability\n", + " ax1 = axes[0, 0]\n", + " altruism_probs = altruism_groups.index\n", + " mean_scores = altruism_groups[('total_score', 'mean')]\n", + " std_scores = altruism_groups[('total_score', 'std')]\n", + " ax1.errorbar(altruism_probs, mean_scores, yerr=std_scores, marker='o', capsize=5, capthick=2)\n", + " ax1.set_xlabel('Altruism Probability')\n", + " ax1.set_ylabel('Total Score')\n", + " ax1.set_title('Total Score vs Altruism Probability')\n", + " ax1.grid(True, alpha=0.3)\n", + "\n", + " # Plot 2: Player10 Score vs Altruism Probability\n", + " ax2 = axes[0, 1]\n", + " mean_p10_scores = altruism_groups[('player10_score', 'mean')]\n", + " std_p10_scores = altruism_groups[('player10_score', 'std')]\n", + " ax2.errorbar(\n", + " altruism_probs, mean_p10_scores, yerr=std_p10_scores,\n", + " marker='s', capsize=5, capthick=2, color='orange'\n", + " )\n", + " ax2.set_xlabel('Altruism Probability')\n", + " ax2.set_ylabel('Player10 Score')\n", + " ax2.set_title('Player10 Individual Score vs Altruism Probability')\n", + " ax2.grid(True, alpha=0.3)\n", + "\n", + " # Plot 3: Conversation Length vs Altruism Probability\n", + " ax3 = axes[1, 0]\n", + " conv_lengths = altruism_groups[('conversation_length', 'mean')]\n", + " ax3.plot(altruism_probs, conv_lengths, marker='^')\n", + " ax3.set_xlabel('Altruism Probability')\n", + " ax3.set_ylabel('Average Conversation Length')\n", + " ax3.set_title('Conversation Length vs Altruism Probability')\n", + " ax3.grid(True, alpha=0.3)\n", + "\n", + " # Plot 4: Early Termination Rate vs Altruism Probability\n", + " ax4 = axes[1, 1]\n", + " early_term_rates = altruism_groups[('early_termination', 'mean')]\n", + " ax4.plot(altruism_probs, early_term_rates, marker='d')\n", + " ax4.set_xlabel('Altruism Probability')\n", + " ax4.set_ylabel('Early Termination Rate')\n", + " ax4.set_title('Early Termination Rate vs Altruism Probability')\n", + " ax4.grid(True, alpha=0.3)\n", + "\n", + " plt.tight_layout()\n", + " if save_path:\n", + " plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Plot saved to: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_parameter_heatmap(\n", + " self, param1: str, param2: str, metric: str = 'total_score', save_path: str | None = None\n", + " ):\n", + " \"\"\"Create a heatmap showing the interaction between two parameters.\"\"\"\n", + " if not self.results:\n", + " print('No results loaded. Please load results first.')\n", + " return\n", + "\n", + " df = self.create_dataframe()\n", + " pivot = df.groupby([param1, param2])[metric].mean().unstack()\n", + "\n", + " plt.figure(figsize=(10, 8))\n", + " sns.heatmap(pivot, annot=True, fmt='.2f', cmap='viridis')\n", + " plt.title(f'{metric.title()} Heatmap: {param1} vs {param2}')\n", + " plt.xlabel(param2.replace('_', ' ').title())\n", + " plt.ylabel(param1.replace('_', ' ').title())\n", + "\n", + " if save_path:\n", + " plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Heatmap saved to: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_score_distributions(self, save_path: str | None = None):\n", + " \"\"\"Plot score distributions for different altruism probabilities.\"\"\"\n", + " if not self.results:\n", + " print('No results loaded. Please load results first.')\n", + " return\n", + "\n", + " df = self.create_dataframe()\n", + " altruism_probs = sorted(df['altruism_prob'].unique())\n", + "\n", + " fig, axes = plt.subplots(1, 2, figsize=(15, 6))\n", + " fig.suptitle('Score Distributions by Altruism Probability', fontsize=16)\n", + "\n", + " # Plot 1: Total Score Distributions\n", + " ax1 = axes[0]\n", + " for prob in altruism_probs:\n", + " scores = df[df['altruism_prob'] == prob]['total_score']\n", + " ax1.hist(scores, alpha=0.6, label=f'Altruism: {prob:.1f}', bins=20)\n", + " ax1.set_xlabel('Total Score'); ax1.set_ylabel('Frequency')\n", + " ax1.set_title('Total Score Distributions'); ax1.legend(); ax1.grid(True, alpha=0.3)\n", + "\n", + " # Plot 2: Player10 Score Distributions\n", + " ax2 = axes[1]\n", + " for prob in altruism_probs:\n", + " scores = df[df['altruism_prob'] == prob]['player10_score']\n", + " ax2.hist(scores, alpha=0.6, label=f'Altruism: {prob:.1f}', bins=20)\n", + " ax2.set_xlabel('Player10 Score'); ax2.set_ylabel('Frequency')\n", + " ax2.set_title('Player10 Individual Score Distributions'); ax2.legend(); ax2.grid(True, alpha=0.3)\n", + "\n", + " plt.tight_layout()\n", + " if save_path:\n", + " plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Distributions plot saved to: {save_path}')\n", + " plt.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45cbdb6f-ded2-42f7-8232-ee810b7f8a8a", + "metadata": {}, + "outputs": [], + "source": [ + "# ----- NEW PLOTS -----\n", + "\n", + " def plot_component_stack(self, save_path: str | None = None):\n", + " \"\"\"Stacked bars of shared component means vs altruism_prob.\"\"\"\n", + " df = self.create_dataframe()\n", + " keep = ['shared_importance', 'shared_coherence', 'shared_freshness', 'shared_nonmonotonousness']\n", + " have = [c for c in keep if c in df.columns]\n", + " if not have:\n", + " print('No shared component breakdown in results.')\n", + " return\n", + " g = df.groupby('altruism_prob')[have].mean().reset_index().sort_values('altruism_prob')\n", + " ax = g.set_index('altruism_prob')[have].plot(kind='bar', stacked=True, figsize=(12, 6))\n", + " ax.set_ylabel('Mean shared component score'); ax.set_xlabel('Altruism probability')\n", + " ax.set_title('Shared score component breakdown vs altruism'); ax.legend(title='Component')\n", + " ax.grid(True, axis='y', alpha=0.3); plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_pareto_tradeoff(self, save_path: str | None = None):\n", + " \"\"\"Scatter of mean Player10 individual vs mean total score, colored by altruism.\"\"\"\n", + " df = self.create_dataframe()\n", + " agg = (df.groupby(['altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono'])\n", + " .agg({'total_score': 'mean', 'player10_individual': 'mean', 'early_termination': 'mean'})\n", + " .reset_index())\n", + " plt.figure(figsize=(9, 7))\n", + " s = plt.scatter(\n", + " agg['player10_individual'], agg['total_score'],\n", + " c=agg['altruism_prob'], cmap='viridis', s=60, alpha=0.85\n", + " )\n", + " plt.colorbar(s, label='altruism_prob')\n", + " # Annotate \"risky\" configs\n", + " for _, r in agg.iterrows():\n", + " if r['early_termination'] > 0.30:\n", + " plt.annotate('ET>0.3', (r['player10_individual'], r['total_score']), fontsize=8)\n", + " plt.xlabel('Player10 individual (mean)')\n", + " plt.ylabel('Total score (mean)')\n", + " plt.title('Pareto trade-off: Player10 individual vs Total')\n", + " plt.grid(True, alpha=0.3); plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_rank_distribution(self, save_path: str | None = None):\n", + " \"\"\"Violin plots of Player10 rank across seeds vs altruism.\"\"\"\n", + " dfp = self.create_player_long()\n", + " if dfp.empty or 'rank' not in dfp:\n", + " print('No per-player metrics available.')\n", + " return\n", + " dfp_p10 = dfp[dfp['class_name'] == 'Player10']\n", + " if dfp_p10.empty:\n", + " print('No Player10 entries in per-player metrics.')\n", + " return\n", + " plt.figure(figsize=(10, 5))\n", + " sns.violinplot(data=dfp_p10, x='altruism_prob', y='rank', inner='quartile', cut=0)\n", + " plt.gca().invert_yaxis() # rank 1 is best\n", + " plt.title('Player10 rank distribution across seeds'); plt.xlabel('Altruism probability')\n", + " plt.ylabel('Rank (lower is better)'); plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_seed_stability(self, metric: str = 'total_score', save_path: str | None = None):\n", + " \"\"\"Cumulative mean vs number of simulations (sorted by seed) to show stabilization.\"\"\"\n", + " df = self.create_dataframe().sort_values('seed')\n", + " curves: list[tuple[float, np.ndarray]] = []\n", + " for p, g in df.groupby('altruism_prob'):\n", + " means = g[metric].expanding().mean().values\n", + " curves.append((p, means))\n", + "\n", + " plt.figure(figsize=(10, 6))\n", + " for p, means in curves:\n", + " plt.plot(range(1, len(means) + 1), means, label=f'p={p}')\n", + " plt.xlabel('Number of simulations (cumulative)'); plt.ylabel(f'Cumulative mean {metric}')\n", + " plt.title('Seed stability of the estimate'); plt.legend(title='altruism_prob')\n", + " plt.grid(True, alpha=0.3); plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_correlation_heatmap(self, save_path: str | None = None):\n", + " \"\"\"Correlation among knobs and outcomes.\"\"\"\n", + " df = self.create_dataframe()\n", + " cols = [\n", + " 'altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono',\n", + " 'importance_weight', 'coherence_weight', 'freshness_weight', 'monotony_weight',\n", + " 'total_score', 'player10_score', 'early_termination', 'pause_count',\n", + " 'unique_items_used', 'length_utilization'\n", + " ]\n", + " cols = [c for c in cols if c in df.columns]\n", + " corr = df[cols].corr(numeric_only=True)\n", + " plt.figure(figsize=(10, 8))\n", + " sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)\n", + " plt.title('Correlation matrix: knobs vs outcomes'); plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n", + "\n", + " def plot_multi_heatmaps(\n", + " self,\n", + " fixed: str = 'altruism_prob',\n", + " metric: str = 'total_score',\n", + " cols: tuple[str, str] = ('tau_margin', 'epsilon_fresh'),\n", + " save_path: str | None = None\n", + " ):\n", + " \"\"\"Small-multiple heatmaps for metric by two parameters, faceted by a fixed param.\"\"\"\n", + " df = self.create_dataframe()\n", + " vals = sorted(df[fixed].unique())\n", + " n = len(vals)\n", + " fig, axes = plt.subplots(1, n, figsize=(6 * n, 5), sharey=True)\n", + " if n == 1:\n", + " axes = [axes]\n", + " for ax, v in zip(axes, vals, strict=False):\n", + " sub = df[df[fixed] == v]\n", + " if sub.empty:\n", + " ax.set_visible(False)\n", + " continue\n", + " pivot = sub.groupby(list(cols))[metric].mean().unstack()\n", + " sns.heatmap(pivot, ax=ax, annot=True, fmt='.2f', cmap='viridis')\n", + " ax.set_title(f'{metric} | {fixed}={v}')\n", + " ax.set_xlabel(cols[1]); ax.set_ylabel(cols[0])\n", + " plt.tight_layout()\n", + " if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight'); print(f'Saved: {save_path}')\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78d49284-708b-4c7a-a0c9-b75cb05453a5", + "metadata": {}, + "outputs": [], + "source": [ + "# ---------- Modeling (optional) ----------\n", + "\n", + " def run_ols(self, metric: str = 'total_score'):\n", + " \"\"\"\n", + " OLS regression of metric on config knobs (robust SE). Requires statsmodels.\n", + " Returns the fitted model.\n", + " \"\"\"\n", + " try:\n", + " import statsmodels.api as sm\n", + " except ImportError:\n", + " print(\"statsmodels not installed. `pip install statsmodels` to use run_ols().\")\n", + " return None\n", + "\n", + " df = self.create_dataframe().dropna(subset=[metric])\n", + " X_cols = [\n", + " 'altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono',\n", + " 'importance_weight', 'coherence_weight', 'freshness_weight', 'monotony_weight'\n", + " ]\n", + " X_cols = [c for c in X_cols if c in df.columns]\n", + " X = df[X_cols].copy()\n", + " X = sm.add_constant(X)\n", + " y = df[metric]\n", + " model = sm.OLS(y, X).fit(cov_type='HC3') # robust SE\n", + " print(model.summary())\n", + " return model\n", + "\n", + " def run_logistic_early_term(self):\n", + " \"\"\"\n", + " Logistic regression predicting early termination. Requires statsmodels.\n", + " Returns the fitted model.\n", + " \"\"\"\n", + " try:\n", + " import statsmodels.api as sm\n", + " except ImportError:\n", + " print(\"statsmodels not installed. `pip install statsmodels` to use run_logistic_early_term().\")\n", + " return None\n", + "\n", + " df = self.create_dataframe().dropna(subset=['early_termination'])\n", + " X_cols = [\n", + " 'altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono',\n", + " 'pause_count', 'unique_items_used', 'conversation_length_cfg'\n", + " ]\n", + " X_cols = [c for c in X_cols if c in df.columns]\n", + " X = df[X_cols].copy()\n", + " X = sm.add_constant(X)\n", + " y = df['early_termination'].astype(int)\n", + " model = sm.Logit(y, X).fit(disp=False)\n", + " print(model.summary())\n", + " return model\n", + "\n", + " # ---------- Config search ----------\n", + "\n", + " def best_configs(\n", + " self,\n", + " objective: str = 'total_score',\n", + " constraints: dict[str, tuple[float | None, float | None]] | None = None,\n", + " top_k: int = 10\n", + " ) -> pd.DataFrame:\n", + " \"\"\"\n", + " Find top configs by objective subject to optional constraints.\n", + " constraints example: {'early_termination': (None, 0.2)} # <= 0.2\n", + " \"\"\"\n", + " df = self.create_dataframe()\n", + " agg = (\n", + " df.groupby(['altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono'])\n", + " .agg({\n", + " objective: 'mean',\n", + " 'early_termination': 'mean',\n", + " 'player10_individual': 'mean',\n", + " 'total_score': 'mean'\n", + " })\n", + " .reset_index()\n", + " )\n", + " if constraints:\n", + " mask = pd.Series(True, index=agg.index)\n", + " for col, (lo, hi) in constraints.items():\n", + " if lo is not None:\n", + " mask &= agg[col] >= lo\n", + " if hi is not None:\n", + " mask &= agg[col] <= hi\n", + " agg = agg[mask]\n", + " return agg.sort_values(objective, ascending=False).head(top_k)\n", + "\n", + " # ---------- Quick report ----------\n", + "\n", + " def save_quick_report(self, out_dir: str = 'report_out'):\n", + " \"\"\"Save a set of figures and a short markdown summary to a directory.\"\"\"\n", + " os.makedirs(out_dir, exist_ok=True)\n", + "\n", + " # figures\n", + " self.plot_altruism_comparison(f'{out_dir}/altruism_comparison.png')\n", + " self.plot_component_stack(f'{out_dir}/component_stack.png')\n", + " self.plot_pareto_tradeoff(f'{out_dir}/pareto.png')\n", + " self.plot_rank_distribution(f'{out_dir}/rank_violin.png')\n", + " self.plot_seed_stability(save_path=f'{out_dir}/seed_stability.png')\n", + " self.plot_correlation_heatmap(f'{out_dir}/corr.png')\n", + " self.plot_multi_heatmaps(save_path=f'{out_dir}/multi_heatmaps.png')\n", + "\n", + " # analysis text\n", + " df = self.create_dataframe()\n", + " lines = [\n", + " '# Simulation Summary',\n", + " '',\n", + " f'- Total sims: {len(df)}',\n", + " f'- Unique configs: {df.groupby([\"altruism_prob\",\"tau_margin\",\"epsilon_fresh\",\"epsilon_mono\"]).ngroups}',\n", + " f'- Overall total mean ± std: {df[\"total_score\"].mean():.2f} ± {df[\"total_score\"].std():.2f}',\n", + " f'- Early termination rate: {df[\"early_termination\"].mean():.2f}',\n", + " ]\n", + " (Path(out_dir) / 'SUMMARY.md').write_text('\\n'.join(lines), encoding='utf-8')\n", + " print(f'Report written to {out_dir}/')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25c5656e-9904-4a41-8ed3-585695394245", + "metadata": {}, + "outputs": [], + "source": [ + " # ---------- Text summary (existing, kept with minor tweaks) ----------\n", + "\n", + " def print_detailed_analysis(self):\n", + " \"\"\"Print detailed analysis of the results.\"\"\"\n", + " if not self.results:\n", + " print('No results loaded. Please load results first.')\n", + " return\n", + "\n", + " df = self.create_dataframe()\n", + "\n", + " print('=== DETAILED ANALYSIS ===')\n", + " print(f'Total simulations: {len(df)}')\n", + " print(\n", + " f'Unique configurations: {df.groupby([\"altruism_prob\", \"tau_margin\", \"epsilon_fresh\", \"epsilon_mono\"]).ngroups}'\n", + " )\n", + "\n", + " # Overall statistics\n", + " print('\\n=== OVERALL STATISTICS ===')\n", + " print(f'Total Score - Mean: {df[\"total_score\"].mean():.2f}, Std: {df[\"total_score\"].std():.2f}')\n", + " print(f'Player10 Score - Mean: {df[\"player10_score\"].mean():.2f}, Std: {df[\"player10_score\"].std():.2f}')\n", + " if 'player10_individual' in df:\n", + " print(\n", + " f'Player10 Individual - Mean: {df[\"player10_individual\"].mean():.2f}, '\n", + " f'Std: {df[\"player10_individual\"].std():.2f}'\n", + " )\n", + " if 'player10_rank' in df:\n", + " print(\n", + " f'Player10 Rank - Mean: {df[\"player10_rank\"].mean():.2f}, '\n", + " f'Std: {df[\"player10_rank\"].std():.2f}'\n", + " )\n", + " print(\n", + " f'Conversation Length - Mean: {df[\"conversation_length\"].mean():.1f}, '\n", + " f'Std: {df[\"conversation_length\"].std():.1f}'\n", + " )\n", + " print(f'Early Termination Rate: {df[\"early_termination\"].mean():.2f}')\n", + "\n", + " # Best configurations\n", + " print('\\n=== TOP 10 CONFIGURATIONS ===')\n", + " agg_map = {'total_score': ['mean', 'std', 'count'], 'player10_score': 'mean'}\n", + " if 'player10_rank' in df:\n", + " agg_map['player10_rank'] = 'mean'\n", + " if 'player10_individual' in df:\n", + " agg_map['player10_individual'] = 'mean'\n", + "\n", + " top_configs = (\n", + " df.groupby(['altruism_prob', 'tau_margin', 'epsilon_fresh', 'epsilon_mono'])\n", + " .agg(agg_map)\n", + " .round(3)\n", + " )\n", + "\n", + " new_columns = ['total_mean', 'total_std', 'count', 'p10_mean']\n", + " if 'player10_rank' in agg_map:\n", + " new_columns.append('p10_rank_mean')\n", + " if 'player10_individual' in agg_map:\n", + " new_columns.append('p10_individual_mean')\n", + " top_configs.columns = new_columns\n", + " top_configs = top_configs.sort_values('total_mean', ascending=False).head(10)\n", + "\n", + " for i, (config, row) in enumerate(top_configs.iterrows(), 1):\n", + " altruism, tau, fresh, mono = config\n", + " parts = [\n", + " f'{i:2d}. Altruism: {altruism:.1f}',\n", + " f'Tau: {tau:.2f}',\n", + " f'Fresh: {fresh:.2f}',\n", + " f'Mono: {mono:.2f}',\n", + " f'Total: {row[\"total_mean\"]:.2f}±{row[\"total_std\"]:.2f}',\n", + " f'P10: {row[\"p10_mean\"]:.2f}',\n", + " ]\n", + " if 'p10_rank_mean' in row:\n", + " parts.append(f'P10 Rank: {row[\"p10_rank_mean\"]:.2f}')\n", + " if 'p10_individual_mean' in row:\n", + " parts.append(f'P10 Individual: {row[\"p10_individual_mean\"]:.2f}')\n", + " print(' -> '.join(parts))\n", + "\n", + " # Altruism analysis\n", + " print('\\n=== ALTRUISM ANALYSIS ===')\n", + " agg_map = {\n", + " 'total_score': ['mean', 'std'],\n", + " 'player10_score': ['mean', 'std'],\n", + " 'conversation_length': 'mean',\n", + " 'early_termination': 'mean',\n", + " }\n", + " if 'player10_rank' in df:\n", + " agg_map['player10_rank'] = ['mean', 'std']\n", + " if 'player10_individual' in df:\n", + " agg_map['player10_individual'] = ['mean', 'std']\n", + "\n", + " altruism_stats = df.groupby('altruism_prob').agg(agg_map).round(3)\n", + " for prob in sorted(df['altruism_prob'].unique()):\n", + " stats = altruism_stats.loc[prob]\n", + " parts = [\n", + " f'Altruism {prob:.1f}:',\n", + " f'Total={stats[(\"total_score\", \"mean\")]:.2f}±{stats[(\"total_score\", \"std\")]:.2f}',\n", + " f'P10={stats[(\"player10_score\", \"mean\")]:.2f}±{stats[(\"player10_score\", \"std\")]:.2f}',\n", + " f'Length={stats[(\"conversation_length\", \"mean\")]:.1f}',\n", + " f'EarlyTerm={stats[(\"early_termination\", \"mean\")]:.2f}',\n", + " ]\n", + " if ('player10_rank', 'mean') in stats:\n", + " parts.append(\n", + " f'P10 Rank={stats[(\"player10_rank\", \"mean\")]:.2f}±{stats[(\"player10_rank\", \"std\")]:.2f}'\n", + " )\n", + " if ('player10_individual', 'mean') in stats:\n", + " parts.append(\n", + " f'P10 Ind={stats[(\"player10_individual\", \"mean\")]:.2f}±{stats[(\"player10_individual\", \"std\")]:.2f}'\n", + " )\n", + " print(' '.join(parts))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a34895e-f309-46f1-a96c-635e623dddbb", + "metadata": {}, + "outputs": [], + "source": [ + "# ---------- Convenience: expose bootstrap/effect sizes quickly ----------\n", + "\n", + " def print_ci_and_effects(self, metric: str = 'total_score'):\n", + " \"\"\"Print bootstrapped CIs per altruism level and pairwise effect sizes.\"\"\"\n", + " df = self.create_dataframe()\n", + " ci = self.bootstrap_ci(df, ['altruism_prob'], metric)\n", + " print('\\n=== BOOTSTRAP CI (by altruism_prob) ===')\n", + " print(ci.sort_values('altruism_prob').to_string(index=False))\n", + " deltas = self.pairwise_altruism_deltas(metric=metric)\n", + " print('\\n=== PAIRWISE DELTAS (a->b) ===')\n", + " print(deltas.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a6ef4e6-e198-4694-9545-899e533c8cb3", + "metadata": {}, + "outputs": [], + "source": [ + "# ----------------------------\n", + "# CLI\n", + "# ----------------------------\n", + "\n", + "def main():\n", + " \"\"\"Main function for command-line usage.\"\"\"\n", + " parser = argparse.ArgumentParser(description='Analyze Monte Carlo simulation results')\n", + " parser.add_argument('results_file', help='Path to results JSON file')\n", + "\n", + " parser.add_argument(\n", + " '--plot',\n", + " choices=[\n", + " 'altruism', 'heatmap', 'distributions',\n", + " 'components', 'pareto', 'rank', 'seed', 'corr', 'multi-heatmap'\n", + " ],\n", + " default='altruism',\n", + " help='Type of plot to create',\n", + " )\n", + " parser.add_argument('--param1', default='altruism_prob', help='Param for heatmap / multi-heatmap (rows)')\n", + " parser.add_argument('--param2', default='tau_margin', help='Param for heatmap / multi-heatmap (cols)')\n", + " parser.add_argument('--metric', default='total_score', help='Metric for heatmaps / stability')\n", + " parser.add_argument('--fixed', default='altruism_prob', help='Facet for multi-heatmap')\n", + " parser.add_argument('--save', help='Save plot to file')\n", + " parser.add_argument('--analysis', action='store_true', help='Print detailed analysis')\n", + " parser.add_argument('--ci', action='store_true', help='Print bootstrapped CIs and effect sizes')\n", + " parser.add_argument('--report', help='Save a quick report to a directory (path)')\n", + " parser.add_argument('--ols', action='store_true', help='Run OLS on total_score with knobs')\n", + " parser.add_argument('--logit', action='store_true', help='Run logistic regression for early termination')\n", + "\n", + " args = parser.parse_args()\n", + "\n", + " # Load results\n", + " analyzer = ResultsAnalyzer(args.results_file)\n", + "\n", + " # Print analysis tables\n", + " if args.analysis:\n", + " analyzer.print_detailed_analysis()\n", + " if args.ci:\n", + " analyzer.print_ci_and_effects(metric=args.metric)\n", + " if args.report:\n", + " analyzer.save_quick_report(args.report)\n", + "\n", + " # Optional modeling\n", + " if args.ols:\n", + " analyzer.run_ols(metric='total_score')\n", + " if args.logit:\n", + " analyzer.run_logistic_early_term()\n", + "\n", + " # Create plots\n", + " if args.plot == 'altruism':\n", + " analyzer.plot_altruism_comparison(args.save)\n", + " elif args.plot == 'heatmap':\n", + " analyzer.plot_parameter_heatmap(args.param1, args.param2, metric=args.metric, save_path=args.save)\n", + " elif args.plot == 'distributions':\n", + " analyzer.plot_score_distributions(args.save)\n", + " elif args.plot == 'components':\n", + " analyzer.plot_component_stack(args.save)\n", + " elif args.plot == 'pareto':\n", + " analyzer.plot_pareto_tradeoff(args.save)\n", + " elif args.plot == 'rank':\n", + " analyzer.plot_rank_distribution(args.save)\n", + " elif args.plot == 'seed':\n", + " analyzer.plot_seed_stability(metric=args.metric, save_path=args.save)\n", + " elif args.plot == 'corr':\n", + " analyzer.plot_correlation_heatmap(args.save)\n", + " elif args.plot == 'multi-heatmap':\n", + " analyzer.plot_multi_heatmaps(fixed=args.fixed, metric=args.metric, save_path=args.save)\n", + "\n", + " # Done\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98c654c5-9e33-440d-a720-89a1adbe29e1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09df6723-3e92-4b6a-a307-29cf288dc512", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87df6905-7a17-48e0-a101-cf7744261428", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/players/player_10/__init__.py b/players/player_10/__init__.py index 7e8a23a..001daf9 100644 --- a/players/player_10/__init__.py +++ b/players/player_10/__init__.py @@ -1,8 +1,20 @@ from .agent.player import Player10Agent # Agent-based player for comparison -from .rl.eval_player import EvalPlayer, create_eval_player # RL evaluation player -# Use the trained RL model as Player10 by default -Player10 = EvalPlayer +try: + from .rl.eval_player import EvalPlayer, create_eval_player # RL evaluation player +except Exception: # pragma: no cover - optional dependency + EvalPlayer = None + + def create_eval_player(*_args, **_kwargs): + message = ( + 'Player10 RL evaluation requires the optional torch dependency and a trained model. ' + 'Install torch and ensure models are available to use EvalPlayer.' + ) + raise RuntimeError(message) + + +# Use the original Player10Agent as Player10 by default (instead of EvalPlayer) +Player10 = Player10Agent __all__ = [ 'Player10', diff --git a/players/player_10/analysis b/players/player_10/analysis new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/players/player_10/analysis @@ -0,0 +1 @@ + diff --git a/players/player_10/tools/.syncthing.reporting.py.tmp b/players/player_10/tools/.syncthing.reporting.py.tmp new file mode 100644 index 0000000..3e870ac Binary files /dev/null and b/players/player_10/tools/.syncthing.reporting.py.tmp differ diff --git a/players/player_10/tools/dashboard/builder.py b/players/player_10/tools/dashboard/builder.py index 27410c1..083ce7b 100644 --- a/players/player_10/tools/dashboard/builder.py +++ b/players/player_10/tools/dashboard/builder.py @@ -4,6 +4,7 @@ import json import re +from collections import defaultdict from datetime import datetime from pathlib import Path @@ -34,6 +35,193 @@ def _format_number(value: float | None, digits: int = 2) -> str: return f'{value:.{digits}f}' +COLORWAY = [ + '#3867d6', + '#fa8231', + '#20bf6b', + '#a55eea', + '#fed330', + '#fc5c65', + '#2d98da', +] + +COMPONENT_LABELS = { + 'importance': 'Importance', + 'coherence': 'Coherence', + 'freshness': 'Freshness', + 'nonmonotonousness': 'Monotony relief', +} + + +def _config_value(result, attr: str): + config = getattr(result, 'config', None) + if config is None: + return None + if hasattr(config, attr): + return getattr(config, attr) + if isinstance(config, dict): + return config.get(attr) + return None + + +def _metric_value(result, metric: str): + if metric == 'total_score': + return getattr(result, 'total_score', None) + if metric == 'player10_score': + return getattr(result, 'player10_total_mean', None) + if metric == 'player10_individual': + return getattr(result, 'player10_individual_mean', None) + if metric == 'early_termination': + value = getattr(result, 'early_termination', None) + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + return getattr(result, metric, None) + + +def _compute_heatmap_data(results, row_attr: str, col_attr: str, metric: str): + matrix = defaultdict(lambda: defaultdict(list)) + rows: set = set() + cols: set = set() + for result in results: + row_value = _config_value(result, row_attr) + col_value = _config_value(result, col_attr) + metric_value = _metric_value(result, metric) + if row_value is None or col_value is None or metric_value is None: + continue + matrix[row_value][col_value].append(float(metric_value)) + rows.add(row_value) + cols.add(col_value) + if not rows or not cols: + return None + row_order = sorted(rows) + col_order = sorted(cols) + grid: list[list[float | None]] = [] + for row_value in row_order: + row_data: list[float | None] = [] + for col_value in col_order: + bucket = matrix.get(row_value, {}).get(col_value, []) + if bucket: + row_data.append(sum(bucket) / len(bucket)) + else: + row_data.append(None) + grid.append(row_data) + return row_order, col_order, grid + + +def _collect_scores_by_altruism(results): + buckets = defaultdict(lambda: {'total': [], 'player10': []}) + for result in results: + altruism = _config_value(result, 'altruism_prob') + if altruism is None: + continue + total_value = _metric_value(result, 'total_score') + if total_value is not None: + buckets[altruism]['total'].append(float(total_value)) + p10_value = _metric_value(result, 'player10_score') + if p10_value is not None: + buckets[altruism]['player10'].append(float(p10_value)) + if not buckets: + return None + return dict(sorted(buckets.items())) + + +def _component_means_by_altruism(results): + sums = defaultdict(lambda: defaultdict(float)) + counts = defaultdict(lambda: defaultdict(int)) + for result in results: + altruism = _config_value(result, 'altruism_prob') + breakdown = getattr(result, 'score_breakdown', None) or {} + if altruism is None: + continue + for key in COMPONENT_LABELS: + value = breakdown.get(key) + if value is None: + continue + try: + value = float(value) + except (TypeError, ValueError): + continue + sums[altruism][key] += value + counts[altruism][key] += 1 + if not sums: + return None + altruism_values = sorted(sums.keys()) + component_series: dict[str, list[float]] = {key: [] for key in COMPONENT_LABELS} + for altruism in altruism_values: + for key in COMPONENT_LABELS: + count = counts[altruism].get(key, 0) + if count: + component_series[key].append(sums[altruism][key] / count) + else: + component_series[key].append(0.0) + return altruism_values, component_series + + +def _aggregate_pareto_points(results): + groups = defaultdict( + lambda: { + 'total_sum': 0.0, + 'total_count': 0, + 'p10_sum': 0.0, + 'p10_count': 0, + 'early_sum': 0.0, + 'early_count': 0, + } + ) + for result in results: + key = ( + _config_value(result, 'altruism_prob'), + _config_value(result, 'tau_margin'), + _config_value(result, 'epsilon_fresh'), + _config_value(result, 'epsilon_mono'), + ) + if any(value is None for value in key): + continue + total_value = _metric_value(result, 'total_score') + if total_value is not None: + groups[key]['total_sum'] += float(total_value) + groups[key]['total_count'] += 1 + p10_value = _metric_value(result, 'player10_individual') + if p10_value is not None: + groups[key]['p10_sum'] += float(p10_value) + groups[key]['p10_count'] += 1 + early_value = _metric_value(result, 'early_termination') + if early_value is not None: + groups[key]['early_sum'] += float(early_value) + groups[key]['early_count'] += 1 + points: list[dict[str, float | int | None]] = [] + for key, data in groups.items(): + if not data['total_count'] or not data['p10_count']: + continue + altruism, tau, fresh, mono = key + point = { + 'altruism': altruism, + 'tau': tau, + 'fresh': fresh, + 'mono': mono, + 'total': data['total_sum'] / data['total_count'], + 'player10': data['p10_sum'] / data['p10_count'], + 'early': (data['early_sum'] / data['early_count']) if data['early_count'] else None, + 'runs': data['total_count'], + } + points.append(point) + if not points: + return None + points.sort(key=lambda item: (item['altruism'], item['tau'], item['fresh'], item['mono'])) + return points + + +def _format_axis_value(value): + if isinstance(value, float): + formatted = f'{value:.3f}' if abs(value) < 1 else f'{value:.2f}' + return formatted.rstrip('0').rstrip('.') + return str(value) + + def generate_dashboard( results, analysis, @@ -45,12 +233,17 @@ def generate_dashboard( try: import plotly.graph_objects as go import plotly.io as pio + + from plotly.subplots import make_subplots + except ImportError: return None output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) + analysis = analysis or {} + aggregated = summarize_parameterizations(results) table_rows: list[dict] = [] for row in aggregated: @@ -116,23 +309,48 @@ def generate_dashboard( chart_sections: list[dict[str, str]] = [] if top_rows: - labels = [parameter_label(row['meta']) for row in top_rows] - total_means = [row['mean'] for row in top_rows] - fig_top = go.Figure( - go.Bar( - x=labels, - y=total_means, - text=[f'±{row["std"]:.2f}' for row in top_rows], - textposition='outside', - marker=dict(color='#3867d6'), + fig_top = go.Figure() + rank_labels: list[str] = [] + for idx, row in enumerate(top_rows, start=1): + full_label = parameter_label(row['meta']) + mean_value = row['mean'] + std_value = row.get('std', 0.0) + rank_label = f'#{idx}' + rank_labels.append(rank_label) + fig_top.add_trace( + go.Bar( + x=[mean_value], + y=[rank_label], + orientation='h', + name=full_label, + marker=dict(color=COLORWAY[(idx - 1) % len(COLORWAY)]), + text=[f'{mean_value:.2f} ± {std_value:.2f}'], + textposition='outside', + customdata=[[full_label, std_value]], + hovertemplate=( + '%{customdata[0]}
Mean: %{x:.2f}
Std: %{customdata[1]:.2f}' + ), + ) ) - ) fig_top.update_layout( title='Top Parameterizations by Total Score', - xaxis_title='Parameterization label', - yaxis_title='Mean total score', + xaxis_title='Mean total score', + yaxis_title='Rank', + yaxis=dict(categoryorder='array', categoryarray=rank_labels), + margin=dict(l=0, r=20, t=60, b=40), + height=max(320, 90 * len(rank_labels)), uniformtext_minsize=10, - uniformtext_mode='show', + uniformtext_mode='hide', + legend=dict( + title='Parameterization label', + yanchor='top', + y=1.0, + xanchor='left', + x=1.02, + bgcolor='rgba(255,255,255,0.85)', + bordercolor='rgba(0,0,0,0.1)', + borderwidth=1, + ), ) chart_sections.append( { @@ -224,6 +442,201 @@ def generate_dashboard( }, ) + # Enhanced analysis sections derived from notebook utilities + heatmap_data = _compute_heatmap_data(results, 'altruism_prob', 'tau_margin', 'total_score') + if heatmap_data: + row_values, col_values, matrix = heatmap_data + y_labels = [_format_axis_value(value) for value in row_values] + x_labels = [_format_axis_value(value) for value in col_values] + fig_heatmap = go.Figure( + go.Heatmap( + z=matrix, + x=x_labels, + y=y_labels, + colorscale='Viridis', + colorbar={'title': 'Mean total score'}, + ) + ) + fig_heatmap.update_layout( + title='Total Score Heatmap', + xaxis_title='Tau margin', + yaxis_title='Altruism probability', + margin={'l': 80, 'r': 40, 't': 60, 'b': 60}, + ) + chart_sections.append( + { + 'title': 'Parameter Heatmap', + 'description': 'Average total score for each altruism/tau combination helps spot sweet spots quickly.', + 'html': pio.to_html( + fig_heatmap, + include_plotlyjs=False, + full_html=False, + config={'displaylogo': False}, + default_width='100%', + default_height='420px', + ), + }, + ) + + score_buckets = _collect_scores_by_altruism(results) + if score_buckets: + fig_dist = make_subplots(rows=1, cols=2, subplot_titles=('Total score', 'Player10 score')) + for idx, (prob, values) in enumerate(score_buckets.items()): + label = f'altruism {prob:.2f}' if isinstance(prob, float) else f'altruism {prob}' + color = COLORWAY[idx % len(COLORWAY)] + if values['total']: + fig_dist.add_trace( + go.Histogram( + x=values['total'], + name=label, + legendgroup=label, + marker={'color': color}, + opacity=0.55, + nbinsx=20, + showlegend=True, + ), + row=1, + col=1, + ) + if values['player10']: + fig_dist.add_trace( + go.Histogram( + x=values['player10'], + name=label, + legendgroup=label, + marker={'color': color}, + opacity=0.55, + nbinsx=20, + showlegend=False, + ), + row=1, + col=2, + ) + fig_dist.update_layout( + title_text='Score Distributions by Altruism', + barmode='overlay', + legend={'orientation': 'h', 'y': 1.12, 'x': 0.5, 'xanchor': 'center'}, + xaxis_title='Total score', + xaxis2_title='Player10 score', + yaxis_title='Frequency', + margin={'l': 60, 'r': 20, 't': 80, 'b': 60}, + ) + chart_sections.append( + { + 'title': 'Score Distributions', + 'description': 'Histogram overlays reveal how each altruism setting shifts total and individual score shapes.', + 'html': pio.to_html( + fig_dist, + include_plotlyjs=False, + full_html=False, + config={'displaylogo': False}, + default_width='100%', + default_height='420px', + ), + }, + ) + + component_data = _component_means_by_altruism(results) + if component_data: + altruism_values, component_series = component_data + labels = [_format_axis_value(value) for value in altruism_values] + fig_components = go.Figure() + for idx, (comp_key, comp_label) in enumerate(COMPONENT_LABELS.items()): + values = component_series.get(comp_key, []) + if not values: + continue + fig_components.add_trace( + go.Bar( + x=labels, + y=values, + name=comp_label, + marker={'color': COLORWAY[idx % len(COLORWAY)]}, + ) + ) + fig_components.update_layout( + title='Shared Component Breakdown', + barmode='stack', + xaxis_title='Altruism probability', + yaxis_title='Mean shared score', + legend={'orientation': 'h', 'y': 1.1, 'x': 0.5, 'xanchor': 'center'}, + margin={'l': 60, 'r': 20, 't': 80, 'b': 60}, + ) + chart_sections.append( + { + 'title': 'Shared Components', + 'description': 'Stacks quantify how shared scoring components vary with altruism levels.', + 'html': pio.to_html( + fig_components, + include_plotlyjs=False, + full_html=False, + config={'displaylogo': False}, + default_width='100%', + default_height='420px', + ), + }, + ) + + pareto_points = _aggregate_pareto_points(results) + if pareto_points: + customdata = [ + [ + _format_axis_value(point['altruism']), + _format_axis_value(point['tau']), + _format_axis_value(point['fresh']), + _format_axis_value(point['mono']), + (f'{point["early"]:.1%}' if point['early'] is not None else 'n/a'), + point['runs'], + ] + for point in pareto_points + ] + fig_pareto = go.Figure( + go.Scatter( + x=[point['player10'] for point in pareto_points], + y=[point['total'] for point in pareto_points], + mode='markers', + marker={ + 'size': 10, + 'color': [point['altruism'] for point in pareto_points], + 'colorscale': 'Viridis', + 'showscale': True, + 'colorbar': {'title': 'Altruism p'}, + }, + text=['ET>0.3' if (point['early'] or 0) > 0.3 else '' for point in pareto_points], + textposition='top center', + customdata=customdata, + hovertemplate=( + 'Player10 mean: %{x:.2f}
' + 'Total mean: %{y:.2f}
' + 'Altruism: %{customdata[0]}
' + 'Tau margin: %{customdata[1]}
' + 'Epsilon fresh: %{customdata[2]}
' + 'Epsilon mono: %{customdata[3]}
' + 'Early termination: %{customdata[4]}
' + 'Runs: %{customdata[5]}' + ), + ) + ) + fig_pareto.update_layout( + title='Pareto: Player10 vs Total Score', + xaxis_title='Player10 individual mean', + yaxis_title='Total score mean', + margin={'l': 60, 'r': 20, 't': 60, 'b': 60}, + ) + chart_sections.append( + { + 'title': 'Pareto Trade-off', + 'description': 'Scatter highlights where individual gains align with team score, colored by altruism.', + 'html': pio.to_html( + fig_pareto, + include_plotlyjs=False, + full_html=False, + config={'displaylogo': False}, + default_width='100%', + default_height='420px', + ), + }, + ) + total_simulations = analysis.get('total_simulations', len(results)) unique_configs = analysis.get('unique_configurations', len(aggregated)) best_entry = next(iter(analysis.get('best_configurations', [])), None) diff --git a/players/player_10/tools/manual_dashboard.py b/players/player_10/tools/manual_dashboard.py new file mode 100644 index 0000000..2bd84c2 --- /dev/null +++ b/players/player_10/tools/manual_dashboard.py @@ -0,0 +1,346 @@ +"""Generate a dashboard for manual engine experiments. + +This script recreates two lightweight experiment profiles (balanced vs adversarial +supporting casts) across a small set of seeds and toggles Player10's altruism +probability. The per-run outputs are converted into the same shape that the +Plotly dashboard expects, so we can reuse generate_dashboard without relying +on the MonteCarlo simulator assets that aren't available locally. +""" + +from __future__ import annotations + +import json +import statistics as stats +import sys +from collections import Counter +from collections.abc import Sequence +from dataclasses import asdict, dataclass +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +PROJECT_ROOT = Path(__file__).resolve().parents[3] +if str(PROJECT_ROOT) not in sys.path: + sys.path.append(str(PROJECT_ROOT)) + + +@dataclass +class ManualConfig: + """Minimal config stub exposing the knobs used by the dashboard helpers.""" + + altruism_prob: float + tau_margin: float + epsilon_fresh: float + epsilon_mono: float + seed: int + players: dict[str, int] + subjects: int + memory_size: int + conversation_length: int + min_samples_pid: int + ewma_alpha: float + importance_weight: float + coherence_weight: float + freshness_weight: float + monotony_weight: float + + +@dataclass +class ManualResult: + """Container that matches the attributes accessed by the dashboard builder.""" + + config: ManualConfig + total_score: float + best_total_score: float + player_scores: dict[str, float] + player_contributions: dict[str, int] + conversation_length: int + early_termination: bool + pause_count: int + unique_items_used: int + execution_time: float + score_breakdown: dict[str, float] + player_metrics: dict[str, dict[str, float | str | int | None]] + player10_total_mean: float + player10_individual_mean: float + player10_rank_mean: float + player10_gap_to_best: float + player10_instances: int + + +def _build_label_map(engine: Any) -> dict[str, str]: + """Assign stable human-readable labels to player UUIDs.""" + counts: Counter[str] = Counter() + labels: dict[str, str] = {} + + for player in engine.players: + class_name = type(player).__name__ + if class_name == 'Player10Agent': + label = 'Player10' + else: + counts[class_name] += 1 + label = f'{class_name}#{counts[class_name]}' + labels[str(player.id)] = label + + return labels + + +def _rank_players(totals: dict[str, float]) -> dict[str, float]: + """Return 1-based ranks (dense ranking) for each player label.""" + sorted_totals = sorted(totals.items(), key=lambda item: item[1], reverse=True) + ranks: dict[str, float] = {} + + current_rank = 1 + previous_value: float | None = None + + for index, (label, value) in enumerate(sorted_totals, start=1): + if previous_value is None or value < previous_value: + current_rank = index + previous_value = value + ranks[label] = float(current_rank) + + return ranks + + +def _build_manual_result( + engine: Any, + seed: int, + altruism: float, + roster: Sequence[type], + subjects: int, + memory_size: int, + conversation_length: int, +) -> ManualResult: + """Run the engine once and transform the output into a dashboard result.""" + from players.player_10.agent import config as p10_config + + output = engine.run(list(roster)) + history = output['history'] + scores = output['scores'] + + label_map = _build_label_map(engine) + + player_scores_dict: dict[str, float] = {} + player_metrics: dict[str, dict[str, float | str | int | None]] = {} + + totals_for_ranking: dict[str, float] = {} + + for entry in scores['player_scores']: + label = label_map[str(entry['id'])] + total = float(entry['scores']['total']) + individual = float(entry['scores']['individual']) + shared = float(entry['scores']['shared']) + + player_scores_dict[label] = total + player_metrics[label] = { + 'class_name': label.split('#')[0], + 'alias': label, + 'total': total, + 'individual': individual, + 'shared': shared, + 'rank': None, # filled in after ranking + } + totals_for_ranking[label] = total + + ranks = _rank_players(totals_for_ranking) + for label, rank in ranks.items(): + player_metrics[label]['rank'] = rank + + player10_total = player_scores_dict['Player10'] + best_total = max(player_scores_dict.values()) + + player_contributions_counts = { + label_map[str(uid)]: len(items) for uid, items in engine.player_contributions.items() + } + + unique_items = {item.id for item in history if item is not None} + pause_count = sum(1 for item in history if item is None) + + config = ManualConfig( + altruism_prob=altruism, + tau_margin=p10_config.TAU_MARGIN, + epsilon_fresh=p10_config.EPSILON_FRESH, + epsilon_mono=p10_config.EPSILON_MONO, + seed=seed, + players=dict(Counter(type(player).__name__ for player in engine.players)), + subjects=subjects, + memory_size=memory_size, + conversation_length=conversation_length, + min_samples_pid=p10_config.MIN_SAMPLES_PID, + ewma_alpha=p10_config.EWMA_ALPHA, + importance_weight=p10_config.IMPORTANCE_WEIGHT, + coherence_weight=p10_config.COHERENCE_WEIGHT, + freshness_weight=p10_config.FRESHNESS_WEIGHT, + monotony_weight=p10_config.MONOTONY_WEIGHT, + ) + + return ManualResult( + config=config, + total_score=float(output['score_breakdown']['total']), + best_total_score=best_total, + player_scores=player_scores_dict, + player_contributions=player_contributions_counts, + conversation_length=len(history), + early_termination=len(history) < conversation_length, + pause_count=pause_count, + unique_items_used=len(unique_items), + execution_time=0.0, + score_breakdown={k: float(v) for k, v in output['score_breakdown'].items()}, + player_metrics=player_metrics, + player10_total_mean=player10_total, + player10_individual_mean=float( + next( + entry['scores']['individual'] + for entry in scores['player_scores'] + if label_map[str(entry['id'])] == 'Player10' + ) + ), + player10_rank_mean=ranks['Player10'], + player10_gap_to_best=best_total - player10_total, + player10_instances=1, + ) + + +def run_manual_experiments() -> tuple[list[ManualResult], dict[str, dict[str, float]]]: + """Return all per-run results plus an aggregate summary per configuration.""" + from core.engine import Engine + from players.pause_player import PausePlayer + from players.player_10.agent import config as p10_config + from players.player_10.agent.player import Player10Agent + from players.random_pause_player import RandomPausePlayer + from players.random_player import RandomPlayer + + subjects = 10 + memory_size = 16 + conversation_length = 40 + seeds = list(range(100, 116)) + + rosters: dict[str, Sequence[type]] = { + 'Balanced support (3 Random)': [Player10Agent, RandomPlayer, RandomPlayer, RandomPlayer], + 'Adversarial mix (Random, RandomPause, Pause)': [ + Player10Agent, + RandomPlayer, + RandomPausePlayer, + PausePlayer, + ], + } + + results: list[ManualResult] = [] + aggregates: dict[str, list[float]] = {} + + original_altruism = p10_config.ALTRUISM_USE_PROB + + for roster_name, roster in rosters.items(): + for altruism_value in (0.0, 0.6): + p10_config.ALTRUISM_USE_PROB = altruism_value + + key = f'{roster_name} | altruism={altruism_value:.1f}' + aggregates[key] = [] + + for seed in seeds: + engine = Engine( + players=list(roster), + player_count=len(roster), + subjects=subjects, + memory_size=memory_size, + conversation_length=conversation_length, + seed=seed, + ) + result = _build_manual_result( + engine, + seed=seed, + altruism=altruism_value, + roster=roster, + subjects=subjects, + memory_size=memory_size, + conversation_length=conversation_length, + ) + results.append(result) + aggregates[key].append(result.total_score) + + # Restore the original altruism probability so we do not affect other tooling + p10_config.ALTRUISM_USE_PROB = original_altruism + + aggregate_summary = { + key: { + 'mean': stats.mean(values), + 'std': stats.pstdev(values) if len(values) > 1 else 0.0, + } + for key, values in aggregates.items() + } + + output_payload = [ + { + 'config': asdict(result.config), + 'total_score': result.total_score, + 'best_total_score': result.best_total_score, + 'player_scores': result.player_scores, + 'player_contributions': result.player_contributions, + 'conversation_length': result.conversation_length, + 'early_termination': result.early_termination, + 'pause_count': result.pause_count, + 'unique_items_used': result.unique_items_used, + 'execution_time': result.execution_time, + 'score_breakdown': result.score_breakdown, + 'player_metrics': result.player_metrics, + 'player10_total_mean': result.player10_total_mean, + 'player10_individual_mean': result.player10_individual_mean, + 'player10_rank_mean': result.player10_rank_mean, + 'player10_gap_to_best': result.player10_gap_to_best, + 'player10_instances': result.player10_instances, + 'altruism_prob': result.config.altruism_prob, + 'seed': result.config.seed, + 'players': result.config.players, + } + for result in results + ] + + output_path_json = Path('players/player_10/results/manual_dashboard_runs.json') + output_path_json.write_text(json.dumps(output_payload, indent=2)) + print(f'Detailed run data written to {output_path_json}') + + return results, aggregate_summary + + +def main(open_browser: bool = False) -> None: + from players.player_10.tools.dashboard import generate_dashboard + + results, summary = run_manual_experiments() + + analysis = { + 'total_simulations': len(results), + 'unique_configurations': len(summary), + 'best_configurations': [ + { + 'label': label, + 'mean_score': stats['mean'], + 'std_score': stats['std'], + } + for label, stats in sorted( + summary.items(), key=lambda item: item[1]['mean'], reverse=True + ) + ], + } + + dashboard_config = SimpleNamespace( + name='Manual Engine Experiments', + description='Player10 altruism sensitivity across two roster archetypes.', + output_dir='players/player_10/results', + ) + + output_path = generate_dashboard( + results, + analysis, + dashboard_config, + output_dir='players/player_10/results/dashboards', + open_browser=open_browser, + ) + + if output_path: + print(f'Dashboard written to: {output_path}') + else: + print('Plotly is not installed; dashboard generation skipped.') + + +if __name__ == '__main__': + main(open_browser=False) diff --git a/players/player_10/tools/reporting.py b/players/player_10/tools/reporting.py index 441f266..2a8a023 100644 --- a/players/player_10/tools/reporting.py +++ b/players/player_10/tools/reporting.py @@ -10,16 +10,44 @@ from collections import defaultdict from typing import Any -from ..sim.test_framework import ParameterRange, TestConfiguration +try: + from ..sim.test_framework import ( + ParameterRange, + TestConfiguration, + ) +except ModuleNotFoundError: + ParameterRange = None # type: ignore + TestConfiguration = None # type: ignore + _BASELINE_CONFIG = None +else: + _BASELINE_CONFIG = TestConfiguration(name='baseline_snapshot') -_BASELINE_CONFIG = TestConfiguration(name='baseline_snapshot') - -def _first(range_field: ParameterRange) -> Any: - return range_field.values[0] if range_field.values else None +def _first(range_field) -> Any: + if range_field is None: + return None + return range_field.values[0] if getattr(range_field, 'values', None) else None def _capture_baseline_meta() -> dict[str, Any]: + if _BASELINE_CONFIG is None: + return { + 'altruism_prob': 0.0, + 'tau_margin': 0.0, + 'epsilon_fresh': 0.0, + 'epsilon_mono': 0.0, + 'min_samples_pid': 5, + 'ewma_alpha': 0.0, + 'importance_weight': 1.0, + 'coherence_weight': 1.0, + 'freshness_weight': 1.0, + 'monotony_weight': 1.0, + 'conversation_length': 0, + 'subjects': 0, + 'memory_size': 0, + 'players': {}, + } + players = dict(_BASELINE_CONFIG.player_configs[0]) if _BASELINE_CONFIG.player_configs else {} return { 'altruism_prob': _first(_BASELINE_CONFIG.altruism_probs), diff --git a/pyproject.toml b/pyproject.toml index 9c5bf04..15e3d0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,8 +8,8 @@ dependencies = [ "pygame>=2.6.1", "openai", "ruff>=0.12.8", - "numpy>=2.3.3", - "torch>=2.8.0", + "numpy", + "torch", ] [tool.ruff] diff --git a/uv.lock b/uv.lock index 23a48b9..7efbbe4 100644 --- a/uv.lock +++ b/uv.lock @@ -61,11 +61,13 @@ dev = [ [package.metadata] requires-dist = [ - { name = "numpy", specifier = ">=2.3.3" }, + + { name = "numpy" }, { name = "openai" }, { name = "pygame", specifier = ">=2.6.1" }, { name = "ruff", specifier = ">=0.12.8" }, - { name = "torch", specifier = ">=2.8.0" }, + { name = "torch" }, + ] [package.metadata.requires-dev]