updates

paramkpr · paramkpr · commit 3512b6f2f299 · 2025-04-22T15:40:42.000-07:00
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -71,4 +71,9 @@ logs/
 *.log 
 
 wandb/
-outputs/
+outputs/
+
+*.DS_Store
+.DS_Store
+.DS_Store
+.DS_Store?
diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ae8b72691bc94e5b8406681bb5b11d59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/63981 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f04a9293b4a14796aa1050f40b8f0135",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/872 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0e7c45fd8b974bc1ac58f95e05d69cb6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/3368 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9968f3bb38ad4d26b78e6d1cc7ae295c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=4):   0%|          | 0/1821 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['idx', 'labels', 'text', 'input_ids', 'attention_mask'],\n",
+       "    num_rows: 63981\n",
+       "})"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_from_disk, Dataset\n",
+    "from transformers import GPT2Tokenizer\n",
+    "\n",
+    "dataset = load_from_disk(\"data/clean\")\n",
+    "\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "\n",
+    "def tokenize_function(examples):\n",
+    "    return tokenizer(examples[\"text\"], padding=True, truncation=True, max_length=32)\n",
+    "\n",
+    "tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4)\n",
+    "\n",
+    "tokenized_datasets[\"train\"]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'list' object has no attribute 'schema'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m Dataset(\u001b[43mTable\u001b[49m\u001b[43m(\u001b[49m\u001b[43md\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtext\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/projects/SentiSynth/venv311/lib/python3.11/site-packages/datasets/table.py:167\u001b[39m, in \u001b[36mTable.__init__\u001b[39m\u001b[34m(self, table)\u001b[39m\n\u001b[32m    166\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, table: pa.Table):\n\u001b[32m--> \u001b[39m\u001b[32m167\u001b[39m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[34;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtable\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    168\u001b[39m     \u001b[38;5;28mself\u001b[39m.table = table\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/projects/SentiSynth/venv311/lib/python3.11/site-packages/datasets/table.py:107\u001b[39m, in \u001b[36mIndexedTableMixin.__init__\u001b[39m\u001b[34m(self, table)\u001b[39m\n\u001b[32m    106\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, table: pa.Table):\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m     \u001b[38;5;28mself\u001b[39m._schema: pa.Schema = \u001b[43mtable\u001b[49m\u001b[43m.\u001b[49m\u001b[43mschema\u001b[49m\n\u001b[32m    108\u001b[39m     \u001b[38;5;28mself\u001b[39m._batches: \u001b[38;5;28mlist\u001b[39m[pa.RecordBatch] = [\n\u001b[32m    109\u001b[39m         recordbatch \u001b[38;5;28;01mfor\u001b[39;00m recordbatch \u001b[38;5;129;01min\u001b[39;00m table.to_batches() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(recordbatch) > \u001b[32m0\u001b[39m\n\u001b[32m    110\u001b[39m     ]\n\u001b[32m    111\u001b[39m     \u001b[38;5;28mself\u001b[39m._offsets: np.ndarray = np.cumsum([\u001b[32m0\u001b[39m] + [\u001b[38;5;28mlen\u001b[39m(b) \u001b[38;5;28;01mfor\u001b[39;00m b \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m._batches], dtype=np.int64)\n",
+      "\u001b[31mAttributeError\u001b[39m: 'list' object has no attribute 'schema'"
+     ]
+    }
+   ],
+   "source": [
+    "Dataset(Table(d[\"train\"][\"text\"]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python3.11 (sentisynth)",
+   "language": "python",
+   "name": "auctionn"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/cli/01_train_teacher.py b/src/cli/01_train_teacher.py
@@ -4,34 +4,21 @@
 import logging
 from pathlib import Path
 
-import numpy as np
-from sklearn.metrics import precision_recall_fscore_support, accuracy_score
 import torch
 from transformers import DataCollatorWithPadding, IntervalStrategy, TrainingArguments, Trainer
 
 from src.models import build_teacher
 from src.data import ClassificationDataModule
 from utils.wandb_setup import setup_wandb
-
+from utils.metrics import compute_metrics
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
 
 app = typer.Typer()
 
 
-def compute_metrics(p):
-    """Computes metrics for HF Trainer."""
-    preds = np.argmax(p.predictions, axis=1)
-    labels = p.label_ids
-    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') # Assuming binary
-    acc = accuracy_score(labels, preds)
-    return {
-        'accuracy': acc,
-        'f1': f1,
-        'precision': precision,
-        'recall': recall
-    }
+
 
 
 @app.command()