paramkpr
diff --git a/‎.flake8‎
Lines changed: 5 additions & 0 deletions b/‎.flake8‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎configs/deberta_large_sst2.yaml‎
Lines changed: 19 additions & 0 deletions b/‎configs/deberta_large_sst2.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎notebooks/01_data_exploration.ipynb‎ ‎data/clean/.gitkeep‎notebooks/01_data_exploration.ipynb renamed to data/clean/.gitkeep b/‎notebooks/01_data_exploration.ipynb‎ ‎data/clean/.gitkeep‎notebooks/01_data_exploration.ipynb renamed to data/clean/.gitkeep
diff --git a/‎main.py‎
Lines changed: 49 additions & 0 deletions b/‎main.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎notebooks/eda_sst_2.ipynb‎
Lines changed: 494 additions & 0 deletions b/‎notebooks/eda_sst_2.ipynb‎
Lines changed: 494 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 14 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎scripts/download_dataset.py‎ b/‎scripts/download_dataset.py‎
diff --git a/‎scripts/train_model.py‎ b/‎scripts/train_model.py‎
diff --git a/‎sentisynth/__init__.py‎ b/‎sentisynth/__init__.py‎
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 120
+ignore =
+    E203,  # spacing before colon (in conflict with black)
+    W503   # line break before binary operator
@@ -51,6 +51,8 @@ data/raw/*
 data/processed/*
 data/models/*
 data/results/*
+data/clean/*
+!data/clean/.gitkeep
 !data/raw/.gitkeep
 !data/processed/.gitkeep
 !data/models/.gitkeep
 
@@ -0,0 +1,19 @@
+model_name: microsoft/deberta-v3-large
+dataset_path: data/sst2_dd
+train_split: train
+eval_split: val
+sanity_split: sanity
+max_len: 128
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 32
+gradient_accumulation_steps: 4
+num_train_epochs: 3
+learning_rate: 2e-5
+warmup_ratio: 0.06
+fp16: true
+logging_steps: 50
+eval_steps: 200
+save_steps: 200
+output_dir: outputs/teacher
+report_to: wandb
+project_name: sst2_teacher 
@@ -0,0 +1,49 @@
+import argparse
+import sys
+import os
+
+# Ensure the src directory is in the Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
+
+
+try:
+    from data.clean import run_cleaning_and_split
+except ImportError as e:
+    print(f"Error importing modules: {e}")
+    print("Please ensure your scripts are correctly placed in the 'src' directory and paths are correct.")
+    sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="SentiSynth Project Main Entry Point")
+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
+
+    # --- Clean and Split Command ---
+    parser_process = subparsers.add_parser('process_data', help='Clean raw data and create final train/val/sanity splits')
+    parser_process.add_argument('--raw-path', default='./data/raw', help='Path to the raw dataset directory')
+    parser_process.add_argument('--output-path', default='./data/sst2_dd', help='Path to save the final DatasetDict')
+    parser_process.set_defaults(func=lambda args: run_cleaning_and_split(args.raw_path, args.output_path))
+
+    # --- Add other commands here as subparsers ---
+    # Example: Download command
+    # parser_download = subparsers.add_parser('download', help='Download the raw dataset')
+    # parser_download.add_argument('--save-path', default='./data/raw', help='Path to save the raw dataset')
+    # parser_download.set_defaults(func=lambda args: run_download(args.save_path)) # Assuming you create run_download
+
+    # Example: Train command
+    # parser_train = subparsers.add_parser('train', help='Train a model')
+    # parser_train.add_argument('--config', required=True, help='Path to the training configuration file')
+    # ... other training args ...
+    # parser_train.set_defaults(func=lambda args: run_training(args)) # Assuming you create run_training
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Execute the function associated with the chosen command
+    if hasattr(args, 'func'):
+        args.func(args)
+    else:
+        # If no command is given, print help
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
@@ -6,4 +6,17 @@ scikit-learn>=0.24.2
 pandas>=1.3.0
 matplotlib>=3.4.2
 seaborn>=0.11.1
-tqdm>=4.61.2 
+tqdm>=4.61.2
+datasets>=3.5.0
+
+accelerate==1.6.0
+
+# Configuration and Metrics
+PyYAML>=6.0.2
+scikit-learn>=1.6.1
+
+# Logging
+wandb>=0.19.9
+Cmake
+sentencepiece
+protobuf