Modalities
diff --git a/‎README.md‎
Lines changed: 25 additions & 4 deletions b/‎README.md‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎configs/annotation/lorem_ipsum_annotation_pipeline.yaml‎
Lines changed: 25 additions & 0 deletions b/‎configs/annotation/lorem_ipsum_annotation_pipeline.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎configs/annotation/lorem_ipsum_annotation_pipeline_slurm.yaml‎
Lines changed: 31 additions & 0 deletions b/‎configs/annotation/lorem_ipsum_annotation_pipeline_slurm.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎configs/annotation/lorem_ipsum_embedding.yaml‎
Lines changed: 38 additions & 0 deletions b/‎configs/annotation/lorem_ipsum_embedding.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎configs/annotation/lorem_ipsum_embedding_pipeline_slurm.yaml‎
Lines changed: 52 additions & 0 deletions b/‎configs/annotation/lorem_ipsum_embedding_pipeline_slurm.yaml‎
Lines changed: 52 additions & 0 deletions
@@ -18,12 +18,16 @@ We use this repository to filter out low-quality documents from the Common Crawl
 3. The classifier is used to filter out low-quality documents from the entire CC dataset. The filtered dataset is then used to train the model(s).
 
 
+## Documentation Map
+- [Pipelines: Embedding & Annotation](documentation/pipelines.md) – generate embeddings and run annotation heads at scale.
+- [Aggregation](documentation/aggregation.md) – how scores are combined (mean, max, min, majority, etc.).
+- [Data Format](documentation/data_format.md) – expected JSONL schema & label structure.
+- [Evaluation](documentation/evaluation.md) – metrics and evaluation utilities.
 
 ## Installation and Development
 
 Please see [CONTRIBUTING.md](CONTRIBUTING.md)
 
-
 ## Usage
 Once you have [setup TGI container](#setting-up-the-tgi-container-with-hugging-face-models), you can proceed to score and the documents and trainer and classifier
 
@@ -32,12 +36,28 @@ Once you have [setup TGI container](#setting-up-the-tgi-container-with-hugging-f
 python cli.py score_documents --config_file_path path/to/your/config.yaml
 
 ```
-### 2. How to Train a Classifier
-If you already have the score, you can train a classifier by running
+### 2. Create Embeddings at Scale
+Generate HDF5 embedding files from raw JSONL (see `documentation/pipelines.md` for full schema):
+```bash
+python cli.py run_embedding_pipeline --config_file_path configs/embedding_job.yaml
+```
+Outputs: one `.h5` per input file (embeddings + optional labels) under the configured embedding directory.
+
+### 3. How to Train a Classifier
+If you already have scores (e.g. LLM annotations), you can train a classifier by running
 ```script
 python cli.py train_classifier --config_file_path path/to/your/training_config.yaml
 ```
-### 3. Measure Interrater Reliability
+Trained model (and tokenizer) are saved under the `final` subdirectory of the configured output dir.
+
+### 4. Run Annotation Heads on Embeddings
+Apply one or more trained regression / classification heads to previously generated embeddings:
+```bash
+python cli.py run_annotation_pipeline --config_file_path configs/annotation_job.yaml
+```
+Outputs: `${source_filename}.jsonl` with predicted scores in `annotated_data/`.
+
+### 5. Measure Interrater Reliability
 If you have a dataset with scores annotated by multiple annotators, you can compute metrics to measure the interrater reliability with the command interrater_reliability. If you want to compare the scores in a single file (e.g. the human annotated ground truth data), run:
 ```script
 python cli.py interrater_reliability data_annotated.jsonl --output_file_path output.json
@@ -50,6 +70,7 @@ You can create plots for the distribution of annotations and the differences bet
 ```script
 python cli.py plot_scores data_annotated_by_model_1.jsonl data_annotated_by_model_2.jsonl --aggregation majority --output_dir outputs
 ```
+
 ## TGI
 
 This service relies on **TGI containers** (Text Generation Inference), which can be downloaded from [Hugging Face](https://huggingface.co). Follow the steps below to download and run the TGI container.
 
@@ -0,0 +1,25 @@
+params:
+  embeddings_directory: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/output/validation_embeddings
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/output/annotations
+
+  regression_head_checkpoints:
+    Gemma_Snowflake: /raid/s3/opengptx/jude/repos/ml_filter/embedding_ablations/training/final #/raid/s3/opengptx/jude/repos/ml_filter/hessanAI/checkpoints/checkpoints/edu-gemma-snowflake-balanced.ckpt
+    Llama_Snowflake: /raid/s3/opengptx/jude/repos/ml_filter/embedding_ablations/training/final #/raid/s3/opengptx/jude/repos/ml_filter/hessanAI/checkpoints/checkpoints/edu-llama-snowflake-balanced.ckpt
+    Mistral_Snowflake: /raid/s3/opengptx/jude/repos/ml_filter/embedding_ablations/training/final #/raid/s3/opengptx/jude/repos/ml_filter/hessanAI/checkpoints/checkpoints/edu-mistral-snowflake-balanced.ckpt
+  batch_size: 1000
+  hdf5_dataset_name: train
+  output_keys: ["document_id", "score_Gemma_Snowflake", "score_Llama_Snowflake", "score_Mistral_Snowflake"]
+  model_dtype: bfloat16
+  embedding_dtype: bfloat16
+  label_dtype: bfloat16
+  compression: gzip
+running_on_slurm: false
+
+local_settings:
+  tasks: 1
+  workers: 1
+  local_tasks: 1
+  local_rank_offset: 0
+  logging_dir: ${params.output_dir}/logs
+
+slurm_settings: null
@@ -0,0 +1,31 @@
+params:
+  embeddings_directory: /raid/s3/opengptx/jude/repos/ml_filter/data/input_dir_hierarchy
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/embedding_output_dir/annotations_new/
+
+  regression_head_checkpoints:
+    Gemma_Snowflake: /raid/s3/opengptx/jude/repos/ml_filter/hessanAI/checkpoints/checkpoints/edu-gemma-snowflake-balanced.ckpt
+    Llama_Snowflake: /raid/s3/opengptx/jude/repos/ml_filter/hessanAI/checkpoints/checkpoints/edu-llama-snowflake-balanced.ckpt
+  batch_size: 1000
+
+running_on_slurm: true
+
+local_settings: null
+
+slurm_settings:
+  sbatch_args:
+    account: "p_gptx"
+    nodes: 1
+    ntasks: 1
+    gres: gpu:1
+  partition: "capella"
+  time: "04:00:00"
+  cpus_per_task: 8
+  mem_per_cpu_gb: 2
+  gpus_per_task: 1
+  job_name: "MMbert_embedder"
+  output: /data/cat/ws/alju972f-regression_heads/dataset/mmbet_embeddings/mmber_logs/%j.out
+  error: /data/cat/ws/alju972f-regression_heads/dataset/mmbet_embeddings/mmber_logs/%j.err
+  qos: "normal"
+  venv_path: /data/cat/ws/alju972f-regression_heads/envs/env_regression_heads/bin/activate
+  tasks: 10
+  workers: 1001
@@ -0,0 +1,38 @@
+dataset_name: validation
+
+params:
+  # File selection
+  glob_pattern: "**/*.jsonl"
+  input_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/input #/raid/s3/opengptx/abbas/processed_data_natural/${dataset_name}_set
+
+  # Output
+  output_dir: /raid/s3/opengptx/jude/repos/ml_filter/data/throughput_analysis/output
+  text_field: text
+  keys_to_index: ["id", "aggregation_type"]
+  embedding_dir: ${dataset_name}_embeddings
+  compression: gzip
+
+  # Precision
+  embedding_dtype: float32
+  label_dtype: int8
+  model_dtype: bfloat16
+
+  # Model and embedding parameters
+  embedding_model: jhu-clsp/mmBERT-base
+  batch_size: 128
+  writer_batch_size: 1000
+  hdf5_dataset_name: train
+  save_labels: false
+  max_length: 8192
+  padding: true
+  truncation: true
+
+running_on_slurm: false
+
+local_settings:
+  tasks: 2
+  workers: 2
+  local_tasks: 2
+  local_rank_offset: 0
+
+slurm_settings: null
@@ -0,0 +1,52 @@
+dataset_name: training
+
+params:
+  # File selection
+  glob_pattern: "**/*.jsonl"
+  input_dir: /data/cat/ws/alju972f-regression_heads/repos/data/embedding_creation/input #/data/cat/ws/alju972f-regression_heads/dataset/Regression_head/abbas/processed_data_natural/${dataset_name}_set
+
+  # Output
+  output_dir: /data/cat/ws/alju972f-regression_heads/repos/data/embedding_creation/output
+  keys_to_index: ["id", "aggregation_type"]
+  text_field: text
+  embedding_dir: ${dataset_name}_embeddings
+  compression: gzip
+
+  # Precision
+  embedding_dtype: float32
+  label_dtype: int8
+  model_dtype: bfloat16
+
+  # Model and embedding parameters
+  embedding_model: jhu-clsp/mmBERT-base
+  batch_size: 512
+  writer_batch_size: 1000
+  hdf5_dataset_name: train
+  save_labels: false
+  max_length: 8192
+  padding: true
+  truncation: true
+
+running_on_slurm: true
+
+local_settings: null
+
+slurm_settings:
+  sbatch_args:
+    account: "p_gptx"
+    nodes: 1
+    ntasks: 1
+    gres: gpu:4
+    exclusive: user
+  partition: "capella"
+  time: "04:00:00"
+  cpus_per_task: 32
+  mem_per_cpu_gb: 8
+  gpus_per_task: 4
+  job_name: "MMbert_embedder"
+  output: ${params.output_dir}/logs/%j.out
+  error: ${params.output_dir}/logs/%j.err
+  qos: "normal"
+  venv_path: /data/cat/ws/alju972f-regression_heads/repos/env/jql_pipeline/bin/activate
+  tasks: 1
+  workers: 1