Modalities
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 0 deletions b/‎README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎configs/score_documents/lorem_ipsum.yaml‎
Lines changed: 4 additions & 2 deletions b/‎configs/score_documents/lorem_ipsum.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎configs/score_documents/pii_content_config.yaml‎
Lines changed: 42 additions & 0 deletions b/‎configs/score_documents/pii_content_config.yaml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎data/debug/deu_Latn_sampled_500k_first200_id.jsonl‎
Lines changed: 200 additions & 0 deletions b/‎data/debug/deu_Latn_sampled_500k_first200_id.jsonl‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎data/debug/deu_Latn_sampled_500k_first_200.jsonl‎
Lines changed: 200 additions & 0 deletions b/‎data/debug/deu_Latn_sampled_500k_first_200.jsonl‎
Lines changed: 200 additions & 0 deletions
diff --git a/‎data/fineweb_2_500k_both/split/spa_Latn_sampled_500k_0_to_600.jsonl‎
Lines changed: 600 additions & 0 deletions b/‎data/fineweb_2_500k_both/split/spa_Latn_sampled_500k_0_to_600.jsonl‎
Lines changed: 600 additions & 0 deletions
diff --git a/‎data/fineweb_2_500k_both/split/srp_Cyrl_sampled_500k_0_to_600.jsonl‎
Lines changed: 600 additions & 0 deletions b/‎data/fineweb_2_500k_both/split/srp_Cyrl_sampled_500k_0_to_600.jsonl‎
Lines changed: 600 additions & 0 deletions
diff --git a/‎data/prompts/pii/pii_prompt.yaml‎
Lines changed: 16 additions & 0 deletions b/‎data/prompts/pii/pii_prompt.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/ml_filter/__main__.py‎
Lines changed: 77 additions & 2 deletions b/‎src/ml_filter/__main__.py‎
Lines changed: 77 additions & 2 deletions
@@ -20,13 +20,13 @@ This will create a venv with Python3.11 and pip under `.venv`.
 Install the project via
 ```shell
 # ensure that you already created and activated a virtual environment before
-pip install .
+uv pip install .
 ```
 
 For developers, use
 ```shell
 # ensure that you already created and activated a virtual environment before
-pip install -e .[tests,linting]
+uv pip install -e .[tests,linting]
 pre-commit install --install-hooks
 ```
 
 
@@ -164,6 +164,19 @@ Alternativley, just use execute `bash scripts/host_vllm_model.sh $CONTAINER_NAME
 bash scripts/host_vllm_model.sh my_vllm_container 9123 meta-llama/Llama-3.1-8B-Instruct
 ```
 
+##### Mistral
+For mistral models make sure to manually set the correct chat template file in the tokenizer_config.json.
+We tried hosting the model as described by MistralAI, 
+```bash
+vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tensor-parallel-size 4 --port 8003 --tokenizer_mode mistral --config_format mistral --load_format mistral
+```
+
+but still ran into:
+```bash
+ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at
+```
+
+
 #### Test the hosted model
 ```bash
 curl http://localhost:port_number/v1/completions \
 
@@ -8,7 +8,8 @@ settings:
 
   paths:
     raw_data_file_paths: 
-    - data/test_fineweb2_dump.jsonl
+    - data/fineweb_2_500k_both/split/spa_Latn_sampled_500k_0_to_600.jsonl
+    - data/fineweb_2_500k_both/split/srp_Cyrl_sampled_500k_0_to_600.jsonl
     output_directory_path: data/output
     prompt_template_file_path: data/prompts/fineweb_edu/educational_prompt.yaml
     start_indexes:
@@ -48,4 +49,5 @@ document_processor:
   num_processes: 1
   score_metric_name: educational_score
   strings_to_remove: []
-  jq_language_pattern: .metadata.language
+  jq_language_pattern: .language
+  document_id_column: id
@@ -0,0 +1,42 @@
+settings:
+  model_name: google/gemma-2-27b-it
+  num_gpus: 1
+  tokenizer_name_or_path: ${settings.model_name}
+  paths:
+    raw_data_file_paths:
+    - data/debug/deu_Latn_sampled_500k_first_200.jsonl
+    start_indexes:
+    - 150   
+    output_directory_path: data/output
+    prompt_template_file_path: data/prompts/pii/pii_prompt.yaml
+llm_rest_client:
+  model_name: ${settings.model_name}
+  max_tokens: 8192
+  sampling_params:
+    max_tokens: 500
+    temperature: 0.7
+    n: 3
+    top_p: 0.9
+  max_pool_connections: 1000
+  max_pool_maxsize: 1000
+  max_retries: 2
+  backoff_factor: 0.4
+  timeout: 100
+  verbose: false
+  num_gpus: ${settings.num_gpus}
+  max_new_tokens: 500
+tokenizer:
+  pretrained_model_name_or_path: ${settings.tokenizer_name_or_path}
+  special_tokens: null
+  add_generation_prompt: true
+prompt_builder:
+  prompt_template_file_path: ${settings.paths.prompt_template_file_path}
+  max_prompt_length: 7690
+document_processor:
+  output_directory_path: ${settings.paths.output_directory_path}
+  queue_size: 1000
+  num_processes: 10
+  score_metric_name: pii_score
+  strings_to_remove: []
+  jq_language_pattern: .language
+  document_id_column: id
@@ -0,0 +1,16 @@
+prompt: |
+  Below is an extract from a web page. Evaluate the level of Personally Identifiable Information (PII) present in the extract using the additive 5-point scoring system described below. Points are accumulated based on the presence and sensitivity of PII:
+
+  - Add 1 point if the extract contains any potential PII like a name, username, or other identifiers that could be linked to an individual, even if not uniquely identifying on their own.
+  - Add another point if the extract includes direct identifiers that can uniquely identify an individual, such as a full name, email address, phone number, or physical address.
+  - Award a third point if the extract contains sensitive PII like government ID numbers (e.g., SSN, passport), or financial information (e.g., credit card, bank account numbers).
+  - Grant a fourth point if the extract includes highly sensitive PII, such as medical records, biometric data, genetic information, or private communications.
+  - Bestow a fifth point if the extract is dominated by PII, such as a list or database of personal information for multiple individuals.
+  Note: If the extract contains no PII, the score is 0. Information about public figures that is already widely known, or data belonging to fictional characters, should also be scored 0. General discussions about data privacy that do not contain specific individuals' data should be scored based only on any actual PII examples present.
+  The extract:
+  {placeholder}
+  After examining the extract:
+  - Briefly justify your score, up to 100 words.
+  - Conclude with the score using the format: 'PII score: <total points>'
+
+prompt_name: pii_content_filter
@@ -21,6 +21,7 @@
 from ml_filter.training.embedding_training_pipeline import run_embedding_head_training_pipeline
 from ml_filter.translate import TranslationServiceType, TranslatorFactory
 from ml_filter.utils.chunk_data import chunk_jsonl
+from ml_filter.utils.get_costs_of_openai_batched_requests import find_and_process_files
 from ml_filter.utils.manipulate_datasets import apply_score_transforms, convert_hf_dataset_to_jsonl, split_dataset
 from ml_filter.utils.manipulate_documents import merge_and_sort_jsonl_files
 from ml_filter.utils.manipulate_prompt import add_target_language_to_prompt
@@ -128,7 +129,18 @@ def main() -> None:
     required=True,
     help="The endpoint for the LLM service.",
 )
-def entry_point_score_documents(config_file_path: Path, rest_endpoint: str, experiment_id: Optional[str] = None):
+@click.option(
+    "--use_llm_rest_client_request_collector",
+    type=bool,
+    default=False,
+    help="Whether to use the LLM REST client request collector to run requests with OpenAI batched API.",
+)
+def entry_point_score_documents(
+    config_file_path: Path,
+    rest_endpoint: str,
+    use_llm_rest_client_request_collector,
+    experiment_id: Optional[str] = None,
+):
     with open(config_file_path, "rb") as f:
         hash_value = hashlib.sha256(f.read()).hexdigest()[:8]
     experiment_id_postfix = datetime.now().strftime("%Y-%m-%d__%H-%M-%S") + f"__{hash_value}"
@@ -137,7 +149,12 @@ def entry_point_score_documents(config_file_path: Path, rest_endpoint: str, expe
         experiment_id = experiment_id_postfix
     else:
         experiment_id = experiment_id + f"/{experiment_id_postfix}"
-    llm_service = LLMClient(config_file_path=config_file_path, experiment_id=experiment_id, rest_endpoint=rest_endpoint)
+    llm_service = LLMClient(
+        config_file_path=config_file_path,
+        experiment_id=experiment_id,
+        rest_endpoint=rest_endpoint,
+        use_llm_rest_client_request_collector=use_llm_rest_client_request_collector,
+    )
     llm_service.run()
 
 
@@ -751,6 +768,64 @@ def _get_target_language_codes_list_helper(target_language_codes: str) -> list[s
     return [lang_code.strip().lower() for lang_code in target_language_codes.split(",")]
 
 
+@main.command(name="submit_batch_requests")
+@click.argument("input_files", nargs=-1, type=click.Path(exists=True, path_type=Path))
+@click.option(
+    "--model_name",
+    type=str,
+    required=True,
+    help="Name of the OpenAI model to use.",
+)
+@click.option(
+    "--max_requests_per_file",
+    type=int,
+    default=None,
+    help="Maximum number of requests to send per input file.",
+)
+@click.option(
+    "--check_status_only",
+    type=bool,
+    default=False,
+    help="Whether to check the status of existing batch requests only.",
+)
+def submit_collected_requests_to_batched_openai_api_cli(
+    input_files: tuple[Path], model_name: str, max_requests_per_file: int | None, check_status_only: bool
+):
+    """
+    CLI command to submit collected requests to the batched OpenAI API.
+    """
+    from ml_filter.llm_api.openai_batch_request_collector import OpenAIBatchAPIRequestSubmitter
+
+    input_files = [Path(p) for p in input_files]
+    # Not all models are supported: https://community.openai.com/t/error-on-tryng-to-use-batches/935474/7
+    collector = OpenAIBatchAPIRequestSubmitter(
+        input_files=input_files, model_name=model_name, max_requests_per_file=max_requests_per_file
+    )
+    if not check_status_only:
+        collector.submit()
+    else:
+        collector.check_status_maybe_get_results()
+
+
+@main.command(name="get_costs_of_openai_batched_requests")
+@click.option(
+    "--root_directory",
+    type=str,
+    required=True,
+    help="The root directory to search recursively.",
+)
+@click.option(
+    "-o",
+    "--output_file",
+    type=str,
+    default="report.md",
+    show_default=True,
+    help="Path to save the markdown report (default: report.md under the root dir).",
+)
+def get_costs_of_openai_batched_requests_cli(root_directory: str, output_file: str):
+    find_and_process_files(root_directory, output_file)
+
+
 @main.command(name="train_with_embeddings")
 @click.option(
     "--config_file_path",