Add dpo improvements arguments

timofeev1995 · timofeev1995 · commit a0270e8f42f9 · 2025-06-12T17:24:30.000+02:00
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -142,6 +142,36 @@ def fine_tuning(ctx: click.Context) -> None:
     default=0.1,
     help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')",
 )
+@click.option(
+    "--dpo-normalize-logratios_by-length",
+    type=bool,
+    default=False,
+    help=(
+        "Whether to normalize logratios by sample length "
+        "(only used when '--training-method' is 'dpo')"
+    ),
+)
+@click.option(
+    "--dpo-reference-free",
+    type=bool,
+    default=False,
+    help="Whether to skip reference logits usage (only used when '--training-method' is 'dpo')",
+)
+@click.option(
+    "--rpo-alpha",
+    type=float,
+    default=0.0,
+    help=(
+        "RPO alpha parameter of DPO training to include NLL in the loss "
+        "(only used when '--training-method' is 'dpo')"
+    ),
+)
+@click.option(
+    "--simpo-gamma",
+    type=float,
+    default=0.1,
+    help="SimPO gamma parameter (only used when '--training-method' is 'dpo')",
+)
 @click.option(
     "--suffix",
     "-s",
@@ -206,6 +236,10 @@ def create(
     train_on_inputs: bool | Literal["auto"],
     training_method: str,
     dpo_beta: float,
+    dpo_normalize_logratios_by_length: bool,
+    dpo_reference_free: bool,
+    rpo_alpha: float,
+    simpo_gamma: float,
     from_checkpoint: str,
 ) -> None:
     """Start fine-tuning"""
@@ -239,6 +273,10 @@ def create(
         train_on_inputs=train_on_inputs,
         training_method=training_method,
         dpo_beta=dpo_beta,
+        dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+        dpo_reference_free=dpo_reference_free,
+        rpo_alpha=rpo_alpha,
+        simpo_gamma=simpo_gamma,
         from_checkpoint=from_checkpoint,
     )
 
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -72,6 +72,10 @@ def create_finetune_request(
     train_on_inputs: bool | Literal["auto"] | None = None,
     training_method: str = "sft",
     dpo_beta: float | None = None,
+    dpo_normalize_logratios_by_length: bool = False,
+    dpo_reference_free: bool = False,
+    rpo_alpha: float | None = None,
+    simpo_gamma: float | None = None,
     from_checkpoint: str | None = None,
 ) -> FinetuneRequest:
     if model is not None and from_checkpoint is not None:
@@ -182,6 +186,14 @@ def create_finetune_request(
 
     if dpo_beta is not None and training_method != "dpo":
         raise ValueError("dpo_beta is only supported for DPO training")
+    if dpo_normalize_logratios_by_length and training_method != "dpo":
+        raise ValueError("dpo_normalize_logratios_by_length=True is only supported for DPO training")
+    if dpo_reference_free and training_method != "dpo":
+        raise ValueError("dpo_reference_free=True is only supported for DPO training")
+    if rpo_alpha is not None and training_method != "dpo":
+        raise ValueError("rpo_alpha is only supported for DPO training")
+    if simpo_gamma is not None and training_method != "dpo":
+        raise ValueError("simpo_gamma is only supported for DPO training")
 
     lr_scheduler: FinetuneLRScheduler
     if lr_scheduler_type == "cosine":
@@ -204,7 +216,13 @@ def create_finetune_request(
     if training_method == "sft":
         training_method_cls = TrainingMethodSFT(train_on_inputs=train_on_inputs)
     elif training_method == "dpo":
-        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
+        training_method_cls = TrainingMethodDPO(
+            dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            dpo_reference_free=dpo_reference_free,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
+        )
 
     finetune_request = FinetuneRequest(
         model=model,
@@ -302,6 +320,10 @@ def create(
         train_on_inputs: bool | Literal["auto"] | None = None,
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        dpo_normalize_logratios_by_length: bool = False,
+        dpo_reference_free: bool = False,
+        rpo_alpha: float | None = None,
+        simpo_gamma: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -353,6 +375,10 @@ def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample lenght. Defaults to False,
+            dpo_reference_free (bool): Whether to skip reference logits usage. Defaults to False.
+            rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
+            simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -405,6 +431,10 @@ def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            dpo_reference_free=dpo_reference_free,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
             from_checkpoint=from_checkpoint,
         )
 
@@ -714,6 +744,10 @@ async def create(
         train_on_inputs: bool | Literal["auto"] | None = None,
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        dpo_normalize_logratios_by_length: bool = False,
+        dpo_reference_free: bool = False,
+        rpo_alpha: float | None = None,
+        simpo_gamma: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -765,6 +799,10 @@ async def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample lenght. Defaults to False,
+            dpo_reference_free (bool): Whether to skip reference logits usage. Defaults to False.
+            rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
+            simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -817,6 +855,10 @@ async def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            dpo_reference_free=dpo_reference_free,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
             from_checkpoint=from_checkpoint,
         )
 
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -159,6 +159,10 @@ class TrainingMethodDPO(TrainingMethod):
 
     method: Literal["dpo"] = "dpo"
     dpo_beta: float | None = None
+    dpo_normalize_logratios_by_length: bool = False
+    dpo_reference_free: bool = False
+    rpo_alpha: float | None = None
+    simpo_gamma: float | None = None
 
 
 class FinetuneRequest(BaseModel):