Add the field for RPO-alpha

timofeev1995 · timofeev1995 · commit 959f9f137447 · 2025-05-05T18:07:01.000+02:00
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -142,6 +142,13 @@ def fine_tuning(ctx: click.Context) -> None:
     default=0.1,
     help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')",
 )
+@click.option(
+    "--rpo-alpha",
+    type=float,
+    default=1.0,
+    help="RPO alpha to control the weight of NLL loss component for chosen responses "
+    "(only used when '--training-method' is 'dpo')",
+)
 @click.option(
     "--suffix",
     "-s",
@@ -206,6 +213,7 @@ def create(
     train_on_inputs: bool | Literal["auto"],
     training_method: str,
     dpo_beta: float,
+    rpo_alpha: float,
     from_checkpoint: str,
 ) -> None:
     """Start fine-tuning"""
@@ -239,6 +247,7 @@ def create(
         train_on_inputs=train_on_inputs,
         training_method=training_method,
         dpo_beta=dpo_beta,
+        rpo_alpha=rpo_alpha,
         from_checkpoint=from_checkpoint,
     )
 
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -80,6 +80,7 @@ def create_finetune_request(
     train_on_inputs: bool | Literal["auto"] = "auto",
     training_method: str = "sft",
     dpo_beta: float | None = None,
+    rpo_alpha: float | None = None,
     from_checkpoint: str | None = None,
 ) -> FinetuneRequest:
     if model is not None and from_checkpoint is not None:
@@ -193,7 +194,7 @@ def create_finetune_request(
 
     training_method_cls: TrainingMethodSFT | TrainingMethodDPO = TrainingMethodSFT()
     if training_method == "dpo":
-        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
+        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta, rpo_alpha=rpo_alpha)
 
     finetune_request = FinetuneRequest(
         model=model,
@@ -322,6 +323,7 @@ def create(
         train_on_inputs: bool | Literal["auto"] = "auto",
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        rpo_alpha: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -373,6 +375,7 @@ def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            rpo_alpha (float, optional): RPO alpha to control the weight of NLL loss component for chosen responses. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -425,6 +428,7 @@ def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            rpo_alpha=rpo_alpha,
             from_checkpoint=from_checkpoint,
         )
 
@@ -710,6 +714,7 @@ async def create(
         train_on_inputs: bool | Literal["auto"] = "auto",
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        rpo_alpha: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -761,6 +766,7 @@ async def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            rpo_alpha (float, optional): RPO alpha to control the weight of NLL loss component for chosen responses. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -813,6 +819,7 @@ async def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            rpo_alpha=rpo_alpha,
             from_checkpoint=from_checkpoint,
         )
 
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -158,6 +158,7 @@ class TrainingMethodDPO(TrainingMethod):
 
     method: Literal["dpo"] = "dpo"
     dpo_beta: float | None = None
+    rpo_alpha: float | None = None
 
 
 class FinetuneRequest(BaseModel):