Add sequence confidence to pretranslations (#279)

Enkidu93 · web-flow · commit 9cc4cd0e718f · 2026-03-18T11:08:16.000-04:00
diff --git a/machine/jobs/nmt_engine_build_job.py b/machine/jobs/nmt_engine_build_job.py
@@ -115,6 +115,7 @@ def _batch_inference(
                     check_canceled()
                 for i, result in enumerate(engine.translate_batch(seg_batch)):
                     pretranslations[current_inference_step + i]["translation"] = result.translation
+                    pretranslations[current_inference_step + i]["sequenceConfidence"] = result.sequence_confidence
                 current_inference_step += len(seg_batch)
                 phase_progress(ProgressStatus.from_step(current_inference_step, inference_step_count))
 
diff --git a/machine/jobs/translation_file_service.py b/machine/jobs/translation_file_service.py
@@ -20,6 +20,7 @@ class PretranslationInfo(TypedDict):
     sourceTokens: List[str]  # noqa: N815
     translationTokens: List[str]  # noqa: N815
     alignment: str
+    sequenceConfidence: float  # noqa: N815
 
 
 class TranslationFileService:
@@ -98,6 +99,7 @@ def generator() -> Generator[PretranslationInfo, None, None]:
                         sourceTokens=list(),
                         translationTokens=list(),
                         alignment="",
+                        sequenceConfidence=0,
                     )
 
         return ContextManagedGenerator(generator())
diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -164,6 +164,8 @@ def _try_translate_n_batch(
                 builder = TranslationResultBuilder(input_tokens)
                 for token, score in zip(output["translation_tokens"], output["token_scores"]):
                     builder.append_token(token, TranslationSources.NMT, exp(score))
+                if output["sequence_score"] is not None:
+                    builder.set_sequence_confidence(exp(output["sequence_score"]))
                 word_pairs: Optional[Collection[Union[AlignedWordPair, Tuple[int, int]]]] = None
                 if output.get("token_attentions") is not None:
                     src_indices = torch.argmax(output["token_attentions"], dim=1).tolist()
@@ -257,36 +259,56 @@ def _forward(self, model_inputs, **generate_kwargs):
             output_ids = output.sequences
             beam_indices = output.beam_indices
             scores = output.scores
+            assert scores is not None and beam_indices is not None
+            sequences_scores = output.sequences_scores
             attentions = output.cross_attentions
         elif isinstance(output, GreedySearchEncoderDecoderOutput):
             output_ids = output.sequences
-            beam_indices = torch.zeros_like(output_ids)
+            beam_indices = None
             assert output.scores is not None
-            scores = tuple(torch.nn.functional.log_softmax(logits, dim=-1) for logits in output.scores)
+            scores = output.scores
+            sequences_scores = None
             attentions = output.cross_attentions
         else:
             raise RuntimeError("Cannot postprocess the output of the model.")
 
-        assert beam_indices is not None and scores is not None
-        out_b = output_ids.shape[0]
+        transition_scores = cast(
+            torch.Tensor,
+            self.model.compute_transition_scores(
+                output_ids,  # type: ignore
+                scores,  # type: ignore
+                beam_indices,  # type: ignore
+                normalize_logits=True,
+            ),
+        )
+
+        if beam_indices is None:
+            beam_indices = torch.zeros_like(output_ids)
+
+        out_b, seq_len = output_ids.shape
         num_beams = scores[0].shape[0] // in_b
         n_sequences = out_b // in_b
+
+        ts_len = transition_scores.shape[1]
+        if ts_len == seq_len:
+            token_logprobs = transition_scores
+        elif ts_len == seq_len - 1:
+            token_logprobs = torch.cat(
+                [
+                    torch.zeros(out_b, 1, device=transition_scores.device, dtype=transition_scores.dtype),
+                    transition_scores,
+                ],
+                dim=1,
+            )
+        else:
+            raise RuntimeError(
+                f"Unexpected transition_scores length {ts_len} for sequences length {seq_len}. "
+                "Cannot align token scores robustly."
+            )
+
         start_index = 0
         if self.model.config.decoder_start_token_id is not None:
             start_index = 1
-        indices = torch.stack(
-            (
-                torch.arange(output_ids.shape[1] - start_index, device=output_ids.device).expand(in_b, n_sequences, -1),
-                torch.reshape(beam_indices[:, start_index:] % num_beams, (in_b, n_sequences, -1)),
-                torch.reshape(output_ids[:, start_index:], (in_b, n_sequences, -1)),
-            ),
-            dim=3,
-        )
-        scores = torch.stack(scores, dim=0).reshape(len(scores), in_b, num_beams, -1).transpose(0, 1)
-        scores = torch_gather_nd(scores, indices, 1)
-        if self.model.config.decoder_start_token_id is not None:
-            scores = torch.cat((torch.zeros(scores.shape[0], scores.shape[1], 1, device=scores.device), scores), dim=2)
-
         if generate_kwargs["output_attentions"] is True:
             assert attentions is not None
             num_heads = attentions[0][0].shape[1]
@@ -320,13 +342,15 @@ def _forward(self, model_inputs, **generate_kwargs):
                     ),
                     dim=2,
                 )
+        output_ids = output_ids.reshape(in_b, n_sequences, seq_len)
+        token_logprobs = token_logprobs.reshape(in_b, n_sequences, seq_len)
 
-        output_ids = output_ids.reshape(in_b, n_sequences, *output_ids.shape[1:])
         return {
             "input_ids": model_inputs["input_ids"],
             "input_tokens": input_tokens,
             "output_ids": output_ids,
-            "scores": scores,
+            "scores": token_logprobs,
+            "sequences_scores": sequences_scores,
             "attentions": attentions,
         }
 
@@ -346,24 +370,17 @@ def postprocess(self, model_outputs, clean_up_tokenization_spaces=False):
         records = []
 
         has_attentions = model_outputs.get("attentions") is not None and model_outputs["attentions"][0] is not None
-        if has_attentions:
-            zipped = zip(
-                model_outputs["output_ids"][0],
-                model_outputs["scores"][0],
-                model_outputs["attentions"][0],
-            )
-        else:
-            zipped = zip(
-                model_outputs["output_ids"][0],
-                model_outputs["scores"][0],
-            )
-
+        has_sequence_scores = model_outputs["sequences_scores"] is not None
+        zipped = zip(
+            model_outputs["output_ids"][0],
+            model_outputs["scores"][0],
+            model_outputs["sequences_scores"] if has_sequence_scores else iter(lambda: None, 1),
+            model_outputs["attentions"][0] if has_attentions else iter(lambda: None, 1),
+        )
         for item in zipped:
-            if has_attentions:
-                output_ids, scores, attentions = cast(Tuple[torch.Tensor, torch.Tensor, torch.Tensor], item)
-            else:
-                output_ids, scores = cast(Tuple[torch.Tensor, torch.Tensor], item)
-                attentions = None
+            output_ids, scores, sequence_score, attentions = cast(
+                Tuple[torch.Tensor, torch.Tensor, Optional[float], Optional[torch.Tensor]], item
+            )
 
             output_tokens: List[str] = []
             output_indices: List[int] = []
@@ -379,6 +396,7 @@ def postprocess(self, model_outputs, clean_up_tokenization_spaces=False):
                 "input_tokens": input_tokens,
                 "translation_tokens": output_tokens,
                 "token_scores": scores,
+                "sequence_score": sequence_score,
                 "translation_text": self.tokenizer.decode(
                     output_ids,
                     skip_special_tokens=True,
diff --git a/machine/translation/translation_result.py b/machine/translation/translation_result.py
@@ -12,6 +12,7 @@ def __init__(
         source_tokens: Iterable[str],
         target_tokens: Iterable[str],
         confidences: Iterable[float],
+        sequence_confidence: float,
         sources: Iterable[TranslationSources],
         alignment: WordAlignmentMatrix,
         phrases: Iterable[Phrase],
@@ -20,6 +21,7 @@ def __init__(
         self._source_tokens = list(source_tokens)
         self._target_tokens = list(target_tokens)
         self._confidences = list(confidences)
+        self._sequence_confidence = sequence_confidence
         self._sources = list(sources)
         self._alignment = alignment
         self._phrases = list(phrases)
@@ -49,6 +51,10 @@ def target_tokens(self) -> Sequence[str]:
     def confidences(self) -> Sequence[float]:
         return self._confidences
 
+    @property
+    def sequence_confidence(self) -> float:
+        return self._sequence_confidence
+
     @property
     def sources(self) -> Sequence[TranslationSources]:
         return self._sources
diff --git a/machine/translation/translation_result_builder.py b/machine/translation/translation_result_builder.py
@@ -28,6 +28,7 @@ def __init__(
         self._confidences: List[float] = []
         self._sources: List[TranslationSources] = []
         self._phrases: List[PhraseInfo] = []
+        self._sequence_confidence: float = -1.0
 
     @property
     def source_tokens(self) -> Sequence[str]:
@@ -49,6 +50,10 @@ def sources(self) -> Sequence[TranslationSources]:
     def phrases(self) -> Sequence[PhraseInfo]:
         return self._phrases
 
+    @property
+    def sequence_confidence(self) -> float:
+        return self.sequence_confidence
+
     def append_token(self, token: str, source: TranslationSources, confidence: float) -> None:
         self._target_tokens.append(token)
         self._sources.append(source)
@@ -60,6 +65,9 @@ def mark_phrase(self, source_segment_range: Range[int], alignment: WordAlignment
     def set_confidence(self, index: int, confidence: float) -> None:
         self._confidences[index] = confidence
 
+    def set_sequence_confidence(self, sequence_confidence: float):
+        self._sequence_confidence = sequence_confidence
+
     def correct_prefix(
         self,
         word_ops: Iterable[EditOperation],
@@ -165,6 +173,7 @@ def to_result(self, translation: Optional[str] = None) -> TranslationResult:
             self._source_tokens,
             self._target_tokens,
             self._confidences,
+            self._sequence_confidence,
             sources,
             alignment,
             phrases,
diff --git a/machine/translation/truecaser.py b/machine/translation/truecaser.py
@@ -29,6 +29,7 @@ def truecase_translation_result(
             result.source_tokens,
             target_tokens,
             result.confidences,
+            result.sequence_confidence,
             result.sources,
             result.alignment,
             result.phrases,
diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py
@@ -50,10 +50,12 @@ def test_run(decoy: Decoy) -> None:
         ]
         assert pretranslations[0]["translationTokens"] == ["Please", ",", "I", "have", "booked", "a", "room", "."]
         assert len(pretranslations[0]["alignment"]) > 0
+        assert pretranslations[0]["sequenceConfidence"] == 0.5
     else:
         assert pretranslations[0]["sourceTokens"] == []
         assert pretranslations[0]["translationTokens"] == []
         assert len(pretranslations[0]["alignment"]) == 0
+        assert pretranslations[0]["sequenceConfidence"] == 0.5
     decoy.verify(env.translation_file_service.save_model(Path("model.tar.gz"), "models/save-model.tar.gz"), times=1)
 
 
@@ -86,6 +88,7 @@ def __init__(self, decoy: Decoy) -> None:
                     source_tokens="Por favor , tengo reservada una habitación .".split(),
                     target_tokens="Please , I have booked a room .".split(),
                     confidences=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                    sequence_confidence=0.5,
                     sources=[
                         TranslationSources.NMT,
                         TranslationSources.NMT,
@@ -135,6 +138,7 @@ def __init__(self, decoy: Decoy) -> None:
                             sourceTokens=[],
                             translationTokens=[],
                             alignment="",
+                            sequenceConfidence=0.5,
                         )
                     ]
                 )
diff --git a/tests/jobs/test_smt_engine_build_job.py b/tests/jobs/test_smt_engine_build_job.py
@@ -65,6 +65,7 @@ def __init__(self, decoy: Decoy) -> None:
                     source_tokens="Por favor , tengo reservada una habitación .".split(),
                     target_tokens="Please , I have booked a room .".split(),
                     confidences=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+                    sequence_confidence=0.5,
                     sources=[
                         TranslationSources.SMT,
                         TranslationSources.SMT,
@@ -140,6 +141,7 @@ def __init__(self, decoy: Decoy) -> None:
                             sourceTokens=[],
                             translationTokens=[],
                             alignment="",
+                            sequenceConfidence=0.5,
                         )
                     ]
                 )
diff --git a/tests/translation/huggingface/test_hugging_face_nmt_engine.py b/tests/translation/huggingface/test_hugging_face_nmt_engine.py
@@ -5,9 +5,12 @@
 
     skip("skipping Hugging Face tests on MacOS", allow_module_level=True)
 
+from math import exp, log
+
 from pytest import approx, mark, raises
 
 from machine.translation.huggingface import HuggingFaceNmtEngine
+from machine.translation.translation_result import TranslationResult
 
 
 @mark.parametrize("output_attentions", [True, False])
@@ -26,16 +29,23 @@ def test_translate_n_batch_beam(output_attentions: bool) -> None:
         )
         assert results[0][0].translation == "skaberskaber Dollar Dollar ፤ ፤ gerekir gerekir"
         assert results[0][0].confidences[0] == approx(1.08e-05, 0.01)
+        assert results[0][0].sequence_confidence == approx(_get_sequence_confidence(results[0][0]), 0.01)
         assert str(results[0][0].alignment) == ("2-0 2-1 2-2 2-3 4-4 4-5 4-6 4-7" if output_attentions else "")
+
         assert results[0][1].translation == "skaberskaber Dollar Dollar ፤ ፤ ፤ gerekir"
         assert results[0][1].confidences[0] == approx(1.08e-05, 0.01)
+        assert results[0][1].sequence_confidence == approx(_get_sequence_confidence(results[0][0]), 0.01)
         assert str(results[0][1].alignment) == ("2-0 2-1 2-2 2-3 4-4 4-5 4-6 4-7" if output_attentions else "")
+
         assert results[1][0].translation == "skaberskaber Dollar Dollar ፤ ፤ gerekir gerekir"
         assert results[1][0].confidences[0] == approx(1.08e-05, 0.01)
         assert str(results[1][0].alignment) == ("0-1 0-2 0-7 1-0 3-3 3-4 3-5 3-6" if output_attentions else "")
+        assert results[1][0].sequence_confidence == approx(_get_sequence_confidence(results[0][0]), 0.01)
+
         assert results[1][1].translation == "skaberskaber Dollar Dollar ፤ ፤ ፤ gerekir"
         assert results[1][1].confidences[0] == approx(1.08e-05, 0.01)
         assert str(results[1][1].alignment) == ("0-1 0-2 0-7 1-0 3-3 3-4 3-5 3-6" if output_attentions else "")
+        assert results[1][1].sequence_confidence == approx(_get_sequence_confidence(results[0][0]), 0.01)
 
 
 @mark.parametrize("output_attentions", [True, False])
@@ -46,10 +56,16 @@ def test_translate_greedy(output_attentions: bool) -> None:
         result = engine.translate("This is a test string")
         assert result.translation == "skaberskaber Dollar Dollar Dollar ፤ gerekir gerekir"
         assert result.confidences[0] == approx(1.08e-05, 0.01)
+        assert result.sequence_confidence == -1.0
         assert str(result.alignment) == ("2-0 2-1 2-2 2-3 4-4 4-5 4-6 4-7" if output_attentions else "")
 
 
 @mark.parametrize("output_attentions", [True, False])
 def test_construct_invalid_lang(output_attentions: bool) -> None:
     with raises(ValueError):
         HuggingFaceNmtEngine("stas/tiny-m2m_100", src_lang="qaa", tgt_lang="es", output_attentions=output_attentions)
+
+
+def _get_sequence_confidence(result: TranslationResult) -> float:
+    # Inject a 0 score for the BOS token
+    return exp(sum([log(c) for c in result.confidences] + [0]) / (len(result.confidences) + 1))

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def __init__(self, decoy: Decoy) -> None:`
`65`	`65`	`source_tokens="Por favor , tengo reservada una habitación .".split(),`
`66`	`66`	`target_tokens="Please , I have booked a room .".split(),`
`67`	`67`	`confidences=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],`
	`68`	`+ sequence_confidence=0.5,`
`68`	`69`	`sources=[`
`69`	`70`	`TranslationSources.SMT,`
`70`	`71`	`TranslationSources.SMT,`
`@@ -140,6 +141,7 @@ def __init__(self, decoy: Decoy) -> None:`
`140`	`141`	`sourceTokens=[],`
`141`	`142`	`translationTokens=[],`
`142`	`143`	`alignment="",`
	`144`	`+ sequenceConfidence=0.5,`
`143`	`145`	`)`
`144`	`146`	`]`
`145`	`147`	`)`