Skip to content

Commit cd06553

Browse files
authored
Add Quality Estimation support (#397)
1 parent f7ae987 commit cd06553

17 files changed

Lines changed: 751 additions & 0 deletions
Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using SIL.Machine.Corpora;
5+
6+
namespace SIL.Machine.QualityEstimation
7+
{
8+
/// <summary>
9+
/// Provides chrF3 quality estimation for pre-translations.
10+
/// </summary>
11+
public class ChrF3QualityEstimator
12+
{
13+
private readonly double _intercept;
14+
private readonly double _slope;
15+
16+
/// <summary>
17+
/// Initializes a new instance of the ChrF3QualityEstimator class with the specified slope and intercept values.
18+
/// </summary>
19+
/// <param name="slope">The slope value used in the quality estimation calculation.</param>
20+
/// <param name="intercept">The intercept value used in the quality estimation calculation.</param>
21+
public ChrF3QualityEstimator(double slope, double intercept)
22+
{
23+
_slope = slope;
24+
_intercept = intercept;
25+
}
26+
27+
/// <summary>
28+
/// The threshold values used to calculate the usability label for every book or text.
29+
/// </summary>
30+
public Thresholds BookThresholds { get; set; } = new Thresholds(greenThreshold: 0.745, yellowThreshold: 0.62);
31+
32+
/// <summary>
33+
/// The threshold values used to calculate the usability label for every chapter.
34+
/// </summary>
35+
public Thresholds ChapterThresholds { get; set; } =
36+
new Thresholds(greenThreshold: 0.745, yellowThreshold: 0.62);
37+
38+
/// <summary>
39+
/// The threshold values used to calculate the usability label for every segment.
40+
/// </summary>
41+
public Thresholds SegmentThresholds { get; set; } =
42+
new Thresholds(greenThreshold: 0.745, yellowThreshold: 0.62);
43+
44+
/// <summary>
45+
/// The usable parameters to calculate the usable probabilities.
46+
/// </summary>
47+
public UsabilityParameters Usable { get; set; } = UsabilityParameters.Usable;
48+
49+
/// <summary>
50+
/// The unusable parameters to calculate the usable probabilities.
51+
/// </summary>
52+
public UsabilityParameters Unusable { get; set; } = UsabilityParameters.Unusable;
53+
54+
/// <summary>
55+
/// Estimate the quality of the pre-translations from text files.
56+
/// </summary>
57+
/// <param name="confidences">The confidence values.</param>
58+
/// <returns>The usability scores for every segment in the texts, and for the texts.</returns>
59+
public (List<TextSegmentUsability> usabilitySegments, List<TextUsability> usabilityTexts) EstimateQuality(
60+
IEnumerable<(MultiKeyRef key, double confidence)> confidences
61+
)
62+
{
63+
(List<TextSegmentScore> segmentScores, TextScores textScores) = ProjectChrF3(confidences);
64+
return ComputeSegmentUsability(segmentScores, textScores);
65+
}
66+
67+
/// <summary>
68+
/// Estimate the quality of the pre-translations from USFM files.
69+
/// </summary>
70+
/// <param name="confidences">The confidence values.</param>
71+
/// <returns>The usability scores for every verse segment, chapter, and book.</returns>
72+
public (
73+
List<ScriptureSegmentUsability> usabilitySegments,
74+
List<ScriptureChapterUsability> usabilityChapters,
75+
List<ScriptureBookUsability> usabilityBooks
76+
) EstimateQuality(IEnumerable<(ScriptureRef key, double confidence)> confidences)
77+
{
78+
(
79+
List<ScriptureSegmentScore> segmentScores,
80+
ScriptureChapterScores chapterScores,
81+
ScriptureBookScores bookScores
82+
) = ProjectChrF3(confidences);
83+
return ComputeSegmentUsability(segmentScores, chapterScores, bookScores);
84+
}
85+
86+
/// <summary>
87+
/// Calculates the geometric mean for a collection of values.
88+
/// </summary>
89+
/// <param name="values"></param>
90+
/// <returns>The geometric mean.</returns>
91+
private static double GeometricMean(IList<double> values)
92+
{
93+
// Geometric mean requires positive values
94+
if (values == null || !values.Any() || values.Any(x => x <= 0))
95+
return 0;
96+
97+
// Compute the sum of the natural logarithms of all values,
98+
// and divide by the count of numbers and take the exponential
99+
return Math.Exp(values.Sum(Math.Log) / values.Count);
100+
}
101+
102+
private double CalculateUsableProbability(double chrF3)
103+
{
104+
double usableWeight = Math.Exp(-Math.Pow(chrF3 - Usable.Mean, 2) / (2 * Usable.Variance)) * Usable.Count;
105+
double unusableWeight =
106+
Math.Exp(-Math.Pow(chrF3 - Unusable.Mean, 2) / (2 * Unusable.Variance)) * Unusable.Count;
107+
return usableWeight / (usableWeight + unusableWeight);
108+
}
109+
110+
private List<ScriptureBookUsability> ComputeBookUsability(ScriptureBookScores bookScores)
111+
{
112+
var usabilityBooks = new List<ScriptureBookUsability>();
113+
foreach (string book in bookScores.Scores.Keys)
114+
{
115+
Score score = bookScores.GetScore(book);
116+
if (score is null)
117+
continue;
118+
119+
List<double> bookUsabilities = bookScores.GetSegmentUsabilities(book);
120+
double averageProbability = bookUsabilities.Average();
121+
usabilityBooks.Add(
122+
new ScriptureBookUsability(
123+
book,
124+
label: BookThresholds.ReturnLabel(averageProbability),
125+
usability: averageProbability,
126+
projectedChrF3: score.ProjectedChrF3
127+
)
128+
);
129+
}
130+
131+
return usabilityBooks;
132+
}
133+
134+
private List<ScriptureChapterUsability> ComputeChapterUsability(ScriptureChapterScores chapterScores)
135+
{
136+
var usabilityChapters = new List<ScriptureChapterUsability>();
137+
foreach (KeyValuePair<string, Dictionary<int, Score>> chapterScoresByBook in chapterScores.Scores)
138+
{
139+
string book = chapterScoresByBook.Key;
140+
foreach (int chapter in chapterScoresByBook.Value.Keys)
141+
{
142+
Score score = chapterScores.GetScore(book, chapter);
143+
if (score is null)
144+
continue;
145+
146+
List<double> chapterUsabilities = chapterScores.GetSegmentUsabilities(book, chapter);
147+
double averageProbability = chapterUsabilities.Average();
148+
usabilityChapters.Add(
149+
new ScriptureChapterUsability(
150+
book,
151+
chapter,
152+
label: ChapterThresholds.ReturnLabel(averageProbability),
153+
usability: averageProbability,
154+
projectedChrF3: score.ProjectedChrF3
155+
)
156+
);
157+
}
158+
}
159+
160+
return usabilityChapters;
161+
}
162+
163+
private (
164+
List<ScriptureSegmentUsability>,
165+
List<ScriptureChapterUsability>,
166+
List<ScriptureBookUsability>
167+
) ComputeSegmentUsability(
168+
List<ScriptureSegmentScore> segmentScores,
169+
ScriptureChapterScores chapterScores,
170+
ScriptureBookScores bookScores
171+
)
172+
{
173+
var usabilitySegments = new List<ScriptureSegmentUsability>();
174+
foreach (ScriptureSegmentScore segmentScore in segmentScores)
175+
{
176+
double probability = CalculateUsableProbability(segmentScore.ProjectedChrF3);
177+
chapterScores.AppendSegmentUsability(
178+
segmentScore.ScriptureRef.Book,
179+
segmentScore.ScriptureRef.ChapterNum,
180+
probability
181+
);
182+
bookScores.AppendSegmentUsability(segmentScore.ScriptureRef.Book, probability);
183+
usabilitySegments.Add(
184+
new ScriptureSegmentUsability(
185+
scriptureRef: segmentScore.ScriptureRef,
186+
label: SegmentThresholds.ReturnLabel(probability),
187+
usability: probability,
188+
projectedChrF3: segmentScore.ProjectedChrF3
189+
)
190+
);
191+
}
192+
193+
return (usabilitySegments, ComputeChapterUsability(chapterScores), ComputeBookUsability(bookScores));
194+
}
195+
196+
private (List<TextSegmentUsability>, List<TextUsability>) ComputeSegmentUsability(
197+
List<TextSegmentScore> segmentScores,
198+
TextScores textScores
199+
)
200+
{
201+
var usabilitySegments = new List<TextSegmentUsability>();
202+
foreach (TextSegmentScore segmentScore in segmentScores)
203+
{
204+
double probability = CalculateUsableProbability(segmentScore.ProjectedChrF3);
205+
textScores.AppendSegmentUsability(segmentScore.TextId, probability);
206+
usabilitySegments.Add(
207+
new TextSegmentUsability(
208+
segmentRef: segmentScore.SegmentRef,
209+
label: SegmentThresholds.ReturnLabel(probability),
210+
usability: probability,
211+
projectedChrF3: segmentScore.ProjectedChrF3
212+
)
213+
);
214+
}
215+
216+
return (usabilitySegments, ComputeTextUsability(textScores));
217+
}
218+
219+
private List<TextUsability> ComputeTextUsability(TextScores textScores)
220+
{
221+
var usabilityTexts = new List<TextUsability>();
222+
foreach (string textId in textScores.Scores.Keys)
223+
{
224+
Score score = textScores.GetScore(textId);
225+
if (score is null)
226+
continue;
227+
228+
List<double> textUsabilities = textScores.GetSegmentUsabilities(textId);
229+
double averageProbability = textUsabilities.Average();
230+
usabilityTexts.Add(
231+
new TextUsability(
232+
textId,
233+
label: BookThresholds.ReturnLabel(averageProbability),
234+
usability: averageProbability,
235+
projectedChrF3: score.ProjectedChrF3
236+
)
237+
);
238+
}
239+
240+
return usabilityTexts;
241+
}
242+
243+
private (List<TextSegmentScore> segmentScores, TextScores textScores) ProjectChrF3(
244+
IEnumerable<(MultiKeyRef, double)> confidences
245+
)
246+
{
247+
var confidencesByTextId = new Dictionary<string, List<double>>();
248+
var segmentScores = new List<TextSegmentScore>();
249+
foreach ((MultiKeyRef segmentRef, double confidence) in confidences)
250+
{
251+
var score = new TextSegmentScore(_slope, confidence, _intercept, segmentRef);
252+
segmentScores.Add(score);
253+
254+
// Record the confidence by text id
255+
string textId = segmentRef.TextId;
256+
if (!confidencesByTextId.TryGetValue(textId, out List<double> textConfidences))
257+
{
258+
textConfidences = new List<double>();
259+
confidencesByTextId[textId] = textConfidences;
260+
}
261+
262+
textConfidences.Add(confidence);
263+
}
264+
265+
var textScores = new TextScores();
266+
foreach (KeyValuePair<string, List<double>> textIdConfidences in confidencesByTextId)
267+
{
268+
textScores.AddScore(
269+
textIdConfidences.Key,
270+
new Score(_slope, confidence: GeometricMean(textIdConfidences.Value), _intercept)
271+
);
272+
}
273+
274+
return (segmentScores, textScores);
275+
}
276+
277+
private (
278+
List<ScriptureSegmentScore> segmentScores,
279+
ScriptureChapterScores chapterScores,
280+
ScriptureBookScores bookScores
281+
) ProjectChrF3(IEnumerable<(ScriptureRef, double)> confidences)
282+
{
283+
var confidencesByBook = new Dictionary<string, List<double>>();
284+
var confidencesByBookAndChapter = new Dictionary<(string, int), List<double>>();
285+
var segmentScores = new List<ScriptureSegmentScore>();
286+
foreach ((ScriptureRef scriptureRef, double confidence) in confidences)
287+
{
288+
var score = new ScriptureSegmentScore(_slope, confidence, _intercept, scriptureRef);
289+
segmentScores.Add(score);
290+
string book = scriptureRef.Book;
291+
int chapter = scriptureRef.ChapterNum;
292+
293+
// Record the confidence by and chapter
294+
if (
295+
!confidencesByBookAndChapter.TryGetValue(
296+
(book, chapter),
297+
out List<double> bookAndChapterConfidences
298+
)
299+
)
300+
{
301+
bookAndChapterConfidences = new List<double>();
302+
confidencesByBookAndChapter[(book, chapter)] = bookAndChapterConfidences;
303+
}
304+
305+
bookAndChapterConfidences.Add(confidence);
306+
307+
// Record the confidence by book
308+
if (!confidencesByBook.TryGetValue(book, out List<double> bookConfidences))
309+
{
310+
bookConfidences = new List<double>();
311+
confidencesByBook[book] = bookConfidences;
312+
}
313+
314+
bookConfidences.Add(confidence);
315+
}
316+
317+
var chapterScores = new ScriptureChapterScores();
318+
foreach (
319+
KeyValuePair<
320+
(string Book, int Chapter),
321+
List<double>
322+
> bookAndChapterConfidences in confidencesByBookAndChapter
323+
)
324+
{
325+
chapterScores.AddScore(
326+
bookAndChapterConfidences.Key.Book,
327+
bookAndChapterConfidences.Key.Chapter,
328+
new Score(_slope, confidence: GeometricMean(bookAndChapterConfidences.Value), _intercept)
329+
);
330+
}
331+
332+
var bookScores = new ScriptureBookScores();
333+
foreach (KeyValuePair<string, List<double>> bookConfidences in confidencesByBook)
334+
{
335+
bookScores.AddScore(
336+
bookConfidences.Key,
337+
new Score(_slope, confidence: GeometricMean(bookConfidences.Value), _intercept)
338+
);
339+
}
340+
341+
return (segmentScores, chapterScores, bookScores);
342+
}
343+
}
344+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
namespace SIL.Machine.QualityEstimation
2+
{
3+
internal class Score
4+
{
5+
public Score(double slope, double confidence, double intercept)
6+
{
7+
Confidence = confidence;
8+
ProjectedChrF3 = slope * confidence + intercept;
9+
}
10+
11+
public double Confidence { get; }
12+
13+
public double ProjectedChrF3 { get; }
14+
}
15+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
using System.Collections.Generic;
2+
3+
namespace SIL.Machine.QualityEstimation
4+
{
5+
internal class ScriptureBookScores
6+
{
7+
private readonly Dictionary<string, List<double>> _segmentUsabilities = new Dictionary<string, List<double>>();
8+
9+
public readonly Dictionary<string, Score> Scores = new Dictionary<string, Score>();
10+
11+
public void AddScore(string book, Score score) => Scores[book] = score;
12+
13+
public Score GetScore(string book) => Scores.TryGetValue(book, out Score score) ? score : null;
14+
15+
public void AppendSegmentUsability(string book, double usability)
16+
{
17+
if (!_segmentUsabilities.TryGetValue(book, out List<double> list))
18+
{
19+
list = new List<double>();
20+
_segmentUsabilities[book] = list;
21+
}
22+
23+
list.Add(usability);
24+
}
25+
26+
public List<double> GetSegmentUsabilities(string book) =>
27+
_segmentUsabilities.TryGetValue(book, out List<double> list) ? new List<double>(list) : new List<double>();
28+
}
29+
}

0 commit comments

Comments
 (0)