-
-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathParallelTextCorpus.cs
More file actions
79 lines (73 loc) · 3.45 KB
/
ParallelTextCorpus.cs
File metadata and controls
79 lines (73 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
using System;
using System.Collections.Generic;
using System.Linq;
namespace SIL.Machine.Corpora
{
public class ParallelTextCorpus : ParallelTextCorpusBase
{
public ParallelTextCorpus(
ITextCorpus sourceCorpus,
ITextCorpus targetCorpus,
IAlignmentCorpus alignmentCorpus = null,
IComparer<object> rowRefComparer = null
)
{
SourceCorpus = sourceCorpus;
TargetCorpus = targetCorpus;
AlignmentCorpus = alignmentCorpus ?? new DictionaryAlignmentCorpus();
RowRefComparer = rowRefComparer ?? new NParallelTextCorpus.DefaultRowRefComparer();
_nParallelTextCorpus = new NParallelTextCorpus(new List<ITextCorpus> { SourceCorpus, TargetCorpus });
}
public override bool IsSourceTokenized => SourceCorpus.IsTokenized;
public override bool IsTargetTokenized => TargetCorpus.IsTokenized;
public bool AllSourceRows { get; set; }
public bool AllTargetRows { get; set; }
public ITextCorpus SourceCorpus { get; }
public ITextCorpus TargetCorpus { get; }
public IAlignmentCorpus AlignmentCorpus { get; }
public IComparer<object> RowRefComparer { get; }
private readonly NParallelTextCorpus _nParallelTextCorpus;
public override IEnumerable<ParallelTextRow> GetRows(IEnumerable<string> textIds)
{
using (IEnumerator<AlignmentRow> alignmentEnumerator = AlignmentCorpus.GetEnumerator())
{
_nParallelTextCorpus.AllRows = new bool[] { AllSourceRows, AllTargetRows };
bool isScripture = SourceCorpus.IsScripture() && TargetCorpus.IsScripture();
foreach (var nRow in _nParallelTextCorpus.GetRows(textIds))
{
int compareAlignmentCorpus = -1;
if (AlignmentCorpus != null && nRow.NSegments.All(s => s.Count > 0))
{
do
{
try
{
compareAlignmentCorpus = alignmentEnumerator.MoveNext()
? RowRefComparer.Compare(nRow.Ref, alignmentEnumerator.Current.Ref)
: 1;
}
catch (ArgumentException e)
{
throw new CorpusAlignmentException(nRow.NRefs.Select(r => r.ToString()).ToArray(), e);
}
} while (compareAlignmentCorpus < 0);
}
yield return new ParallelTextRow(
nRow.TextId,
nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref },
nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref },
nRow.ContentType
)
{
SourceFlags = nRow.NFlags[0],
TargetFlags = nRow.NFlags[1],
SourceSegment = nRow.NSegments[0],
TargetSegment = nRow.NSegments[1],
AlignedWordPairs =
compareAlignmentCorpus == 0 ? alignmentEnumerator.Current.AlignedWordPairs.ToArray() : null,
};
}
}
}
}
}