Skip to content

Commit d93dacd

Browse files
benjaminkingBen King
andauthored
Change quotation denormalizer to only use target corpus quote convention (#234)
* Change quotation denormalizer to only use target corpus quote convention * Fix module imports in tests --------- Co-authored-by: Ben King <benjaminking@sil.org>
1 parent 852ea41 commit d93dacd

24 files changed

Lines changed: 173 additions & 162 deletions

machine/corpora/__init__.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from .dbl_bundle_text_corpus import DblBundleTextCorpus
88
from .dictionary_alignment_corpus import DictionaryAlignmentCorpus
99
from .dictionary_text_corpus import DictionaryTextCorpus
10-
from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
11-
from .file_paratext_project_quote_convention_detector import FileParatextProjectQuoteConventionDetector
1210
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
1311
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
1412
from .flatten import flatten
@@ -26,13 +24,6 @@
2624
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
2725
from .paratext_text_corpus import ParatextTextCorpus
2826
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
29-
from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass
30-
from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler
31-
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
32-
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
33-
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
34-
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
35-
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
3627
from .scripture_element import ScriptureElement
3728
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
3829
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
@@ -85,7 +76,6 @@
8576
from .usx_file_text_corpus import UsxFileTextCorpus
8677
from .usx_memory_text import UsxMemoryText
8778
from .usx_zip_text import UsxZipText
88-
from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
8979
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
9080
from .zip_paratext_project_settings_parser_base import ZipParatextProjectSettingsParserBase
9181
from .zip_paratext_project_terms_parser import ZipParatextProjectTermsParser
@@ -96,7 +86,6 @@
9686
"AlignmentCollection",
9787
"AlignmentCorpus",
9888
"AlignmentRow",
99-
"FallbackQuotationMarkResolver",
10089
"batch",
10190
"Corpus",
10291
"create_versification_ref_corpus",
@@ -106,7 +95,6 @@
10695
"EMPTY_SCRIPTURE_REF",
10796
"escape_spaces",
10897
"extract_scripture_corpus",
109-
"FileParatextProjectQuoteConventionDetector",
11098
"FileParatextProjectSettingsParser",
11199
"FileParatextProjectTextUpdater",
112100
"flatten",
@@ -133,13 +121,6 @@
133121
"PlaceMarkersAlignmentInfo",
134122
"PlaceMarkersUsfmUpdateBlockHandler",
135123
"parse_usfm",
136-
"QuoteConventionChangingUsfmUpdateBlockHandler",
137-
"QuotationMarkUpdateResolutionSettings",
138-
"QuotationMarkUpdateStrategy",
139-
"QuotationMarkUpdateFirstPass",
140-
"QuotationMarkDenormalizationFirstPass",
141-
"QuotationMarkDenormalizationUsfmUpdateBlockHandler",
142-
"QuotationMarkUpdateSettings",
143124
"RtlReferenceOrder",
144125
"ScriptureElement",
145126
"ScriptureRef",
@@ -189,7 +170,6 @@
189170
"UsxFileTextCorpus",
190171
"UsxMemoryText",
191172
"UsxZipText",
192-
"ZipParatextProjectQuoteConventionDetector",
193173
"ZipParatextProjectSettingsParser",
194174
"ZipParatextProjectSettingsParserBase",
195175
"ZipParatextProjectTermsParser",

machine/corpora/quotation_mark_denormalization_first_pass.py

Lines changed: 0 additions & 9 deletions
This file was deleted.

machine/punctuation_analysis/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
QuoteContinuerState,
77
QuoteContinuerStyle,
88
)
9+
from .fallback_quotation_mark_resolver import FallbackQuotationMarkResolver
10+
from .file_paratext_project_quote_convention_detector import FileParatextProjectQuoteConventionDetector
11+
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector
912
from .preliminary_quotation_mark_analyzer import (
1013
ApostropheProportionStatistics,
1114
PreliminaryApostropheAnalyzer,
@@ -14,6 +17,8 @@
1417
QuotationMarkSequences,
1518
QuotationMarkWordPositions,
1619
)
20+
from .quotation_mark_denormalization_first_pass import QuotationMarkDenormalizationFirstPass
21+
from .quotation_mark_denormalization_usfm_update_block_handler import QuotationMarkDenormalizationUsfmUpdateBlockHandler
1722
from .quotation_mark_direction import QuotationMarkDirection
1823
from .quotation_mark_finder import QuotationMarkFinder
1924
from .quotation_mark_metadata import QuotationMarkMetadata
@@ -22,7 +27,12 @@
2227
from .quotation_mark_resolver import QuotationMarkResolver
2328
from .quotation_mark_string_match import QuotationMarkStringMatch
2429
from .quotation_mark_tabulator import QuotationMarkCounts, QuotationMarkTabulator
30+
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
31+
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
32+
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
33+
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
2534
from .quote_convention import QuoteConvention, SingleLevelQuoteConvention
35+
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
2636
from .quote_convention_detection_resolution_settings import QuoteConventionDetectionResolutionSettings
2737
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
2838
from .quote_convention_set import QuoteConventionSet
@@ -31,27 +41,38 @@
3141
from .usfm_marker_type import UsfmMarkerType
3242
from .usfm_structure_extractor import UsfmStructureExtractor
3343
from .verse import Verse
44+
from .zip_paratext_project_quote_convention_detector import ZipParatextProjectQuoteConventionDetector
3445

3546
__all__ = [
3647
"ApostropheProportionStatistics",
3748
"Chapter",
3849
"DepthBasedQuotationMarkResolver",
50+
"FallbackQuotationMarkResolver",
51+
"FileParatextProjectQuoteConventionDetector",
52+
"ParatextProjectQuoteConventionDetector",
3953
"PreliminaryApostropheAnalyzer",
4054
"PreliminaryQuotationMarkAnalyzer",
4155
"SingleLevelQuoteConvention",
4256
"QuoteContinuerState",
4357
"QuoteContinuerStyle",
4458
"QuotationMarkCategorizer",
4559
"QuotationMarkCounts",
60+
"QuotationMarkDenormalizationFirstPass",
61+
"QuotationMarkDenormalizationUsfmUpdateBlockHandler",
4662
"QuotationMarkDirection",
4763
"QuotationMarkGrouper",
4864
"QuotationMarkMetadata",
4965
"QuotationMarkResolverState",
5066
"QuotationMarkSequences",
5167
"QuotationMarkStringMatch",
68+
"QuotationMarkUpdateFirstPass",
69+
"QuotationMarkUpdateResolutionSettings",
70+
"QuotationMarkUpdateSettings",
71+
"QuotationMarkUpdateStrategy",
5272
"QuotationMarkWordPositions",
5373
"QuoteConvention",
5474
"QuoteConventionAnalysis",
75+
"QuoteConventionChangingUsfmUpdateBlockHandler",
5576
"QuoteConventionDetectionResolutionSettings",
5677
"QuotationMarkFinder",
5778
"QuotationMarkResolutionIssue",
@@ -65,4 +86,5 @@
6586
"UsfmMarkerType",
6687
"UsfmStructureExtractor",
6788
"Verse",
89+
"ZipParatextProjectQuoteConventionDetector",
6890
]

machine/corpora/fallback_quotation_mark_resolver.py renamed to machine/punctuation_analysis/fallback_quotation_mark_resolver.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from typing import Generator, Optional, Set
22

3-
from ..punctuation_analysis.quotation_mark_direction import QuotationMarkDirection
4-
from ..punctuation_analysis.quotation_mark_metadata import QuotationMarkMetadata
5-
from ..punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
6-
from ..punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings
7-
from ..punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver
8-
from ..punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch
3+
from .quotation_mark_direction import QuotationMarkDirection
4+
from .quotation_mark_metadata import QuotationMarkMetadata
5+
from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue
6+
from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
7+
from .quotation_mark_resolver import QuotationMarkResolver
8+
from .quotation_mark_string_match import QuotationMarkStringMatch
99

1010

1111
class FallbackQuotationMarkResolver(QuotationMarkResolver):

machine/corpora/file_paratext_project_quote_convention_detector.py renamed to machine/punctuation_analysis/file_paratext_project_quote_convention_detector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from pathlib import Path
22
from typing import BinaryIO
33

4+
from ..corpora.file_paratext_project_settings_parser import FileParatextProjectSettingsParser
45
from ..utils.typeshed import StrPath
5-
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
66
from .paratext_project_quote_convention_detector import ParatextProjectQuoteConventionDetector
77

88

machine/corpora/paratext_project_quote_convention_detector.py renamed to machine/punctuation_analysis/paratext_project_quote_convention_detector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from abc import ABC, abstractmethod
22
from typing import BinaryIO, Optional, Union
33

4-
from ..punctuation_analysis.quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
4+
from ..corpora.paratext_project_settings import ParatextProjectSettings
5+
from ..corpora.paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
6+
from ..corpora.usfm_parser import parse_usfm
57
from ..utils.typeshed import StrPath
6-
from .paratext_project_settings import ParatextProjectSettings
7-
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
8-
from .usfm_parser import parse_usfm
8+
from .quote_convention_detector import QuoteConventionAnalysis, QuoteConventionDetector
99

1010

1111
class ParatextProjectQuoteConventionDetector(ABC):
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .quotation_mark_update_first_pass import QuotationMarkUpdateFirstPass
2+
from .quote_convention import QuoteConvention
3+
4+
5+
# This is a convenience class so that users don't have to know to pass in two quote conventions,
6+
# with the first being the normalized version of the second.
7+
class QuotationMarkDenormalizationFirstPass(QuotationMarkUpdateFirstPass):
8+
9+
def __init__(self, target_quote_convention: QuoteConvention):
10+
super().__init__(target_quote_convention.normalize(), target_quote_convention)

machine/corpora/quotation_mark_denormalization_usfm_update_block_handler.py renamed to machine/punctuation_analysis/quotation_mark_denormalization_usfm_update_block_handler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
from ..punctuation_analysis.quote_convention import QuoteConvention
21
from .quotation_mark_update_settings import QuotationMarkUpdateSettings
2+
from .quote_convention import QuoteConvention
33
from .quote_convention_changing_usfm_update_block_handler import QuoteConventionChangingUsfmUpdateBlockHandler
44

55

6-
# This is a convenience class so that users don't have to know to normalize the source quote convention
6+
# This is a convenience class so that users don't have to know to pass in two quote conventions,
7+
# with the first being the normalized version of the second.
78
class QuotationMarkDenormalizationUsfmUpdateBlockHandler(QuoteConventionChangingUsfmUpdateBlockHandler):
89

910
def __init__(
1011
self,
11-
source_quote_convention: QuoteConvention,
1212
target_quote_convention: QuoteConvention,
1313
settings: QuotationMarkUpdateSettings = QuotationMarkUpdateSettings(),
1414
):
15-
super().__init__(source_quote_convention.normalize(), target_quote_convention, settings)
15+
super().__init__(target_quote_convention.normalize(), target_quote_convention, settings)

machine/corpora/quotation_mark_update_first_pass.py renamed to machine/punctuation_analysis/quotation_mark_update_first_pass.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,56 @@
11
from typing import Dict, List, Set
22

3-
from ..punctuation_analysis.chapter import Chapter
4-
from ..punctuation_analysis.depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
5-
from ..punctuation_analysis.quotation_mark_finder import QuotationMarkFinder
6-
from ..punctuation_analysis.quotation_mark_resolution_issue import QuotationMarkResolutionIssue
7-
from ..punctuation_analysis.quotation_mark_resolver import QuotationMarkResolver
8-
from ..punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch
9-
from ..punctuation_analysis.quote_convention import QuoteConvention
10-
from ..punctuation_analysis.quote_convention_set import QuoteConventionSet
11-
from ..punctuation_analysis.usfm_structure_extractor import UsfmStructureExtractor
3+
from .chapter import Chapter
4+
from .depth_based_quotation_mark_resolver import DepthBasedQuotationMarkResolver
5+
from .quotation_mark_finder import QuotationMarkFinder
6+
from .quotation_mark_resolution_issue import QuotationMarkResolutionIssue
7+
from .quotation_mark_resolver import QuotationMarkResolver
8+
from .quotation_mark_string_match import QuotationMarkStringMatch
129
from .quotation_mark_update_resolution_settings import QuotationMarkUpdateResolutionSettings
1310
from .quotation_mark_update_strategy import QuotationMarkUpdateStrategy
11+
from .quote_convention import QuoteConvention
12+
from .quote_convention_set import QuoteConventionSet
13+
from .usfm_structure_extractor import UsfmStructureExtractor
1414

1515

1616
# Determines the best strategy to take for each chapter
1717
class QuotationMarkUpdateFirstPass(UsfmStructureExtractor):
1818

19-
def __init__(self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention):
19+
def __init__(self, old_quote_convention: QuoteConvention, new_quote_convention: QuoteConvention):
2020
super().__init__()
2121
self._quotation_mark_finder: QuotationMarkFinder = QuotationMarkFinder(
22-
QuoteConventionSet([source_quote_convention])
22+
QuoteConventionSet([old_quote_convention])
2323
)
2424
self._quotation_mark_resolver: QuotationMarkResolver = DepthBasedQuotationMarkResolver(
25-
QuotationMarkUpdateResolutionSettings(source_quote_convention)
25+
QuotationMarkUpdateResolutionSettings(old_quote_convention)
2626
)
2727
self._will_fallback_mode_work: bool = self._check_whether_fallback_mode_will_work(
28-
source_quote_convention, target_quote_convention
28+
old_quote_convention, new_quote_convention
2929
)
3030

3131
def _check_whether_fallback_mode_will_work(
32-
self, source_quote_convention: QuoteConvention, target_quote_convention: QuoteConvention
32+
self, old_quote_convention: QuoteConvention, new_quote_convention: QuoteConvention
3333
) -> bool:
34-
opening_target_marks_by_source_marks: Dict[str, str] = {}
35-
closing_target_marks_by_source_marks: Dict[str, str] = {}
36-
for depth in range(1, min(source_quote_convention.num_levels, target_quote_convention.num_levels) + 1):
37-
source_opening_quotation_mark = source_quote_convention.get_opening_quotation_mark_at_depth(depth)
38-
target_opening_quotation_mark = target_quote_convention.get_opening_quotation_mark_at_depth(depth)
34+
new_opening_marks_by_old_marks: Dict[str, str] = {}
35+
new_closing_marks_by_old_marks: Dict[str, str] = {}
36+
for depth in range(1, min(old_quote_convention.num_levels, new_quote_convention.num_levels) + 1):
37+
old_opening_quotation_mark = old_quote_convention.get_opening_quotation_mark_at_depth(depth)
38+
new_opening_quotation_mark = new_quote_convention.get_opening_quotation_mark_at_depth(depth)
3939
if (
40-
source_opening_quotation_mark in opening_target_marks_by_source_marks
41-
and opening_target_marks_by_source_marks[source_opening_quotation_mark] != target_opening_quotation_mark
40+
old_opening_quotation_mark in new_opening_marks_by_old_marks
41+
and new_opening_marks_by_old_marks[old_opening_quotation_mark] != new_opening_quotation_mark
4242
):
4343
return False
44-
opening_target_marks_by_source_marks[source_opening_quotation_mark] = target_opening_quotation_mark
44+
new_opening_marks_by_old_marks[old_opening_quotation_mark] = new_opening_quotation_mark
4545

46-
source_closing_quotation_mark = source_quote_convention.get_closing_quotation_mark_at_depth(depth)
47-
target_closing_quotation_mark = target_quote_convention.get_closing_quotation_mark_at_depth(depth)
46+
old_closing_quotation_mark = old_quote_convention.get_closing_quotation_mark_at_depth(depth)
47+
new_closing_quotation_mark = new_quote_convention.get_closing_quotation_mark_at_depth(depth)
4848
if (
49-
source_closing_quotation_mark in closing_target_marks_by_source_marks
50-
and closing_target_marks_by_source_marks[source_closing_quotation_mark] != target_closing_quotation_mark
49+
old_closing_quotation_mark in new_closing_marks_by_old_marks
50+
and new_closing_marks_by_old_marks[old_closing_quotation_mark] != new_closing_quotation_mark
5151
):
5252
return False
53-
closing_target_marks_by_source_marks[source_closing_quotation_mark] = target_closing_quotation_mark
53+
new_closing_marks_by_old_marks[old_closing_quotation_mark] = new_closing_quotation_mark
5454

5555
return True
5656

machine/corpora/quotation_mark_update_resolution_settings.py renamed to machine/punctuation_analysis/quotation_mark_update_resolution_settings.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22

33
import regex
44

5-
from ..punctuation_analysis.quotation_mark_direction import QuotationMarkDirection
6-
from ..punctuation_analysis.quotation_mark_resolution_settings import QuotationMarkResolutionSettings
7-
from ..punctuation_analysis.quotation_mark_string_match import QuotationMarkStringMatch
8-
from ..punctuation_analysis.quote_convention import QuoteConvention
9-
from ..punctuation_analysis.quote_convention_set import QuoteConventionSet
5+
from .quotation_mark_direction import QuotationMarkDirection
6+
from .quotation_mark_resolution_settings import QuotationMarkResolutionSettings
7+
from .quotation_mark_string_match import QuotationMarkStringMatch
8+
from .quote_convention import QuoteConvention
9+
from .quote_convention_set import QuoteConventionSet
1010

1111

1212
class QuotationMarkUpdateResolutionSettings(QuotationMarkResolutionSettings):
13-
def __init__(self, source_quote_convention: QuoteConvention):
14-
self._source_quote_convention = source_quote_convention
15-
self._quote_convention_singleton_set = QuoteConventionSet([self._source_quote_convention])
13+
def __init__(self, old_quote_convention: QuoteConvention):
14+
self._old_quote_convention = old_quote_convention
15+
self._quote_convention_singleton_set = QuoteConventionSet([self._old_quote_convention])
1616

1717
def is_valid_opening_quotation_mark(self, quotation_mark_match: QuotationMarkStringMatch) -> bool:
1818
return quotation_mark_match.is_valid_opening_quotation_mark(self._quote_convention_singleton_set)
@@ -36,9 +36,9 @@ def should_rely_on_paragraph_markers(self):
3636
return False
3737

3838
def get_possible_depths(self, quotation_mark: str, direction: QuotationMarkDirection) -> Set[int]:
39-
return self._source_quote_convention.get_possible_depths(quotation_mark, direction)
39+
return self._old_quote_convention.get_possible_depths(quotation_mark, direction)
4040

4141
def metadata_matches_quotation_mark(
4242
self, quotation_mark: str, depth: int, direction: QuotationMarkDirection
4343
) -> bool:
44-
return self._source_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark
44+
return self._old_quote_convention.get_expected_quotation_mark(depth, direction) == quotation_mark

0 commit comments

Comments
 (0)