Skip to content

Commit decc088

Browse files
committed
reset parenthesis/quotation levels at the beginning of a line
1 parent 0c2c51a commit decc088

3 files changed

Lines changed: 33 additions & 9 deletions

File tree

tests/test_ssplit/016.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"text": "このライブラリは Python で書かれています(笑\n安心してください(笑",
3+
"sentences": [
4+
"このライブラリは Python で書かれています(笑",
5+
"安心してください(笑"
6+
]
7+
}

tests/test_ssplit/017.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"text": "なんと (((;゚Д゚)))))))\nびっくりしました.\n",
3+
"sentences": [
4+
"なんと (((;゚Д゚)))))))",
5+
"びっくりしました."
6+
]
7+
}

textformatting/ssplit.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -108,24 +108,34 @@ def _merge_parenthesis(sentence_candidates):
108108
quotation_level = 0
109109

110110
merged_sentences = []
111-
_sentence = ''
112-
for sentence_candidate in sentence_candidates:
111+
_sentence_candidate = ''
112+
while sentence_candidates:
113+
sentence_candidate = sentence_candidates.pop(0)
114+
113115
parenthesis_level += sentence_candidate.count('(') + sentence_candidate.count('(')
114116
parenthesis_level -= sentence_candidate.count(')') + sentence_candidate.count(')')
115117

116118
quotation_level += sentence_candidate.count('「') + sentence_candidate.count('“')
117119
quotation_level -= sentence_candidate.count('」') + sentence_candidate.count('”')
118120

119121
if parenthesis_level == 0 and quotation_level == 0:
120-
if _sentence:
121-
sentence_candidate = _sentence + sentence_candidate
122-
_sentence = ''
122+
sentence_candidate = _sentence_candidate + sentence_candidate
123123
merged_sentences.append(sentence_candidate)
124+
_sentence_candidate = ''
124125
else:
125-
_sentence += sentence_candidate
126-
127-
if _sentence:
128-
merged_sentences.append(_sentence)
126+
if '\n' in sentence_candidate:
127+
sentence_candidate, rest = sentence_candidate.split('\n', maxsplit=1)
128+
sentence_candidate = _sentence_candidate + sentence_candidate
129+
merged_sentences.append(sentence_candidate)
130+
_sentence_candidate = ''
131+
sentence_candidates.insert(0, rest)
132+
parenthesis_level = 0
133+
quotation_level = 0
134+
else:
135+
_sentence_candidate += sentence_candidate
136+
137+
if _sentence_candidate:
138+
merged_sentences.append(_sentence_candidate)
129139
return merged_sentences
130140

131141

0 commit comments

Comments
 (0)