Skip to content

Commit

Permalink
Improve sentence boundary detection
Browse files Browse the repository at this point in the history
  • Loading branch information
jnordberg committed May 27, 2022
1 parent 3f7386d commit b294f02
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions tortoise/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,18 @@ def commit():
commit()
# check for sentence boundaries
elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')):
# seek forward if we have consecutive boundary markers but still within the max length
while len(current) < max_length and next_c in '!?.':
c, next_c = seek(1)
current += c
split_pos.append(pos)
if len(current) >= desired_length:
commit()
rv.append(current)

# clean up
# clean up, remove lines with only whitespace or punctuation
rv = [s.strip() for s in rv]
rv = [s for s in rv if len(s) > 0]
rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]

return rv

Expand All @@ -81,4 +85,15 @@ def test_split_and_recombine_text(self):
'inthemiddlebutinotinthislongword.',
'"Don\'t split my quote... please"'])

def test_split_and_recombine_text_2(self):
text = """
When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?!
I don't know but we should handle this situation..........................
"""
self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50),
['When you are really angry sometimes you use',
'consecutive exclamation marks!!!!!!',
'Is this a good thing to do?!?!?!',
'I don\'t know but we should handle this situation.'])

unittest.main()

0 comments on commit b294f02

Please sign in to comment.