-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathutils.py
54 lines (52 loc) · 1.67 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import logging
from nltk.tokenize import word_tokenize
def strip_punct(word):
'''take a word, return word with start and end punctuation removed'''
for i in range(len(word)):
if word[i].isalnum():
break
for j in range(len(word) - 1, -1, -1):
if word[j].isalnum():
break
word = word[i:j + 1]
return word
def copy_punct(word,distractor):
"""Takes the start and end punctuation of word as well as the capitalization pattern
and return distractor with that punctuation and capitalization"""
for i in range(len(word)):
if word[i].isalnum():
break
prefix=word[0:i]
for j in range(len(word) - 1, -1, -1):
if word[j].isalnum():
break
suffix=word[j+1:]
word = word[i:j + 1]
if len(word) > 1 and word.isupper():
distractor=distractor.upper() # all capitalized
elif len(word) > 1 and word[0].isupper():
distractor=distractor[0:1].upper()+distractor[1:] # first letter capitalized
else:
distractor=distractor.lower() # all lowercase
distractor=prefix+distractor+suffix
return distractor
def tokenize(word):
"""because someone needs to pull off those initial single quotes"""
tokens=[]
end_tokens=[]
for i in range(len(word)):
if word[i].isalnum():
break
else:
tokens.append(word[i])
for j in range(len(word) - 1, -1, -1):
if word[j].isalnum():
break
else:
end_tokens.append(word[j])
end_tokens.reverse()
word = word[i:j + 1]
word_tokens=word_tokenize(word)
tokens.extend(word_tokens)
tokens.extend(end_tokens)
return tokens