import re class normalize(object): expansion_file_dir='' # assume you have file with list of short forms with their expansion as gazeter short_form_dict={} # Constructor def __init__(self): self.short_form_dict=self.get_short_forms() def get_short_forms(self): text=open(self.expansion_file_dir,encoding='utf8') exp={} for line in iter(text): line=line.strip() if not line: # line is blank continue else: expanded=line.split("-") exp[expanded[0].strip()]=expanded[1].replace(" ",'_').strip() return exp # method that expand short form file def expand_short_form(self,input_short_word): if input_short_word in self.short_form_dict: return self.short_form_dict[input_short_word] else: return input_short_word #method to normalize character level missmatch such as ጸሀይ and ፀሐይ def normalize_char_level_missmatch(self,input_token): rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token) rep2=re.sub('[ሑኁዅ]','ሁ',rep1) rep3=re.sub('[ኂሒኺ]','ሂ',rep2) rep4=re.sub('[ኌሔዄ]','ሄ',rep3) rep5=re.sub('[ሕኅ]','ህ',rep4) rep6=re.sub('[ኆሖኾ]','ሆ',rep5) rep7=re.sub('[ሠ]','ሰ',rep6) rep8=re.sub('[ሡ]','ሱ',rep7) rep9=re.sub('[ሢ]','ሲ',rep8) rep10=re.sub('[ሣ]','ሳ',rep9) rep11=re.sub('[ሤ]','ሴ',rep10) rep12=re.sub('[ሥ]','ስ',rep11) rep13=re.sub('[ሦ]','ሶ',rep12) rep14=re.sub('[ዓኣዐ]','አ',rep13) rep15=re.sub('[ዑ]','ኡ',rep14) rep16=re.sub('[ዒ]','ኢ',rep15) rep17=re.sub('[ዔ]','ኤ',rep16) rep18=re.sub('[ዕ]','እ',rep17) rep19=re.sub('[ዖ]','ኦ',rep18) rep20=re.sub('[ጸ]','ፀ',rep19) rep21=re.sub('[ጹ]','ፁ',rep20) rep22=re.sub('[ጺ]','ፂ',rep21) rep23=re.sub('[ጻ]','ፃ',rep22) rep24=re.sub('[ጼ]','ፄ',rep23) rep25=re.sub('[ጽ]','ፅ',rep24) rep26=re.sub('[ጾ]','ፆ',rep25) #Normalizing words with Labialized Amharic characters such as በልቱዋል or በልቱአል to በልቷል rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26) rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27) rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28) rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29) rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30) rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31) rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32) rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33) rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34) rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35) rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36) rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37) rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38) rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39) rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40) rep42=re.sub('(ደ[ዋአ])','ዷ',rep41) rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42) rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43) rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44) rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45) rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ return rep48 #replacing any existance of special character or punctuation to null def remove_punc_and_special_chars(self,sentence_input): # puct in amh =፡።፤;፦፧፨፠፣ normalized_text = re.sub('[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\።\፤\;\፦\፥\፧\፨\፠\፣]', '',sentence_input) return normalized_text #remove all ascii characters and Arabic and Amharic numbers def remove_ascii_and_numbers(self,text_input): rm_num_and_ascii=re.sub('[A-Za-z0-9]','',text_input) return re.sub('[\'\u1369-\u137C\']+','',rm_num_and_ascii)
class DirConfig(object): BASE_DIR = '../' DATA_DIR = BASE_DIR+'Dataset/' MODEL_DIR='Models/' EMBED_DIR=MODEL_DIR+'Embedding/' PREPROCESSED_DIR=DATA_DIR +'normalized/' from nltk import BigramCollocationFinder import nltk.collocations import io import re import os class normalize(object): def tokenize(self,corpus): print('Tokenization ...') all_tokens=[] for sentence in corpus: tokens=re.compile('[\s+]+').split(sentence) all_tokens.extend(tokens) return all_tokens def get_short_forms(self,_file_dir): text=open(_file_dir,encoding='utf8') exp={} for line in iter(text): line=line.strip() if not line: # line is blank continue else: expanded=line.split("-") exp[expanded[0].strip()]=expanded[1].replace(" ",'_').strip() return exp def collocation_finder(self,tokens,bigram_dir): bigram_measures = nltk.collocations.BigramAssocMeasures() #Search for bigrams with in a corpus finder = BigramCollocationFinder.from_words(tokens) #filter only Ngram appears morethan 3+ times finder.apply_freq_filter(3) frequent_bigrams = finder.nbest(bigram_measures.chi_sq,5) # chi square computer print(frequent_bigrams) PhraseWriter = io.open(bigram_dir, "w", encoding="utf8") for bigram in frequent_bigrams: PhraseWriter.write(bigram[0]+' '+bigram[1] + "\n") def normalize_multi_words(self,tokenized_sentence,bigram_dir,corpus): bigram=set() sent_with_bigrams=[] index=0 if not os.path.exists(bigram_dir): self.collocation_finder(self.tokenize(corpus),bigram_dir) #calling itsef self.normalize_multi_words(tokenized_sentence,bigram_dir,corpus) else: text=open(bigram_dir,encoding='utf8') for line in iter(text): line=line.strip() if not line: # line is blank continue else: bigram.add(line) if len(tokenized_sentence)==1: sent_with_bigrams=tokenized_sentence else: while index <=len(tokenized_sentence)-2: mword=tokenized_sentence[index]+' '+tokenized_sentence[index+1] if mword in bigram: sent_with_bigrams.append(tokenized_sentence[index]+''+tokenized_sentence[index+1]) index+=1 else: sent_with_bigrams.append(tokenized_sentence[index]) index+=1 if index==len(tokenized_sentence)-1: sent_with_bigrams.append(tokenized_sentence[index]) return sent_with_bigrams # method that expand short form file def expand_short_form(self,input_short_word,_file_dir): if not os.path.exists(_file_dir): return input_short_word else: short_form_dict=self.get_short_forms(_file_dir) if input_short_word in short_form_dict: return short_form_dict[input_short_word] else: return input_short_word #method to normalize character level missmatch such as ጸሀይ and ፀሐይ def normalize_char_level_missmatch(self,input_token,lang_resource): if not os.path.exists(lang_resource): return input_token else: text=open(lang_resource,encoding='utf8') rep=input_token for line in iter(text): line=line.strip() if not line: # line is blank continue else: chars=line.split() chars_from=chars[0] chars_to=chars[1] rep=re.sub('['+chars_from+']',chars_to,rep) return rep #replacing any existance of special character or punctuation to null def remove_punc_and_special_chars(self,sentence_input,lang_resource): # puct in amh =፡።፤;፦፧፨፠፣ if not os.path.exists(lang_resource): return sentence_input else: text=open(lang_resource,encoding='utf8') chars=text.read() sp_chars=chars.split(' ') punct=set(sp_chars) normalized_text=sentence_input for p in punct: normalized_text = re.sub('[\\'+p+']', '',normalized_text) return normalized_text def preprocess_text(self,text_input,model_dir,corpus): normalzed_text=[] CHARS_DIR=model_dir+DirConfig.CHARS_DIR MULTI_DIR=model_dir+DirConfig.MULTI_DIR ABRV_DIR=model_dir+DirConfig.ABRV_DIR PUNCT_DIR=model_dir+DirConfig.PUNCT_DIR print('Preprocessing '+str(len(text_input))+' sentences ....') for sentence in text_input: tokens=re.compile('[\s+]+').split(sentence) normalized_token=[] multi_words=self.normalize_multi_words(tokens,MULTI_DIR, corpus) for token in tokens: short_rem=self.expand_short_form(token,ABRV_DIR) char_normalized=self.normalize_char_level_missmatch(short_rem,CHARS_DIR) punct_rem=self.remove_punc_and_special_chars(char_normalized,PUNCT_DIR) normalized_token.append(punct_rem) normalized_token.append(token) normalzed_text.append(normalized_token) return normalzed_text
This code snippet allows you to expand decimal form numbers to text representation. It also automatically normalize arabic numbers to Geez form.
def arabic2geez(arabicNumber): ETHIOPIC_ONE= 0x1369 ETHIOPIC_TEN= 0x1372 ETHIOPIC_HUNDRED= 0x137B ETHIOPIC_TEN_THOUSAND = 0x137C arabicNumber=str(arabicNumber) n = len(arabicNumber)-1 #length of arabic number if n%2 == 0: arabicNumber = "0" + arabicNumber n+=1 arabicBigrams=[arabicNumber[i:i+2] for i in range(0,n,2)] #spliting bigrams reversedArabic=arabicBigrams[::-1] #reversing list content geez=[] for index,pair in enumerate(reversedArabic): curr_geez='' artens=pair[0]#arrabic tens arones=pair[1]#arrabic ones amtens='' amones='' if artens!='0': amtens=str(chr((int(artens) + (ETHIOPIC_TEN - 1)))) #replacing with Geez 10s [፲,፳,፴, ...] else: if arones=='0': #for 00 cases continue if arones!='0': amones=str(chr((int(arones) + (ETHIOPIC_ONE - 1)))) #replacing with Geez Ones [፩,፪,፫, ...] if index>0: if index%2!= 0: #odd index curr_geez=amtens+amones+ str(chr(ETHIOPIC_HUNDRED)) #appending ፻ else: #even index curr_geez=amtens+amones+ str(chr(ETHIOPIC_TEN_THOUSAND)) # appending ፼ else: #last bigram (right most part) curr_geez=amtens+amones geez.append(curr_geez) geez=''.join(geez[::-1]) if geez.startswith('፩፻') or geez.startswith('፩፼'): geez=geez[1:] if len(arabicNumber)>=7: end_zeros=''.join(re.findall('([0]+)$',arabicNumber)[0:]) i=int(len(end_zeros)/3) if len(end_zeros)>=(3*i): if i>=3: i-=1 for thoushand in range(i-1): print(thoushand) geez+='፼' return geez def getExpandedNumber(self,number): if '.' not in str(number): return arabic2geez(number) else: num,decimal=str(number).split('.') if decimal.startswith('0'): decimal=decimal[1:] dot=' ነጥብ ዜሮ ' else: dot=' ነጥብ ' return arabic2geez(num)+dot+self.arabic2geez(decimal)