Skip to content

Commit

Permalink
qallme importer (MozillaItalia#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
dag7dev authored Apr 13, 2020
1 parent f065520 commit 130f56e
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 4 deletions.
79 changes: 79 additions & 0 deletions MITADS/qallme_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import xml.etree.ElementTree as ET
from utils import sanitize, line_rules, download

# managing sanitizer
sanitizer = sanitize.Sanitization()

mapping_normalization = [
[u'*', u''],
[u'/', u''],
[u'#', u''],
[u'{', u''],
[u'}', u''],
[u'(', u''],
[u')', u''],
[u'[', u''],
[u']', u'']
]

# managing parse directory name
parsedir = "parsing" + os.path.sep

# managing output pathname + output filename
outdir = "output" + os.path.sep
filename = "qallme.txt"

output_file = open(outdir + filename, "w", encoding='utf-8')

# print("\n")
print("Qallme Importer")
print("===============")
# downloading resource
download_link = 'http://qallme.fbk.eu/archive/QB_IT_V1.0_TranscriptionsReferences.zip'

downloader = download.Download()
downloader = downloader.if_not_exist(download_link)

# extracting files
downloader.zip_decompress(parsedir)

# going to the right directory
os.chdir(parsedir + "QB_IT_V1.0_Translations") # name of the folder inside
# the zip package


### XML ###
qallmef = ET.parse("QallmebenchmarkIT_v1.0_final-translation.xml")

sentences = qallmef.findall("question/text")
len_sentences = len(sentences)

print("Now parsing " + str(len_sentences) + " sentences... ")

# We are looking for sentences, not xml elements!
# turning xml elements into real sentences
i = 0
for s in sentences:
sentences[i] = sentences[i].text
i += 1

# sanitizing line by using libs
for line in sentences:
if line is not None: # if we are not treating an empty line
line = sanitizer.maybe_normalize(line, mapping_normalization)

# print("OK!")


print("Now writing to " + outdir + filename + "... ")

# writing to output file
for line in sentences:
if line is not None:
output_file.write(line)
output_file.write("\n")

#print("OK!\n")

print("Import from QALLME completed!")
3 changes: 3 additions & 0 deletions MITADS/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
gutenberg
Unidecode
fake-useragent
roman
BeautifulSoup4

4 changes: 2 additions & 2 deletions MITADS/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def bz2_decompress(self, file=''):
file = self.file

extract_to = self.file.replace('.bz2','')
if not os.path.isfile(extract_to):
if os.path.isfile(extract_to):
print('Decompressing to ' + extract_to)
with open(extract_to, 'wb') as new_file, bz2.BZ2File(self.file, 'rb') as file:
for data in iter(lambda : file.read(100 * 1024), b''):
Expand All @@ -60,7 +60,7 @@ def zip_decompress(self, extract_to="", file=""):
if file == '':
file = self.file

if not os.path.isdir(extract_to):
if os.path.isdir(extract_to):
print('Decompressing to ' + extract_to)
with zipfile.ZipFile(self.file, "r") as zip_ref:
zip_ref.extractall(extract_to)
Expand Down
6 changes: 4 additions & 2 deletions MITADS/utils/sanitize.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,7 @@ def clean_single_line(self, value):

if(value.isupper()):
value = value.lower()

return value

value = value.strip() # clean line from whitespace at the beginning / at the end

return value

0 comments on commit 130f56e

Please sign in to comment.