Skip to content

Commit

Permalink
preprocessing script updates
Browse files Browse the repository at this point in the history
  • Loading branch information
mattboggess committed Jun 9, 2020
1 parent 95799b3 commit 3d3851e
Show file tree
Hide file tree
Showing 9 changed files with 343 additions and 306 deletions.
24 changes: 24 additions & 0 deletions preprocessing/collect_terms.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,30 @@
# Collects all terms from all sources into single standardized dataframe and spacy preprocesses
# the terms.
#
# Author: Matthew Boggess
# Version: 5/28/20
#
# Data Source: All text files containing terms output by preprocess_textbooks.py (textbook key
# terms), output by preprocess_kb_bio101_terms.py (kb terms), and manually curated and copied over
# hand labelled terms lists for select sections.
#
# Description:
# For each term source, pulls out the term and spacy preprocesses. Also pulls out concept mappings
# for the KB terms and event/entity labels where provided. Exports to single dataframe with
# markers for source of the term as well as academic domain from where it was derived.

#===================================================================================
# Libraries

import os
import pandas as pd
import re
from tqdm import tqdm
import spacy

#===================================================================================
# Parameters

terms_dir = "../data/preprocessed/terms"

rel_terms_exclude_file = f"{terms_dir}/kb_bio101_relations_exclude.txt"
Expand All @@ -26,10 +47,13 @@
'University_Physics_Volume_3': 'physics'
}

#===================================================================================

if __name__ == '__main__':

nlp = spacy.load('en_core_web_sm')

# relations/other terms that should be excluded from our final list
exclude_terms = []
with open(rel_terms_exclude_file, 'r') as fid:
rel_terms = fid.readlines()
Expand Down
47 changes: 20 additions & 27 deletions preprocessing/generate_term_pairs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Enumerates all sentence, term pair combinations for a set of term-tagged sentences to be used
# for relation extraction
#
# Author: Matthew Boggess
# Version: 5/28/20
#
# Data Source: Tagged sentences dataframes output from tag_sentences.py.
#
# Description:
# For each provided set of tagged sentences: Enumerates all term pairs from the set of tagged
#. terms in each sentence creating a row in a new data frame for each pair.

#===================================================================================
# Libraries

import spacy
Expand All @@ -8,34 +21,25 @@
import pandas as pd
import json

#===================================================================================
# Parameters

input_data_dir = '../data/preprocessed/tagged_sentences'
output_data_dir = '../data/preprocessed/term_pair_sentences'

textbooks = [
'Life_Biology',
'Biology_2e'
#'Anatomy_and_Physiology',
#'Astronomy',
#'Chemistry_2e',
#'Microbiology',
#'Psychology',
#'University_Physics_Volume_1',
#'University_Physics_Volume_2',
#'University_Physics_Volume_3',
]

# invalid parts of speech that shouldn't be tagged
invalid_pos = ['JJ', 'JJR', 'JJS', 'MD', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ',
'VBP', 'WRB']

one_direction = True
#===================================================================================

if __name__ == '__main__':

nlp = spacy.load('en_core_web_sm')

for i, textbook in enumerate(textbooks):
data = pd.read_pickle(f"{input_data_dir}/{textbook}_tagged_sentences.pkl")
data = pd.read_pickle(f"{input_data_dir}/{textbook}_relation_extraction_tagged_sentences.pkl")

new_df = []
for k, row in tqdm(list(data.iterrows())):
Expand All @@ -46,10 +50,6 @@
for j in range(i + 1, len(found_terms)):
term_pair = (found_terms[i], found_terms[j])

if found_terms_info[term_pair[0]]['pos'][0][-1] in invalid_pos or \
found_terms_info[term_pair[1]]['pos'][0][-1] in invalid_pos:
continue

indices = get_closest_match(
found_terms_info[term_pair[0]]['indices'],
found_terms_info[term_pair[1]]['indices']
Expand All @@ -65,14 +65,7 @@
new_row['term1_location'] = indices[0]
new_row['term2_location'] = indices[1]
new_df.append(new_row)

#if not one_direction:
# term_pair_reverse = (found_terms[j], found_terms[i])
# indices_reverse = get_closest_match(
# found_terms_info[term_pair_reverse[0]]['indices'],
# found_terms_info[term_pair_reverse[1]]['indices']
# )
# found_term_pairs.append((term_pair_reverse, indices_reverse))

data = pd.DataFrame(new_df).reset_index()
data = data.drop(['level_0', 'index'], axis=1)
data = data.drop(['index', 'tags', 'term_info'], axis=1)
data.to_pickle(f"{output_data_dir}/{textbook}_term_pairs.pkl")
128 changes: 77 additions & 51 deletions preprocessing/parse_life_bio_sentences.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
# Converts Life Bio Raw HTML into sentences in pandas dataframe
# Converts Life Biology Raw HTML into individual clean sentences in csv file
#
# Author: Matthew Boggess
# Version: 5/28/20

# Data Source: HTML versions of Life Biology chapters shared by Dr. Chaudhri

# Description:
# For each provided chapter/section this script finds the all valid full sentences for that
# section and then cleans up encoding errors and removes the HTML markup. It assembles it all
# into a csv file for the entire book.

#===================================================================================
# Libraries

import os
import re
import pandas as pd
from nltk import sent_tokenize

#===================================================================================
# Parameters

data_dir = '../data/raw_data/life_bio/life_bio_html'

# regex for valid sentences
sentence_regex = '<[pl]i*[ >].*?>.*?</[pl]i*>'
# regex for key terms
term_regex = '<span class="bolded-keyword">.*?</span>'

# classes that invalidate the sentence regex
exclude_classes = [
'fig-title',
'h2a',
Expand All @@ -30,7 +50,11 @@
'division-title'
]

#===================================================================================
# Helper Functions

def clean_sent(sent):
"""Removes HTML markup and fixes several encoding errors."""
sent = re.sub('<span class="sidebar-division-title".*?>.*?</span>', '', sent)
sent = re.sub('<.*?>', '', sent)
sent = sent.replace('*', '')
Expand All @@ -54,54 +78,56 @@ def clean_sent(sent):
sent = sent.strip()
return sent

chs = sorted([ch for ch in os.listdir(data_dir) if ch.startswith('chapter')])
data = {
'chapter': [],
'section': [],
'section_name': [],
'sentence_number': [],
'sentence': []
}
#===================================================================================

if __name__ == '__main__':

chs = sorted([ch for ch in os.listdir(data_dir) if ch.startswith('chapter')])
data = {
'chapter': [],
'section': [],
'section_name': [],
'sentence_number': [],
'sentence': []
}

for ch in chs:
ch_num = re.match('chapter(\d+)', ch).group(1)
sections = sorted([sect for sect in os.listdir(f"{data_dir}/{ch}") if sect.startswith('chapter')])
for sect in sections:
with open(f"{data_dir}/{ch}/{sect}", 'r') as fid:
text = fid.read()
text = text.replace('\n', ' ')

section = re.match('chapter\d+-*(.*).html', sect).group(1)
if not section:
section = 0
if section == 'summary':
section = len(sections) - 1
section_name = clean_sent(re.findall('<h1 class="concept-title">(.*?)</h1>', text)[0])

for ch in chs:
ch_num = re.match('chapter(\d+)', ch).group(1)
sections = sorted([sect for sect in os.listdir(f"{data_dir}/{ch}") if sect.startswith('chapter')])
for sect in sections:
with open(f"{data_dir}/{ch}/{sect}", 'r') as fid:
text = fid.read()
text = text.replace('\n', ' ')

section = re.match('chapter\d+-*(.*).html', sect).group(1)
if not section:
section = 0
if section == 'summary':
section = len(sections) - 1
section_name = clean_sent(re.findall('<h1 class="concept-title">(.*?)</h1>', text)[0])

sents = re.findall(sentence_regex, text)
terms = re.findall(term_regex, text)
new_sents = []
for sent in sents:
first_class = re.match('.*?class="(.*?)".*', sent)
if first_class:
first_class = first_class.group(1)
else:
first_class = ''
if first_class in exclude_classes:
if ch_num == '39':
print(sent)
continue
new_sents += sent_tokenize(clean_sent(sent))
sents = new_sents
sents = [s for s in sents if s != 'You should be able to:' and \
s != "Apply what you've learned" and \
len(s.split()) > 3 and \
not s.startswith('End of') and \
not s.startswith('Review')]
data['sentence'] += sents
data['sentence_number'] += list(range(1, len(sents) + 1))
data['chapter'] += [ch_num] * len(sents)
data['section'] += [section] * len(sents)
data['section_name'] += [section_name] * len(sents)
data = pd.DataFrame(data)
data.to_csv('../data/preprocessed/parsed_books/sentences_Life_Biology_parsed.csv', index=False)
sents = re.findall(sentence_regex, text)
terms = re.findall(term_regex, text)
new_sents = []
for sent in sents:
first_class = re.match('.*?class="(.*?)".*', sent)
if first_class:
first_class = first_class.group(1)
else:
first_class = ''
if first_class in exclude_classes:
continue
new_sents += sent_tokenize(clean_sent(sent))
sents = new_sents
sents = [s for s in sents if s != 'You should be able to:' and \
s != "Apply what you've learned" and \
len(s.split()) > 3 and \
not s.startswith('End of') and \
not s.startswith('Review')]
data['sentence'] += sents
data['sentence_number'] += list(range(1, len(sents) + 1))
data['chapter'] += [ch_num] * len(sents)
data['section'] += [section] * len(sents)
data['section_name'] += [section_name] * len(sents)
data = pd.DataFrame(data)
data.to_csv('../data/preprocessed/parsed_books/sentences_Life_Biology_parsed.csv', index=False)
29 changes: 20 additions & 9 deletions preprocessing/preprocess_hand_labelled_terms.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,32 @@
# Converts Life Biology Raw HTML into individual clean sentences in csv file
#
# Author: Matthew Boggess
# Version: 5/28/20

# Data Source: HTML versions of Life Biology chapters shared by Dr. Chaudhri

# Description:
# For each provided chapter/section this script finds the all valid full sentences for that
# section and then cleans up encoding errors and removes the HTML markup. It assembles it all
# into a csv file for the entire book.

#===================================================================================
# Libraries

import spacy
import pandas as pd
from io import StringIO
import os
from tqdm import tqdm

# text representations of concepts that are too general and thus problematic for text matching
exclude_terms = ['object', 'aggregate', 'group', 'thing', 'region', 'center', 'response',
'series', 'unit', 'result', 'normal', 'divide', 'whole', 'someone', 'somebody',
'feature', 'class']
#===================================================================================
# Parameters

# data directories
exclude_term_file
raw_data_dir = "../data/raw_data/hand_labelled"
output_dir = "../data/preprocessed/terms"

## Important Enumerations

# textbook sections/chapters to be processed
# textbook sections/chapters to be processed that have hand labelled list of terms
textbook_sections = [
'openstax_bio2e_section10-2_hand_labelled',
'openstax_bio2e_section10-4_hand_labelled',
Expand All @@ -25,7 +36,7 @@

#===================================================================================

if __name__ == "__main__":
if __name__ == '__main__':

for i, section in enumerate(textbook_sections):

Expand Down
Loading

0 comments on commit 3d3851e

Please sign in to comment.