preprocessing script updates

mattboggess · Jun 9, 2020 · 3d3851e · 3d3851e
1 parent 95799b3
commit 3d3851e
Show file tree

Hide file tree

Showing 9 changed files with 343 additions and 306 deletions.
diff --git a/preprocessing/collect_terms.py b/preprocessing/collect_terms.py
@@ -1,9 +1,30 @@
+# Collects all terms from all sources into single standardized dataframe and spacy preprocesses
+# the terms.
+#
+# Author: Matthew Boggess
+# Version: 5/28/20
+#
+# Data Source: All text files containing terms output by preprocess_textbooks.py (textbook key
+# terms), output by preprocess_kb_bio101_terms.py (kb terms), and manually curated and copied over
+# hand labelled terms lists for select sections.
+#
+# Description: 
+#   For each term source, pulls out the term and spacy preprocesses. Also pulls out concept mappings
+#   for the KB terms and event/entity labels where provided. Exports to single dataframe with
+#   markers for source of the term as well as academic domain from where it was derived.
+
+#===================================================================================
+# Libraries 
+
 import os
 import pandas as pd
 import re
 from tqdm import tqdm
 import spacy
 
+#===================================================================================
+# Parameters 
+
 terms_dir = "../data/preprocessed/terms"
 
 rel_terms_exclude_file = f"{terms_dir}/kb_bio101_relations_exclude.txt"
@@ -26,10 +47,13 @@
     'University_Physics_Volume_3': 'physics'
 }
 
+#===================================================================================
+
 if __name__ == '__main__':
 
     nlp = spacy.load('en_core_web_sm')
 
+    # relations/other terms that should be excluded from our final list
     exclude_terms = []
     with open(rel_terms_exclude_file, 'r') as fid:
         rel_terms = fid.readlines()

diff --git a/preprocessing/generate_term_pairs.py b/preprocessing/generate_term_pairs.py
@@ -1,3 +1,16 @@
+# Enumerates all sentence, term pair combinations for a set of term-tagged sentences to be used
+# for relation extraction
+#
+# Author: Matthew Boggess
+# Version: 5/28/20
+#
+# Data Source: Tagged sentences dataframes output from tag_sentences.py.
+#
+# Description: 
+#   For each provided set of tagged sentences: Enumerates all term pairs from the set of tagged 
+#.  terms in each sentence creating a row in a new data frame for each pair.
+
+#===================================================================================
 # Libraries 
 
 import spacy
@@ -8,34 +21,25 @@
 import pandas as pd
 import json
 
+#===================================================================================
+# Parameters 
+
 input_data_dir = '../data/preprocessed/tagged_sentences'
 output_data_dir = '../data/preprocessed/term_pair_sentences'
 
 textbooks = [
     'Life_Biology', 
     'Biology_2e'
-    #'Anatomy_and_Physiology',
-    #'Astronomy',
-    #'Chemistry_2e',
-    #'Microbiology',
-    #'Psychology',
-    #'University_Physics_Volume_1',
-    #'University_Physics_Volume_2',
-    #'University_Physics_Volume_3',
 ]
 
-# invalid parts of speech that shouldn't be tagged
-invalid_pos = ['JJ', 'JJR', 'JJS', 'MD', 'RB', 'RBR', 'RBS', 'RP', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', 
-               'VBP', 'WRB']
-
-one_direction = True
+#===================================================================================
 
 if __name__ == '__main__':
 
     nlp = spacy.load('en_core_web_sm')
 
     for i, textbook in enumerate(textbooks):
-        data = pd.read_pickle(f"{input_data_dir}/{textbook}_tagged_sentences.pkl")
+        data = pd.read_pickle(f"{input_data_dir}/{textbook}_relation_extraction_tagged_sentences.pkl")
 
         new_df = []
         for k, row in tqdm(list(data.iterrows())):
@@ -46,10 +50,6 @@
                 for j in range(i + 1, len(found_terms)):
                     term_pair = (found_terms[i], found_terms[j])
 
-                    if found_terms_info[term_pair[0]]['pos'][0][-1] in invalid_pos or \
-                       found_terms_info[term_pair[1]]['pos'][0][-1] in invalid_pos:
-                        continue
-
                     indices = get_closest_match(
                         found_terms_info[term_pair[0]]['indices'],
                         found_terms_info[term_pair[1]]['indices']
@@ -65,14 +65,7 @@
                     new_row['term1_location'] = indices[0]
                     new_row['term2_location'] = indices[1]
                     new_df.append(new_row)
-
-                    #if not one_direction:
-                    #    term_pair_reverse = (found_terms[j], found_terms[i])
-                    #    indices_reverse = get_closest_match(
-                    #        found_terms_info[term_pair_reverse[0]]['indices'],
-                    #        found_terms_info[term_pair_reverse[1]]['indices']
-                    #    )
-                    #    found_term_pairs.append((term_pair_reverse, indices_reverse))
+
         data = pd.DataFrame(new_df).reset_index()
-        data = data.drop(['level_0', 'index'], axis=1)
+        data = data.drop(['index', 'tags', 'term_info'], axis=1)
         data.to_pickle(f"{output_data_dir}/{textbook}_term_pairs.pkl")
diff --git a/preprocessing/parse_life_bio_sentences.py b/preprocessing/parse_life_bio_sentences.py
@@ -1,14 +1,34 @@
-# Converts Life Bio Raw HTML into sentences in pandas dataframe
+# Converts Life Biology Raw HTML into individual clean sentences in csv file 
+#
+# Author: Matthew Boggess
+# Version: 5/28/20
+
+# Data Source: HTML versions of Life Biology chapters shared by Dr. Chaudhri
+
+# Description: 
+#   For each provided chapter/section this script finds the all valid full sentences for that
+#   section and then cleans up encoding errors and removes the HTML markup. It assembles it all
+#   into a csv file for the entire book.
+
+#===================================================================================
+# Libraries 
 
 import os
 import re
 import pandas as pd
 from nltk import sent_tokenize
 
+#===================================================================================
+# Parameters
+
 data_dir = '../data/raw_data/life_bio/life_bio_html'
+
+# regex for valid sentences
 sentence_regex = '<[pl]i*[ >].*?>.*?</[pl]i*>'
+# regex for key terms
 term_regex = '<span class="bolded-keyword">.*?</span>'
 
+# classes that invalidate the sentence regex
 exclude_classes = [
     'fig-title',
     'h2a',
@@ -30,7 +50,11 @@
     'division-title'
 ]
 
+#===================================================================================
+# Helper Functions
+
 def clean_sent(sent):
+    """Removes HTML markup and fixes several encoding errors."""
     sent = re.sub('<span class="sidebar-division-title".*?>.*?</span>', '', sent)
     sent = re.sub('<.*?>', '', sent)
     sent = sent.replace('*', '')
@@ -54,54 +78,56 @@ def clean_sent(sent):
     sent = sent.strip()
     return sent
 
-chs = sorted([ch for ch in os.listdir(data_dir) if ch.startswith('chapter')])
-data = {
-    'chapter': [],
-    'section': [],
-    'section_name': [],
-    'sentence_number': [],
-    'sentence': []
-}
+#===================================================================================
+
+if __name__ == '__main__':
+
+    chs = sorted([ch for ch in os.listdir(data_dir) if ch.startswith('chapter')])
+    data = {
+        'chapter': [],
+        'section': [],
+        'section_name': [],
+        'sentence_number': [],
+        'sentence': []
+    }
+
+    for ch in chs:
+        ch_num = re.match('chapter(\d+)', ch).group(1) 
+        sections = sorted([sect for sect in os.listdir(f"{data_dir}/{ch}") if sect.startswith('chapter')])
+        for sect in sections:
+            with open(f"{data_dir}/{ch}/{sect}", 'r') as fid:
+                text = fid.read()
+            text = text.replace('\n', ' ')
+
+            section = re.match('chapter\d+-*(.*).html', sect).group(1)
+            if not section:
+                section = 0
+            if section == 'summary':
+                section = len(sections) - 1
+            section_name = clean_sent(re.findall('<h1 class="concept-title">(.*?)</h1>', text)[0])
 
-for ch in chs:
-    ch_num = re.match('chapter(\d+)', ch).group(1) 
-    sections = sorted([sect for sect in os.listdir(f"{data_dir}/{ch}") if sect.startswith('chapter')])
-    for sect in sections:
-        with open(f"{data_dir}/{ch}/{sect}", 'r') as fid:
-            text = fid.read()
-        text = text.replace('\n', ' ')
-
-        section = re.match('chapter\d+-*(.*).html', sect).group(1)
-        if not section:
-            section = 0
-        if section == 'summary':
-            section = len(sections) - 1
-        section_name = clean_sent(re.findall('<h1 class="concept-title">(.*?)</h1>', text)[0])
-
-        sents = re.findall(sentence_regex, text)
-        terms = re.findall(term_regex, text)
-        new_sents = []
-        for sent in sents:
-            first_class = re.match('.*?class="(.*?)".*', sent)
-            if first_class:
-                first_class = first_class.group(1)
-            else:
-                first_class = ''
-            if first_class in exclude_classes:
-                if ch_num == '39':
-                    print(sent)
-                continue
-            new_sents += sent_tokenize(clean_sent(sent))
-        sents = new_sents
-        sents = [s for s in sents if s != 'You should be able to:' and \
-                                     s != "Apply what you've learned" and \
-                                     len(s.split()) > 3 and \
-                                     not s.startswith('End of') and \
-                                     not s.startswith('Review')]
-        data['sentence'] += sents
-        data['sentence_number'] += list(range(1, len(sents) + 1))
-        data['chapter'] += [ch_num] * len(sents) 
-        data['section'] += [section] * len(sents) 
-        data['section_name'] += [section_name] * len(sents) 
-data = pd.DataFrame(data)
-data.to_csv('../data/preprocessed/parsed_books/sentences_Life_Biology_parsed.csv', index=False)
+            sents = re.findall(sentence_regex, text)
+            terms = re.findall(term_regex, text)
+            new_sents = []
+            for sent in sents:
+                first_class = re.match('.*?class="(.*?)".*', sent)
+                if first_class:
+                    first_class = first_class.group(1)
+                else:
+                    first_class = ''
+                if first_class in exclude_classes:
+                    continue
+                new_sents += sent_tokenize(clean_sent(sent))
+            sents = new_sents
+            sents = [s for s in sents if s != 'You should be able to:' and \
+                                         s != "Apply what you've learned" and \
+                                         len(s.split()) > 3 and \
+                                         not s.startswith('End of') and \
+                                         not s.startswith('Review')]
+            data['sentence'] += sents
+            data['sentence_number'] += list(range(1, len(sents) + 1))
+            data['chapter'] += [ch_num] * len(sents) 
+            data['section'] += [section] * len(sents) 
+            data['section_name'] += [section_name] * len(sents) 
+    data = pd.DataFrame(data)
+    data.to_csv('../data/preprocessed/parsed_books/sentences_Life_Biology_parsed.csv', index=False)
diff --git a/preprocessing/preprocess_hand_labelled_terms.py b/preprocessing/preprocess_hand_labelled_terms.py
@@ -1,21 +1,32 @@
+# Converts Life Biology Raw HTML into individual clean sentences in csv file 
+#
+# Author: Matthew Boggess
+# Version: 5/28/20
+
+# Data Source: HTML versions of Life Biology chapters shared by Dr. Chaudhri
+
+# Description: 
+#   For each provided chapter/section this script finds the all valid full sentences for that
+#   section and then cleans up encoding errors and removes the HTML markup. It assembles it all
+#   into a csv file for the entire book.
+
+#===================================================================================
+# Libraries 
+
 import spacy
 import pandas as pd
 from io import StringIO
 import os
 from tqdm import tqdm
 
-# text representations of concepts that are too general and thus problematic for text matching
-exclude_terms = ['object', 'aggregate', 'group', 'thing', 'region', 'center', 'response',
-                 'series', 'unit', 'result', 'normal', 'divide', 'whole', 'someone', 'somebody',
-                 'feature', 'class']
+#===================================================================================
+# Parameters 
+
 # data directories
-exclude_term_file
 raw_data_dir = "../data/raw_data/hand_labelled"
 output_dir = "../data/preprocessed/terms"
 
-## Important Enumerations 
-
-# textbook sections/chapters to be processed
+# textbook sections/chapters to be processed that have hand labelled list of terms
 textbook_sections = [
     'openstax_bio2e_section10-2_hand_labelled',
     'openstax_bio2e_section10-4_hand_labelled',
@@ -25,7 +36,7 @@
 
 #===================================================================================
 
-if __name__ == "__main__":
+if __name__ == '__main__':
 
     for i, section in enumerate(textbook_sections):