Merge pull request #30 from vaquierm/marine/vocabDates

added regex for years - not tested yet
vaquierm · Oct 12, 2019 · e82ee8e · e82ee8e
2 parents 8120ad8 + 024d45d
commit e82ee8e
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 18 deletions.
diff --git a/src/config.py b/src/config.py
@@ -7,13 +7,6 @@
 # Path to which scripts will dump data
 results_dir_path: str = "../results"
 
-# Vocab token for youtubelink
-token_youtube_link = "youtubelink"
-# Vocab token for internetlink
-token_internet_link = "internetlink"
-# Vocab token for emoticonFunny
-token_emoticon_funny = "emoticonFunny"
-
 # These are all the different dictionary names ("LEMMA", "STEM")
 vocabularies_to_run = ["LEMMA", "STEM"]
 

diff --git a/src/create_vocabularies.py b/src/create_vocabularies.py
@@ -4,6 +4,7 @@
 from src.config import vocabularies_to_run, raw_data_dir_path, processed_dir_path
 from src.utils.utils import load_raw_test_data, load_raw_training_data
 from src.data_processing.vocabulary import create_vocab
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
 
 from src.utils.utils import save_cleaned_raw_data
 
@@ -31,14 +32,13 @@ def create_vocabularies():
         print("\tCreating vocabulary: " + vocabulary)
 
         # Create the vocabs
-        comments_train_clean, comments_test_clean = create_vocab(comments_train, comments_test, vocabulary)
+        comments_train_clean, comments_test_clean, additional_features_train, additional_features_test = create_vocab(comments_train, comments_test, vocabulary)
 
         # Save both cleaned comment lists to csv files
         clean_raw_train_path = os.path.join(processed_dir_path, vocabulary + "_train_clean.csv")
-        save_cleaned_raw_data(clean_raw_train_path, train_raw_data_path, comments_train_clean)
+        save_cleaned_raw_data(clean_raw_train_path, train_raw_data_path, comments_train_clean, additional_features_train)
         clean_raw_test_path = os.path.join(processed_dir_path, vocabulary + "_test_clean.csv")
-        save_cleaned_raw_data(clean_raw_test_path, test_raw_data_path, comments_test_clean)
-
+        save_cleaned_raw_data(clean_raw_test_path, test_raw_data_path, comments_test_clean, additional_features_test)
 
 if __name__ == '__main__':
     create_vocabularies()
diff --git a/src/data_processing/vocabulary.py b/src/data_processing/vocabulary.py
@@ -4,11 +4,17 @@
 from nltk.corpus import wordnet
 from nltk.stem import WordNetLemmatizer
 from nltk.stem import PorterStemmer
-from nltk.tokenize import TweetTokenizer
-from config import token_youtube_link, token_emoticon_funny, token_internet_link
-
-# Lemmatization was compared using diff libraries https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
-
+from nltk.tokenize import TweetTokenizer, RegexpTokenizer
+
+# Vocab token for youtubelink
+token_youtube_link = "youtubelink"
+# Vocab token for internetlink
+token_internet_link = "internetlink"
+# Vocab token for emoticonFunny
+token_emoticon_funny = "emoticonFunny"
+# Vocab token for year
+token_year1900 = "year1900"
+token_year2000 = "year2000"
 
 def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
     """
@@ -20,10 +26,23 @@ def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
     :return: The processed list of training comments, The processed list of test comments
     """
 
+    additional_features_train = {}
+    additional_features_test = {}
+
     # Preprocess the dataset bu applying custom replacers
+    print("\t\ttrain: Get comment length and average word length in the comment")
+    comments_length_train, average_word_length_train = lenghtOfComments(comments_train)
+    additional_features_train.update({'comment_length': comments_length_train})
+    additional_features_train.update({'average_word_length': average_word_length_train})
+
     print("\t\tApplying custom replacers")
     comments_train = replace_all_for_strong_vocab(comments_train)
 
+    print("\t\tTest: Get comment length and average word length in the comment")
+    comments_length_test, average_word_length_test = lenghtOfComments(comments_test)
+    additional_features_test.update({'comment_length': comments_length_test})
+    additional_features_test.update({'average_word_length': average_word_length_test})
+
     # Get the root of each words
     if vocab_type == "LEMMA":
         print("\t\tLemmatizing training set")
@@ -38,7 +57,7 @@ def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
     else:
         raise Exception("The type of vocabulary " + vocab_type + " is not known")
 
-    return comments_train, comments_test
+    return comments_train, comments_test, additional_features_train, additional_features_test
 
 
 def lemmatize_comments(comments):
@@ -110,11 +129,32 @@ def reduce_lengthening(text):
     return pattern.sub(r"\1\1", text)
 
 
+def lenghtOfComments(comments):
+    tokenizer = RegexpTokenizer(r'\w+')
+    lengthOfComments = []
+    averageLengthOfWords = []
+    for i in range(len(comments)):
+        tokened_sentence = tokenizer.tokenize(comments[i])
+        lengthOfComments.append(len(tokened_sentence))
+
+        sum = 0
+        for k in range(len(tokened_sentence)):
+            lengthOfWord = len(tokened_sentence[k])
+            sum += lengthOfWord
+        average = round(sum/len(tokened_sentence), 0)
+        averageLengthOfWords.append(average)
+
+    return lengthOfComments, averageLengthOfWords
+
+
 def replace_all_for_strong_vocab(comments):
     for i in range(len(comments)):
         comments[i] = replace_youtube_links(comments[i])
         comments[i] = replace_url(comments[i])
         comments[i] = replace_smiley(comments[i])
+        comments[i] = replace_years1900(comments[i])
+        comments[i] = replace_years2000(comments[i])
+
     return comments
 
 
@@ -144,3 +184,13 @@ def replace_smiley(comment):
     comment = sentence_untokenized
 
     return comment
+
+
+def replace_years1900(comment):
+    regex = (r'(19)\d\d')
+    return re.sub(regex, token_year1900, comment)
+
+
+def replace_years2000(comment):
+    regex = (r'(20)\d\d')
+    return re.sub(regex, token_year2000, comment)
diff --git a/src/utils/utils.py b/src/utils/utils.py
@@ -114,7 +114,7 @@ def load_raw_test_data(file_path: str):
     return np.array(ids).reshape((len(ids), 1)), list(df['comments'])
 
 
-def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list):
+def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list, additional_features: dict = {}):
     """
     Saves the clean raw data after lemmatization
     :param file_path: The file path to save the new clean raw data
@@ -128,6 +128,12 @@ def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list):
 
     df.loc[:, 'comments'] = pd.Series(comments)
 
+    listOfAdditionalFeatures = list(additional_features.items())
+
+    for i in range(len(additional_features.keys())):
+        feature = listOfAdditionalFeatures[i]
+        df.loc[:, feature[0]] = pd.Series(feature[1])
+
     df.to_csv(file_path, mode='w', index=False)