Skip to content

Commit

Permalink
Merge pull request #30 from vaquierm/marine/vocabDates
Browse files Browse the repository at this point in the history
added regex for years - not tested yet
  • Loading branch information
hmarine authored Oct 12, 2019
2 parents 8120ad8 + 024d45d commit e82ee8e
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 18 deletions.
7 changes: 0 additions & 7 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,6 @@
# Path to which scripts will dump data
results_dir_path: str = "../results"

# Vocab token for youtubelink
token_youtube_link = "youtubelink"
# Vocab token for internetlink
token_internet_link = "internetlink"
# Vocab token for emoticonFunny
token_emoticon_funny = "emoticonFunny"

# These are all the different dictionary names ("LEMMA", "STEM")
vocabularies_to_run = ["LEMMA", "STEM"]

Expand Down
8 changes: 4 additions & 4 deletions src/create_vocabularies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from src.config import vocabularies_to_run, raw_data_dir_path, processed_dir_path
from src.utils.utils import load_raw_test_data, load_raw_training_data
from src.data_processing.vocabulary import create_vocab
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

from src.utils.utils import save_cleaned_raw_data

Expand Down Expand Up @@ -31,14 +32,13 @@ def create_vocabularies():
print("\tCreating vocabulary: " + vocabulary)

# Create the vocabs
comments_train_clean, comments_test_clean = create_vocab(comments_train, comments_test, vocabulary)
comments_train_clean, comments_test_clean, additional_features_train, additional_features_test = create_vocab(comments_train, comments_test, vocabulary)

# Save both cleaned comment lists to csv files
clean_raw_train_path = os.path.join(processed_dir_path, vocabulary + "_train_clean.csv")
save_cleaned_raw_data(clean_raw_train_path, train_raw_data_path, comments_train_clean)
save_cleaned_raw_data(clean_raw_train_path, train_raw_data_path, comments_train_clean, additional_features_train)
clean_raw_test_path = os.path.join(processed_dir_path, vocabulary + "_test_clean.csv")
save_cleaned_raw_data(clean_raw_test_path, test_raw_data_path, comments_test_clean)

save_cleaned_raw_data(clean_raw_test_path, test_raw_data_path, comments_test_clean, additional_features_test)

if __name__ == '__main__':
create_vocabularies()
62 changes: 56 additions & 6 deletions src/data_processing/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from config import token_youtube_link, token_emoticon_funny, token_internet_link

# Lemmatization was compared using diff libraries https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

from nltk.tokenize import TweetTokenizer, RegexpTokenizer

# Vocab token for youtubelink
token_youtube_link = "youtubelink"
# Vocab token for internetlink
token_internet_link = "internetlink"
# Vocab token for emoticonFunny
token_emoticon_funny = "emoticonFunny"
# Vocab token for year
token_year1900 = "year1900"
token_year2000 = "year2000"

def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
"""
Expand All @@ -20,10 +26,23 @@ def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
:return: The processed list of training comments, The processed list of test comments
"""

additional_features_train = {}
additional_features_test = {}

# Preprocess the dataset bu applying custom replacers
print("\t\ttrain: Get comment length and average word length in the comment")
comments_length_train, average_word_length_train = lenghtOfComments(comments_train)
additional_features_train.update({'comment_length': comments_length_train})
additional_features_train.update({'average_word_length': average_word_length_train})

print("\t\tApplying custom replacers")
comments_train = replace_all_for_strong_vocab(comments_train)

print("\t\tTest: Get comment length and average word length in the comment")
comments_length_test, average_word_length_test = lenghtOfComments(comments_test)
additional_features_test.update({'comment_length': comments_length_test})
additional_features_test.update({'average_word_length': average_word_length_test})

# Get the root of each words
if vocab_type == "LEMMA":
print("\t\tLemmatizing training set")
Expand All @@ -38,7 +57,7 @@ def create_vocab(comments_train: list, comments_test: list, vocab_type: str):
else:
raise Exception("The type of vocabulary " + vocab_type + " is not known")

return comments_train, comments_test
return comments_train, comments_test, additional_features_train, additional_features_test


def lemmatize_comments(comments):
Expand Down Expand Up @@ -110,11 +129,32 @@ def reduce_lengthening(text):
return pattern.sub(r"\1\1", text)


def lenghtOfComments(comments):
tokenizer = RegexpTokenizer(r'\w+')
lengthOfComments = []
averageLengthOfWords = []
for i in range(len(comments)):
tokened_sentence = tokenizer.tokenize(comments[i])
lengthOfComments.append(len(tokened_sentence))

sum = 0
for k in range(len(tokened_sentence)):
lengthOfWord = len(tokened_sentence[k])
sum += lengthOfWord
average = round(sum/len(tokened_sentence), 0)
averageLengthOfWords.append(average)

return lengthOfComments, averageLengthOfWords


def replace_all_for_strong_vocab(comments):
for i in range(len(comments)):
comments[i] = replace_youtube_links(comments[i])
comments[i] = replace_url(comments[i])
comments[i] = replace_smiley(comments[i])
comments[i] = replace_years1900(comments[i])
comments[i] = replace_years2000(comments[i])

return comments


Expand Down Expand Up @@ -144,3 +184,13 @@ def replace_smiley(comment):
comment = sentence_untokenized

return comment


def replace_years1900(comment):
regex = (r'(19)\d\d')
return re.sub(regex, token_year1900, comment)


def replace_years2000(comment):
regex = (r'(20)\d\d')
return re.sub(regex, token_year2000, comment)
8 changes: 7 additions & 1 deletion src/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def load_raw_test_data(file_path: str):
return np.array(ids).reshape((len(ids), 1)), list(df['comments'])


def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list):
def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list, additional_features: dict = {}):
"""
Saves the clean raw data after lemmatization
:param file_path: The file path to save the new clean raw data
Expand All @@ -128,6 +128,12 @@ def save_cleaned_raw_data(file_path: str, og_file_path: str, comments:list):

df.loc[:, 'comments'] = pd.Series(comments)

listOfAdditionalFeatures = list(additional_features.items())

for i in range(len(additional_features.keys())):
feature = listOfAdditionalFeatures[i]
df.loc[:, feature[0]] = pd.Series(feature[1])

df.to_csv(file_path, mode='w', index=False)


Expand Down

0 comments on commit e82ee8e

Please sign in to comment.