added new R files containing data analysis functions

Bensas · May 26, 2022 · 552132b · 552132b
1 parent 6bf012f
commit 552132b
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 36 deletions.
diff --git a/DataAnalysis/Q3/BigramWordCloud.R b/DataAnalysis/Q3/BigramWordCloud.R
@@ -0,0 +1,58 @@
+library(pacman) #my package manager
+
+#load necessary packages
+p_load(ggplot2)
+p_load(dplyr)
+p_load(reshape2)
+p_load(gridExtra)
+p_load(stringr)
+p_load(tidytext)
+p_load(tidyr)
+p_load(wordcloud)
+p_load(tm)
+
+####Loading all the data
+foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
+nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
+foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
+nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
+
+bigram_wc <- function(foxnews){
+  fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
+                                       n=2) %>% 
+    anti_join(stop_words)
+  bg_fox <- fox_unn %>% 
+    separate(word, c("word1", "word2"), sep=" ")
+
+  avoid_list <- c("russia", "ukraine", "user", "http", "fox", "york")
+  filter_bg_fox <- bg_fox %>% 
+    filter(!word1 %in% stop_words$word) %>% 
+    filter(!word2 %in% stop_words$word) %>% 
+    filter(!word1 %in% avoid_list) %>% 
+    filter(!word2 %in% avoid_list)
+
+  count_bg <- filter_bg_fox %>% 
+    group_by(word1, word2) %>% 
+    tally(sort = TRUE)
+
+  count_bg <- as.data.frame(count_bg)
+
+  count_bg$bigram <- paste(count_bg$word1, count_bg$word2, sep=" ")
+  wc <- wordcloud(words = count_bg$bigram, freq = count_bg$n, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, 
+                  colors=brewer.pal(8, "Dark2"))
+
+  return(wc)
+}
+
+fox_pos <- foxnews %>% filter(label=="Positive")
+fox_neg <- foxnews %>% filter(label=="Negative")
+fox_neu <- foxnews %>% filter(label=="Neutral")
+
+nyt_pos <- nytimes %>% filter(label=="Positive")
+nyt_neg <- nytimes %>% filter(label=="Negative")
+nyt_neu <- nytimes %>% filter(label=="Neutral")
+
+type(coocc_func(foxnews))
+df <- coocc_func(nytimes)
+df$bigram <- paste(df$word1, df$word2, sep=" ")
+head(df)
diff --git a/DataAnalysis/Q3/Cooccurance.R b/DataAnalysis/Q3/Cooccurance.R
@@ -0,0 +1,48 @@
+library(pacman) #my package manager
+
+#load necessary packages
+p_load(ggplot2)
+p_load(dplyr)
+p_load(reshape2)
+p_load(gridExtra)
+p_load(stringr)
+p_load(tidytext)
+p_load(tidyr)
+
+####Loading all the data
+foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
+nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
+foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
+nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
+
+coocc_func <- function(foxnews){
+  fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
+                                       n=2) %>% 
+    anti_join(stop_words)
+  bg_fox <- fox_unn %>% 
+    separate(word, c("word1", "word2"), sep=" ")
+
+  avoid_list <- c("russia", "ukraine", "user", "http")
+  filter_bg_fox <- bg_fox %>% 
+    filter(!word1 %in% stop_words$word) %>% 
+    filter(!word2 %in% stop_words$word) %>% 
+    filter(!word1 %in% avoid_list) %>% 
+    filter(!word2 %in% avoid_list)
+
+  count_bg <- filter_bg_fox %>% 
+    group_by(word1, word2) %>% 
+    tally(sort = TRUE)
+  return(count_bg)  
+}
+fox_pos <- foxnews %>% filter(label=="Positive")
+fox_neg <- foxnews %>% filter(label=="Negative")
+fox_neu <- foxnews %>% filter(label=="Neutral")
+
+nyt_pos <- nytimes %>% filter(label=="Positive")
+nyt_neg <- nytimes %>% filter(label=="Negative")
+nyt_neu <- nytimes %>% filter(label=="Neutral")
+
+type(coocc_func(foxnews))
+df <- coocc_func(nytimes)
+df$bigram <- paste(df$word1, df$word2, sep=" ")
+head(df)
diff --git a/DataAnalysis/Q3/MakeWordCloud.R b/DataAnalysis/Q3/MakeWordCloud.R
@@ -0,0 +1,45 @@
+# Q1 wordcloud
+library(pacman)
+p_load(wordcloud)
+p_load(tm)
+p_load(dplyr)
+p_load(ggplot2)
+
+# source file -- change the file path here
+foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
+nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
+foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
+nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
+
+make_cloud <- function(dataset, sentiment){
+  positive <- fox_cloud[fox_cloud$label == sentiment,] 
+
+  # remove non-ascii words
+  positive$text <- stringi::stri_trans_general(positive$text, "latin-ascii")
+  positive$text <- gsub("[^\x01-\x7F]", "", positive$text)
+
+  # create corpus and preprocess data
+  docs <- Corpus(VectorSource(positive$text))
+  docs <- docs %>%
+    tm_map(removeNumbers) %>%
+    tm_map(removePunctuation) %>%
+    tm_map(stripWhitespace)
+  docs <- tm_map(docs, content_transformer(tolower))
+  docs <- tm_map(docs, removeWords, stopwords("english"))
+  docs <- tm_map(docs, removeWords, c("russia", "ukraine", "user", "http")) # remove "Russia" and "Ukraine"
+
+  # create matrix
+  dtm <- TermDocumentMatrix(docs) 
+  matrix <- as.matrix(dtm) 
+  words <- sort(rowSums(matrix),decreasing=TRUE) 
+  df <- data.frame(word = names(words),freq=words)
+
+  # create wordcloud
+  set.seed(1234)
+  wc <- wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, 
+            colors=brewer.pal(8, "Dark2"))
+  return(wc)
+
+}
+
+make_cloud(foxnews, "Neutral")
diff --git a/DataAnalysis/Q3/Q3RAnalysisSheikh.R b/DataAnalysis/Q3/Q3RAnalysisSheikh.R
@@ -7,6 +7,8 @@ p_load(dplyr)
 p_load(reshape2)
 p_load(gridExtra)
 p_load(stringr)
+p_load(tidytext)
+p_load(tidyr)
 #reading data
 foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
 nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
@@ -130,4 +132,38 @@ ggplot(allSent, aes(x=Sentiment, y=Percentage, fill=Source)) +
 
 
 
+############################################
+#cooccurance score
+
+coocc_func <- function(foxnews){
+  fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
+                                       n=2) %>% 
+    anti_join(stop_words)
+  bg_fox <- fox_unn %>% 
+    separate(word, c("word1", "word2"), sep=" ")
+
+  avoid_list <- c("russia", "ukraine", "user", "http")
+  filter_bg_fox <- bg_fox %>% 
+    filter(!word1 %in% stop_words$word) %>% 
+    filter(!word2 %in% stop_words$word) %>% 
+    filter(!word1 %in% avoid_list) %>% 
+    filter(!word2 %in% avoid_list)
+
+  count_bg <- filter_bg_fox %>% 
+    group_by(word1, word2) %>% 
+    tally(sort = TRUE)
+  return(count_bg)  
+}
+fox_pos <- foxnews %>% filter(label=="Positive")
+fox_neg <- foxnews %>% filter(label=="Negative")
+fox_neu <- foxnews %>% filter(label=="Neutral")
+
+nyt_pos <- nytimes %>% filter(label=="Positive")
+nyt_neg <- nytimes %>% filter(label=="Negative")
+nyt_neu <- nytimes %>% filter(label=="Neutral")
+
+coocc_func(foxnews)
+coocc_func(nytimes)
+
+
 
diff --git a/DataAnalysis/Q3/Q3WordCloudPosFoxNews.R b/DataAnalysis/Q3/Q3WordCloudPosFoxNews.R