done some more analysis and polished some codes

Bensas · May 27, 2022 · cfd2c15 · cfd2c15
1 parent 0901245
commit cfd2c15
Show file tree

Hide file tree

Showing 9 changed files with 315 additions and 1,275 deletions.
diff --git a/DataAnalysis/Q3/BigramWordCloud.R b/DataAnalysis/Q3/BigramWordCloud.R
@@ -52,7 +52,4 @@ nyt_pos <- nytimes %>% filter(label=="Positive")
 nyt_neg <- nytimes %>% filter(label=="Negative")
 nyt_neu <- nytimes %>% filter(label=="Neutral")
 
-type(coocc_func(foxnews))
-df <- coocc_func(nytimes)
-df$bigram <- paste(df$word1, df$word2, sep=" ")
-head(df)
+filter_by_date <- 
diff --git a/DataAnalysis/Q3/Cooccurance.R b/DataAnalysis/Q3/Cooccurance.R
@@ -14,6 +14,15 @@ foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Meth
 nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
 foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
 nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
+####pre-processing#####
+nytimes <- nytimes %>% select(Date, text, label, score)
+nytimes$Date <- sub(" .*", "", nytimes$Date) %>% as.Date(format="%Y-%m-%d", tz="UTC")
+nytimes$text <- tolower(nytimes$text)
+
+foxnews <- foxnews %>% select(Date, text, label, score)
+foxnews$Date <- sub(" .*", "", foxnews$Date) %>% as.Date(format="%Y-%m-%d", tz="UTC")
+foxnews$text <- tolower(foxnews$text)
+##########
 
 coocc_func <- function(foxnews){
   fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
@@ -42,6 +51,8 @@ nyt_pos <- nytimes %>% filter(label=="Positive")
 nyt_neg <- nytimes %>% filter(label=="Negative")
 nyt_neu <- nytimes %>% filter(label=="Neutral")
 
+int_point <- foxnews %>% filter(Date > "2022-01-05")
+int_point <- int_point %>% filter(Date < "2022-01-20")
 type(coocc_func(foxnews))
 df <- coocc_func(nytimes)
 df$bigram <- paste(df$word1, df$word2, sep=" ")

diff --git a/DataAnalysis/Q3/FoxNewsTitleSentimentByWeek.png b/DataAnalysis/Q3/FoxNewsTitleSentimentByWeek.png
diff --git a/DataAnalysis/Q3/MakeWordCloud.R b/DataAnalysis/Q3/MakeWordCloud.R
@@ -12,7 +12,7 @@ foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Met
 nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
 
 make_cloud <- function(dataset, sentiment){
-  positive <- fox_cloud[fox_cloud$label == sentiment,] 
+  positive <- dataset[dataset$label == sentiment,] 
 
   # remove non-ascii words
   positive$text <- stringi::stri_trans_general(positive$text, "latin-ascii")
@@ -43,3 +43,4 @@ make_cloud <- function(dataset, sentiment){
 }
 
 make_cloud(foxnews, "Neutral")
+
diff --git a/DataAnalysis/Q3/NYTimesTitleSentiment.png b/DataAnalysis/Q3/NYTimesTitleSentiment.png
diff --git a/DataAnalysis/Q3/Q3LinePlot.R b/DataAnalysis/Q3/Q3LinePlot.R
@@ -0,0 +1,57 @@
+library(ggplot2)
+library(dplyr)
+
+# csv file -- change the file path here
+#q1 <-  read.csv('/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q1/all_tweets_emotions_with_sentiment.csv')
+#foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
+#nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
+foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
+#nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")
+q1 <- foxtitle
+
+
+## stacked bar plot ##
+
+# convert string to datetime
+q1$Date <- sub(" .*", "", q1$Date)
+q1$Date <- as.Date(q1$Date, format="%Y-%m-%d", tz="UTC")
+
+# create YearMonth column
+q1$YearMonth <- substr(q1$Date, 1,7)
+
+# create Week column
+q1 <- q1 %>% 
+  mutate(Week = cut.Date(q1$Date, breaks = "1 week", labels = FALSE)) %>% 
+  arrange(q1$Date)
+
+# calculate percentage of sentiment by week
+sentiment_by_week <- q1 %>%
+  group_by(Week, label) %>%
+  summarise(cnt = n()) %>%
+  mutate(freq = round(cnt / sum(cnt), 3)) %>% 
+  arrange(Week)
+
+# calculate percentage of sentiment by month
+sentiment_by_month <- q1 %>%
+  group_by(YearMonth, label) %>%
+  summarise(cnt = n()) %>%
+  mutate(freq = round(cnt / sum(cnt), 3)) %>% 
+  arrange(YearMonth)
+
+# graph (by week)
+ggplot(sentiment_by_week, aes(fill=label, y=freq, x=Week, col=label)) + 
+  geom_line(lwd=1.5) +
+  theme_minimal() + 
+  theme(panel.background = element_blank()) +
+  ggtitle("NY Times Average Emotions of Tweets by Week") +
+  labs(x='Week', y='Frequency') +
+  theme(plot.title = element_text(hjust = 0.5, size=15, face='bold', margin = margin(t = 10, r = 0 , b = 10, l = 0))) +
+  theme(axis.title.x = element_text(face='bold', size=10, margin = margin(t = 10, b = 10, r = 0, l = 0))) +
+  theme(axis.title.y = element_text(face='bold', size=10, margin = margin(t = 0, b = 0, r = 10, l = 10))) +
+  theme(axis.text.x = element_text(angle=30, size=10)) +
+  theme(axis.text.y = element_text(size=10)) +
+  theme(legend.title = element_text(face='bold', size=10)) +
+  theme(legend.text = element_text(size=8)) +
+  scale_x_continuous(breaks=c(1,11,16,23), labels=c("Dec 24th", "Feb 24th", "April 7th", "May 24th"))
+#scale_color_manual('label', values=c('#d9534f', '#f0ad4e', '#5cb85c', '#5cb86c', '#5cb87c', '#5cb88c', '#5cb89c'))
+
diff --git a/DataAnalysis/Q3/Q3RAnalysisSheikh.R b/DataAnalysis/Q3/Q3RAnalysisSheikh.R
@@ -1,3 +1,5 @@
+#This file is a mess. I only used this to make the graphs
+
 #Research Question 3
 library(pacman) #my package manager
 
@@ -87,6 +89,7 @@ ggplot(sentiment_by_week_combined, aes(fill=source, y=freq, x=Week)) +
 
 
 ####Checking the overall sentiment of the news
+
 foxTotal <- table(foxnews$label)/length(foxnews$label) * 100
 nyTotal <- table(nytimes$label)/length(nytimes$label) * 100
 
@@ -107,9 +110,6 @@ ggplot(nyTotal, aes(x="", y=Percentage, fill=Sentiment)) +
   theme(panel.background = element_blank())
 
 
-gfox1
-gnyt1
-
 #barplot
 gfox2 <- ggplot(foxTotal, aes(x=Sentiment, y=Percentage, fill=Sentiment)) +
   geom_bar(stat="identity", width = 1, color="white") +
@@ -128,6 +128,8 @@ ggplot(allSent, aes(x=Sentiment, y=Percentage, fill=Source)) +
   ggtitle("Media Outlet Tweet and Reply Sentiment Comparison") +
   scale_fill_manual(values=c("#fc4949", "#1a94eb"))
 
+###############################################################################
+
 
 
 

diff --git a/DataAnalysis/Q3/Weekly Ukraine Related Tweets.png b/DataAnalysis/Q3/Weekly Ukraine Related Tweets.png