Added Q2 before/after barplots and updated scripts

Bensas · May 26, 2022 · 36a7e97 · 36a7e97
1 parent 8c53d11
commit 36a7e97
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 11 deletions.
diff --git a/DataAnalysis/Q2/before_after_barplot_1_week_nato.png b/DataAnalysis/Q2/before_after_barplot_1_week_nato.png
diff --git a/DataAnalysis/Q2/before_after_barplot_1_week_putin.png b/DataAnalysis/Q2/before_after_barplot_1_week_putin.png
diff --git a/DataAnalysis/Q2/before_after_barplot_1_week_zelensky.png b/DataAnalysis/Q2/before_after_barplot_1_week_zelensky.png
diff --git a/DataAnalysis/Q2/before_after_barplot_2_weeks_nato.png b/DataAnalysis/Q2/before_after_barplot_2_weeks_nato.png
diff --git a/DataAnalysis/Q2/before_after_barplot_2_weeks_putin.png b/DataAnalysis/Q2/before_after_barplot_2_weeks_putin.png
diff --git a/DataAnalysis/Q2/before_after_barplot_2_weeks_zelensky.png b/DataAnalysis/Q2/before_after_barplot_2_weeks_zelensky.png
diff --git a/DataAnalysis/Q2/q2_before_after_russian_tweets.R b/DataAnalysis/Q2/q2_before_after_russian_tweets.R
@@ -12,7 +12,7 @@ split_before_after_date <- function(data, date) {
   data$Date <- sub(" .*", "", data$Date)
   data$Date <- as.Date(data$Date, format="%Y-%m-%d", tz="UTC")
 
-  data <- subset(data, Date >= as.Date('2022-02-25') & Date <= as.Date('2022-03-10'))
+  data <- subset(data, Date >= as.Date('2022-02-18') & Date <= as.Date('2022-03-17'))
   data
 
   # Tweets before and after twitter ban
@@ -29,40 +29,61 @@ conduct_chisq_before_after <- function(data) {
   after_ban = data.frame(result[2])
 
   # calculate frequency of label
-  info_matrix = create_freq_matrix_2_groups(before_ban, after_ban, "Before Twitter Ban", "After Twitter Ban")
+  info_matrix = create_count_matrix_2_groups(before_ban, after_ban, "Before Twitter Ban", "After Twitter Ban")
   print(info_matrix)
 
   chisq.test(info_matrix)
 }
 
+plot_barplot_before_after <- function(data, plot_title) {
+  result = split_before_after_date(data, as.Date('2022-03-04'))
+  before_ban = data.frame(result[1])
+  after_ban = data.frame(result[2])
+
+  # calculate frequency of label
+  info_matrix = create_freq_matrix_2_groups(before_ban, after_ban, "Before Twitter Ban", "After Twitter Ban")
+  print(info_matrix)
+  barplot(info_matrix, main=plot_title, col = c('#d9534f', '#f0ad4e', '#5cb85c'))
+}
+
 # Russian
 
 # csv file -- change the file path here
 q2_zelensky_russian <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/zelensky_russian_with_sentiment.csv')
 conduct_chisq_before_after(q2_zelensky_russian)
-# p-value = 0.4434 (insignificant)
+plot_barplot_before_after(q2_zelensky_russian, "Sentiment of russian tweets containing \"Zelensky\"")
+# p-value for 1 week before/after = 0.4434 (insignificant)
+# p-value for 2 weeks before/after = 0.0744 (insignificant)
+
 
 q2_putin_russian <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/putin_russian_with_sentiment.csv')
 conduct_chisq_before_after(q2_putin_russian)
-# p-value = 0.4486 (insignificant)
+plot_barplot_before_after(q2_putin_russian, "Sentiment of russian tweets containing \"Putin\"")
+# p-value for 1 week before/after = 0.4486 (insignificant)
+# p-value for 2 weeks before/after = 0.1378 (insignificant)
 
 q2_nato_russian <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/nato_russian_with_sentiment.csv')
 conduct_chisq_before_after(q2_nato_russian)
-# p-value = 0.05422 (insignificant)
+plot_barplot_before_after(q2_nato_russian, "Sentiment of russian tweets containing \"NATO\"")
+# p-value for 1 week before/after = 0.05422 (insignificant)
+# p-value for 2 weeks before/after = 0.00012 (significant)
 
 # English
 
 q2_zelensky_english <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/zelensky_english_with_sentiment.csv')
 conduct_chisq_before_after(q2_zelensky_english)
-# p-value = 7.735e-14 (significant)
+plot_barplot_before_after(q2_zelensky_english, "Sentiment of english tweets containing \"Zelensky\"")
+# p-value for 1 week before/after = 7.735e-14 (significant)
 
 q2_putin_english <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/putin_english_with_sentiment.csv')
 conduct_chisq_before_after(q2_putin_english)
-# p-value = p-value = 0.1959 (insignificant)
+plot_barplot_before_after(q2_putin_english, "Sentiment of english tweets containing \"Putin\"")
+# p-value for 1 week before/after = 0.1959 (insignificant)
 
 q2_nato_english <-  read.csv('UkraineConflictOnTwitter/SentimentAnalysis/data/q2/nato_english_with_sentiment.csv')
 conduct_chisq_before_after(q2_nato_english)
-# p-value = 0.1518 (insignificant)
+plot_barplot_before_after(q2_nato_english, "Sentiment of english tweets containing \"NATO\"")
+# p-value for 1 week before/after = 0.1518 (insignificant)
 
 
 
diff --git a/DataAnalysis/utils.R b/DataAnalysis/utils.R
@@ -4,14 +4,34 @@ library(ggplot2)
 library(dplyr)
 
 
-create_freq_matrix_2_groups <- function(group_1, group_2, title_1, title_2) {
+create_count_matrix_2_groups <- function(group_1, group_2, title_1, title_2) {
   sentiment_group_1 <- group_1 %>%
     group_by(label) %>%
     summarise(freq = n())
-  
+
   sentiment_group_2 <- group_2 %>%
     group_by(label) %>%
     summarise(freq = n())
+
+  negative_freq = c(sentiment_group_1$freq[1], sentiment_group_2$freq[1])
+  neutral_freq = c(sentiment_group_1$freq[2], sentiment_group_2$freq[2])
+  positive_freq = c(sentiment_group_1$freq[3], sentiment_group_2$freq[3])
+  matrix_data = rbind(negative_freq, neutral_freq, positive_freq)
+  info_matrix = matrix(matrix_data, nrow=3, ncol=2, 
+                       dimnames= list(c("Negative tweets", "Neutral Tweets", "Positive tweets"), c(title_1, title_2)))
+  return(info_matrix)
+}
+
+create_freq_matrix_2_groups <- function(group_1, group_2, title_1, title_2) {
+  sentiment_group_1 <- group_1 %>%
+    group_by(label) %>%
+    summarise(cnt = n()) %>%
+    mutate(freq = round(cnt / sum(cnt), 3))
+
+  sentiment_group_2 <- group_2 %>%
+    group_by(label) %>%
+    summarise(cnt = n()) %>%
+    mutate(freq = round(cnt / sum(cnt), 3))
 
   negative_freq = c(sentiment_group_1$freq[1], sentiment_group_2$freq[1])
   neutral_freq = c(sentiment_group_1$freq[2], sentiment_group_2$freq[2])
@@ -20,4 +40,4 @@ create_freq_matrix_2_groups <- function(group_1, group_2, title_1, title_2) {
   info_matrix = matrix(matrix_data, nrow=3, ncol=2, 
                        dimnames= list(c("Negative tweets", "Neutral Tweets", "Positive tweets"), c(title_1, title_2)))
   return(info_matrix)
-}
+}