Skip to content

Commit

Permalink
added new R files containing data analysis functions
Browse files Browse the repository at this point in the history
  • Loading branch information
sheikhshafayat committed May 26, 2022
1 parent 6bf012f commit 552132b
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 36 deletions.
58 changes: 58 additions & 0 deletions DataAnalysis/Q3/BigramWordCloud.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
library(pacman) #my package manager

#load necessary packages
p_load(ggplot2)
p_load(dplyr)
p_load(reshape2)
p_load(gridExtra)
p_load(stringr)
p_load(tidytext)
p_load(tidyr)
p_load(wordcloud)
p_load(tm)

####Loading all the data
foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")

bigram_wc <- function(foxnews){
fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
n=2) %>%
anti_join(stop_words)
bg_fox <- fox_unn %>%
separate(word, c("word1", "word2"), sep=" ")

avoid_list <- c("russia", "ukraine", "user", "http", "fox", "york")
filter_bg_fox <- bg_fox %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% avoid_list) %>%
filter(!word2 %in% avoid_list)

count_bg <- filter_bg_fox %>%
group_by(word1, word2) %>%
tally(sort = TRUE)

count_bg <- as.data.frame(count_bg)

count_bg$bigram <- paste(count_bg$word1, count_bg$word2, sep=" ")
wc <- wordcloud(words = count_bg$bigram, freq = count_bg$n, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

return(wc)
}

fox_pos <- foxnews %>% filter(label=="Positive")
fox_neg <- foxnews %>% filter(label=="Negative")
fox_neu <- foxnews %>% filter(label=="Neutral")

nyt_pos <- nytimes %>% filter(label=="Positive")
nyt_neg <- nytimes %>% filter(label=="Negative")
nyt_neu <- nytimes %>% filter(label=="Neutral")

type(coocc_func(foxnews))
df <- coocc_func(nytimes)
df$bigram <- paste(df$word1, df$word2, sep=" ")
head(df)
48 changes: 48 additions & 0 deletions DataAnalysis/Q3/Cooccurance.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
library(pacman) #my package manager

#load necessary packages
p_load(ggplot2)
p_load(dplyr)
p_load(reshape2)
p_load(gridExtra)
p_load(stringr)
p_load(tidytext)
p_load(tidyr)

####Loading all the data
foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")

coocc_func <- function(foxnews){
fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
n=2) %>%
anti_join(stop_words)
bg_fox <- fox_unn %>%
separate(word, c("word1", "word2"), sep=" ")

avoid_list <- c("russia", "ukraine", "user", "http")
filter_bg_fox <- bg_fox %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% avoid_list) %>%
filter(!word2 %in% avoid_list)

count_bg <- filter_bg_fox %>%
group_by(word1, word2) %>%
tally(sort = TRUE)
return(count_bg)
}
fox_pos <- foxnews %>% filter(label=="Positive")
fox_neg <- foxnews %>% filter(label=="Negative")
fox_neu <- foxnews %>% filter(label=="Neutral")

nyt_pos <- nytimes %>% filter(label=="Positive")
nyt_neg <- nytimes %>% filter(label=="Negative")
nyt_neu <- nytimes %>% filter(label=="Neutral")

type(coocc_func(foxnews))
df <- coocc_func(nytimes)
df$bigram <- paste(df$word1, df$word2, sep=" ")
head(df)
45 changes: 45 additions & 0 deletions DataAnalysis/Q3/MakeWordCloud.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Q1 wordcloud
library(pacman)
p_load(wordcloud)
p_load(tm)
p_load(dplyr)
p_load(ggplot2)

# source file -- change the file path here
foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
foxtitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/FoxNews_Sheikh_with_sentiment.csv")
nytitle <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/NYT_Sheikh_with_sentiment.csv")

make_cloud <- function(dataset, sentiment){
positive <- fox_cloud[fox_cloud$label == sentiment,]

# remove non-ascii words
positive$text <- stringi::stri_trans_general(positive$text, "latin-ascii")
positive$text <- gsub("[^\x01-\x7F]", "", positive$text)

# create corpus and preprocess data
docs <- Corpus(VectorSource(positive$text))
docs <- docs %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("russia", "ukraine", "user", "http")) # remove "Russia" and "Ukraine"

# create matrix
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)

# create wordcloud
set.seed(1234)
wc <- wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
return(wc)

}

make_cloud(foxnews, "Neutral")
36 changes: 36 additions & 0 deletions DataAnalysis/Q3/Q3RAnalysisSheikh.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ p_load(dplyr)
p_load(reshape2)
p_load(gridExtra)
p_load(stringr)
p_load(tidytext)
p_load(tidyr)
#reading data
foxnews <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/fox_news_Final_with_sentiment.csv")
nytimes <- read.csv("/Volumes/GoogleDrive/My Drive/Spring 2022/Data Science Methodology/UkraineConflictOnTwitter/SentimentAnalysis/data/q3/new_york_times_Final_with_sentiment.csv")
Expand Down Expand Up @@ -130,4 +132,38 @@ ggplot(allSent, aes(x=Sentiment, y=Percentage, fill=Source)) +



############################################
#cooccurance score

coocc_func <- function(foxnews){
fox_unn <- foxnews %>% unnest_tokens(word, text, token = "ngrams",
n=2) %>%
anti_join(stop_words)
bg_fox <- fox_unn %>%
separate(word, c("word1", "word2"), sep=" ")

avoid_list <- c("russia", "ukraine", "user", "http")
filter_bg_fox <- bg_fox %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word1 %in% avoid_list) %>%
filter(!word2 %in% avoid_list)

count_bg <- filter_bg_fox %>%
group_by(word1, word2) %>%
tally(sort = TRUE)
return(count_bg)
}
fox_pos <- foxnews %>% filter(label=="Positive")
fox_neg <- foxnews %>% filter(label=="Negative")
fox_neu <- foxnews %>% filter(label=="Neutral")

nyt_pos <- nytimes %>% filter(label=="Positive")
nyt_neg <- nytimes %>% filter(label=="Negative")
nyt_neu <- nytimes %>% filter(label=="Neutral")

coocc_func(foxnews)
coocc_func(nytimes)



36 changes: 0 additions & 36 deletions DataAnalysis/Q3/Q3WordCloudPosFoxNews.R

This file was deleted.

0 comments on commit 552132b

Please sign in to comment.