29 min to read
Tweet War
Trump VS Biden
US Election 2020 Tweets War: Can a US election be determined by tweets? The 2020 US election happened on the 3rd November 2020 and the resulting impact to the world will doubt be large, irrespective of which candidate is elected! #Content ##Tweets collected, using the Twitter API statuses_lookup and snsscrape for keywords, with the original ##intention to try to update this dataset daily so that the timeframe will eventually cover 15.10.2020 and 04.11.2020.
Needed libraries
library(tidyverse)
library(dplyr)
library(ggplot2)
library(wordcloud)
library(tm)
library(tidytext)
library(SnowballC)
library(R.utils)
library(stringr)
library(stopwords)
library(textdata)
Import the datasets
l2keep <- 100000
column_namesDT <- as.vector(t(read.csv("hashtag_donaldtrump.csv", header=FALSE, colClasses='character', nrows=1)))
nLDT <- countLines("hashtag_donaldtrump.csv")
dfDT <- read.csv("hashtag_donaldtrump.csv", header=FALSE, col.names=column_namesDT,skip=nLDT-l2keep)
column_namesJB <- as.vector(t(read.csv("hashtag_joebiden.csv", header=FALSE, colClasses='character', nrows=1)))
nLJB <- countLines("hashtag_joebiden.csv")
dfJB <- read.csv("hashtag_joebiden.csv")
Clean the Data Donald Trump
#########3Turn text to tokens
tokensDT <- dfDT %>%
unnest_tokens(output = word, input = tweet)
###########Show common words
tokensDT %>%
count(word,
sort = TRUE)
########REmove Stop words
sw = get_stopwords()
cleaned_tokensDT <- tokensDT %>%
filter(!word %in% sw$word)
#########REmove numbers
numsDT <- cleaned_tokensDT %>%
filter(str_detect(word, "^[0-9]")) %>%
select(word) %>% unique()
cleaned_tokensDT <- cleaned_tokensDT %>%
filter(!word %in% numsDT$word)
###Remove rare words
rareDT <- cleaned_tokensDT %>%
count(word) %>%
filter(n<150) %>%
select(word) %>% unique()
cleaned_tokensDT <- cleaned_tokensDT %>%
filter(!word %in% rareDT$word)
cleaned_tokensDT = cleaned_tokensDT %>%
filter(!word %in% c("https", "t.co","ðÿ","à","ø"))
Plot Tokens
#This graph displays the amount of times a word shows up over its frequency for Donald Trump
#########Plot tokens
cleaned_tokensDT %>%
count(word, sort = T) %>%
rename(word_freq = n) %>%
ggplot(aes(x=word_freq)) +
geom_histogram(aes(y=..count..), color="black", fill="blue", alpha=0.3) +
scale_x_continuous(breaks=c(0:5,10,100,500,10e3), trans="log1p", expand=c(0,0)) +
scale_y_continuous(breaks=c(0,100,1000,5e3,10e3,5e4,10e4,4e4), expand=c(0,0)) +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Word Cloud for Donald Trump
library(wordcloud)
# define a nice color palette
pal <- brewer.pal(8,"Dark2")
# plot the 100 most common words
cleaned_tokensDT %>%
count(word) %>%
with(wordcloud(word, n, random.order = FALSE, max.words = 150, colors=pal))
#Clean the dataJoe Biden Tweets
dfJB %>%
mutate(tweet = str_remove_all(tweet, "https"))
#########3Turn text to tokens
tokensJB <- dfJB %>%
unnest_tokens(output = word, input = tweet)
###########Show common words
tokensJB %>%
count(word,
sort = TRUE)
########REmove Stop words
sw = get_stopwords()
cleaned_tokensJB <- tokensJB %>%
filter(!word %in% sw$word)
#########REmove numbers
nums <- cleaned_tokensJB %>%
filter(str_detect(word, "^[0-9]")) %>%
select(word) %>% unique()
cleaned_tokensJB <- cleaned_tokensJB %>%
filter(!word %in% nums$word)
###Remove rare words
rareJB <- cleaned_tokensJB %>%
count(word) %>%
filter(n<1000) %>%
select(word) %>% unique()
cleaned_tokensJB <- cleaned_tokensJB %>%
filter(!word %in% rareJB$word)
length(unique(cleaned_tokensJB$word))
cleaned_tokensJB = cleaned_tokensDT %>%
filter(!word %in% c("https", "t.co","ðÿ","à","ø"))
Plot Tokens
#This graph displays the amount of times a word shows up over its frequency for Joe Biden
cleaned_tokensJB %>%
count(word, sort = T) %>%
rename(word_freq = n) %>%
ggplot(aes(x=word_freq)) +
geom_histogram(aes(y=..count..), color="black", fill="blue", alpha=0.3) +
scale_x_continuous(breaks=c(0:5,10,100,500,10e3), trans="log1p", expand=c(0,0)) +
scale_y_continuous(breaks=c(0,100,1000,5e3,10e3,5e4,10e4,4e4), expand=c(0,0)) +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Word Cloud for Joe Biden
library(wordcloud)
# define a nice color palette
pal <- brewer.pal(8,"Dark2")
# plot the 100 most common words
cleaned_tokensJB %>%
count(word) %>%
with(wordcloud(word, n, random.order = FALSE, max.words = 150, colors=pal))
# Joe Bidens Sentiment Analysis
############Sentiment Analysis
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
sent_reviewsJB = cleaned_tokensJB %>%
left_join(get_sentiments("nrc")) %>%
rename(nrc = sentiment) %>%
left_join(get_sentiments("bing")) %>%
rename(bing = sentiment) %>%
left_join(get_sentiments("afinn")) %>%
rename(afinn = value)
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
########Most Common Postivie and negative words
bing_word_countsJB <- sent_reviewsJB %>%
filter(!is.na(bing)) %>%
count(word, bing, sort = TRUE)
bing_word_countsJB %>%
filter(n > 1000) %>%
mutate(n = ifelse(bing == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = bing)) +
geom_col() +
coord_flip() +
labs(y = "Contribution to sentiment")
Donald Trump Sentiment Analysis
sent_reviewsDT = cleaned_tokensDT %>%
left_join(get_sentiments("nrc")) %>%
rename(nrc = sentiment) %>%
left_join(get_sentiments("bing")) %>%
rename(bing = sentiment) %>%
left_join(get_sentiments("afinn")) %>%
rename(afinn = value)
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
########Most Common Postivie and negative words
bing_word_countsDT <- sent_reviewsDT %>%
filter(!is.na(bing)) %>%
count(word, bing, sort = TRUE)
bing_word_countsDT %>%
filter(n > 1000) %>%
mutate(n = ifelse(bing == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = bing)) +
geom_col() +
coord_flip() +
labs(y = "Contribution to sentiment")