install.packages("dplyr") install.packages("ggplot2") install.packages("gridExtra") install.packages("tidytext") install.packages("wordcloud2") install.packages("readxl") install.packages("openxlsx") install.packages("textdata") install.packages("tidyr") install.packages("knitr") install.packages("kableExtra") install.packages("widyr") install.packages("igraph") install.packages("ggraph") install.packages("ggrepel") library(dplyr) library(ggplot2) library(gridExtra) library(tidytext) library(wordcloud2) library(readxl) library(openxlsx) library(textdata) library(tidyr) library(knitr) library(kableExtra) library(dplyr) library(widyr) library(igraph) library(ggraph) library(ggrepel) discurso <- read_excel("C:/Users/Paulo/Documents/Documents/Estatistica I/Análise de Texto/kenya.xls", sheet = 1) speeches <- discurso$texto speeches_right <- discurso$texto[discurso$ideologia == 'Direita'] speeches_left <- discurso$texto[discurso$ideologia == 'Centro-esquerda'] names(discurso) glimpse(discurso) dim(discurso) speeches <- sapply(speeches, tolower) removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x) speeches <- sapply(speeches, removeSpecialChars) summary(discurso) my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00") theme_discurso <- function() { theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_blank(), axis.ticks = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "none") } speeches_over_time <- discurso %>% group_by(ano_discurso) %>% summarise(number_of_texto = n()) speeches_over_time %>% ggplot() + geom_bar(aes(x = ano_discurso, y = number_of_texto), stat = "identity") + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = "Nº Discursos") + ggtitle("Contagem Discursos") speeches_over_time <- discurso %>% group_by(id_ano) %>% summarise(number_of_texto = n()) speeches_over_time %>% ggplot() + geom_bar(aes(x = id_ano, y = number_of_texto), stat = "identity") + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = "Nº Discursos") + ggtitle("0 = antes 2012; 1 = pós 2012") speeches_over_time_ideology <- discurso %>% group_by(id_ano, ideologia) %>% summarise(number_of_texto = n()) speeches_over_time_ideology %>% ggplot() + geom_bar(aes(x = id_ano, y = number_of_texto, fill = ideologia), stat = "identity") + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = "Nº Discursos") + ggtitle("0 = antes 2012; 1 = pós 2012") speeches_over_time_province <- discurso %>% group_by(id_ano, provincia) %>% summarise(number_of_texto = n()) speeches_over_time_province %>% ggplot() + geom_bar(aes(x = id_ano, y = number_of_texto, fill = provincia), stat = "identity") + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank(), panel.grid.minor = element_blank()) + labs(x = NULL, y = "Nº Discursos") + ggtitle("0 = antes 2012; 1 = pós 2012") undesirable_words <- c("deputy", "speaker", "mr", "mister", "ministry", "minister", "committee", "senator", "senators", "members", "member", "temporary", "laughter", "debates", "debate", "thank you", "house", "parliament", "senate", "editor", "cent", "court", "hansard", "time", "motion", "support", "bill", "report", "county", "country") head(sample(stop_words$word, 1000), 1000) discurso_tidy <- discurso %>% unnest_tokens(word, texto) %>% anti_join(stop_words) %>% distinct() %>% filter(!word %in% undesirable_words) %>% filter(nchar(word) > 3) class(discurso_tidy) dim(discurso_tidy) discurso_words_counts <- discurso_tidy %>% count(word, sort = TRUE) wordcloud2(discurso_words_counts[1:300, ], size = .5) install.packages("textdata") get_sentiments("afinn") get_sentiments("bing") get_sentiments("nrc") get_sentiments("loughran") get_sentiments(lexicon = c("afinn", "bing", "loughran", "nrc")) library(tidyr) discurso_bigrams <- discurso_tidy %>% unnest_tokens(bigram, word, token = "ngrams", n = 2) discurso_bigrams %>% count(bigram, sort = TRUE) bigrams_separated <- discurso_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) %>% filter(!word1 %in% undesirable_words) %>% filter(!word2 %in% undesirable_words) AFINN <- get_sentiments("afinn") china_words <- bigrams_filtered %>% filter(word1 == "china") %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word2, value, sort = TRUE) china_words %>% mutate(contribution = n * value) %>% arrange(desc(abs(contribution))) %>% head(50) %>% ggplot(aes(word2, n * value, fill = n * value >= 0)) + geom_col(show.legend = FALSE) + xlab("Words preceded by \"china\"") + ylab("Sentiment value * number of occurrences") + coord_flip() chinese_words <- bigrams_filtered %>% filter(word1 == "chinese") %>% inner_join(AFINN, by = c(word2 = "word")) %>% count(word2, value, sort = TRUE) chinese_words %>% mutate(contribution = n * value) %>% arrange(desc(abs(contribution))) %>% head(50) %>% ggplot(aes(word2, n * value, fill = n * value > 0)) + geom_col(show.legend = FALSE) + xlab("Words preceded by \"chinese\"") + ylab("Sentiment value * number of occurrences") + coord_flip() discurso_bing <- discurso_tidy %>% inner_join(get_sentiments("bing")) discurso_nrc <- discurso_tidy %>% inner_join(get_sentiments("nrc")) discurso_nrc_sub <- discurso_tidy %>% inner_join(get_sentiments("nrc")) %>% filter(!sentiment %in% c("positive", "negative")) install.packages ("memery") install.packages ("magick") library(memery) library(magick) nrc_plot <- discurso_nrc %>% group_by(sentiment) %>% summarise(word_count = n()) %>% ungroup() %>% mutate(sentiment = reorder(sentiment, word_count)) %>% ggplot(aes(sentiment, word_count, fill = -word_count)) + geom_col() + guides(fill = FALSE) + #Turn off the legend theme_discurso() + labs(x = NULL, y = "Word Count") + scale_y_continuous(limits = c(0, 15000)) + #Hard code the axis limit ggtitle("Speeches NRC Sentiment") + coord_flip() plot(nrc_plot) install.packages("circlize") library(circlize) grid.col = c("0" = my_colors[1], "1" = my_colors[2], "anger" = "grey", "anticipation" = "grey", "disgust" = "grey", "fear" = "grey", "joy" = "grey", "sadness" = "grey", "surprise" = "grey", "trust" = "grey") decade_mood <- discurso_nrc %>% filter(id_ano != "NA" & !sentiment %in% c("positive", "negative")) %>% count(sentiment, id_ano) %>% group_by(id_ano, sentiment) %>% summarise(sentiment_sum = sum(n)) %>% ungroup() circos.clear() circos.par(gap.after = c(rep(5, length(unique(decade_mood[[1]])) - 1), 15, rep(5, length(unique(decade_mood[[2]])) - 1), 15)) chordDiagram(decade_mood, grid.col = grid.col, transparency = .2) title("Relationship Between Mood and 2012 - (0) before, (1) after") grid.col = c("Direita" = my_colors[1], "Centro-esquerda" = my_colors[2], "Centro" = my_colors[3], anger" = "grey", "anticipation" = "grey", "disgust" = "grey", "fear" = "grey", "joy" = "grey", "sadness" = "grey", "surprise" = "grey", "trust" = "grey") decade_mood <- discurso_nrc %>% filter(ideologia != "NA" & !sentiment %in% c("positive", "negative")) %>% count(sentiment, ideologia) %>% group_by(ideologia, sentiment) %>% summarise(sentiment_sum = sum(n)) %>% ungroup() circos.clear() #Set the gap size circos.par(gap.after = c(rep(9, length(unique(decade_mood[[1]])) - 1), 15, rep(9, length(unique(decade_mood[[2]])) - 1), 15)) chordDiagram(decade_mood, grid.col = grid.col, transparency = .2) title("Relação entre Ideologia e Sentimento do Discurso")