install.packages("dplyr")
install.packages("ggplot2")
install.packages("gridExtra")
install.packages("tidytext")
install.packages("wordcloud2")
install.packages("readxl")
install.packages("openxlsx")
install.packages("textdata")
install.packages("tidyr")
install.packages("knitr")
install.packages("kableExtra")
install.packages("widyr")
install.packages("igraph")
install.packages("ggraph")
install.packages("ggrepel")
library(dplyr) 
library(ggplot2) 
library(gridExtra) 
library(tidytext) 
library(wordcloud2) 
library(readxl)
library(openxlsx)
library(textdata)
library(tidyr)
library(knitr)
library(kableExtra)
library(dplyr)
library(widyr)
library(igraph)
library(ggraph)
library(ggrepel)

discurso <- read_excel("C:/Users/Paulo/Documents/Documents/Estatistica I/Análise de Texto/kenya.xls", sheet = 1)

speeches <- discurso$texto

speeches_right <- discurso$texto[discurso$ideologia == 'Direita']
speeches_left <- discurso$texto[discurso$ideologia == 'Centro-esquerda']

names(discurso)

glimpse(discurso)

dim(discurso)

speeches <- sapply(speeches, tolower)

removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)
speeches <- sapply(speeches, removeSpecialChars)

summary(discurso)

my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#CC79A7", "#D55E00")


theme_discurso <- function() 
{ theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_blank(), 
        axis.ticks = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none") }

speeches_over_time <- discurso %>%
  group_by(ano_discurso) %>%
  summarise(number_of_texto = n())

speeches_over_time %>% 
  ggplot() + 
  geom_bar(aes(x = ano_discurso, y = number_of_texto), stat = "identity") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(x = NULL, y = "Nº Discursos") +
  ggtitle("Contagem Discursos")

speeches_over_time <- discurso %>%
  group_by(id_ano) %>%
  summarise(number_of_texto = n())

speeches_over_time %>% 
  ggplot() + 
  geom_bar(aes(x = id_ano, y = number_of_texto), stat = "identity") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(x = NULL, y = "Nº Discursos") +
  ggtitle("0 = antes 2012; 1 = pós 2012")

speeches_over_time_ideology <- discurso %>%
  group_by(id_ano, ideologia) %>%
  summarise(number_of_texto = n())

speeches_over_time_ideology %>% 
  ggplot() + 
  geom_bar(aes(x = id_ano, y = number_of_texto, fill = ideologia), stat = "identity") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(x = NULL, y = "Nº Discursos") +
  ggtitle("0 = antes 2012; 1 = pós 2012")

speeches_over_time_province <- discurso %>%
  group_by(id_ano, provincia) %>%
  summarise(number_of_texto = n())

speeches_over_time_province %>% 
  ggplot() + 
  geom_bar(aes(x = id_ano, y = number_of_texto, fill = provincia), stat = "identity") +
  theme(plot.title = element_text(hjust = 0.5),
        legend.title = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(x = NULL, y = "Nº Discursos") +
  ggtitle("0 = antes 2012; 1 = pós 2012")

undesirable_words <- c("deputy", "speaker", "mr", "mister", "ministry", "minister", "committee", "senator", "senators", "members", "member", "temporary", "laughter", "debates", "debate", "thank you", "house", "parliament", "senate", "editor", "cent", "court", "hansard", "time", "motion", "support", "bill", "report", "county", "country")


head(sample(stop_words$word, 1000), 1000)

discurso_tidy <- discurso %>%
  unnest_tokens(word, texto) %>%
  anti_join(stop_words) %>%
  distinct() %>%
  filter(!word %in% undesirable_words) %>%
  filter(nchar(word) > 3)

class(discurso_tidy)

dim(discurso_tidy)

discurso_words_counts <- discurso_tidy %>%
  count(word, sort = TRUE) 
wordcloud2(discurso_words_counts[1:300, ], size = .5)


install.packages("textdata")


get_sentiments("afinn")

get_sentiments("bing")

get_sentiments("nrc")

get_sentiments("loughran")

get_sentiments(lexicon = c("afinn", "bing", "loughran", "nrc"))

library(tidyr)

discurso_bigrams <- discurso_tidy %>%
  unnest_tokens(bigram, word, token = "ngrams", n = 2)

discurso_bigrams %>%
  count(bigram, sort = TRUE)

bigrams_separated <- discurso_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word1 %in% undesirable_words) %>%
  filter(!word2 %in% undesirable_words)

AFINN <- get_sentiments("afinn")

china_words <- bigrams_filtered %>%
  filter(word1 == "china") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

china_words  %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(50) %>%
  ggplot(aes(word2, n * value, fill = n * value >= 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"china\"") +
  ylab("Sentiment value * number of occurrences") +
  coord_flip()

chinese_words <- bigrams_filtered %>%
  filter(word1 == "chinese") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)


chinese_words  %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(50) %>%
  ggplot(aes(word2, n * value, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"chinese\"") +
  ylab("Sentiment value * number of occurrences") +
  coord_flip()

discurso_bing <- discurso_tidy %>%
  inner_join(get_sentiments("bing"))
discurso_nrc <- discurso_tidy %>%
  inner_join(get_sentiments("nrc"))
discurso_nrc_sub <- discurso_tidy %>%
  inner_join(get_sentiments("nrc")) %>%
  filter(!sentiment %in% c("positive", "negative"))

install.packages ("memery")
install.packages ("magick")
library(memery)
library(magick)

nrc_plot <- discurso_nrc %>%
  group_by(sentiment) %>%
  summarise(word_count = n()) %>%
  ungroup() %>%
  mutate(sentiment = reorder(sentiment, word_count)) %>%
  ggplot(aes(sentiment, word_count, fill = -word_count)) +
  geom_col() +
  guides(fill = FALSE) + #Turn off the legend
  theme_discurso() +
  labs(x = NULL, y = "Word Count") +
  scale_y_continuous(limits = c(0, 15000)) + #Hard code the axis limit
  ggtitle("Speeches NRC Sentiment") +
  coord_flip()
plot(nrc_plot)

install.packages("circlize")
library(circlize)

grid.col = c("0" = my_colors[1], "1" = my_colors[2], "anger" = "grey", "anticipation" = "grey", "disgust" = "grey", "fear" = "grey", "joy" = "grey", "sadness" = "grey", "surprise" = "grey", "trust" = "grey")

decade_mood <-  discurso_nrc %>%
  filter(id_ano != "NA" & !sentiment %in% c("positive", "negative")) %>%
  count(sentiment, id_ano) %>%
  group_by(id_ano, sentiment) %>%
  summarise(sentiment_sum = sum(n)) %>%
  ungroup()

circos.clear()

circos.par(gap.after = c(rep(5, length(unique(decade_mood[[1]])) - 1), 15,
                         rep(5, length(unique(decade_mood[[2]])) - 1), 15))
chordDiagram(decade_mood, grid.col = grid.col, transparency = .2)
title("Relationship Between Mood and 2012 - (0) before, (1) after")


grid.col = c("Direita" = my_colors[1], "Centro-esquerda" = my_colors[2], "Centro" = my_colors[3], anger" = "grey", "anticipation" = "grey", "disgust" = "grey", "fear" = "grey", "joy" = "grey", "sadness" = "grey", "surprise" = "grey", "trust" = "grey")

decade_mood <-  discurso_nrc %>%
  filter(ideologia != "NA" & !sentiment %in% c("positive", "negative")) %>%
  count(sentiment, ideologia) %>%
  group_by(ideologia, sentiment) %>%
  summarise(sentiment_sum = sum(n)) %>%
  ungroup()

circos.clear()
#Set the gap size
circos.par(gap.after = c(rep(9, length(unique(decade_mood[[1]])) - 1), 15,
                         rep(9, length(unique(decade_mood[[2]])) - 1), 15))
chordDiagram(decade_mood, grid.col = grid.col, transparency = .2)
title("Relação entre Ideologia e Sentimento do Discurso")