# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CNA WORKSHOP 2023
# Introduction to R
#
# May 2023
# R version 4.3.0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#### Introduction ####

# - R is a computer language.
# - R can be used for statistics, graphics, calculations, programming...
# - There are many online resources.
# - R website: www.r-project.org
# - package repository CRAN

# - R studio
# - panels
# - scripts, console

#### 1. Start with R #####

# Comments in the script start with # (on Mac: "alt+3").
# Everything after # in the line is ignored by R.
# If you use several #'s before and after the comment, Rstudio recognizes it as
# a section title to which you can navigate.

##### mytitle ######

# In the RStudio script, send code to the Console with "ctrl + Enter" (Windows) 
# or "cmd + Enter" (Mac).
4 + 4

# Clear your workspace before starting to work.

rm(list = ls())

### Set your Working directory where your data and everything you produce will 
# be stored.
# Display the working directory.

getwd() 

# Set your working directory with setwd("").
# Set the working directory in which you saved the sample data for today!
# With Rstudio, go to session -> set working directory -> to source file location 
# (this sets the wd to the same folder where the R script is).
# In the script, simply copy-paste the location from the header of your windows
# folder. Remember to replace \ by / .

setwd("C:/Users/lso055/Documents/PhD Work/Prague")

getwd()

# During the CNA training, we set the working directory to the location of the R
# script.
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()

# Display the content of the current working directory:
dir() 

### Help!
# The ?command gives you the relevant information and examples for a specific command.

?hist()

# The ?? command gives you a list of possible help sources for some keyword (in 
# case you don't know the command).

??histogram

# Use "" for several words.

??"descriptive statistics"

# Very good online documentation: help yourself with Google! Good links/forums: 
# Quick R (http://www.statmethods.net/) and www.stackoverflow.com

### Packages
# Packages contain useful R code that others have written. There are packages for
# everything.
# For example, the package cna is used for performing CNA with R.
# Some packages are installed permanently ("base") others must be installed by
# the user.
# install.packages(""): install package

# Running the following line of code would install all the packages needed for
# this CNA training:
# install.packages(c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr",
#     "LogicReg", "psych"), dependencies = TRUE)

# But we don't want to re-install any packages that are already installed.
# To avoid that, we run the code below instead. This first checks if the package
# is already installed.
# If the package is not already installed, the package is installed.

if (!require("cna")){
  install.packages("cna")
}

if (!require("frscore")){
  install.packages("frscore")
}

if (!require("cnaOpt")){
  install.packages("cnaOpt")
}

if (!require("dplyr")){
  install.packages("dplyr")
}

if (!require("QCA")){
  install.packages("QCA")
}

if (!require("stringr")){
  install.packages("stringr")
}

if (!require("LogicReg")){
  install.packages("LogicReg")
}

if (!require("psych")){
  install.packages("psych")
}


# You only need to install a package once. But every time you want to use it, 
# you have to load it:
# library(): load an installed package (without quotes!)
# We also load the base package (which comes with R, so we did not need to 
# install it).

library(cna); 
library(base); 
library(frscore);
library(cnaOpt); 
library(dplyr); 
library(QCA); 
library(stringr); 
library(LogicReg);
library(psych); 

# Check whether the required packages are loaded.

search() 

# More info on cna:
help(package="cna")
vignette("cna")


##### 2. Objects, Workspace and Operators #####

# R works with objects. Objects can be everything. For example, datasets are 
# objects, variables are objects.
# Data are contained in objects of different size and format (object classes),
# functions use the content of an object and produce results.
# In RStudio, Objects are stored in the "workspace", you can see them in the 
# environment (upper right).

# To generate an object, we first give it a name and then use a backward arrow <- 
# to define its content.
# Example: generate an object that contains the number 50.

a <- 50 

# display object in console

a

# c() is a function which combines its arguments (c stands for "concatenate").
# This helps you to put separate elements into a single object. 

b <- c(10, 50, 60) 
b

d <- c(10:50, 100, 150)
d

# save objects in workspace in a file

save(a, b, file="ab.RData")
dir()

# remove objects from workspace
b
rm(b) 
b

# Delete ("clean") whole workspace
rm(list=ls()) 
a

### Operators
# Arithmetic operators: +, -, *, /, ^. 
# Logical operators: &, |, ==, !=, <, >, <=, >=. 
# Functions: sqrt(), exp(), log(), min(), max(), mean()

# You can use R much like a calculator:

((5+5)/2)*3

# Save the result in an object! If not, it only appears in the Console.

a <- ((5+5)/2)*3
a

x <- c(1,5,20)

a+x

d <- a+x
d

# ask R a question: True or false?

a

a == 10

a != 10

!(a == 10)

x < 5

# this does not work:
1 < a < 20
# write this instead:
1 < a & a < 20

TRUE & TRUE

TRUE & FALSE

TRUE | FALSE


# Exercises:

# Check if a is bigger than 10 AND smaller than or equal to 20. 


# Create an object called p that is the list of 1, 3, 5, 7 and 11.


# Remove a and p.



##### 3. Working with data #####

### Open and save datasets #######
# Before you start working with a new dataset, you want to clear the workspace
# to avoid that the software "gets confused".

rm(list=ls()) 

# For performing a CNA analysis, we usually work with .csv files. Make sure that
# the data is stored in the working directory you work on. 
# To load the dataset, we build an object that reads the .csv file:
dir()
cs_data <- read.csv("Crisp_set_data.csv")

# For excels from countries that use a comma as decimal point and a semicolon as 
# field separator, you should add the following arguments to the read.csv()
# function: sep = ";",  dec = ","

# If this does not work, it may be that your excel is formatted differently 
# (e.g. US version). Try the option: 
# read.csv("cs_data.csv", row.names=1, header = TRUE, sep = ",",  dec = ".") instead.


# If you want to give your dataset another name without replacing the other 
# dataset, just type:
crisp_set_data <- cs_data
crisp_set_data
rm(crisp_set_data)

# Save (Export) your dataset.

write.csv2(cs_data, "cs_data.csv")

# You could also save it under any different name. With Data in US-format, use
# write.csv(cs_data, "cs_data.csv").
write.csv(cs_data, "A dataset containing crisp-set data.csv")

# write.csv() writes “.” for the decimal point and (“,”) for the separator.
# write.csv2() writes (“,”) for the decimal point and (“;”) for the separator.

# This is a simple trick to reformat a csv file if you can't read it properly 
# in Excel, too!

######### Inspect data######
# The View() command makes you see your dataset as if you opened it normally.

View(cs_data)

# Attention: you cannot make any changes in the dataset using the cursor!

# Check how many variables you have in your dataset:

length(cs_data)

# Look up the names of the variables:

names(cs_data)

# You can use rownames() to get the names of the cases in your dataset:

rownames(cs_data)

# head() gives you a first impression of how your dataset looks like (first 6 rows).

head(cs_data)

# Access the elements of subsets of a data frame: $, [rows,columns], subset.
# The $ sign means something like "and therein", for example "variable A in cs_data":

cs_data$A

# You can use square brackets to specify which part of the dataset you want to see. 
# Before the comma, you specify the number of the row(s), and after the comma,
# the column(s). If unspecified, all rows / columns are displayed. 
# You can list several elements you want to see using c(). 
# Look up several variables at once:

cs_data[, c("A", "B")]

# You can also use square brackets to specify which part of the dataset you want
# to see.
# Before the comma, you specify the row(s) (cases), and after the comma, the 
# columns (variable). 
# Now, we want to see the values of variables A and B for only the 5th, 6th, 
# 7th and 8th row.

cs_data[c("5", "6", "7", "8"), c("A", "B")]

# We can get the same values by specifying row and column numbers instead of
# row and column names, like this:
cs_data[5:8, c(1, 2)]

# Another way of doing this: look up the value of the 5th to 8th case for the 
# variable A.

cs_data$A[5:8]

# Make a dataframe containing only the observations in which both A and B are
# both equal to 1.

subset(cs_data, A == 1 & B == 1)
AB_data <-  subset(cs_data, A == 1 & B == 1)
AB_data
rm(AB_data)

# Get descriptive statistics for your dataset:

describe(cs_data)

# Or perform descriptive statistics separately: mean(), median(), sd()

mean(cs_data$C)
sd(cs_data$C)


################### Data management ##############

##### Recoding variables

# You can attribute values to cases. 
# For example, you want B to have a missing value (NA) for the third row:

cs_data["3","B"]

# The case has a value of 0.

cs_data["3","B"] <- NA
cs_data["3","B"]
cs_data[,"B"]

# Set this back to the original value.

cs_data["3","B"] <- 0

# Usually, you will create a new variable to recode another variable. Otherwise 
# you "lose" the values of the old variables.
# We use $ to ensure that the new variable (object) is tied into the dataset.
# Here, we create B_new, consisting of the values of B.

cs_data$B_new <- cs_data$B

cs_data[,c("B", "B_new")]


# Sometimes, we want to recode variables using logical rules.

cs_data$B_new <- NA
cs_data$B_new[cs_data$B == 0] <- "no"
cs_data$B_new[cs_data$B == 1] <- "yes"

cs_data[,c("B", "B_new")]


# You can also perform operations with variables to create a new variable. 
# For example, we create a new variable "AB" which occurs (is equal to 1) when
# both "A" and "B" occur.
cs_data$AB <- cs_data$A & cs_data$B

cs_data$AB


# It is also possible to add up, subtract or exponentiate (+, -, *, exp(), sqr()).

# Renaming variables: here we rename "E" into "octopus".

names(cs_data)[names(cs_data)=="E"] <- "octopus"
View(cs_data)

# Alternatively, to rename a variable, you can simply create a new variable that
# equals the old variable, and then remove the old variable: 

cs_data$common_octopus <- cs_data$octopus
cs_data$octopus <- NULL 
names(cs_data)

# If you want to use the manipulation we just did, save the data again.
# Here we save it under the name "exercise".

write.csv2(cs_data, "exercise.csv") 

dir()


## Also, save your changes to the script before closing it.
##### Have a nice evening!


################### Appendix: functions and operators ##############

# Here is an overview of (most of) the functions that we have seen in the
# tutorial.

?rm()
?getwd()
?setwd()
?dir()
?library()
?c()
?save()
?read.csv()
?write.csv2()
?write.csv()
?View()
?length
?names()
?rownames()
?head()
?subset()
?describe()
?mean()
?sd()

## Here are some operators that we have seen:

# plus: +
# minus: -
# division: /
# multiplication: *
# and: &
# or: |
# not: !
# is equal to: ==
# is not equal to: !=
# is less than: <
# is greater than: >
# is less than or equal to: <=
# is greater than or equal to: >=


############## Extra: if, else, for ############################

# As an extra, an introduction to if-else statements and to for loops in R is
# included below.
# Please only have a look at this if you are interested and want to learn more 
# about R programming. It is not needed (or even helpful) for following the CNA 
# training.

# The example in this script shows how to install multiple R packages with one
# chunk of code. If you would like to follow tutorials with different examples,
# you can use the following links:
# https://www.datamentor.io/r-programming/if-else-statement/
# https://www.datamentor.io/r-programming/for-loop/

##### if-else

# if statements have the following form:
# if (condition){
#   statement
# }
# If the condition is true, the statement between {} will be executed by R.
# If the condition is not true, the statement between {} will not be executed
# by R. (The statement can consist of multiple lines of code.)

# The following chunk of code checks if "cna" is already installed. 
# If "cna" is already installed, the condition between () is not true and the
# lines between {} are NOT executed.
# If "cna" is not installed yet, the condition between () is true and the lines
# between {} ARE executed.

if (!require("cna")){
  install.packages("cna")
  print("cna has now been installed.")
}


# if-else statements have the following form:
# if (condition){
#   statement1
# } else {
#   statement2
# }

# if-else statements are like if statements, but they add that R should do
# do something else if (and only if!) the condition between () is not true.

# The following chunk of code does the same as the previous one, but it adds:
# If the condition is not true (so if "cna" is already installed) R executes the
# line of code between {} AFTER ELSE. That line of code between {} after else is
# NOT executed if the condition IS true.


if (!require("cna")){
  install.packages("cna")
  print("cna has now been installed.")
} else {
  print("cna was already installed.")
}

##### for loop

# A for loop has the following form:
# for (value in sequence){
#   statement
#}

# It executes the statement between {} for every value in the sequence.
# In the following chunk of code, the list of package names is the sequence.
# The for loop goes through the list of package names and prints each name.

package_names <- c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr", "LogicReg")


for (name in package_names){
  print(name)
}

# The following for loops prints, for each name in package_names, whether the
# package has not yet been installed. All the packages in the list HAVE been 
# installed, so the answer should be FALSE for each element in the sequence.
for (name in package_names){
  print(!require(name, character.only = TRUE))
}

# Finally, we can use a for loop which checks for each package in the sequence
# whether it has not been installed yet. If the package has not been installed
# yet, R installs the package and prints "     has now been installed.",
# if the package has already been installed, R just prints
# "     was already installed."

for (name in package_names){
  if (!require(name, character.only = TRUE)){
    install.packages(name, character.only = TRUE)
    print(name)
    print("     has now been installed.")
  } else {
    print(name)
    print("     was already installed.")
  }
}

# This shows how repetitive tasks can be automated in R.
# If you don't need to see which packages had already been installed, you can
# run the following chunk of code instead. You only need 6 lines of code to install
# as many packages as you want!

package_names <- c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr", "LogicReg")
for (name in package_names){
  if (!require(name, character.only = TRUE)){
    install.packages(name, character.only = TRUE)
  }
}