# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # CNA WORKSHOP 2023 # Introduction to R # # May 2023 # R version 4.3.0 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #### Introduction #### # - R is a computer language. # - R can be used for statistics, graphics, calculations, programming... # - There are many online resources. # - R website: www.r-project.org # - package repository CRAN # - R studio # - panels # - scripts, console #### 1. Start with R ##### # Comments in the script start with # (on Mac: "alt+3"). # Everything after # in the line is ignored by R. # If you use several #'s before and after the comment, Rstudio recognizes it as # a section title to which you can navigate. ##### mytitle ###### # In the RStudio script, send code to the Console with "ctrl + Enter" (Windows) # or "cmd + Enter" (Mac). 4 + 4 # Clear your workspace before starting to work. rm(list = ls()) ### Set your Working directory where your data and everything you produce will # be stored. # Display the working directory. getwd() # Set your working directory with setwd(""). # Set the working directory in which you saved the sample data for today! # With Rstudio, go to session -> set working directory -> to source file location # (this sets the wd to the same folder where the R script is). # In the script, simply copy-paste the location from the header of your windows # folder. Remember to replace \ by / . setwd("C:/Users/lso055/Documents/PhD Work/Prague") getwd() # During the CNA training, we set the working directory to the location of the R # script. setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) getwd() # Display the content of the current working directory: dir() ### Help! # The ?command gives you the relevant information and examples for a specific command. ?hist() # The ?? command gives you a list of possible help sources for some keyword (in # case you don't know the command). ??histogram # Use "" for several words. ??"descriptive statistics" # Very good online documentation: help yourself with Google! Good links/forums: # Quick R (http://www.statmethods.net/) and www.stackoverflow.com ### Packages # Packages contain useful R code that others have written. There are packages for # everything. # For example, the package cna is used for performing CNA with R. # Some packages are installed permanently ("base") others must be installed by # the user. # install.packages(""): install package # Running the following line of code would install all the packages needed for # this CNA training: # install.packages(c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr", # "LogicReg", "psych"), dependencies = TRUE) # But we don't want to re-install any packages that are already installed. # To avoid that, we run the code below instead. This first checks if the package # is already installed. # If the package is not already installed, the package is installed. if (!require("cna")){ install.packages("cna") } if (!require("frscore")){ install.packages("frscore") } if (!require("cnaOpt")){ install.packages("cnaOpt") } if (!require("dplyr")){ install.packages("dplyr") } if (!require("QCA")){ install.packages("QCA") } if (!require("stringr")){ install.packages("stringr") } if (!require("LogicReg")){ install.packages("LogicReg") } if (!require("psych")){ install.packages("psych") } # You only need to install a package once. But every time you want to use it, # you have to load it: # library(): load an installed package (without quotes!) # We also load the base package (which comes with R, so we did not need to # install it). library(cna); library(base); library(frscore); library(cnaOpt); library(dplyr); library(QCA); library(stringr); library(LogicReg); library(psych); # Check whether the required packages are loaded. search() # More info on cna: help(package="cna") vignette("cna") ##### 2. Objects, Workspace and Operators ##### # R works with objects. Objects can be everything. For example, datasets are # objects, variables are objects. # Data are contained in objects of different size and format (object classes), # functions use the content of an object and produce results. # In RStudio, Objects are stored in the "workspace", you can see them in the # environment (upper right). # To generate an object, we first give it a name and then use a backward arrow <- # to define its content. # Example: generate an object that contains the number 50. a <- 50 # display object in console a # c() is a function which combines its arguments (c stands for "concatenate"). # This helps you to put separate elements into a single object. b <- c(10, 50, 60) b d <- c(10:50, 100, 150) d # save objects in workspace in a file save(a, b, file="ab.RData") dir() # remove objects from workspace b rm(b) b # Delete ("clean") whole workspace rm(list=ls()) a ### Operators # Arithmetic operators: +, -, *, /, ^. # Logical operators: &, |, ==, !=, <, >, <=, >=. # Functions: sqrt(), exp(), log(), min(), max(), mean() # You can use R much like a calculator: ((5+5)/2)*3 # Save the result in an object! If not, it only appears in the Console. a <- ((5+5)/2)*3 a x <- c(1,5,20) a+x d <- a+x d # ask R a question: True or false? a a == 10 a != 10 !(a == 10) x < 5 # this does not work: 1 < a < 20 # write this instead: 1 < a & a < 20 TRUE & TRUE TRUE & FALSE TRUE | FALSE # Exercises: # Check if a is bigger than 10 AND smaller than or equal to 20. # Create an object called p that is the list of 1, 3, 5, 7 and 11. # Remove a and p. ##### 3. Working with data ##### ### Open and save datasets ####### # Before you start working with a new dataset, you want to clear the workspace # to avoid that the software "gets confused". rm(list=ls()) # For performing a CNA analysis, we usually work with .csv files. Make sure that # the data is stored in the working directory you work on. # To load the dataset, we build an object that reads the .csv file: dir() cs_data <- read.csv("Crisp_set_data.csv") # For excels from countries that use a comma as decimal point and a semicolon as # field separator, you should add the following arguments to the read.csv() # function: sep = ";", dec = "," # If this does not work, it may be that your excel is formatted differently # (e.g. US version). Try the option: # read.csv("cs_data.csv", row.names=1, header = TRUE, sep = ",", dec = ".") instead. # If you want to give your dataset another name without replacing the other # dataset, just type: crisp_set_data <- cs_data crisp_set_data rm(crisp_set_data) # Save (Export) your dataset. write.csv2(cs_data, "cs_data.csv") # You could also save it under any different name. With Data in US-format, use # write.csv(cs_data, "cs_data.csv"). write.csv(cs_data, "A dataset containing crisp-set data.csv") # write.csv() writes “.” for the decimal point and (“,”) for the separator. # write.csv2() writes (“,”) for the decimal point and (“;”) for the separator. # This is a simple trick to reformat a csv file if you can't read it properly # in Excel, too! ######### Inspect data###### # The View() command makes you see your dataset as if you opened it normally. View(cs_data) # Attention: you cannot make any changes in the dataset using the cursor! # Check how many variables you have in your dataset: length(cs_data) # Look up the names of the variables: names(cs_data) # You can use rownames() to get the names of the cases in your dataset: rownames(cs_data) # head() gives you a first impression of how your dataset looks like (first 6 rows). head(cs_data) # Access the elements of subsets of a data frame: $, [rows,columns], subset. # The $ sign means something like "and therein", for example "variable A in cs_data": cs_data$A # You can use square brackets to specify which part of the dataset you want to see. # Before the comma, you specify the number of the row(s), and after the comma, # the column(s). If unspecified, all rows / columns are displayed. # You can list several elements you want to see using c(). # Look up several variables at once: cs_data[, c("A", "B")] # You can also use square brackets to specify which part of the dataset you want # to see. # Before the comma, you specify the row(s) (cases), and after the comma, the # columns (variable). # Now, we want to see the values of variables A and B for only the 5th, 6th, # 7th and 8th row. cs_data[c("5", "6", "7", "8"), c("A", "B")] # We can get the same values by specifying row and column numbers instead of # row and column names, like this: cs_data[5:8, c(1, 2)] # Another way of doing this: look up the value of the 5th to 8th case for the # variable A. cs_data$A[5:8] # Make a dataframe containing only the observations in which both A and B are # both equal to 1. subset(cs_data, A == 1 & B == 1) AB_data <- subset(cs_data, A == 1 & B == 1) AB_data rm(AB_data) # Get descriptive statistics for your dataset: describe(cs_data) # Or perform descriptive statistics separately: mean(), median(), sd() mean(cs_data$C) sd(cs_data$C) ################### Data management ############## ##### Recoding variables # You can attribute values to cases. # For example, you want B to have a missing value (NA) for the third row: cs_data["3","B"] # The case has a value of 0. cs_data["3","B"] <- NA cs_data["3","B"] cs_data[,"B"] # Set this back to the original value. cs_data["3","B"] <- 0 # Usually, you will create a new variable to recode another variable. Otherwise # you "lose" the values of the old variables. # We use $ to ensure that the new variable (object) is tied into the dataset. # Here, we create B_new, consisting of the values of B. cs_data$B_new <- cs_data$B cs_data[,c("B", "B_new")] # Sometimes, we want to recode variables using logical rules. cs_data$B_new <- NA cs_data$B_new[cs_data$B == 0] <- "no" cs_data$B_new[cs_data$B == 1] <- "yes" cs_data[,c("B", "B_new")] # You can also perform operations with variables to create a new variable. # For example, we create a new variable "AB" which occurs (is equal to 1) when # both "A" and "B" occur. cs_data$AB <- cs_data$A & cs_data$B cs_data$AB # It is also possible to add up, subtract or exponentiate (+, -, *, exp(), sqr()). # Renaming variables: here we rename "E" into "octopus". names(cs_data)[names(cs_data)=="E"] <- "octopus" View(cs_data) # Alternatively, to rename a variable, you can simply create a new variable that # equals the old variable, and then remove the old variable: cs_data$common_octopus <- cs_data$octopus cs_data$octopus <- NULL names(cs_data) # If you want to use the manipulation we just did, save the data again. # Here we save it under the name "exercise". write.csv2(cs_data, "exercise.csv") dir() ## Also, save your changes to the script before closing it. ##### Have a nice evening! ################### Appendix: functions and operators ############## # Here is an overview of (most of) the functions that we have seen in the # tutorial. ?rm() ?getwd() ?setwd() ?dir() ?library() ?c() ?save() ?read.csv() ?write.csv2() ?write.csv() ?View() ?length ?names() ?rownames() ?head() ?subset() ?describe() ?mean() ?sd() ## Here are some operators that we have seen: # plus: + # minus: - # division: / # multiplication: * # and: & # or: | # not: ! # is equal to: == # is not equal to: != # is less than: < # is greater than: > # is less than or equal to: <= # is greater than or equal to: >= ############## Extra: if, else, for ############################ # As an extra, an introduction to if-else statements and to for loops in R is # included below. # Please only have a look at this if you are interested and want to learn more # about R programming. It is not needed (or even helpful) for following the CNA # training. # The example in this script shows how to install multiple R packages with one # chunk of code. If you would like to follow tutorials with different examples, # you can use the following links: # https://www.datamentor.io/r-programming/if-else-statement/ # https://www.datamentor.io/r-programming/for-loop/ ##### if-else # if statements have the following form: # if (condition){ # statement # } # If the condition is true, the statement between {} will be executed by R. # If the condition is not true, the statement between {} will not be executed # by R. (The statement can consist of multiple lines of code.) # The following chunk of code checks if "cna" is already installed. # If "cna" is already installed, the condition between () is not true and the # lines between {} are NOT executed. # If "cna" is not installed yet, the condition between () is true and the lines # between {} ARE executed. if (!require("cna")){ install.packages("cna") print("cna has now been installed.") } # if-else statements have the following form: # if (condition){ # statement1 # } else { # statement2 # } # if-else statements are like if statements, but they add that R should do # do something else if (and only if!) the condition between () is not true. # The following chunk of code does the same as the previous one, but it adds: # If the condition is not true (so if "cna" is already installed) R executes the # line of code between {} AFTER ELSE. That line of code between {} after else is # NOT executed if the condition IS true. if (!require("cna")){ install.packages("cna") print("cna has now been installed.") } else { print("cna was already installed.") } ##### for loop # A for loop has the following form: # for (value in sequence){ # statement #} # It executes the statement between {} for every value in the sequence. # In the following chunk of code, the list of package names is the sequence. # The for loop goes through the list of package names and prints each name. package_names <- c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr", "LogicReg") for (name in package_names){ print(name) } # The following for loops prints, for each name in package_names, whether the # package has not yet been installed. All the packages in the list HAVE been # installed, so the answer should be FALSE for each element in the sequence. for (name in package_names){ print(!require(name, character.only = TRUE)) } # Finally, we can use a for loop which checks for each package in the sequence # whether it has not been installed yet. If the package has not been installed # yet, R installs the package and prints " has now been installed.", # if the package has already been installed, R just prints # " was already installed." for (name in package_names){ if (!require(name, character.only = TRUE)){ install.packages(name, character.only = TRUE) print(name) print(" has now been installed.") } else { print(name) print(" was already installed.") } } # This shows how repetitive tasks can be automated in R. # If you don't need to see which packages had already been installed, you can # run the following chunk of code instead. You only need 6 lines of code to install # as many packages as you want! package_names <- c("cna", "frscore", "cnaOpt", "dplyr", "QCA", "stringr", "LogicReg") for (name in package_names){ if (!require(name, character.only = TRUE)){ install.packages(name, character.only = TRUE) } }