This page is intended as a quick reference when helping students with their data handling and analysis.
Included on this page are short examples of everything we teach students in the LifeSavR introductory/revision course.
Basic R statements
# assign the current year to a variable
<- 2021
current_year
# print the variable we created
# (it should display below the chunk, or in the console pane)
current_year
## [1] 2021
# do a calculation to create a new variable from an existing one
<- current_year + 1
next_year
next_year
## [1] 2022
# multiply two numbers
2 * 221
## [1] 442
Loading packages
These are the packages used in this cheatsheet (and the lifesavR course:
library(tidyverse)
## Warning: replacing previous import 'vctrs::data_frame' by
## 'tibble::data_frame' when loading 'dplyr'
## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.0.1 ✔ dplyr 1.0.0
## ✔ tidyr 1.1.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(psydata)
## Loading datasets for teaching R to psychology students.
##
## Datasets loaded:
## amongus, bae2018, bmi, development, earnings, earnings2, egger2019, employees, fuel, funimagery, grass, happy, heroes_meta, heroes_personal, holmgren2018, im2018, kidiq, kuhberger2014, mentalh, messy_exp, painmusic, rmipx1, shootings, studyhabits, wii_rt, wii_rt_tidy
##
##
## Type help(datasetname) for more information about the data.
library(BayesFactor)
## Loading required package: coda
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
library(corrr)
Looking at data
Show only the first 6 rows of the fuel
data
head(fuel)
## mpg cyl engine_size power weight gear automatic
## 1 21.0 6 2620 110 1188 4 TRUE
## 2 21.0 6 2620 110 1304 4 TRUE
## 3 22.8 4 1770 93 1052 4 TRUE
## 4 21.4 6 4230 110 1458 3 FALSE
## 5 18.7 8 5900 175 1560 3 FALSE
## 6 18.1 6 3690 105 1569 3 FALSE
Shows a list of columns in the development
dataset plus
the first few datapoints (as many as will fit)
glimpse(development)
## Rows: 1,704
## Columns: 6
## $ country <fct> Afghanistan, Afghanistan, Afghanistan, Afgh…
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, A…
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1…
## $ life_expectancy <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.…
## $ population <int> 8425333, 9240934, 10267083, 11537966, 13079…
## $ gdp_per_capita <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739…
Filtering datasets
The filter()
function selects rows from a dataset:
# filter on a categorical variable
# == means 'match exactly'
%>%
development filter(country == "Kenya")
# this doesn't match any rows because the data spells Kenya with a capital 'K'
%>%
development filter(country == "kenya")
# filtering on a numeric variable
%>% filter(year > 2000)
development
# combining multiple filters makes things more specific
%>%
development filter(country == "Kenya") %>%
filter(year > 2000)
# assign filtered data to a new variable for re-use later on
# if you don't store the data, it is displayed but then lost
<- development %>%
development.europe filter(continent == "Europe") %>%
filter(year > 1980)
# show the stored subset
development.europe
# make a plot with the subset
%>%
development.europe ggplot(aes(gdp_per_capita, life_expectancy)) +
geom_point()
Frequencies
# this gives us the total number of rows
%>% count() earnings
## n
## 1 4483
# numbers of men and women
%>%
earnings count(gender)
## gender n
## 1 female 2130
## 2 male 2353
# We can do arithmetic on these numbers
# calculate percentage
2353 / 4483 * 100
## [1] 52.48717
# counts for multiple subgroups
%>%
earnings count(job, gender)
## job gender n
## 1 charity female 210
## 2 charity male 116
## 3 nopay female 2
## 4 nopay male 1
## 5 private female 1507
## 6 private male 1940
## 7 public female 411
## 8 public male 296
# making a frwquency plot
%>%
earnings ggplot(aes(gender)) +
stat_count()
Summarise
# use function mean() to summarise mpg column
# the result is a data.frame with a single column named 'mean_mpg'
%>%
fuel summarise(mean_mpg = mean(mpg))
# if you omit the column name, the summarised column is named after the summary function
# this produces column names which can be awkward to process later in a pipeline
# for example:
%>%
fuel summarise(mean(mpg))
# median mpg (rather than mean)
%>%
fuel summarise(median_mpg = median(mpg))
# standard deviation
%>%
fuel summarise(sd_mpg = sd(mpg))
# summarise two columns at once
# the mean and sd functions are separated with a comma
# the resulting data frame is stored in a variable called fuel_summary
<- fuel %>%
fuel_summary summarise(m = mean(mpg), sd = sd(mpg))
# see the stored summary
fuel_summary
# remember, this summary is still a new dataset; you could do more processing on it in extra steps if needed
# combining filter() and summarise()
%>%
development filter(continent=="Asia") %>%
summarise(Average_life_expectancy = mean(life_expectancy))
Group by
# calculate mean weight loss in each group in a laborious way
%>%
funimagery filter(intervention=="MI") %>%
summarise(mean(weight_lost_end_trt))
%>%
funimagery filter(intervention=="FIT") %>%
summarise(mean(weight_lost_end_trt))
# use group_by to split the data and summarise each group
%>%
funimagery group_by(intervention) %>%
summarise(mean(weight_lost_end_trt))
# example of grouping by two columns at once
%>%
funimagery group_by(gender, intervention) %>%
summarise(mean(weight_lost_end_trt))
## # A tibble: 4 x 3
## # Groups: gender [2]
## gender intervention `mean(weight_lost_end_trt)`
## <chr> <fct> <dbl>
## 1 f MI -1.2
## 2 f FIT -5.56
## 3 m MI -1.37
## 4 m FIT -3.84
Calculate the mean and SD in one go
%>%
funimagery group_by(intervention) %>%
summarise(
mean(weight_lost_end_trt),
sd(weight_lost_end_trt)
)
## # A tibble: 2 x 3
## intervention `mean(weight_lost_end_trt)` `sd(weight_lost_end_trt)`
## <fct> <dbl> <dbl>
## 1 MI -1.25 2.09
## 2 FIT -5.09 4.18
Give your new summary columns a name (this is good practice)
%>%
funimagery group_by(intervention) %>%
summarise(
mean_weight_lost_end_trt = mean(weight_lost_end_trt),
sd_weight_lost_end_trt = sd(weight_lost_end_trt)
)
## # A tibble: 2 x 3
## intervention mean_weight_lost_end_trt sd_weight_lost_end_trt
## <fct> <dbl> <dbl>
## 1 MI -1.25 2.09
## 2 FIT -5.09 4.18
Boxplot
Boxplots show the interquartile range (IQR) as the height of the box.
The IQR is the range which includes 50% of the data points
%>%
funimagery ggplot(aes(intervention, weight_lost_end_trt)) +
geom_boxplot() +
scale_y_continuous(n.breaks = 10) + # this extra code just adds more marks on the y-axis
labs(x = "Intervention", y="Weight lost (end treatment)")
Scatter plot
%>%
fuel ggplot(aes(x=weight, y=mpg)) + # selects the columns to use
geom_point() # adds the points to the plot
Correlation (and Bayes Factor)
Load the corrr
package (see above):
# select weight at baseline and follow-up in the funimagery study
# and correlate them
%>%
funimagery select(kg1, kg2, kg3) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 3 x 4
## term kg1 kg2 kg3
## <chr> <dbl> <dbl> <dbl>
## 1 kg1 NA 0.969 0.934
## 2 kg2 0.969 NA 0.969
## 3 kg3 0.934 0.969 NA
# use `with()` to tell `correlationBF` which dataset to use
with(funimagery, correlationBF(kg1, kg3))
## Bayes factor analysis
## --------------
## [1] Alt., r=0.333 : 8.261128e+45 ±0%
##
## Against denominator:
## Null, rho = 0
## ---
## Bayes factor type: BFcorrelation, Jeffreys-beta*
Remember to ignore the part which reads r=0.333
. The
large number after that is the Bayes Factor in favour of the
correlation.
Loading CSV data
Load CSV data from a local file (in the same directory as your
.Rmd
file… remember to upload it to the server):
<- read_csv("shootings.csv") shootings
Load CSV data from a URL (over the web):
<- read_csv("https://raw.githubusercontent.com/benwhalley/psydata/main/data-raw/shootings.csv") shootings
## Parsed with column specification:
## cols(
## id = col_double(),
## name = col_character(),
## date = col_date(format = ""),
## manner_of_death = col_character(),
## armed = col_character(),
## age = col_double(),
## gender = col_character(),
## race = col_character(),
## city = col_character(),
## state = col_character(),
## signs_of_mental_illness = col_logical(),
## threat_level = col_character(),
## flee = col_character(),
## body_camera = col_logical(),
## arms_category = col_character()
## )
Regression
library(psydata)
Simple linear model (regression) with one predictor:
lm(mpg ~ weight, data=fuel)
##
## Call:
## lm(formula = mpg ~ weight, data = fuel)
##
## Coefficients:
## (Intercept) weight
## 37.28714 -0.01178
Calculate Bayes Factor for single predictor:
library(BayesFactor)
lmBF(mpg ~ weight, data=fuel)
## Bayes factor analysis
## --------------
## [1] weight : 45726075 ±0.01%
##
## Against denominator:
## Intercept only
## ---
## Bayes factor type: BFlinearModel, JZS
Multiple regression (use +
to add predictors:
lm(mpg ~ weight + cyl, data=fuel)
##
## Call:
## lm(formula = mpg ~ weight + cyl, data = fuel)
##
## Coefficients:
## (Intercept) weight cyl
## 39.687037 -0.007037 -1.507643
Bayes Factor to test addition of a variable to an existing model:
<- lmBF(mpg ~ weight, data=fuel)
A <- lmBF(mpg ~ weight + cyl, data=fuel)
B
# gives evidence _for_ B against A
/ A B
## Bayes factor analysis
## --------------
## [1] weight + cyl : 22.7057 ±0.01%
##
## Against denominator:
## mpg ~ weight
## ---
## Bayes factor type: BFlinearModel, JZS
Calculate R2:
library(broom)
lm(mpg ~ weight + cyl, data=fuel) %>% glance()
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 0.830 0.819 2.57 70.9 6.81e-12 3 -74.0 156.
## # … with 3 more variables: BIC <dbl>, deviance <dbl>,
## # df.residual <int>
Plot histogram of residuals from a regression model:
lm(mpg ~ weight + cyl, data=fuel) %>%
augment() %>%
ggplot(aes(.resid)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Fitted-vs-residual diagnostic plot:
lm(mpg ~ weight + cyl, data=fuel) %>%
augment() %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
geom_smooth(size=.25) +
geom_smooth(method=lm, se=F, size=.25, color="black", linetype="dashed") +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'