This page is intended as a quick reference when helping students with their data handling and analysis.

Included on this page are short examples of everything we teach students in the LifeSavR introductory/revision course.

Basic R statements

# assign the current year to a variable
current_year <- 2021

# print the variable we created
# (it should display below the chunk, or in the console pane)
current_year
## [1] 2021
# do a calculation to create a new variable from an existing one
next_year <- current_year + 1

next_year
## [1] 2022
# multiply two numbers
2 * 221
## [1] 442

Loading packages

These are the packages used in this cheatsheet (and the lifesavR course:

library(tidyverse)
## Warning: replacing previous import 'vctrs::data_frame' by
## 'tibble::data_frame' when loading 'dplyr'
## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.0.1     ✔ dplyr   1.0.0
## ✔ tidyr   1.1.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(psydata)
## Loading datasets for teaching R to psychology students.
## 
## Datasets loaded:
## amongus, bae2018, bmi, development, earnings, earnings2, egger2019, employees, fuel, funimagery, grass, happy, heroes_meta, heroes_personal, holmgren2018, im2018, kidiq, kuhberger2014, mentalh, messy_exp, painmusic, rmipx1, shootings, studyhabits, wii_rt, wii_rt_tidy
## 
## 
## Type help(datasetname) for more information about the data.
library(BayesFactor)
## Loading required package: coda
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
library(corrr)

Looking at data

Show only the first 6 rows of the fuel data

head(fuel)
##    mpg cyl engine_size power weight gear automatic
## 1 21.0   6        2620   110   1188    4      TRUE
## 2 21.0   6        2620   110   1304    4      TRUE
## 3 22.8   4        1770    93   1052    4      TRUE
## 4 21.4   6        4230   110   1458    3     FALSE
## 5 18.7   8        5900   175   1560    3     FALSE
## 6 18.1   6        3690   105   1569    3     FALSE

Shows a list of columns in the development dataset plus the first few datapoints (as many as will fit)

glimpse(development)
## Rows: 1,704
## Columns: 6
## $ country         <fct> Afghanistan, Afghanistan, Afghanistan, Afgh…
## $ continent       <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, A…
## $ year            <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1…
## $ life_expectancy <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.…
## $ population      <int> 8425333, 9240934, 10267083, 11537966, 13079…
## $ gdp_per_capita  <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739…

Filtering datasets

The filter() function selects rows from a dataset:

# filter on a categorical variable
# == means 'match exactly'
development %>% 
  filter(country == "Kenya")

# this doesn't match any rows because the data spells Kenya with a capital 'K'
development %>% 
  filter(country == "kenya")

# filtering on a numeric variable
development %>% filter(year > 2000)


# combining multiple filters makes things more specific
development %>%
  filter(country == "Kenya") %>%
  filter(year > 2000)

# assign filtered data to a new variable for re-use later on
# if you don't store the data, it is displayed but then lost
development.europe <- development %>% 
  filter(continent == "Europe") %>% 
  filter(year > 1980)

# show the stored subset
development.europe

# make a plot with the subset
development.europe %>% 
    ggplot(aes(gdp_per_capita, life_expectancy)) + 
  geom_point()

Frequencies

# this gives us the total number of rows
earnings %>% count()
##      n
## 1 4483
# numbers of men and women
earnings %>%
  count(gender)
##   gender    n
## 1 female 2130
## 2   male 2353
# We can do arithmetic on these numbers
# calculate percentage
2353 / 4483 * 100
## [1] 52.48717
# counts for multiple subgroups
earnings %>%
  count(job, gender)
##       job gender    n
## 1 charity female  210
## 2 charity   male  116
## 3   nopay female    2
## 4   nopay   male    1
## 5 private female 1507
## 6 private   male 1940
## 7  public female  411
## 8  public   male  296
# making a frwquency plot
earnings %>%
  ggplot(aes(gender)) +
  stat_count()

Summarise

# use function mean() to summarise mpg column
# the result is a data.frame with a single column named 'mean_mpg'
fuel %>%
  summarise(mean_mpg = mean(mpg))

# if you omit the column name, the summarised column is named after the summary function
# this produces column names which can be awkward to process later in a pipeline
# for example:
fuel %>%
  summarise(mean(mpg))

# median mpg (rather than mean)
fuel %>%
  summarise(median_mpg = median(mpg))

# standard deviation
fuel %>%
  summarise(sd_mpg = sd(mpg))

# summarise two columns at once 
# the mean and sd functions are separated with a comma
# the resulting data frame is stored in a variable called fuel_summary
fuel_summary <- fuel %>%
  summarise(m = mean(mpg), sd = sd(mpg))

# see the stored summary
fuel_summary

# remember, this summary is still a new dataset; you could do more processing on it in extra steps if needed

# combining filter() and summarise()
development %>% 
  filter(continent=="Asia") %>% 
  summarise(Average_life_expectancy = mean(life_expectancy))

Group by

# calculate mean weight loss in each group in a laborious way
funimagery %>%
  filter(intervention=="MI") %>%
  summarise(mean(weight_lost_end_trt))

funimagery %>%
  filter(intervention=="FIT") %>%
  summarise(mean(weight_lost_end_trt))


# use group_by to split the data and summarise each group
funimagery %>%
  group_by(intervention) %>%
  summarise(mean(weight_lost_end_trt))
# example of grouping by two columns at once
funimagery %>%
  group_by(gender, intervention) %>%
  summarise(mean(weight_lost_end_trt))
## # A tibble: 4 x 3
## # Groups:   gender [2]
##   gender intervention `mean(weight_lost_end_trt)`
##   <chr>  <fct>                              <dbl>
## 1 f      MI                                 -1.2 
## 2 f      FIT                                -5.56
## 3 m      MI                                 -1.37
## 4 m      FIT                                -3.84

Calculate the mean and SD in one go

funimagery %>%
  group_by(intervention) %>%
  summarise(
    mean(weight_lost_end_trt),
    sd(weight_lost_end_trt)
  )
## # A tibble: 2 x 3
##   intervention `mean(weight_lost_end_trt)` `sd(weight_lost_end_trt)`
##   <fct>                              <dbl>                     <dbl>
## 1 MI                                 -1.25                      2.09
## 2 FIT                                -5.09                      4.18

Give your new summary columns a name (this is good practice)

funimagery %>%
  group_by(intervention) %>%
  summarise(
    mean_weight_lost_end_trt = mean(weight_lost_end_trt),
    sd_weight_lost_end_trt = sd(weight_lost_end_trt)
  )
## # A tibble: 2 x 3
##   intervention mean_weight_lost_end_trt sd_weight_lost_end_trt
##   <fct>                           <dbl>                  <dbl>
## 1 MI                              -1.25                   2.09
## 2 FIT                             -5.09                   4.18

Boxplot

Boxplots show the interquartile range (IQR) as the height of the box.

The IQR is the range which includes 50% of the data points

funimagery %>%
  ggplot(aes(intervention, weight_lost_end_trt)) +
  geom_boxplot() +
  scale_y_continuous(n.breaks = 10) + # this extra code just adds more marks on the y-axis
  labs(x = "Intervention", y="Weight lost (end treatment)")

Scatter plot

fuel %>%
  ggplot(aes(x=weight, y=mpg)) + # selects the columns to use
    geom_point()                 # adds the points to the plot

Correlation (and Bayes Factor)

Load the corrr package (see above):

# select weight at baseline and follow-up in the funimagery study
# and correlate them
funimagery %>%
  select(kg1, kg2, kg3) %>%
  correlate()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 3 x 4
##   term     kg1    kg2    kg3
##   <chr>  <dbl>  <dbl>  <dbl>
## 1 kg1   NA      0.969  0.934
## 2 kg2    0.969 NA      0.969
## 3 kg3    0.934  0.969 NA
# use `with()` to tell `correlationBF` which dataset to use
with(funimagery,  correlationBF(kg1, kg3))
## Bayes factor analysis
## --------------
## [1] Alt., r=0.333 : 8.261128e+45 ±0%
## 
## Against denominator:
##   Null, rho = 0 
## ---
## Bayes factor type: BFcorrelation, Jeffreys-beta*

Remember to ignore the part which reads r=0.333. The large number after that is the Bayes Factor in favour of the correlation.

Loading CSV data

Load CSV data from a local file (in the same directory as your .Rmd file… remember to upload it to the server):

shootings <- read_csv("shootings.csv")

Load CSV data from a URL (over the web):

shootings <- read_csv("https://raw.githubusercontent.com/benwhalley/psydata/main/data-raw/shootings.csv")
## Parsed with column specification:
## cols(
##   id = col_double(),
##   name = col_character(),
##   date = col_date(format = ""),
##   manner_of_death = col_character(),
##   armed = col_character(),
##   age = col_double(),
##   gender = col_character(),
##   race = col_character(),
##   city = col_character(),
##   state = col_character(),
##   signs_of_mental_illness = col_logical(),
##   threat_level = col_character(),
##   flee = col_character(),
##   body_camera = col_logical(),
##   arms_category = col_character()
## )

Regression

library(psydata)

Simple linear model (regression) with one predictor:

lm(mpg ~ weight, data=fuel)
## 
## Call:
## lm(formula = mpg ~ weight, data = fuel)
## 
## Coefficients:
## (Intercept)       weight  
##    37.28714     -0.01178

Calculate Bayes Factor for single predictor:

library(BayesFactor)
lmBF(mpg ~ weight, data=fuel)
## Bayes factor analysis
## --------------
## [1] weight : 45726075 ±0.01%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS

Multiple regression (use + to add predictors:

lm(mpg ~ weight + cyl, data=fuel)
## 
## Call:
## lm(formula = mpg ~ weight + cyl, data = fuel)
## 
## Coefficients:
## (Intercept)       weight          cyl  
##   39.687037    -0.007037    -1.507643

Bayes Factor to test addition of a variable to an existing model:

A <- lmBF(mpg ~ weight, data=fuel)
B <- lmBF(mpg ~ weight + cyl, data=fuel)

# gives evidence _for_ B against A
B / A
## Bayes factor analysis
## --------------
## [1] weight + cyl : 22.7057 ±0.01%
## 
## Against denominator:
##   mpg ~ weight 
## ---
## Bayes factor type: BFlinearModel, JZS

Calculate R2:

library(broom)
lm(mpg ~ weight + cyl, data=fuel) %>%  glance()
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <int>  <dbl> <dbl>
## 1     0.830         0.819  2.57      70.9 6.81e-12     3  -74.0  156.
## # … with 3 more variables: BIC <dbl>, deviance <dbl>,
## #   df.residual <int>

Plot histogram of residuals from a regression model:

lm(mpg ~ weight + cyl, data=fuel) %>% 
  augment() %>% 
  ggplot(aes(.resid)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Fitted-vs-residual diagnostic plot:

lm(mpg ~ weight + cyl, data=fuel) %>% 
  augment() %>% 
  ggplot(aes(.fitted, .resid)) + 
  geom_point() + 
  geom_smooth(size=.25) + 
  geom_smooth(method=lm, se=F, size=.25, color="black", linetype="dashed") + 
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'