## HEADER #####
## What: Simple anova in R
## Using Ch 14 in Yarrr A Pirates Guide to R
## Sections 14.1 - 14.3
## https://bookdown.org/ndphillips/YaRrr/anova.html
## When: HARUG! 2019.12.11
## Who: Last edited by Ed
####

## CONTENTS ####
## 00 Setup
## 01 Explore your data
## 02 Graph your data (matching experimental design)
## 03 Test assumptions
## 04 Evaluate hypotheses (primary, .secondary)


## 00 Setup ####
# install.packages("yarrr", dep=T)
library(yarrr)

# Ed HA PC
setwd("D:/Dropbox/WORK/__Harper Adams stuff/_HARUG HA R Users Group/2019.12.11 HARUG R motivator")

## 01 Explore your data ####
# data: "Getting poop off of poopdeck"
data(poopdeck)
help(poopdeck)
head(poopdeck)

# look at POSSIBLE questions table slide

## 02 Graph your data (matching experimental design) ####
# We will look at cleaning *time* as a function of *cleaner* used
# our formula: time ~ cleaner

# The traditional way to graph 1-way ANOVA is the boxplot
boxplot(formula = time ~ cleaner, data = poopdeck)


## 03 Test assumptions ####
# basic assumptions of ANOVA
# 03.a Gaussian (a/k/a "normal") distribution of residuals
# 03.b Homoscedacity (==variance amongst groups)

# 03.a Gaussian (a/k/a "normal") distribution of residuals ####

# look at help() for function
help(aov)

# go ahead and calculate ANOVA in order to test residual distribution
myAnova <- aov(formula = time ~ cleaner, data = poopdeck)

# get residuals from ANOVA model object "myAnova"
?residuals
(myResid <- residuals(myAnova))

# Always graph it! 
hist(myResid) # looks pretty Gaussian, good sample size, not bad
qqnorm(myResid)
qqline(myResid)

# Test it formally
help(shapiro.test)
shapiro.test(myResid) # if p < 0.05 IT IS different to Gaussian.  Is it?

# 03.b Homoscedasticity (== homogeneous variance amongst groups) ####
?bartlett.test
bartlett.test(formula = sqrt(time) ~ cleaner, data = poopdeck) # is different

# Since the variances are not equal, I would have a closer look

# make the boxplot
par(mar=c(5.1, 5.1, 3, 4)) # ?par (made right margin a bit bigger)
boxplot(formula = time ~ cleaner, data = poopdeck, cex = 0,
        xlab = "Cleaner used", ylab = "Cleaning time (min)",
        cex.lab = 1.5)

# select the actual data
a <- which(poopdeck$cleaner == "a")
b <- which(poopdeck$cleaner == "b")
c <- which(poopdeck$cleaner == "c")

# draw on the actual data
points(x = jitter(rep(1, length(a)), 5), y = poopdeck$tim[a], 
       pch = 16, cex = .5, col = "blue")
points(x = jitter(rep(2, length(a)), 2.8), y = poopdeck$tim[b], 
       pch = 16, cex = .5, col = "blue")
points(x = jitter(rep(3, length(a)), 1.8), y = poopdeck$tim[c], 
       pch = 16, cex = .5, col = "blue")

# (the most obvious) options here are to decide 
# the variances are "equal enough"
# or else perform a non-parametric test
?kruskal.test

## 04 Evaluate hypotheses (primary, .secondary) ####
# 04.a The means are not equal overall
# 04.b The specific mean levels are different (post-hoc test)

# 04.a The means are not equal overall ####
# this is the fundamental ANOVA result

?anova # make an ANOVA table
anova(myAnova)
summary(myAnova) # alternative
myAnova # different info
names(myAnova) # lots in the object
class(myAnova) 

# 04.b The specific mean levels are different (post-hoc test) ####
?TukeyHSD

TukeyHSD(myAnova) # interpret