ANOVA_Notes

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#ANOVA (Analysis of variance) with cuckoos data set

An analysis of variance (ANOVA) is a statistical test used to determine whether a continuous and a categorical variable differ. The categorical variable should have three or more groups, or levels.

library(DAAG)
data("cuckoos")

Research question: Does egg length differ between the six species?

Null hypothesis:

\[ H_0: \mu_{\text{hedgesparrow}} = \mu_{\text{meadowpipit}} = \mu_{\text{piedwagtail}} = \mu_{\text{robin}} = \mu_{\text{treepipit}} = \mu_{\text{wren}} \]

Alternative hypothesis:

\[ H_a: \text{At least one mean is different } \mu_{\text{species}} \neq \mu_{\text{another species}} \]

##Visualizing the data

#box plot
ggplot(data = cuckoos, aes(x = species, y = length))+
  geom_boxplot()+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

##ANOVA test assumptions

ggplot(data = cuckoos,aes(x = length))+
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cuckoos %>% 
  group_by(species) %>% 
  summarise(var(length))
# A tibble: 6 × 2
  species       `var(length)`
  <fct>                 <dbl>
1 hedge.sparrow         1.10 
2 meadow.pipit          0.846
3 pied.wagtail          1.15 
4 robin                 0.465
5 tree.pipit            0.775
6 wren                  0.569

##Running the anova

model1 <- aov(length ~ species, data = cuckoos)
model2 <- lm(length ~ species, data = cuckoos)
tukey <- TukeyHSD(model1)#make sure your object is called "tukey" for the code below. 
#Extracting significant pairs
tukey_df <- as.data.frame(tukey$species)
names(tukey_df)[names(tukey_df) == "p adj"] <- "p.adj"
sig_pairs <- subset(tukey_df,p.adj < 0.05)

# Create list of comparisons from significant pairs
comparisons <- strsplit(row.names(sig_pairs), "-")
#Creating the plot
library(ggsignif)
ggplot(cuckoos, aes(x = species, y = length)) +
  geom_boxplot() +
  geom_signif(comparisons = comparisons,
  annotations = ifelse(sig_pairs$p.adj < 0.001,   "***",                               ifelse(sig_pairs$p.adj < 0.01, "**",               ifelse(sig_pairs$p.adj < 0.05, "*", ""))),
y_position = seq(25, 35, length.out = nrow(sig_pairs)), #changing location of the bars
tip_length = 0.02) +
theme_bw()+
theme(axis.text.x = element_text(angle = 30, 
    hjus = 1, size = 18),
    axis.text.y = element_text(size = 18),          
    axis.title.x = element_text(size = 15),         
    axis.title.y = element_text(size = 20)) +
  labs(x = "Host species", y = "Egg length (mm)")

#ANOVA with the penguins data set

library(palmerpenguins)
data("penguins")
View(penguins)

Research question: does flipper length differ between species?

##Visualizing the data

ggplot(data = penguins, aes(x = species, y = flipper_length_mm))+
  geom_boxplot()
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).

##Checking assumptions

ggplot(data = penguins, aes(x = flipper_length_mm))+
  geom_histogram()+
  facet_wrap(~species)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).

penguins %>% 
  group_by(species) %>% 
  summarise(var(flipper_length_mm,  na.rm = TRUE))
# A tibble: 3 × 2
  species   `var(flipper_length_mm, na.rm = TRUE)`
  <fct>                                      <dbl>
1 Adelie                                      42.8
2 Chinstrap                                   50.9
3 Gentoo                                      42.1

##Running the ANOVA

#using aov
mod1 <- aov(flipper_length_mm ~ species, data = penguins)
summary(mod1)#getting output of the model
             Df Sum Sq Mean Sq F value Pr(>F)    
species       2  52473   26237   594.8 <2e-16 ***
Residuals   339  14953      44                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
2 observations deleted due to missingness
tukey <- TukeyHSD(mod1)
tukey
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = flipper_length_mm ~ species, data = penguins)

$species
                      diff       lwr       upr p adj
Chinstrap-Adelie  5.869887  3.586583  8.153191     0
Gentoo-Adelie    27.233349 25.334376 29.132323     0
Gentoo-Chinstrap 21.363462 19.000841 23.726084     0
#using lm
mod2 <- lm(flipper_length_mm ~ species, data = penguins)
summary(mod2)

Call:
lm(formula = flipper_length_mm ~ species, data = penguins)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.9536  -4.8235   0.0464   4.8130  20.0464 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      189.9536     0.5405 351.454  < 2e-16 ***
speciesChinstrap   5.8699     0.9699   6.052 3.79e-09 ***
speciesGentoo     27.2333     0.8067  33.760  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.642 on 339 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.7782,    Adjusted R-squared:  0.7769 
F-statistic: 594.8 on 2 and 339 DF,  p-value: < 2.2e-16