1. Think about an ongoing study in your lab (or a paper you have read in a different class), and decide on a pattern that you might expect in your experiment if a specific hypothesis were true. Using the dataset from last week I found on DRYAD from Allen et. al, I am predicting to see the O2 output per kilogram per minute to be higher in those dolphins in a steady state rather than those at rest.
2. To start simply, assume that the data in each of your treatment groups follow a normal distribution. Specify the sample sizes, means, and variances for each group that would be reasonable if your hypothesis were true. You may need to consult some previous literature and/or an expert in the field to come up with these numbers.
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
## Warning: package 'MASS' was built under R version 4.1.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# reading in the same data from last week
z <- read.csv("Fig_3.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 93 obs. of 6 variables:
## $ TrialID : chr "Trial17_08" "Trial17_11" "Trial17_12" "Trial17_16" ...
## $ Animal_ID : chr "9ON6" "83H1" "01L5" "9ON6" ...
## $ State : chr "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
## $ ODBA_g : num 0.309 0.327 0.155 0.218 0.194 ...
## $ O2_ml_per_kg_min: num 13.93 13.64 5.57 13.69 7.85 ...
## $ W_per_kg : num 4.67 4.57 1.87 4.59 2.63 ...
z<-na.omit(z)
z$myVar <-z$O2_ml_per_kg_min
summary(z)
## TrialID Animal_ID State ODBA_g
## Length:93 Length:93 Length:93 Min. :0.01074
## Class :character Class :character Class :character 1st Qu.:0.01641
## Mode :character Mode :character Mode :character Median :0.02953
## Mean :0.12768
## 3rd Qu.:0.22644
## Max. :0.52505
## O2_ml_per_kg_min W_per_kg myVar
## Min. : 0.3877 Min. :0.1299 Min. : 0.3877
## 1st Qu.: 2.3434 1st Qu.:0.7850 1st Qu.: 2.3434
## Median : 4.3945 Median :1.4722 Median : 4.3945
## Mean : 5.6766 Mean :1.9017 Mean : 5.6766
## 3rd Qu.: 7.8524 3rd Qu.:2.6305 3rd Qu.: 7.8524
## Max. :16.4197 Max. :5.5006 Max. :16.4197
# Using work from last time where we created a simulation with the same attributes
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 5.6766070 4.3391717
## (0.4499511) (0.3181635)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 5.68 4.34
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.45 0.318
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.202 0 0 0.101
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 93
## $ loglik : num -268
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # Found the mean
## mean
## 5.676607
# Finding Sample Sizes for whole set
length(z$myVar)
## [1] 93
# Finding Variance for whole set
var(z$myVar)
## [1] 19.03307
sd(z$myVar)
## [1] 4.36269
############ Now need them for each treatment group tho #############
steady <-z[z$State=="SteadyState", ]
summary(steady)
## TrialID Animal_ID State ODBA_g
## Length:35 Length:35 Length:35 Min. :0.1488
## Class :character Class :character Class :character 1st Qu.:0.2176
## Mode :character Mode :character Mode :character Median :0.2930
## Mean :0.2992
## 3rd Qu.:0.3711
## Max. :0.5251
## O2_ml_per_kg_min W_per_kg myVar
## Min. : 5.504 Min. :1.844 Min. : 5.504
## 1st Qu.: 7.381 1st Qu.:2.473 1st Qu.: 7.381
## Median : 9.842 Median :3.297 Median : 9.842
## Mean :10.352 Mean :3.468 Mean :10.352
## 3rd Qu.:13.730 3rd Qu.:4.600 3rd Qu.:13.730
## Max. :16.420 Max. :5.501 Max. :16.420
# sd(steady)
rest <-z[z$State=="Rest", ]
summary(rest)
## TrialID Animal_ID State ODBA_g
## Length:58 Length:58 Length:58 Min. :0.01074
## Class :character Class :character Class :character 1st Qu.:0.01531
## Mode :character Mode :character Mode :character Median :0.01751
## Mean :0.02416
## 3rd Qu.:0.02719
## Max. :0.09835
## O2_ml_per_kg_min W_per_kg myVar
## Min. :0.3877 Min. :0.1299 Min. :0.3877
## 1st Qu.:1.6889 1st Qu.:0.5658 1st Qu.:1.6889
## Median :2.4710 Median :0.8278 Median :2.4710
## Mean :2.8551 Mean :0.9565 Mean :2.8551
## 3rd Qu.:3.9119 3rd Qu.:1.3105 3rd Qu.:3.9119
## Max. :7.4855 Max. :2.5076 Max. :7.4855
3. Using the methods we have covered in class, write code to create a random data set that has these attributes. Organize these data into a data frame with the appropriate structure.
# New Dataset
MySet <- rnorm(n=3000,mean=5.676607)
MySet <- data.frame(1:3000,MySet)
names(MySet) <- list("ID","myVar")
MySet <- MySet[MySet$myVar>0,]
str(MySet)
## 'data.frame': 3000 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ myVar: num 6.87 4.98 5.23 6.32 6.49 ...
summary(MySet$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.164 5.065 5.697 5.720 6.394 9.860
4. Now write code to analyze the data (probably as an ANOVA or regression analysis, but possibly as a logistic regression or contingency table analysis. Write code to generate a useful graph of the data. I’m going to use an ANOVA since I’m testing only one variable against its treatment groups.
# ANOVA Data Frame Construction
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(9.842,2.8551 ) # mean of each group
nSD <- c(4.36269,4.36269) # standard deviation of each group
ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
(rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame': 93 obs. of 2 variables:
## $ TGroup: chr "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
## $ resVar: num 8.59 14.32 6.96 11.56 3.85 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
## aov(formula = resVar ~ TGroup, data = ANOdata)
##
## Terms:
## TGroup Residuals
## Sum of Squares 840.9586 1959.7089
## Deg. of Freedom 1 91
##
## Residual standard error: 4.64061
## Estimated effects may be unbalanced
print(summary(ANOmodel))
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 1 841 841.0 39.05 1.31e-08 ***
## Residuals 91 1960 21.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
## $ :Classes 'anova' and 'data.frame': 2 obs. of 5 variables:
## ..$ Df : num [1:2] 1 91
## ..$ Sum Sq : num [1:2] 841 1960
## ..$ Mean Sq: num [1:2] 841 21.5
## ..$ F value: num [1:2] 39.1 NA
## ..$ Pr(>F) : num [1:2] 1.31e-08 NA
## - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
## TGroup resVar
## 1 Rest 3.347907
## 2 SteadyState 9.554893
unlist(z)
## Df1 Df2 Sum Sq1 Sum Sq2 Mean Sq1 Mean Sq2
## 1.000000e+00 9.100000e+01 8.409586e+02 1.959709e+03 8.409586e+02 2.153526e+01
## F value1 F value2 Pr(>F)1 Pr(>F)2
## 3.905031e+01 NA 1.307675e-08 NA
unlist(z)[7]
## F value1
## 39.05031
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1
## 39.05031
##
## $probF
## Pr(>F)1
## 1.307675e-08
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
geom_boxplot()
print(ANOPlot)
# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")
6. Now begin adjusting the means of the different groups. Given the sample sizes you have chosen, how small can the differences between the groups be (the “effect size”) for you to still detect a significant pattern (p < 0.05)? When I changed the means to 7 and 4, the p-value was still significant meaning the null hypothesis can still be rejected. I again adjusted the means closer, to 6 and 5, and the p-value became insignificant. Small adjustments showed that even a 0.1 difference in effect size (1.3 difference vs. 1.2) changes the significance of my data.
# ANOVA Data Frame for the new means
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(6.3,5 ) # CHANGED THESE MEANS
nSD <- c(4.36269,4.36269) # standard deviation of each group
ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
(rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame': 93 obs. of 2 variables:
## $ TGroup: chr "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
## $ resVar: num 8.4214 4.6799 4.4667 -0.4596 -0.0657 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
## aov(formula = resVar ~ TGroup, data = ANOdata)
##
## Terms:
## TGroup Residuals
## Sum of Squares 64.9143 1718.5007
## Deg. of Freedom 1 91
##
## Residual standard error: 4.345644
## Estimated effects may be unbalanced
print(summary(ANOmodel))
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 1 64.9 64.91 3.437 0.067 .
## Residuals 91 1718.5 18.88
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
## $ :Classes 'anova' and 'data.frame': 2 obs. of 5 variables:
## ..$ Df : num [1:2] 1 91
## ..$ Sum Sq : num [1:2] 64.9 1718.5
## ..$ Mean Sq: num [1:2] 64.9 18.9
## ..$ F value: num [1:2] 3.44 NA
## ..$ Pr(>F) : num [1:2] 0.067 NA
## - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
## TGroup resVar
## 1 Rest 5.020625
## 2 SteadyState 6.745127
unlist(z)
## Df1 Df2 Sum Sq1 Sum Sq2 Mean Sq1 Mean Sq2
## 1.000000e+00 9.100000e+01 6.491431e+01 1.718501e+03 6.491431e+01 1.888462e+01
## F value1 F value2 Pr(>F)1 Pr(>F)2
## 3.437416e+00 NA 6.697461e-02 NA
unlist(z)[7]
## F value1
## 3.437416
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1
## 3.437416
##
## $probF
## Pr(>F)1
## 0.06697461
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
geom_boxplot()
print(ANOPlot)
# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")
7. Alternatively, for the effect sizes you originally hypothesized, what is the minimum sample size you would need in order to detect a statistically significant effect? Again, run the model a few times with the same parameter set to get a feeling for the effect of random variation in the data. The original sample size was 3000, after reducing it to 300 the relationship was still significant, and remained so when reducing it to 100. I could reduce the sample size to as low as 10 and the data would still remain significant. It is interesting to adjust sample size as dolphins are extremely difficult animals to collect data on, and in a study like this one where they are working with captive dolphins there are only so many animals to sample. Bringing the sample size way down for my model would be a much more accurate look at what the actual data entailed.
# Adjusting the Sample Size
MySet <- rnorm(n=10,mean=5.676607)
MySet <- data.frame(1:10,MySet)
names(MySet) <- list("ID","myVar")
MySet <- MySet[MySet$myVar>0,]
str(MySet)
## 'data.frame': 10 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10
## $ myVar: num 6.7 5.97 6.14 5.86 6.71 ...
summary(MySet$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.172 5.889 6.252 6.201 6.704 6.760
# Running ANOVA
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(9.842,2.8551 ) # mean of each group
nSD <- c(4.36269,4.36269) # standard deviation of each group
ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
(rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame': 93 obs. of 2 variables:
## $ TGroup: chr "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
## $ resVar: num 9.9 9.95 10.03 6.36 17.16 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
## aov(formula = resVar ~ TGroup, data = ANOdata)
##
## Terms:
## TGroup Residuals
## Sum of Squares 974.9954 1713.5777
## Deg. of Freedom 1 91
##
## Residual standard error: 4.339415
## Estimated effects may be unbalanced
print(summary(ANOmodel))
## Df Sum Sq Mean Sq F value Pr(>F)
## TGroup 1 975 975.0 51.78 1.71e-10 ***
## Residuals 91 1714 18.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
## $ :Classes 'anova' and 'data.frame': 2 obs. of 5 variables:
## ..$ Df : num [1:2] 1 91
## ..$ Sum Sq : num [1:2] 975 1714
## ..$ Mean Sq: num [1:2] 975 18.8
## ..$ F value: num [1:2] 51.8 NA
## ..$ Pr(>F) : num [1:2] 1.71e-10 NA
## - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
## TGroup resVar
## 1 Rest 3.623908
## 2 SteadyState 10.307266
unlist(z)
## Df1 Df2 Sum Sq1 Sum Sq2 Mean Sq1 Mean Sq2
## 1.000000e+00 9.100000e+01 9.749954e+02 1.713578e+03 9.749954e+02 1.883052e+01
## F value1 F value2 Pr(>F)1 Pr(>F)2
## 5.177739e+01 NA 1.710036e-10 NA
unlist(z)[7]
## F value1
## 51.77739
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1
## 51.77739
##
## $probF
## Pr(>F)1
## 1.710036e-10
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
geom_boxplot()
print(ANOPlot)
# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")