1. Think about an ongoing study in your lab (or a paper you have read in a different class), and decide on a pattern that you might expect in your experiment if a specific hypothesis were true. Using the dataset from last week I found on DRYAD from Allen et. al, I am predicting to see the O2 output per kilogram per minute to be higher in those dolphins in a steady state rather than those at rest.

2. To start simply, assume that the data in each of your treatment groups follow a normal distribution. Specify the sample sizes, means, and variances for each group that would be reasonable if your hypothesis were true. You may need to consult some previous literature and/or an expert in the field to come up with these numbers.

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
## Warning: package 'MASS' was built under R version 4.1.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# reading in the same data from last week
z <- read.csv("Fig_3.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    93 obs. of  6 variables:
##  $ TrialID         : chr  "Trial17_08" "Trial17_11" "Trial17_12" "Trial17_16" ...
##  $ Animal_ID       : chr  "9ON6" "83H1" "01L5" "9ON6" ...
##  $ State           : chr  "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
##  $ ODBA_g          : num  0.309 0.327 0.155 0.218 0.194 ...
##  $ O2_ml_per_kg_min: num  13.93 13.64 5.57 13.69 7.85 ...
##  $ W_per_kg        : num  4.67 4.57 1.87 4.59 2.63 ...
z<-na.omit(z)
z$myVar <-z$O2_ml_per_kg_min
summary(z)
##    TrialID           Animal_ID            State               ODBA_g       
##  Length:93          Length:93          Length:93          Min.   :0.01074  
##  Class :character   Class :character   Class :character   1st Qu.:0.01641  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.02953  
##                                                           Mean   :0.12768  
##                                                           3rd Qu.:0.22644  
##                                                           Max.   :0.52505  
##  O2_ml_per_kg_min     W_per_kg          myVar        
##  Min.   : 0.3877   Min.   :0.1299   Min.   : 0.3877  
##  1st Qu.: 2.3434   1st Qu.:0.7850   1st Qu.: 2.3434  
##  Median : 4.3945   Median :1.4722   Median : 4.3945  
##  Mean   : 5.6766   Mean   :1.9017   Mean   : 5.6766  
##  3rd Qu.: 7.8524   3rd Qu.:2.6305   3rd Qu.: 7.8524  
##  Max.   :16.4197   Max.   :5.5006   Max.   :16.4197
# Using work from last time where we created a simulation with the same attributes
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##      mean         sd    
##   5.6766070   4.3391717 
##  (0.4499511) (0.3181635)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 5.68 4.34
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.45 0.318
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.202 0 0 0.101
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 93
##  $ loglik  : num -268
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # Found the mean
##     mean 
## 5.676607
# Finding Sample Sizes for whole set
length(z$myVar)
## [1] 93
# Finding Variance for whole set
var(z$myVar)
## [1] 19.03307
sd(z$myVar)
## [1] 4.36269
############ Now need them for each treatment group tho #############
steady <-z[z$State=="SteadyState", ]
summary(steady)
##    TrialID           Animal_ID            State               ODBA_g      
##  Length:35          Length:35          Length:35          Min.   :0.1488  
##  Class :character   Class :character   Class :character   1st Qu.:0.2176  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.2930  
##                                                           Mean   :0.2992  
##                                                           3rd Qu.:0.3711  
##                                                           Max.   :0.5251  
##  O2_ml_per_kg_min    W_per_kg         myVar       
##  Min.   : 5.504   Min.   :1.844   Min.   : 5.504  
##  1st Qu.: 7.381   1st Qu.:2.473   1st Qu.: 7.381  
##  Median : 9.842   Median :3.297   Median : 9.842  
##  Mean   :10.352   Mean   :3.468   Mean   :10.352  
##  3rd Qu.:13.730   3rd Qu.:4.600   3rd Qu.:13.730  
##  Max.   :16.420   Max.   :5.501   Max.   :16.420
# sd(steady)

rest <-z[z$State=="Rest", ]
summary(rest)
##    TrialID           Animal_ID            State               ODBA_g       
##  Length:58          Length:58          Length:58          Min.   :0.01074  
##  Class :character   Class :character   Class :character   1st Qu.:0.01531  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.01751  
##                                                           Mean   :0.02416  
##                                                           3rd Qu.:0.02719  
##                                                           Max.   :0.09835  
##  O2_ml_per_kg_min    W_per_kg          myVar       
##  Min.   :0.3877   Min.   :0.1299   Min.   :0.3877  
##  1st Qu.:1.6889   1st Qu.:0.5658   1st Qu.:1.6889  
##  Median :2.4710   Median :0.8278   Median :2.4710  
##  Mean   :2.8551   Mean   :0.9565   Mean   :2.8551  
##  3rd Qu.:3.9119   3rd Qu.:1.3105   3rd Qu.:3.9119  
##  Max.   :7.4855   Max.   :2.5076   Max.   :7.4855

3. Using the methods we have covered in class, write code to create a random data set that has these attributes. Organize these data into a data frame with the appropriate structure.

# New Dataset
MySet <- rnorm(n=3000,mean=5.676607)
MySet <- data.frame(1:3000,MySet)
names(MySet) <- list("ID","myVar")
MySet <- MySet[MySet$myVar>0,]
str(MySet)
## 'data.frame':    3000 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  6.87 4.98 5.23 6.32 6.49 ...
summary(MySet$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.164   5.065   5.697   5.720   6.394   9.860

4. Now write code to analyze the data (probably as an ANOVA or regression analysis, but possibly as a logistic regression or contingency table analysis. Write code to generate a useful graph of the data. I’m going to use an ANOVA since I’m testing only one variable against its treatment groups.

# ANOVA Data Frame Construction
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(9.842,2.8551 ) # mean of each group
nSD <- c(4.36269,4.36269) # standard deviation of each group

ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
            (rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame':    93 obs. of  2 variables:
##  $ TGroup: chr  "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
##  $ resVar: num  8.59 14.32 6.96 11.56 3.85 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
##    aov(formula = resVar ~ TGroup, data = ANOdata)
## 
## Terms:
##                    TGroup Residuals
## Sum of Squares   840.9586 1959.7089
## Deg. of Freedom         1        91
## 
## Residual standard error: 4.64061
## Estimated effects may be unbalanced
print(summary(ANOmodel))
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## TGroup       1    841   841.0   39.05 1.31e-08 ***
## Residuals   91   1960    21.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
##  $ :Classes 'anova' and 'data.frame':    2 obs. of  5 variables:
##   ..$ Df     : num [1:2] 1 91
##   ..$ Sum Sq : num [1:2] 841 1960
##   ..$ Mean Sq: num [1:2] 841 21.5
##   ..$ F value: num [1:2] 39.1 NA
##   ..$ Pr(>F) : num [1:2] 1.31e-08 NA
##  - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
##        TGroup   resVar
## 1        Rest 3.347907
## 2 SteadyState 9.554893
unlist(z)
##          Df1          Df2      Sum Sq1      Sum Sq2     Mean Sq1     Mean Sq2 
## 1.000000e+00 9.100000e+01 8.409586e+02 1.959709e+03 8.409586e+02 2.153526e+01 
##     F value1     F value2      Pr(>F)1      Pr(>F)2 
## 3.905031e+01           NA 1.307675e-08           NA
unlist(z)[7]
## F value1 
## 39.05031
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1 
## 39.05031 
## 
## $probF
##      Pr(>F)1 
## 1.307675e-08
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
           geom_boxplot()
print(ANOPlot)

# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")
  1. Try running your analysis multiple times to get a feeling for how variable the results are with the same parameters, but different sets of random numbers.

6. Now begin adjusting the means of the different groups. Given the sample sizes you have chosen, how small can the differences between the groups be (the “effect size”) for you to still detect a significant pattern (p < 0.05)? When I changed the means to 7 and 4, the p-value was still significant meaning the null hypothesis can still be rejected. I again adjusted the means closer, to 6 and 5, and the p-value became insignificant. Small adjustments showed that even a 0.1 difference in effect size (1.3 difference vs. 1.2) changes the significance of my data.

# ANOVA Data Frame for the new means
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(6.3,5 ) # CHANGED THESE MEANS
nSD <- c(4.36269,4.36269) # standard deviation of each group

ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
            (rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame':    93 obs. of  2 variables:
##  $ TGroup: chr  "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
##  $ resVar: num  8.4214 4.6799 4.4667 -0.4596 -0.0657 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
##    aov(formula = resVar ~ TGroup, data = ANOdata)
## 
## Terms:
##                    TGroup Residuals
## Sum of Squares    64.9143 1718.5007
## Deg. of Freedom         1        91
## 
## Residual standard error: 4.345644
## Estimated effects may be unbalanced
print(summary(ANOmodel))
##             Df Sum Sq Mean Sq F value Pr(>F)  
## TGroup       1   64.9   64.91   3.437  0.067 .
## Residuals   91 1718.5   18.88                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
##  $ :Classes 'anova' and 'data.frame':    2 obs. of  5 variables:
##   ..$ Df     : num [1:2] 1 91
##   ..$ Sum Sq : num [1:2] 64.9 1718.5
##   ..$ Mean Sq: num [1:2] 64.9 18.9
##   ..$ F value: num [1:2] 3.44 NA
##   ..$ Pr(>F) : num [1:2] 0.067 NA
##  - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
##        TGroup   resVar
## 1        Rest 5.020625
## 2 SteadyState 6.745127
unlist(z)
##          Df1          Df2      Sum Sq1      Sum Sq2     Mean Sq1     Mean Sq2 
## 1.000000e+00 9.100000e+01 6.491431e+01 1.718501e+03 6.491431e+01 1.888462e+01 
##     F value1     F value2      Pr(>F)1      Pr(>F)2 
## 3.437416e+00           NA 6.697461e-02           NA
unlist(z)[7]
## F value1 
## 3.437416
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1 
## 3.437416 
## 
## $probF
##    Pr(>F)1 
## 0.06697461
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
           geom_boxplot()
print(ANOPlot)

# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")

7. Alternatively, for the effect sizes you originally hypothesized, what is the minimum sample size you would need in order to detect a statistically significant effect? Again, run the model a few times with the same parameter set to get a feeling for the effect of random variation in the data. The original sample size was 3000, after reducing it to 300 the relationship was still significant, and remained so when reducing it to 100. I could reduce the sample size to as low as 10 and the data would still remain significant. It is interesting to adjust sample size as dolphins are extremely difficult animals to collect data on, and in a study like this one where they are working with captive dolphins there are only so many animals to sample. Bringing the sample size way down for my model would be a much more accurate look at what the actual data entailed.

# Adjusting the Sample Size 
MySet <- rnorm(n=10,mean=5.676607)
MySet <- data.frame(1:10,MySet)
names(MySet) <- list("ID","myVar")
MySet <- MySet[MySet$myVar>0,]
str(MySet)
## 'data.frame':    10 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10
##  $ myVar: num  6.7 5.97 6.14 5.86 6.71 ...
summary(MySet$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.172   5.889   6.252   6.201   6.704   6.760
# Running ANOVA
nGroup <- 2 # number of treatment groups
nName <- c("SteadyState","Rest") # names of groups
nSize <- c(35,58) # number of observations in each group
nMean <- c(9.842,2.8551 ) # mean of each group
nSD <- c(4.36269,4.36269) # standard deviation of each group

ID <- 1:(sum(nSize)) # id vector for each row
resVar <- c(rnorm(n=nSize[1],mean=nMean[1],sd=nSD[1]),
            (rnorm(n=nSize[2],mean=nMean[2],sd=nSD[2])))
TGroup <- rep(nName,nSize)
ANOdata <- data.frame(TGroup, resVar)
str(ANOdata)
## 'data.frame':    93 obs. of  2 variables:
##  $ TGroup: chr  "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
##  $ resVar: num  9.9 9.95 10.03 6.36 17.16 ...
# Basic ANOVA
ANOmodel <- aov(resVar~TGroup,data=ANOdata)
print(ANOmodel)
## Call:
##    aov(formula = resVar ~ TGroup, data = ANOdata)
## 
## Terms:
##                    TGroup Residuals
## Sum of Squares   974.9954 1713.5777
## Deg. of Freedom         1        91
## 
## Residual standard error: 4.339415
## Estimated effects may be unbalanced
print(summary(ANOmodel))
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## TGroup       1    975   975.0   51.78 1.71e-10 ***
## Residuals   91   1714    18.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
z <- summary(ANOmodel)
str(z)
## List of 1
##  $ :Classes 'anova' and 'data.frame':    2 obs. of  5 variables:
##   ..$ Df     : num [1:2] 1 91
##   ..$ Sum Sq : num [1:2] 975 1714
##   ..$ Mean Sq: num [1:2] 975 18.8
##   ..$ F value: num [1:2] 51.8 NA
##   ..$ Pr(>F) : num [1:2] 1.71e-10 NA
##  - attr(*, "class")= chr [1:2] "summary.aov" "listof"
aggregate(resVar~TGroup,data=ANOdata,FUN=mean)
##        TGroup    resVar
## 1        Rest  3.623908
## 2 SteadyState 10.307266
unlist(z)
##          Df1          Df2      Sum Sq1      Sum Sq2     Mean Sq1     Mean Sq2 
## 1.000000e+00 9.100000e+01 9.749954e+02 1.713578e+03 9.749954e+02 1.883052e+01 
##     F value1     F value2      Pr(>F)1      Pr(>F)2 
## 5.177739e+01           NA 1.710036e-10           NA
unlist(z)[7]
## F value1 
## 51.77739
ANOsum <- list(Fval=unlist(z)[7],probF=unlist(z)[9])
ANOsum
## $Fval
## F value1 
## 51.77739 
## 
## $probF
##      Pr(>F)1 
## 1.710036e-10
# Plot of ANOVA Data
ANOPlot <- ggplot(data=ANOdata,aes(x=TGroup,y=resVar,fill=TGroup)) +
           geom_boxplot()
print(ANOPlot)

# ggsave(filename="Plot2.pdf",plot=ANOPlot,device="pdf")