Homework 6: Simulating and Fitting Data Distributions

Fake Class Data

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
## Warning: package 'MASS' was built under R version 4.1.2
# quick and dirty, a truncated normal distribution to work on the solution set

# -------------- Read in Data Vector --------------
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1756 obs. of  2 variables:
##  $ ID   : int  2 3 5 6 7 8 9 10 12 13 ...
##  $ myVar: num  1.191 0.545 0.497 0.715 0.449 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000472 0.382666 0.783620 0.905691 1.299918 4.002000
# ----------------- Plot Histogram of Data ------------
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="coral3",size=0.2) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## â„ą Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## â„ą Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# -------------- Add empirical density curve ----------
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# ------------ Max Likelihood Parameters for Normal -----
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.90569139   0.65854109 
##  (0.01571523) (0.01111234)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.906 0.659
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0157 0.0111
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000247 0 0 0.000123
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1756
##  $ loglik  : num -1758
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##      mean 
## 0.9056914
# ----------- NORMAL Prob Density ------------------
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# -----------EXPONENTIAL Prob Density ---------------
# models continuous variables
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# --------- UNIFORM Prob Density ------------------
# assumes every outcome has same probability of occuring
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# ----------- GAMMA Prob Density ---------------
# Sum of expo distrubutions
# time series modeling
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# ----------- BETA Prob Density ---------------------
# generalization of uniform distribution
# bounded and continuous, no specific shape
# shape changes based on parameters
# used in bassian analysis, as a way to have an assumption
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="skyblue2",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Dolphin data from DRYAD (Allen et. al)

“Dynamic body acceleration as a proxy to predict the cost of locomotion in bottlenose dolphins”

# This study looked at the energetic cost of locomotion of bottlenose dolphins
# variable I chose to focus on was "ml of O2 per kg dolphin per minute"
z <- read.csv("Fig_3.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    93 obs. of  6 variables:
##  $ TrialID         : chr  "Trial17_08" "Trial17_11" "Trial17_12" "Trial17_16" ...
##  $ Animal_ID       : chr  "9ON6" "83H1" "01L5" "9ON6" ...
##  $ State           : chr  "SteadyState" "SteadyState" "SteadyState" "SteadyState" ...
##  $ ODBA_g          : num  0.309 0.327 0.155 0.218 0.194 ...
##  $ O2_ml_per_kg_min: num  13.93 13.64 5.57 13.69 7.85 ...
##  $ W_per_kg        : num  4.67 4.57 1.87 4.59 2.63 ...
z<-na.omit(z)
z$myVar <-z$O2_ml_per_kg_min
summary(z)
##    TrialID           Animal_ID            State               ODBA_g       
##  Length:93          Length:93          Length:93          Min.   :0.01074  
##  Class :character   Class :character   Class :character   1st Qu.:0.01641  
##  Mode  :character   Mode  :character   Mode  :character   Median :0.02953  
##                                                           Mean   :0.12768  
##                                                           3rd Qu.:0.22644  
##                                                           Max.   :0.52505  
##  O2_ml_per_kg_min     W_per_kg          myVar        
##  Min.   : 0.3877   Min.   :0.1299   Min.   : 0.3877  
##  1st Qu.: 2.3434   1st Qu.:0.7850   1st Qu.: 2.3434  
##  Median : 4.3945   Median :1.4722   Median : 4.3945  
##  Mean   : 5.6766   Mean   :1.9017   Mean   : 5.6766  
##  3rd Qu.: 7.8524   3rd Qu.:2.6305   3rd Qu.: 7.8524  
##  Max.   :16.4197   Max.   :5.5006   Max.   :16.4197
# Histogram of Data
p1 <- ggplot(data=z, aes(x=myVar, y=after_stat(density))) +
  geom_histogram(color="grey60",fill="royalblue3",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Gamma Probability Density (Fits Pretty Well)
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat <- stat_function(aes(x = myVar, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Normal Probability Density (DOES NOT MATCH WELL!)
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat2 <- stat_function(aes(x = myVar, y = ..y..), fun = dnorm, colour="orchid", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Simulation of Dolphin Data

# ------------------ Creating a New Simulation --------------------------
# Max Likelihood Parameters
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##      mean         sd    
##   5.6766070   4.3391717 
##  (0.4499511) (0.3181635)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 5.68 4.34
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.45 0.318
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.202 0 0 0.101
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 93
##  $ loglik  : num -268
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 5.676607
# Reading in New Vector
simul <- rnorm(n=3000,mean=5.676607)
simul <- data.frame(1:3000,simul)
names(simul) <- list("ID","myVar")
simul <- simul[simul$myVar>0,]
str(simul)
## 'data.frame':    3000 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  4.85 6.11 4.93 4.69 6.04 ...
summary(simul$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.123   4.996   5.672   5.681   6.362   8.939
# Histogram of Simulation
p2 <- ggplot(data=simul, aes(x=myVar, y=after_stat(density))) +
  geom_histogram(color="grey60",fill="darkslategrey",size=0.2) 
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Gamma Probability Density of Simulation
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat <- stat_function(aes(x = myVar, y = ..y..), fun = dgamma, colour="brown", n = length(simul$myVar), args = list(shape=shapeML, rate=rateML))
p2 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Final Products!

# Fresh Plot of the Original Dolphin Data
print(p1 + stat)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Fresh Simulated Plot 
print(p2 + stat)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

Both histograms are similar in scale (by design) however the simulation is much closer to a what a normal distribution curve on the graph, especially because the mean of the original data is close to 5. However, both of their gamma probability density lines had similar shapes, what makes me believe that the simulation still did a good job of simulating realistic data, though the actual data was a bit more extreme, with more outliers than the simulation.