Homework

Part 1

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)

## 'data.frame':    1745 obs. of  2 variables:
##  $ ID   : int  3 4 6 8 10 11 12 13 15 16 ...
##  $ myVar: num  0.119 1.133 0.266 0.326 3.511 ...

summary(z$myVar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001513 0.370689 0.769985 0.887691 1.242554 4.168616

# Plot histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Add imperical curve density
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Get maximum likelihood parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean          sd    
##   0.88769131   0.65826902 
##  (0.01575817) (0.01114271)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 0.888 0.658
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0158 0.0111
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000248 0 0 0.000124
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1745
##  $ loglik  : num -1746
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##      mean 
## 0.8876913

# Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Part 2

# quick and dirty, a truncated normal distribution to work on the solution set
z <- read.table('CleanData/Azevedo_Egg_size_&_ovariole_number_cleaned.csv',header = TRUE, sep = ',')
str(z)

## 'data.frame':    20 obs. of  9 variables:
##  $ Latitude                        : num  -16.9 -16.9 -20.2 -20.2 -23.1 ...
##  $ Longitude                       : num  145 145 148 148 151 ...
##  $ Year_start                      : int  1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
##  $ Year_end                        : int  1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
##  $ Mean_egg_size                   : num  0.0098 0.0105 0.0102 0.0103 0.0101 0.0107 0.0101 0.0107 0.0098 0.0099 ...
##  $ Confidence_limit_egg_size       : num  2e-04 2e-04 3e-04 1e-04 2e-04 3e-04 2e-04 2e-04 3e-04 1e-04 ...
##  $ Mean_ovariole_number            : num  19.1 20.2 20.1 20.7 19.5 ...
##  $ Confidence_limit_ovariole_number: num  1.01 1.16 1.52 1.42 1.49 ...
##  $ X                               : logi  NA NA NA NA NA NA ...

summary(z)

##     Latitude        Longitude       Year_start      Year_end   
##  Min.   :-42.88   Min.   :145.1   Min.   :1993   Min.   :1993  
##  1st Qu.:-35.82   1st Qu.:145.7   1st Qu.:1993   1st Qu.:1993  
##  Median :-29.14   Median :149.5   Median :1993   Median :1993  
##  Mean   :-29.21   Mean   :149.3   Mean   :1993   Mean   :1993  
##  3rd Qu.:-23.13   3rd Qu.:152.7   3rd Qu.:1993   3rd Qu.:1993  
##  Max.   :-16.88   Max.   :153.2   Max.   :1993   Max.   :1993  
##  Mean_egg_size     Confidence_limit_egg_size Mean_ovariole_number
##  Min.   :0.00980   Min.   :0.000100          Min.   :18.49       
##  1st Qu.:0.01010   1st Qu.:0.000200          1st Qu.:19.32       
##  Median :0.01045   Median :0.000200          Median :20.13       
##  Mean   :0.01045   Mean   :0.000205          Mean   :20.14       
##  3rd Qu.:0.01070   3rd Qu.:0.000200          3rd Qu.:20.61       
##  Max.   :0.01130   Max.   :0.000300          Max.   :23.35       
##  Confidence_limit_ovariole_number    X          
##  Min.   :0.7180                   Mode:logical  
##  1st Qu.:0.9405                   NA's:20       
##  Median :1.1300                                 
##  Mean   :1.1679                                 
##  3rd Qu.:1.4392                                 
##  Max.   :1.6880

# Plot histogram of data
p1 <- ggplot(data=z, aes(x=Mean_ovariole_number, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Add imperical curve density
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Get maximum likelihood parameters for normal
normPars <- fitdistr(z$Mean_ovariole_number,"normal")
print(normPars)

##       mean          sd    
##   20.1404000    1.1116680 
##  ( 0.2485765) ( 0.1757701)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 20.14 1.11
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.249 0.176
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.0618 0 0 0.0309
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 20
##  $ loglik  : num -30.5
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##    mean 
## 20.1404

# Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$Mean_ovariole_number),len=length(z$Mean_ovariole_number))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$Mean_ovariole_number), args = list(mean = meanML, sd = sdML))
p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot exponential probability density
expoPars <- fitdistr(z$Mean_ovariole_number,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$Mean_ovariole_number), args = list(rate=rateML))
p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))

## Warning in min(z$myVar): no non-missing arguments to min; returning Inf

## Warning in max(z$myVar): no non-missing arguments to max; returning -Inf

p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot gamma probability density
gammaPars <- fitdistr(z$Mean_ovariole_number,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$Mean_ovariole_number), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=Mean_ovariole_number/(max(Mean_ovariole_number + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$Mean_ovariole_number/max(z$Mean_ovariole_number + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$Mean_ovariole_number), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Part 3 and 4

# quick and dirty, a truncated normal distribution to work on the solution set
z <- read.table('CleanData/Azevedo_Egg_size_&_ovariole_number_cleaned.csv',header = TRUE, sep = ',')
str(z)

## 'data.frame':    20 obs. of  9 variables:
##  $ Latitude                        : num  -16.9 -16.9 -20.2 -20.2 -23.1 ...
##  $ Longitude                       : num  145 145 148 148 151 ...
##  $ Year_start                      : int  1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
##  $ Year_end                        : int  1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
##  $ Mean_egg_size                   : num  0.0098 0.0105 0.0102 0.0103 0.0101 0.0107 0.0101 0.0107 0.0098 0.0099 ...
##  $ Confidence_limit_egg_size       : num  2e-04 2e-04 3e-04 1e-04 2e-04 3e-04 2e-04 2e-04 3e-04 1e-04 ...
##  $ Mean_ovariole_number            : num  19.1 20.2 20.1 20.7 19.5 ...
##  $ Confidence_limit_ovariole_number: num  1.01 1.16 1.52 1.42 1.49 ...
##  $ X                               : logi  NA NA NA NA NA NA ...

summary(z)

##     Latitude        Longitude       Year_start      Year_end   
##  Min.   :-42.88   Min.   :145.1   Min.   :1993   Min.   :1993  
##  1st Qu.:-35.82   1st Qu.:145.7   1st Qu.:1993   1st Qu.:1993  
##  Median :-29.14   Median :149.5   Median :1993   Median :1993  
##  Mean   :-29.21   Mean   :149.3   Mean   :1993   Mean   :1993  
##  3rd Qu.:-23.13   3rd Qu.:152.7   3rd Qu.:1993   3rd Qu.:1993  
##  Max.   :-16.88   Max.   :153.2   Max.   :1993   Max.   :1993  
##  Mean_egg_size     Confidence_limit_egg_size Mean_ovariole_number
##  Min.   :0.00980   Min.   :0.000100          Min.   :18.49       
##  1st Qu.:0.01010   1st Qu.:0.000200          1st Qu.:19.32       
##  Median :0.01045   Median :0.000200          Median :20.13       
##  Mean   :0.01045   Mean   :0.000205          Mean   :20.14       
##  3rd Qu.:0.01070   3rd Qu.:0.000200          3rd Qu.:20.61       
##  Max.   :0.01130   Max.   :0.000300          Max.   :23.35       
##  Confidence_limit_ovariole_number    X          
##  Min.   :0.7180                   Mode:logical  
##  1st Qu.:0.9405                   NA's:20       
##  Median :1.1300                                 
##  Mean   :1.1679                                 
##  3rd Qu.:1.4392                                 
##  Max.   :1.6880

# Plot gamma probability density
gammaPars <- fitdistr(z$Mean_ovariole_number,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

# rnorm distribution
rnormdist <- rnorm(500, shapeML, rateML)
#into data frame
dframe <- data.frame(rnormdist)

p2 <- ggplot(data = dframe, aes(x = rnormdist, y = ..density..)) +
  geom_histogram(color = 'grey60', fill = 'cornsilk', size = 0.2)
print(p2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gamma Curve

#gamma curve
xval <- seq(0,max(dframe),len=length(dframe))
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(dframe), args = list(shape=shapeML, rate=rateML))
p3 <- p1 + stat4
print(p3)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Origional Curve

#histogram for origional data and curve
pSpecial <- ggplot(data=z, aes(x=Mean_ovariole_number/(max(Mean_ovariole_number + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")
print(pSpecial)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Histogram Comparison

The two histogram profiles look very similar. I think the gamma model does a good job of representing the distribution. They look almost exactly identical except for an extra bin that is at the end in the gamma distribution curve. Other than that the curves are very similar.