library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1745 obs. of 2 variables:
## $ ID : int 3 4 6 8 10 11 12 13 15 16 ...
## $ myVar: num 0.119 1.133 0.266 0.326 3.511 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001513 0.370689 0.769985 0.887691 1.242554 4.168616
# Plot histogram of data
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Add imperical curve density
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Get maximum likelihood parameters for normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.88769131 0.65826902
## (0.01575817) (0.01114271)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.888 0.658
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0158 0.0111
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000248 0 0 0.000124
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1745
## $ loglik : num -1746
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8876913
# Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot exponential probability density
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot gamma probability density
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
# quick and dirty, a truncated normal distribution to work on the solution set
z <- read.table('CleanData/Azevedo_Egg_size_&_ovariole_number_cleaned.csv',header = TRUE, sep = ',')
str(z)
## 'data.frame': 20 obs. of 9 variables:
## $ Latitude : num -16.9 -16.9 -20.2 -20.2 -23.1 ...
## $ Longitude : num 145 145 148 148 151 ...
## $ Year_start : int 1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
## $ Year_end : int 1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
## $ Mean_egg_size : num 0.0098 0.0105 0.0102 0.0103 0.0101 0.0107 0.0101 0.0107 0.0098 0.0099 ...
## $ Confidence_limit_egg_size : num 2e-04 2e-04 3e-04 1e-04 2e-04 3e-04 2e-04 2e-04 3e-04 1e-04 ...
## $ Mean_ovariole_number : num 19.1 20.2 20.1 20.7 19.5 ...
## $ Confidence_limit_ovariole_number: num 1.01 1.16 1.52 1.42 1.49 ...
## $ X : logi NA NA NA NA NA NA ...
summary(z)
## Latitude Longitude Year_start Year_end
## Min. :-42.88 Min. :145.1 Min. :1993 Min. :1993
## 1st Qu.:-35.82 1st Qu.:145.7 1st Qu.:1993 1st Qu.:1993
## Median :-29.14 Median :149.5 Median :1993 Median :1993
## Mean :-29.21 Mean :149.3 Mean :1993 Mean :1993
## 3rd Qu.:-23.13 3rd Qu.:152.7 3rd Qu.:1993 3rd Qu.:1993
## Max. :-16.88 Max. :153.2 Max. :1993 Max. :1993
## Mean_egg_size Confidence_limit_egg_size Mean_ovariole_number
## Min. :0.00980 Min. :0.000100 Min. :18.49
## 1st Qu.:0.01010 1st Qu.:0.000200 1st Qu.:19.32
## Median :0.01045 Median :0.000200 Median :20.13
## Mean :0.01045 Mean :0.000205 Mean :20.14
## 3rd Qu.:0.01070 3rd Qu.:0.000200 3rd Qu.:20.61
## Max. :0.01130 Max. :0.000300 Max. :23.35
## Confidence_limit_ovariole_number X
## Min. :0.7180 Mode:logical
## 1st Qu.:0.9405 NA's:20
## Median :1.1300
## Mean :1.1679
## 3rd Qu.:1.4392
## Max. :1.6880
# Plot histogram of data
p1 <- ggplot(data=z, aes(x=Mean_ovariole_number, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Add imperical curve density
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Get maximum likelihood parameters for normal
normPars <- fitdistr(z$Mean_ovariole_number,"normal")
print(normPars)
## mean sd
## 20.1404000 1.1116680
## ( 0.2485765) ( 0.1757701)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 20.14 1.11
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.249 0.176
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.0618 0 0 0.0309
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 20
## $ loglik : num -30.5
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 20.1404
# Plot normal probability density
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$Mean_ovariole_number),len=length(z$Mean_ovariole_number))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$Mean_ovariole_number), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot exponential probability density
expoPars <- fitdistr(z$Mean_ovariole_number,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$Mean_ovariole_number), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot uniform probability density
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
## Warning in min(z$myVar): no non-missing arguments to min; returning Inf
## Warning in max(z$myVar): no non-missing arguments to max; returning -Inf
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot gamma probability density
gammaPars <- fitdistr(z$Mean_ovariole_number,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$Mean_ovariole_number), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot beta probability density
pSpecial <- ggplot(data=z, aes(x=Mean_ovariole_number/(max(Mean_ovariole_number + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$Mean_ovariole_number/max(z$Mean_ovariole_number + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$Mean_ovariole_number), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
# quick and dirty, a truncated normal distribution to work on the solution set
z <- read.table('CleanData/Azevedo_Egg_size_&_ovariole_number_cleaned.csv',header = TRUE, sep = ',')
str(z)
## 'data.frame': 20 obs. of 9 variables:
## $ Latitude : num -16.9 -16.9 -20.2 -20.2 -23.1 ...
## $ Longitude : num 145 145 148 148 151 ...
## $ Year_start : int 1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
## $ Year_end : int 1993 1993 1993 1993 1993 1993 1993 1993 1993 1993 ...
## $ Mean_egg_size : num 0.0098 0.0105 0.0102 0.0103 0.0101 0.0107 0.0101 0.0107 0.0098 0.0099 ...
## $ Confidence_limit_egg_size : num 2e-04 2e-04 3e-04 1e-04 2e-04 3e-04 2e-04 2e-04 3e-04 1e-04 ...
## $ Mean_ovariole_number : num 19.1 20.2 20.1 20.7 19.5 ...
## $ Confidence_limit_ovariole_number: num 1.01 1.16 1.52 1.42 1.49 ...
## $ X : logi NA NA NA NA NA NA ...
summary(z)
## Latitude Longitude Year_start Year_end
## Min. :-42.88 Min. :145.1 Min. :1993 Min. :1993
## 1st Qu.:-35.82 1st Qu.:145.7 1st Qu.:1993 1st Qu.:1993
## Median :-29.14 Median :149.5 Median :1993 Median :1993
## Mean :-29.21 Mean :149.3 Mean :1993 Mean :1993
## 3rd Qu.:-23.13 3rd Qu.:152.7 3rd Qu.:1993 3rd Qu.:1993
## Max. :-16.88 Max. :153.2 Max. :1993 Max. :1993
## Mean_egg_size Confidence_limit_egg_size Mean_ovariole_number
## Min. :0.00980 Min. :0.000100 Min. :18.49
## 1st Qu.:0.01010 1st Qu.:0.000200 1st Qu.:19.32
## Median :0.01045 Median :0.000200 Median :20.13
## Mean :0.01045 Mean :0.000205 Mean :20.14
## 3rd Qu.:0.01070 3rd Qu.:0.000200 3rd Qu.:20.61
## Max. :0.01130 Max. :0.000300 Max. :23.35
## Confidence_limit_ovariole_number X
## Min. :0.7180 Mode:logical
## 1st Qu.:0.9405 NA's:20
## Median :1.1300
## Mean :1.1679
## 3rd Qu.:1.4392
## Max. :1.6880
# Plot gamma probability density
gammaPars <- fitdistr(z$Mean_ovariole_number,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
# rnorm distribution
rnormdist <- rnorm(500, shapeML, rateML)
#into data frame
dframe <- data.frame(rnormdist)
p2 <- ggplot(data = dframe, aes(x = rnormdist, y = ..density..)) +
geom_histogram(color = 'grey60', fill = 'cornsilk', size = 0.2)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#gamma curve
xval <- seq(0,max(dframe),len=length(dframe))
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(dframe), args = list(shape=shapeML, rate=rateML))
p3 <- p1 + stat4
print(p3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
#histogram for origional data and curve
pSpecial <- ggplot(data=z, aes(x=Mean_ovariole_number/(max(Mean_ovariole_number + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
print(pSpecial)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
The two histogram profiles look very similar. I think the gamma model does a good job of representing the distribution. They look almost exactly identical except for an extra bin that is at the end in the gamma distribution curve. Other than that the curves are very similar.