#open libraries
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
#read in data vector
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1733 obs. of 2 variables:
## $ ID : int 1 4 5 6 7 8 9 10 11 12 ...
## $ myVar: num 1.196 0.558 0.332 0.188 0.173 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001457 0.347645 0.737156 0.868051 1.263470 3.907087
#histogram
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#maximum likelihood parameters
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.86805094 0.65575801
## (0.01575231) (0.01113857)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.868 0.656
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0158 0.0111
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000248 0 0 0.000124
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1733
## $ loglik : num -1728
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8680509
#normal
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#exponential
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#uniform
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#gamma
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#beta
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
z <- read.csv("kbsampledata.csv", header=TRUE,sep=",")
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 71 obs. of 2 variables:
## $ ID : int 210 170 189 194 171 181 192 175 174 163 ...
## $ myVar: num 78.4 48.2 69.8 69 43.5 ...
summary(z)
## ID myVar
## Min. :145.0 Min. : 30.30
## 1st Qu.:166.0 1st Qu.: 45.47
## Median :175.0 Median : 52.28
## Mean :176.6 Mean : 55.33
## 3rd Qu.:185.5 3rd Qu.: 65.37
## Max. :233.0 Max. :141.66
#histogram
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#empirical density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#maximum likelihood parameters
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 55.326901 16.212602
## ( 1.924082) ( 1.360531)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 55.3 16.2
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.92 1.36
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 3.7 0 0 1.85
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 71
## $ loglik : num -299
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 55.3269
#normal
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#exponential
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#uniform
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#gamma
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#beta
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Gamma!
Here are my maximum likelihood parameters:
mean <- 55.326901
sd <- 16.212602
shape <- (mean^2)/sd
scale <- sd/mean
To simulate a new dataset:
z <- rgamma(n=71,shape=shape, scale=scale)
z <- data.frame(1:71,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 71 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ myVar: num 56.8 59.1 53.9 60 60.8 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.44 51.48 54.40 54.80 58.33 64.78
Histogram:
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
With probability density curve:
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shape, rate=(1/scale)))
p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Original histogram:
z <- read.csv("kbsampledata.csv", header=TRUE,sep=",")
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 71 obs. of 2 variables:
## $ ID : int 210 170 189 194 171 181 192 175 174 163 ...
## $ myVar: num 78.4 48.2 69.8 69 43.5 ...
summary(z)
## ID myVar
## Min. :145.0 Min. : 30.30
## 1st Qu.:166.0 1st Qu.: 45.47
## Median :175.0 Median : 52.28
## Mean :176.6 Mean : 55.33
## 3rd Qu.:185.5 3rd Qu.: 65.37
## Max. :233.0 Max. :141.66
#histogram
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
With probability density curve:
gammaPars <- fitdistr(z$myVar,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The simulation performed alright with this data set. The only thing that stuck out to me was the fact that with my original data, the gamma distribution didn’t follow the highest peaks of the histogram. Instead, it seemed to follow a smaller value (probably the mean).