# STATISTICAL INFERENCE

library(faraway)

# MLE tossing a coin #####
# (inspired by Adams (2021) section 5.4.2)
set.seed(9403)
coinFlips <- rbinom(500,size=1,prob=0.2)

#if the coin was fair, how likely these 97 heads would be? 
dbinom(97,size=500,prob=0.5)

#the true value of p is 0.2
choose(500, 97) * 0.2^97 * 0.8^403
dbinom(97,size=500,prob=0.2)
# but we don't know it, we only know our sample

#define log-likelihood
logLikelihood <- function(p){
  return(-dbinom(97,size=500,prob=p, log=TRUE))
}

optimize(f = logLikelihood, interval = c(0,1))
#we get 0.1939823

pVector <- seq(0,1,by=0.01)
plot(pVector, -logLikelihood(pVector), type="l",
     xlab="p",ylab="log likelihood")
abline(v=0.1939823)
abline(v=0.2, col="red")


# MLE motivation - Challenger Crash investigation #####
library(faraway)
library(ggplot2)
library(MASS)

#load the data
data(orings, package="faraway")

#convert into units we understand
orings$tempC <- (orings$temp - 32) * (5 / 9)

#plot the data
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5) +
  labs(
    x = "Temperature (°C)",
    y = "# Failed orings",
    title = "Challenger crash investigation",
    subtitle = ""
  )


#try Linear Probability Model
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5) +
  geom_smooth(
    method = "lm",
    formula = y ~ x,
    se = T
  )+
  labs(
    x = "Temperature (°C)",
    y = " # Failed oring",
    title = "Challenger crash investigation",
    subtitle = " "
  )


#we may try quadratic fit
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5) +
  geom_smooth(
    method = "lm",
    formula = y ~ poly(x,2) 
  )+
  labs(
    x = "Temperature (°C)",
    y = " # Failed oring",
    title = "Challenger crash investigation",
    subtitle = " "
  )


# If the model is too complex it may lead to non-sensical predictions
# (overfitted model)
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5) +
  geom_smooth(
    method = "lm",
    formula = y ~ poly(x,7)
  )+
  labs(
    x = "Temperature (°C)",
    y = " # Failed oring",
    title = "Challenger crash investigation",
    subtitle = ""
  )

# GLM
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5,aes(x = tempC, y = damage/6)) +
  geom_smooth(
    method = "glm",
    formula = cbind(y,6-y) ~ x, 
    se = T, method.args = list(family = "binomial"),fullrange = F
  )+
  labs(
    x = "Temperature (°C)",
    y = "Prob. of failure",
    title = "Challenger crash investigation",
    subtitle = " "
  )


#now visualize the temperature during the Challenger launch
ggplot(
  data = orings,
  aes(x = tempC, y = damage)
) +
  geom_point(size = 5,aes(x = tempC, y = damage/6)) +
  geom_smooth(
    method = "glm",
    formula = cbind(y,6-y) ~ x, 
    se = T, method.args = list(family = "binomial"),fullrange = TRUE
  )+
  scale_x_continuous(limit = c(-2, NA))+
  labs(
    x = "Temperature (°C)",
    y = "Prob. of failure",
    title = "Challenger crash investigation",
    subtitle = " "
  )+
  geom_vline(aes(xintercept = -0.5),
             linetype = "dashed") 


plot(damage/6 ~ tempC, orings, xlim=c(-5,30), ylim = c(0,1), xlab="Temperature (°C)",
     ylab="Prob. of failure", main = "Challenger crash investigation")

lmod <- lm(damage/6 ~ tempC, orings)
abline(lmod)


logitmod <- glm(cbind(damage,6-damage) ~ tempC, family=binomial, orings)
sumlogit <- summary(logitmod)
plot(damage/6 ~ tempC, orings, xlim=c(-5,30), ylim = c(0,1), xlab="Temperature (°C)",
     ylab="Prob. of failure", main = "Challenger crash investigation")

x <- seq(-5,30,1)

lines(x,ilogit(2.5-0.2*x))
lines(x,ilogit(-0.5-0.2*x))
lines(x,ilogit(1.5-0.5*x))
lines(x,ilogit(1.5-0.1*x))

lines(x,ilogit(1.6-0.8*x))
lines(x,ilogit(-0.1-7*x))
lines(x,ilogit(6.5-0.8*x))
lines(x,ilogit(7.5-0.3*x))

lines(x,ilogit(-3.9+0.8*x))
lines(x,ilogit(-10+0.7*x))
lines(x,ilogit(-5.5+0.2*x))
lines(x,ilogit(3.5+0.5*x))

#Among these lines, which one will we pick?


# we pick the one that is most compatible with the data (in a likelihood sense)
lines(x,ilogit(sumlogit$coeff[1,1]+sumlogit$coeff[2,1]*x),col="red",lwd=4)

# we may try a different link function - probit instead of logit
probitmod <- glm(cbind(damage,6-damage) ~ tempC, family=binomial(link=probit), orings)
sumprobit <- summary(probitmod)
lines(x,ilogit(sumprobit$coeff[1,1]+sumprobit$coeff[2,1]*x),col="blue",lwd=4)

#prediction
ilogit(sumlogit$coeff[1,1]+sumlogit$coeff[2,1]*(-0.5))
pnorm(sumprobit$coeff[1,1]+sumprobit$coeff[2,1]*(-0.5))


# MLE Exponential distribution #####
set.seed(849)

lambda <- 6
n <- 100
X <- round(rexp(n,rate=1/lambda),2)
#check parametrization
?rexp 

#define likelihood function
likFunction <- function(lambda){
  prod(dexp(X,rate=1/lambda))
  #or -log(lambda) - X/lambda
}

#define log-likelihood function
logLikFunction <- function(lambda){
  sum(log(dexp(X,rate=1/lambda)))
  #or sum(dexp(X,rate=1/lambda,log=TRUE)))
  #or -n*log(lambda) - n*mean(X)/lambda
}


#look at the values at some grid
lambdaVec <- seq(4,8,1/10)

# evaluate the likelihood
likValues <- sapply(lambdaVec,likFunction)

# evaluate the log-likelihood
logLikValues  <- sapply(lambdaVec,logLikFunction)

#plot it
plot(lambdaVec,likValues)
plot(lambdaVec,logLikValues)

#look at the maximizer (on the grid of values of lambda)
lambdaVec[which.max(likValues)]

#or use "optimize" function to find the maximum
optimize(f = logLikFunction, interval = c(0,10), maximum=TRUE)

#or define the 1st derivative of the likelihood function S_n 
scoreFunction <- function(lambda){
  -n/lambda + n*mean(X)/(lambda^2)
}

#we may check it against the numerical derivative
# (to see if we haven't made any mistake)
numScoreFunction <- function(lambda, d=0.00001){
  (logLikFunction(lambda+d)-logLikFunction(lambda-d))/(2*d)
}

numScoreValues <- sapply(lambdaVec,numScoreFunction)
scoreValues    <- sapply(lambdaVec,scoreFunction)

max(abs(scoreValues - numScoreValues))
#max differences are small, it seems we have not miscoded it

plot(lambdaVec, scoreValues)
lines(lambdaVec, numScoreValues, col="red")
abline(h=0)

# Find a solution to S_n(x) = 0
rootFind <- uniroot(scoreFunction, interval = c(1,10))

# we get the same number as from the optimization
(lambdaMLE <- rootFind$root)

#Look at the Likelihood Hessian
likelihoodHessian <- function(lambda){
  -n/(lambda^2) + 2*n*mean(X)/(lambda^3)
}

likelihoodHessian(lambdaMLE)
# this is a positive number 
# and hence the log-lik. function is concave at the optimum
# (it is MINUS the second derivative)

likHessianValues <- sapply(lambdaVec,likelihoodHessian)
plot(lambdaVec, likHessianValues,ylim= c(-1,max(likHessianValues)))
abline(h=0)
# it appears GLOBALLY concave
# (it indeed is, as we know the function form)


#plot the estimated density and several values
plot(density(X),xlim=c(0,20))
xValues <- seq(0,20,0.1)
lines(xValues,dexp(xValues,1/4),col="red")
lines(xValues,dexp(xValues,1/6),col="blue")
lines(xValues,dexp(xValues,1/8),col="green")


# Bootstrap - rolling dices #####
throws = c(1,1,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,6,6,6,6)
length(throws)
mean(throws)
freq <- c(2,1,8,7,3,4)
numbers <- c(1,2,3,4,5,6)
t(numbers)%*%freq/sum(freq)

hist(throws, breaks=0.5+0:6)


#data sample is all we have
set.seed(35)

#"fake" dataset
throws2 = sample(throws, 25, replace=TRUE) 
hist(throws2, breaks=0.5+0:6, border = "red")

#another "fake" dataset
throws3 = sample(throws, 25, replace=TRUE) 
hist(throws3, breaks=0.5+0:6, border = "green")

#yet another "fake" dataset
throws4 = sample(throws, 25, replace=TRUE) 
hist(throws4, breaks=0.5+0:6, border = "blue")

#let's do it 5000 times
nb = 5000 ; bet = NULL ; n = length(throws);
ptm <- proc.time() # have a look at the time it takes
for (i in 1:nb){
  unifnum = sample(c(1:n),n,replace = T)  # pick random indices
  bet[i] = mean(throws[unifnum])
  #bet[i] = sample(throws, 25, replace=TRUE) # or use this formulation
}
proc.time() - ptm #how long did it take?


hist(bet, breaks=0.5+0:5)
#histogram not very helpful

#look at density estimate
plot(density(bet))
plot(density(bet,bw=.03,kernel="gaussian"))

#how often would we observe number 3?
mean(bet<3)

mean(bet<3.5)

#or use boot library (by Brian Ripley)
library(boot)
samplemean <- function(x, d) {
  return(mean(x[d]))
}
bootbet = boot(data=throws, statistic=samplemean, R=5000) 
plot(bootbet)

# Bootstrap - linear regression #####
set.seed(8322)
n <- 20
x <- rexp(n)
e <- rexp(n)
y <- 2 + 3*x + e

lmod <- lm(y ~ x)
# we observe intercept = 2.041 and slope = 2.806

#run many bootstrap samples
nB <- 10000
resultsB <- numeric(nB)

for (b in 1:nB){
  id <- sample(1:n,replace=TRUE)
  lmodb <- lm(y[id] ~ x[id])
  resultsB[b] <- lmodb$coeff[2]
}

# plot the bootstrapped betas
plot(density(resultsB))

#95% non-parametric bootstrap CI
quantile(resultsB, probs=c(0.025,0.975))

#parametric bootstrap
# assume normality, use bootstrap only to estimate the standard errors
se <- sd(resultsB)
c(2.806 - 1.96*se,2.806 + 1.96*se)


# *Bootstrap - non-parametric estimator #####
#Stampes p227 Introduction to the Bootstrap, Efron, Tibshirani
#dataset is in the bootstrap library
library(bootstrap)
data(stamp)
summary(stamp)

library(stats)
kde1 <- density(stamp$Thickness,bw=.003,kernel="gaussian")
kde2 <- density(stamp$Thickness,bw=.008,kernel="gaussian")
kde3 <- density(stamp$Thickness,bw=.001,kernel="gaussian")
kde4 <- density(stamp$Thickness,bw=0.006713547,kernel="gaussian")
plot(kde4$x,kde4$y,type="line")

plot(kde3$x,kde3$y,type='l',col="blue",main="Bandwidth = ",ylab="Density",xlab="Thickness")

#"reproduce" graph on p.228
par(mfrow=c(2,2))
hist(stamp$Thickness,breaks=0.06+0.005*0:15, main = "Histogram",xlab="Thickness")
plot(kde1$x,kde1$y,type='l',main="Bandwidth = 0.003",ylab="Density",xlab="Thickness")
plot(kde2$x,kde2$y,type='l',col="red",main="Bandwidth = 0.008",ylab="Density",xlab="Thickness")
plot(kde3$x,kde3$y,type='l',col="blue",main="Bandwidth = 0.001",ylab="Density",xlab="Thickness")

0.006713547

#how to calculate number of modes of a functions?
#stackexchange found this, after modification it works
#http://stackoverflow.com/questions/16255622/peak-of-the-kernel-density-estimation
n.modes <- function(x,bww) {  
  den <- density(x, bw=bww, kernel=c("gaussian"))
  den.s <- smooth.spline(den$x, den$y, all.knots=TRUE, spar=0.8)
  s.0 <- predict(den.s, den.s$x, deriv=0)
  s.1 <- predict(den.s, den.s$x, deriv=1)
  s.derv <- data.frame(s0=s.0$y, s1=s.1$y)
  nmodes <- length(rle(den.sign <- sign(s.derv$s1))$values)/2
  if ((nmodes > 10) == TRUE) { nmodes <- 10 }
  if (is.na(nmodes) == TRUE) { nmodes <- 0 } 
  return( nmodes )
}

myBisection <- function(x){
  a = 0.0001
  b = 1
  nSteps = 20
  for (i in 1:nSteps){
    ab = (a+b)/2
    if (n.modes(x,ab) > 1){
      a <- (a+b)/2
    } else {
      b <- (a+b)/2
    }
  }
  return((a+b)/2)
}

hath1 = myBisection(stamp$Thickness)
# 0.006713547
var(stamp$Thickness)
# 0.0002239209

smallestWindow <- function(d,i){
  y = d[i]
  #KS density smoothing artificially increased variance
  #this sampling will ensure that the variance of the estimate, so we rescale it
  #so that it has the same variance as our sample
  x = mean(y) + (1+0.0068^2/0.0002239209)^(-0.5)*(y-mean(y)+0.0068*rnorm(485))
  h = myBisection(x)
  return(h)
}


bootbet = boot(data=stamp$Thickness, statistic=smallestWindow, R=5000) 
summary(bootbet)
#plot(bootbet)
mean(bootbet$t > hath1)
#0.0004