# ============================================================================== # --------------------------------- SEMINAR 6 ---------------------------------- # ============================================================================== # ----------------------------------- TASK 1 ----------------------------------- load("toss_a_coin.RData") n_obs <- 200 # number of observations # ...................................... A ..................................... # Decide if the data sample could possibly come from the binomial distribution. # Explain Your answer (for example using the histogram of the data). hist(data, freq = FALSE, col = "red") box() # it seems like binomial data (revise previous seminar tasks) # ...................................... B ..................................... # Use maximum likelihood method to estimate parameter p of the binomial distribution # Bi(n = 100, p) for this data. # create a function for computing the negative log likelihood of the binomial # distribution: negloglik.binom <- function(par, x, n_trials) { return(- sum(log(dbinom(x, size = n_trials, prob = par)))) } # dbinom computes densities (probabilities) for each element of x, # log will compute the logarithm of each value of the result vector, # sum will sum this vector up and minus sign will flip the sign # now we need to find minimum of this function (which is the maximum of original # log likelihood function and our estimation of parameter par) # univariate optimization: op <- optimize(negloglik.binom, interval = c(0, 1), x = data, n_trials = 100) # results are stored in the list "op": op$minimum # estimation of parameter p (the point of minimum) op$objective # value of the negative log-likelihood # now compute the negative log likelihood for each value of parameters: parameter <- seq(0.01, 0.99, len = 200) # x-axis of parameter values neg_log_like <- rep(0, times = 200) # prepare empty vector of lenght 200 for (i in 1:200) { neg_log_like[i] <- negloglik.binom(parameter[i], x = data, n_trials = 100) } # for each value of x-axis we compute the negative log-likelihood function value. # plot the negative log likelihood function together with the numerical estimation: plot(parameter, neg_log_like, type = "l", main = "Negative log likelihood") points(op$minimum, op$objective, col = "red", pch = 16) abline(v = op$minimum, col = 'red') # ...................................... C ..................................... # Plot the histogram of Your data together with the density of the theoretical # binomial distribution with estimated parameter p. xx <- 0:100 # x-axis density.b <- dbinom(xx, 100, op$minimum) hist(data, xlim = c(0, 100), freq = FALSE, col = "red") points(xx, density.b, type = "o", pch = 19) box() # ----------------------------------- TASK 2 ----------------------------------- data <- read.csv("Computers.csv") # ...................................... A ..................................... # Assume the price of 2 MB RAM computers is following a normal distribution. Use maximum # likelihood method to estimate parameters mu and sigma of the normal distribution # N(mu, sigma). Plot histogram of your data together with a theoretical normal distribution. # create a vector of price values for computers with RAM size 2: price2 <- data[data$ram == 2, "price"] n <- length(price2) # estimate the parameters mu and sigma using theoretical maximum likelihood estimation: (mu <- mean(price2)) (sigma <- sqrt(sum((price2 - mu)^2) / n)) # plot the histogram of price2 together with the density of normal distribution # with estimated parameters mu and sigma: xx <- seq(700, 2500, len = 100) density.n <- dnorm(xx, mu, sigma) hist(price2, xlim = c(700, 2500), freq = FALSE, ylim = c(0, 0.0015), col = "green") lines(xx, density.n, lwd = 3) box() # choice of a theoretical model (normal distribution) looks appropriate # ...................................... B ..................................... # Assume the price of 4 MB RAM computers is following a normal distribution. Use # maximum likelihood method to estimate parameters mu and sigma of the normal # distribution N(mu, sigma). Plot histogram of your data together with a theoretical # normal distribution. Based on the histogram, do you think the assumption was correct? # create a vector of price values for computers with RAM size 4: price4 <- data[data$ram == 4, "price"] n <- length(price4) # estimate the parameters mu and sigma using theoretical maximum likelihood estimation: (mu <- mean(price4)) (sigma <- sqrt(sum((price4 - mu)^2) / n)) # plot the histogram of price4 together with the density of normal distribution # with estimated parameters mu and sigma: xx <- seq(700, 2500, len = 100) density.n <- dnorm(xx, mu, sigma) hist(price4, xlim = c(700, 2500), freq = FALSE, ylim = c(0, 0.0015), col = "green") lines(xx, density.n, lwd = 3) box() # data could be normal, but the fit is not so great, # lets try compute the logarithm of the data and see what will happen (task (C)) # QQ-plot shows that the distribution differs from normal # especially, high (upper tail) quantiles are much larger in the sample # than they should be if a normal distribution was adequate # theory about QQ-plot will be part of one of the lectures ahead qqnorm(price4, pch = 19) qqline(price4, col = "blue") # ...................................... C ..................................... # Assume the logarithm of the price of 4 MB RAM computers instead of the values # themselfs and do the same as in subtask (B). logprice4 <- log(price4) n <- length(price4) # estimate the parameters mu and sigma using theoretical maximum likelihood estimation: (mu <- mean(logprice4)) (sigma <- sqrt(sum((logprice4 - mu)^2) / n)) # plot the histogram of price4 together with the density of normal distribution # with estimated parameters mu and sigma: xx <- seq(7, 8.2, len = 100) density.n <- dnorm(xx, mu, sigma) hist(logprice4, freq = FALSE, col = "green") lines(xx, density.n, lwd = 3) box() # fits better in the right tail qqnorm(logprice4, pch = 16) qqline(logprice4, col = "blue") # logarithm of price4 seems to be normal (fit is better than for price4 itself), # we improved our assumed model