# Title: Intro to R and statistical background - part II
# Environment: RStudio
# Goal: Learn basic data types, functions and data structures
# Course: AI in Finance
# Date: 23.09.2024
# Author: Martina Halousková, Štefan Lyócsa (small contribution)
# License: GPL>=3


# First, create a project and clean up your environment.
# Either click on the broom icon top right in the environment tab, or use this command:
# rm(list=ls())
# This will clean everything from the workspace - all your progress. Use it wisely!


##############################################################
###########  SOME MORE FUN STUFF WITH DATAFRAMES #############  
##############################################################

# Installing and loading packages.
# install.packages("datasets")
library("datasets")

# Load a built-in dataset from library datasets
data(iris)
DT=iris
# We can check different types of datasets that are available to work with
data()

# Lets explore this dataset
# Sepal  - modified leaves that form the outer whorl of a flower
# Petals - thin colored parts which together form the flower.
# You use the measurements on Sepal and Petal to identify type of a flower.
DT
head(DT)
tail(DT)
dim(DT)
names(DT)
class(DT)

# Are there any missing values?
is.na(DT)
# How many and if any, where is the missing value?
sum(is.na(DT))
which(is.na(DT))
# A simple way of dealing with missing values is deleting rows that include missing values.
DT[complete.cases(DT),]

# What are the sums of rows and columns?
colSums(DT) # returns an error
# The previous code returned an error. Do you know why?
colSums(DT[,-c(5)])
rowSums(DT[,-c(5)])

# Summarize this dataset.
summary(DT) 

# What are the 3 main species?
unique(DT$Species)

# How many flowers are in each species?
table(DT$Species)

# Summary in a graph = a boxplot!
boxplot(DT[,-c(5)])


#### Plots ####

# Lets visualize the dataset with a simple function plot().
plot(DT)  # or pairs(DT)

# Are the variables correlated?
cor(DT)   # returns error, why?
cor(DT[,-c(5)])

# Lets focus on a scatterplot of one pair.
plot(DT$Sepal.Length, DT$Petal.Length)

# Can we test the correlation of this pair?
cor.test(DT$Sepal.Length, DT$Petal.Length)

# How about a linear model?
model=lm(Petal.Length~Sepal.Length, data=DT)
summary(model)

# Lets return to that scatterplot and overlay a regression line from the estimated model.
plot(DT$Sepal.Length, DT$Petal.Length)
abline(model)



# Lets look at Petal.Length. Notice anything suspicious?
plot(DT$Petal.Length)
barplot(DT$Petal.Length)
hist(DT$Petal.Length)
table(DT$Petal.Length)
table(DT$Species,DT$Petal.Length)
plot(DT$Petal.Length,col=DT$Species,pch=19,cex=0.5)
# There seems to a relationship between petal length and 'species'
boxplot(Petal.Length~Species,data=DT)

boxplot(Petal.Length~Species,
        data=DT,
        main = "Petal Length in 3 species of iris",
        xlab = "Species of flowers",
        ylab = "Petal Length",
        col=c("red","green","blue"))

# Lets subset Petal.Length into 3 different vectors
setosa = DT[DT$Species == "setosa",3]
versicolor = DT[DT$Species == "versicolor",3]
virginica = DT[DT$Species == "virginica",3]

# Compare their histograms
# I will use argument xlim=c() so that all three histograms have same length on the x-axis
par(mfrow=c(1,3))
# The line above will tell the R to create a Figure with a layout where you have 1 row and 3 columns
hist(setosa,xlim=c(1,7))
hist(versicolor,xlim=c(1,7))
hist(virginica,xlim=c(1,7.5))
par(mfrow=c(1,1))

# Are these differences significant?
t.test(setosa,versicolor)
# A side-quest to statistics. If you have two (random or representative) samples measuring the same event (e.g. height of students in two independent classes)
# and you would like to find out, whether it is reasonable to assume, that the expected
# values are same or different, you could resort to statistical hypothesis testing.
# There are many tests and the basic one is the famous t-test.
# Here, the null hypothesis is:
# H0: The expected values are the same.
# The alternative hypothesis is:
# H1: The expected values are different.
# The test uses the data and calculates the 'test statistics'. If the test statistics has a large enought
# magnitude, the null hypothesis is rejected and the alternative is accepted.
# If the test statistics has a smaller magnitude you cannot reject the null hypothesis (if we have a sample of data, we never accept the null hypothesis - the difference might seem to be sublet, but it is not.)
# Here the test statistics is -39.493 - is that a large magnitude?
# It depends on the pre-defined significance level. We set that to be 0.05 in most cases in economics/finance (social sciences in general)
# The p-value can be used to make decision (instead of the test statistics). If the p-value is belove the significance level of 0.05, we reject the null hypothesis.
# This is the case here as well.
# We thus conclude that expected values are statistically significantly different from each other.
# What is this significance level? It is the probability of comiting a type I error. The error that we falsely reject the null hypothesis. It can happen as data are just
# a sample and we might be unlucky to have a sample where we think that the null should be rejected.


##############################################################
####################  SOME Nicer graphs ######################  
##############################################################


#### Lattice graphics ####

# Note: plots are created with single function call (xyplot etc.)
# Most useful for conditioning types of plots
# Note: I personally use mostly basic plots and I play with it a lot. However, I am a minority here
# and from my observation, most of the people around me (students/businesses) use packages (ggplot are likely most popular)
install.packages("lattice")
library("lattice")

# This package has a nicer histogram.
histogram(setosa)


# But it is perhaps most used for the xyplot graph.
# Scatterplot of sepal length and petal length
xyplot(DT$Sepal.Length~DT$Petal.Length)
# Scatterplot of petal length and species
xyplot(DT$Petal.Length~DT$Species)
# You can split the first scatterplot based on the variable species
xyplot(DT$Sepal.Length~DT$Petal.Length|DT$Species)


#### The GGally package ####

# Produces a summary plot of all variables.
# Note: always make sure to select only numeric variables
install.packages("GGally")
library(GGally)
ggpairs(iris, columns = 1:5, title="IRIS")


#### ggplot2 ####


# ggplot is a large data visualization package. 
# It will let you create and customize just about any graphic or graph.  
# We will start with something simple, but feel free to explore ggplot on you own: 
# https://ggplot2.tidyverse.org/.

# Load the package.
install.packages("ggplot2")
library(ggplot2)

# Similar to the lattice package, ggplot also has a function for nicer scatterplots.
qplot(Petal.Length, Petal.Width, data = iris)


# Lets build up a nice scatterplot step by step.
g1 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length)) +
  geom_line()
g1

# Condition the color of the dots based on the variable Species, change the shape and size of the dots.
g1 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length, color=Species)) +
  geom_point(shape=17, size=1.2)
g1

# Smooth out the graph and add a different theme.
g2 = g1 + geom_smooth(method="lm")
g2 + theme_minimal()

# Lets add a trim around the whole graph, make it blue and dashed.  
# Make the description text of axis bigger and bold. 
# And increase the size of numbers on the axis.
g3 = g2 + theme(axis.line = element_line(colour = "blue", size = .1, linetype = "dashed"),
                axis.text = element_text(size=12), axis.title=element_text(size=12,face="bold")) 

# Lets add a title to the graph and change the axis description text.
g4 = g3 +
  ggtitle("Our first ggplot graph with a title") +
  labs(y = "Petal Length (cm)") + labs(x = "Sepal Length (cm)")+
  theme(legend.title = element_blank())

g4  

# And now everything all at once:
g5 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length, color=Species)) +
  geom_point(shape=17, size=1.2) + 
  geom_smooth(method="lm") +
  theme(axis.line = element_line(colour = "blue", size = .1, linetype = "dashed"),
        axis.text = element_text(size=12), axis.title=element_text(size=12,face="bold"))+
  ggtitle("Our first ggplot graph with a title") +
  labs(y = "Petal Length (cm)") + labs(x = "Sepal Length (cm)")+
  theme(legend.title = element_blank())
g5

##############################################################
####################  SOME MORE problems #####################  
##############################################################


# Try going through these on your own
# 1. Change the rownames of dataset DT.


# 2. How many flowers of the species "setosa" have the Sepal wider then 3.5?


# 3. Calculate the median and standard deviation of all variables 4 numeric variables in the dataset. 
#    Assign these results into vectors. Store these two vectors in a list.


# 4. Is there a relationship between the Petal length and Petal width? 
#    Fit a linear model of Petal length modeled by Petal width 
#    Create a scatter plot and add a regression line



# 5. Add a new column with log transformation of variable Sepal length
#    Make a plot of this new variable. Connect the dots using a type="l" argument
#    Change the axis labels and plot title to explain your graph
#    Change the color of the line to green
#    Assign this plot to an object and then save this object as an RData file 
#    Save this plot as an image


# 6. Convert the first 4 columns of DT into a matrix. Save this matrix as a csv file. 

# 7. Find out, which rows of DT correspond to Sepal width equal to 2.2.
#    Replace all values in these rows with empty values (NAs)
#    use function na.omit() function complete.cases() to remove these rows