# Title: Intro to R and RStudio 2 # Goal: Learn more about data frames, plots, models and statistical testing # Course: AI in Finance # Date: 23.9.2023 # Author: Martina Halousková # License: GPL>=3 # First, create a project and clean up your environment. # Either click on the broom icon top right in the environment tab, or use this command: rm(list=ls()) ############################################################## ########### SOME MORE FUN STUFF WITH DATAFRAMES ############# ############################################################## # Installing and loading packages. # install.packages("datasets") library("datasets") # Load a built-in dataset from library datasets data(iris) data() DT=iris # Lets explore this dataset DT head(DT) tail(DT) dim(DT) names(DT) class(DT) # Are there any missing values? is.na(DT) # How many and if any, where is the missing value? sum(is.na(DT)) which(is.na(DT)) # A simple way of dealing with missing values is deleting rows that include missing values. DT[complete.cases(DT),] # What are the sums of rows and columns? colSums(DT) # returns an error # The previous code returned an error. Do you know why? colSums(DT[,-c(5)]) rowSums(DT[,-c(5)]) # Summarize this dataset. summary(DT) # What are the 3 main species? unique(DT$Species) # How many flowers are in each species? table(DT$Species) # Summary in a graph = a boxplot! boxplot(DT[,-c(5)]) #### Plots #### # Lets visualize the dataset with a simple function plot(). plot(DT) # or pairs(DT) # Are the variables correlated? cor(DT) # returns error, why? cor(DT[,-c(5)]) # Lets focus on a scatterplot of one pair. plot(DT$Sepal.Length, DT$Petal.Length) # Can we test the correlation of this pair? cor.test(DT$Sepal.Length, DT$Petal.Length) # How about a linear model? model=lm(Petal.Length~Sepal.Length, data=DT) summary(model) # Lets return to that scatterplot and overlay a regression line from the estimated model. plot(DT$Sepal.Length, DT$Petal.Length) abline(model) # Lets look at Petal.Length. Notice anything suspicious? plot(DT$Petal.Length) barplot(DT$Petal.Length) hist(DT$Petal.Length) table(DT$Petal.Length) table(DT$Species,DT$Petal.Length) plot(DT$Petal.Length, col=DT$Species) boxplot(Petal.Length~Species,data=DT) boxplot(Petal.Length~Species, data=DT, main = "Petal Length in 3 species of iris", xlab = "Species of flowers", ylab = "Petal Length", col=c("red","green","blue")) # Lets subset Petal.Length into 3 different vectors setosa = DT[DT$Species == "setosa",3] versicolor = DT[DT$Species == "versicolor",3] virginica = DT[DT$Species == "virginica",3] # Compare their histograms par(mfrow=c(1,3)) hist(setosa) hist(versicolor) hist(virginica) par(mfrow=c(1,1)) # Are these differences significant? t.test(setosa,versicolor) ############################################################## #################### SOME Nicer graphs ###################### ############################################################## #### Lattice graphics #### # Note: plots are created with single function call (xyplot etc.) # Most useful for conditioning types of plots install.packages("lattice") library("lattice") # This package has a nicer histogram. histogram(setosa) # But it is perhaps most used for the xyplot graph. # Scatterplot of sepal length and petal length xyplot(DT$Sepal.Length~DT$Petal.Length) # Scatterplot of petal length and species xyplot(DT$Petal.Length~DT$Species) # You can split the first scatterplot based on the variable species xyplot(DT$Sepal.Length~DT$Petal.Length|DT$Species) #### The GGally package #### # Produces a summary plot of all variables. # Note: always make sure to select only numeric variables install.packages(GGally) library(GGally) ggpairs(iris, columns = 1:5, title="IRIS") #### ggplot2 #### # ggplot is a large data visualization package. # It will let you create and customize just about any graphic or graph. # We will start with something simple, but feel free to explore ggplot on you own: # https://ggplot2.tidyverse.org/. # Load the package. install.packages("ggplot2") library(ggplot2) # Similar to the lattice package, ggplot also has a function for nicer scatterplots. qplot(Petal.Length, Petal.Width, data = iris) # Lets build up a nice scatterplot step by step. g1 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length)) + geom_line() g1 # Condition the color of the dots based on the variable Species, change the shape and size of the dots. g1 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length, color=Species)) + geom_point(shape=17, size=1.2) g1 # Smooth out the graph and add a different theme. g2 = g1 + geom_smooth(method="lm") g2 + theme_minimal() # Lets add a trim around the whole graph, make it blue and dashed. # Make the description text of axis bigger and bold. # And increase the size of numbers on the axis. g3 = g2 + theme(axis.line = element_line(colour = "blue", size = .1, linetype = "dashed"), axis.text = element_text(size=12), axis.title=element_text(size=12,face="bold")) # Lets add a title to the graph and change the axis description text. g4 = g3 + ggtitle("Our first ggplot graph with a title") + labs(y = "Petal Length (cm)") + labs(x = "Sepal Length (cm)")+ theme(legend.title = element_blank()) g4 # And now everything all at once: g5 = ggplot(data=DT, aes(x=Sepal.Length, y=Petal.Length, color=Species)) + geom_point(shape=17, size=1.2) + geom_smooth(method="lm") + theme(axis.line = element_line(colour = "blue", size = .1, linetype = "dashed"), axis.text = element_text(size=12), axis.title=element_text(size=12,face="bold"))+ ggtitle("Our first ggplot graph with a title") + labs(y = "Petal Length (cm)") + labs(x = "Sepal Length (cm)")+ theme(legend.title = element_blank()) g5 ############################################################## #################### SOME MORE problems ##################### ############################################################## # Try going through these on your own # 1. Change the rownames of dataset DT. # 2. How many flowers of the species "setosa" have the Sepal wider then 3.5? # 3. Calculate the median and standard deviation of all variables 4 numeric variables in the dataset. # Assign these results into vectors. Store these two vectors in a list. # 4. Is there a relationship between the Petal length and Petal width? # Fit a linear model of Petal length modeled by Petal width # Create a scatter plot and add a regression line # 5. Add a new column with log transformation of variable Sepal length # Make a plot of this new variable. Connect the dots using a type="l" argument # Change the axis labels and plot title to explain your graph # Change the color of the line to green # Assign this plot to an object and then save this object as an RData file # Save this plot as an image # 6. Convert the first 4 columns of DT into a matrix. Save this matrix as a csv file. # 7. Find out, which rows of DT correspond to Sepal width equal to 2.2. # Replace all values in these rows with empty values (NAs) # use function na.omit() function complete.cases() to remove these rows