#Upload the data teachers<-read.csv("teachers_generated.csv") #Inspect the data structure str(teachers) nrow(teachers) ncol(teachers) colnames(teachers) #Change the column names colnames(teachers)<-c("country", "salary", "occupation", "life_satisfy") colnames(teachers) #Check unique values unique(teachers$occupation) unique(teachers$country) #indexing [row, column] teachers[2,3] teachers[2,"country"] teachers[,2] #all rows, column 2 teachers[3,] #only the third row, all columns #Check distributions hist(teachers$salary[which(teachers$country=="France")]) hist(teachers$salary[which(teachers$country=="Germany")]) hist(teachers$salary[which(teachers$country=="Italy")]) #Calculate medians for each country median(teachers$salary[which(teachers$country=="France")]) median(teachers$salary[which(teachers$country=="Germany")]) median(teachers$salary[which(teachers$country=="Italy")]) #The same but using the function aggregate (var1 ~ var2, data= , FUN= ) aggregate(salary ~ country, data = teachers, FUN=median) #you can use different ways to do the same; morre compact script #IQR aggregate(salary ~ country, data = teachers, FUN=IQR) #Build a boxplot (y ~ x) boxplot(teachers$salary ~ teachers$country) #Giving titles boxplot(teachers$salary ~ teachers$country, main="Salary difference among teachers", xlab="Countries", ylab="Salary [euro/month]") #Build a scatter plot - plot (y ~ x) plot(teachers$life_satisfy ~ teachers$salary) #Change points color ?plot plot(teachers$life_satisfy~teachers$salary, col="hotpink2",bg="hotpink2",pch=21) #Add titles. plot(teachers$life_satisfy~teachers$salary, col="hotpink2",bg="hotpink2",pch=21, main="Dependence of satisfaction from salary", xlab="Salary [euro/month]", ylab="Satisfaction") #Changing dots colors depending on the country table_col<-data.frame("country"=c("France", "Germany", "Italy"), "color"=c("red", "blue", "green")) color<-table_col$color[match(teachers$country, table_col$country)] plot(teachers$life_satisfy~teachers$salary, col=color,bg=color,pch=21, main="Dependence of satisfaction from salary", xlab="Salary [euro/month]", ylab="Satisfaction") #about ggplot install.packages("ggplot2") library("ggplot2") #Help-> cheat sheets->Data Visualization with ggplot2 #Build the same scatter plot using ggplot ggplot(data=teachers, aes(x=salary, y=life_satisfy))+ geom_point() #Adding the geometric layer "geom_point" ggplot(data=teachers, aes(x=salary, y=life_satisfy))+ geom_point(aes(color=country)) #Changing the dots color depending on the country+size of the dots depending on the salary size ggplot(data=teachers, aes(x=salary, y=life_satisfy))+ geom_point(aes(color=country, size=salary)) #Add titles with a layer "labs" ggplot(data=teachers, aes(x=salary, y=life_satisfy))+ geom_point(aes(color=country, size=salary))+ labs(x = "Salary [euro/month]", y = "Life satisfaction", title ="Dependence of life satisfaction from salary") #Build a box plot ggplot(data=teachers,aes(x=country, y=salary) )+ geom_boxplot() #Add colors and titles ggplot(data=teachers,aes(x=country, y=salary) )+ geom_boxplot(aes(color=country))+ labs(x = "Country", y = "Salary [euro/month]", title ="Salary difference among teachers") #Take the legend away ggplot(data=teachers,aes(x=country, y=salary) )+ geom_boxplot(aes(color=country), show.legend = FALSE)+ labs(x = "Country", y = "Salary [euro/month]", title ="Salary difference among teachers")