# Packages ---------------------------------------------------------------- # install.packages("tidyverse") library(tidyverse) # install.packages("tidytext") library(tidytext) # install.packages("rsample") library(rsample) # install.packages("glmnet") library(glmnet) # install.packages("yardstick") library(yardstick) # Data preparation -------------------------------------------------------- imdb <- read_csv("C:/Users/Bootcha/Desktop/skola/TM/2019/3/IMDB_Dataset.csv") imdb <- imdb %>% mutate(id = row_number(), review = str_to_lower(review), review = str_replace_all(review, "
", ""), review = str_replace_all(review, "\"", "")) %>% head(10000) # Data exploration -------------------------------------------------------- tidy_imdb <- imdb %>% unnest_tokens(word, review) %>% group_by(word) %>% filter(n() > 10) %>% ungroup() tidy_imdb tidy_imdb %>% count(sentiment, word, sort=TRUE) %>% anti_join(get_stopwords()) %>% group_by(sentiment) %>% top_n(20) %>% ungroup() %>% ggplot(aes(x=reorder_within(word, n, sentiment), y=n, fill=sentiment)) + geom_col(show.legend=TRUE) + scale_x_reordered() + coord_flip() + facet_wrap(~sentiment, scales="free") + scale_y_continuous(expand=c(0, 0)) + labs( x=NULL, y="Word count", title="Most frequent words after removing stop words" ) # Model ------------------------------------------------------------------- imdb_split <- imdb %>% select(id) %>% initial_split() train_data <- training(imdb_split) test_data <- testing(imdb_split) sparse_words <- tidy_imdb %>% count(id, word) %>% inner_join(train_data) %>% cast_sparse(id, word, n) class(sparse_words) dim(sparse_words) word_rownames <- as.integer(rownames(sparse_words)) imdb_joined <- data_frame(id = word_rownames) %>% left_join(imdb %>% select(id, sentiment)) is_positive <- imdb_joined$sentiment == "positive" model <- cv.glmnet(sparse_words, is_positive, family="binomial", keep=TRUE) # model <- cv.glmnet(sparse_words, is_positive, family="binomial", keep=TRUE, type.measure="class") plot(model) plot(model$glmnet.fit, label = TRUE) coefs <- model$glmnet.fit %>% tidy() %>% filter(lambda == model$lambda.1se) coefs %>% group_by(estimate > 0) %>% top_n(10, abs(estimate)) %>% ungroup() %>% ggplot(aes(fct_reorder(term, estimate), estimate, fill=estimate > 0)) + geom_col(alpha=0.8, show.legend=FALSE) + coord_flip() + labs( x = NULL, title = "Coefficients that increase/decrease probability the most") intercept <- coefs %>% filter(term=="(Intercept)") %>% pull(estimate) classifications <- tidy_imdb %>% inner_join(test_data) %>% inner_join(coefs, by=c("word" = "term")) %>% group_by(id) %>% summarize(score=sum(estimate)) %>% mutate(probability=plogis(intercept+score)) classifications comment_classes <- classifications %>% left_join(imdb %>% select(sentiment, id), by="id") %>% mutate(sentiment=as.factor(sentiment)) comment_classes %>% roc_curve(sentiment, probability) %>% ggplot(aes(x=1-specificity, y=sensitivity)) + geom_line(color="midnightblue", size=1.5) + geom_abline(lty=2, alpha=0.5, color="gray50", size=1.2) + labs( title = "ROC curve for text classification using regularized regression" ) comment_classes %>% roc_auc(sentiment, probability) comment_classes %>% mutate( prediction = case_when( probability > 0.5 ~ "positive", TRUE ~ "negative"), prediction = as.factor(prediction)) %>% conf_mat(sentiment, prediction) comment_classes %>% filter(probability > .8, sentiment == "negative") %>% sample_n(10) %>% inner_join(imdb %>% select(id, review)) %>% select(probability, review) comment_classes %>% filter(probability < .3, sentiment == "positive") %>% sample_n(10) %>% inner_join(imdb %>% select(id, review)) %>% select(probability, review)