library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.1.1 ✔ forcats 0.5.1
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.4 ──
## ✔ broom 0.7.10 ✔ rsample 0.1.1
## ✔ dials 0.0.10 ✔ tune 0.1.6
## ✔ infer 1.0.0 ✔ workflows 0.2.4
## ✔ modeldata 0.1.1 ✔ workflowsets 0.1.0
## ✔ parsnip 0.1.7 ✔ yardstick 0.0.9
## ✔ recipes 0.1.17
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(tidytext)
library(textrecipes)
library(stacks)
library(here)
## here() starts at /Volumes/GoogleDrive/My Drive/Google Drive sync/data/github_analysis
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(cores = 3)
df <- read_csv(here("data", "clean.csv")) %>%
mutate(
description = ifelse(is.na(description), "MISSING", description)
)
## Rows: 31431 Columns: 24
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name, full_name, owner, description, homepage, language, license, ...
## dbl (7): id, size, forks_count, open_issues_count, forks, open_issues, stars
## lgl (6): has_issues, has_projects, has_downloads, has_wiki, has_pages, arch...
## dttm (3): created_at, updated_at, pushed_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
We will try to construct a model that predicts description to a language using tf-idf vectors. We will use three models:
set.seed(54321)
split <- initial_split(df, strata = "language", prop = 0.8)
train <- training(split)
test <- testing(split)
folds <- vfold_cv(train, v = 5)
# save indices for python analysis
split %>%
tidy() %>%
mutate(Row = Row - 1) %>%
write_csv(here("data", "indices.csv"))
mset <- metric_set(mn_log_loss, accuracy)
control <- control_grid(
save_workflow = TRUE,
save_pred = TRUE,
verbose = TRUE
)
Here is a master recipe where we
master_rec <- recipe(language ~ description, data = df) %>%
step_mutate(
description = str_replace_all(description, "[^\\x00-\\x7F]", " "),
description = str_replace_all(description, "\\s+", " "),
nwords = str_count(description, "\\w+")
) %>%
step_filter(nwords >= 2, skip = TRUE) %>%
step_normalize(nwords) %>%
step_tokenize(description) %>%
step_stopwords(description) %>%
step_stopwords(description, custom_stopword_source = c("r", "java", "javascript", "js", "python", "c")) %>%
step_tokenfilter(description, max_tokens = tune()) %>%
step_tfidf(description)
knn_model <- nearest_neighbor(
mode = "classification",
neighbors = tune(),
weight_func = "cos"
) %>%
set_engine("kknn")
knn_workflow <- workflow() %>%
add_recipe(master_rec) %>%
add_model(knn_model)
We will tune on number of neighbors and max number of tokens.
knn_path <- here("outputs", "knn_res.rds")
override <- FALSE
if (!file.exists(knn_path) || override) {
knn_res <- knn_workflow %>%
tune_grid(folds,
metrics = mset,
control = control,
grid = crossing(
neighbors = floor(seq(50, 250, length.out = 5)),
max_tokens = floor(seq(50, 350, length.out = 5)),
)
)
saveRDS(knn_res, knn_path)
}
knn_res <- readRDS(knn_path)
knn_res %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
arrange(-mean)
## # A tibble: 25 × 8
## neighbors max_tokens .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 250 350 accuracy multiclass 0.476 5 0.00546 Preprocessor5_M…
## 2 250 275 accuracy multiclass 0.474 5 0.00419 Preprocessor4_M…
## 3 150 350 accuracy multiclass 0.473 5 0.00598 Preprocessor5_M…
## 4 200 350 accuracy multiclass 0.471 5 0.00577 Preprocessor5_M…
## 5 200 275 accuracy multiclass 0.469 5 0.00469 Preprocessor4_M…
## 6 150 275 accuracy multiclass 0.468 5 0.00451 Preprocessor4_M…
## 7 100 350 accuracy multiclass 0.466 5 0.00408 Preprocessor5_M…
## 8 250 200 accuracy multiclass 0.465 5 0.00344 Preprocessor3_M…
## 9 100 275 accuracy multiclass 0.465 5 0.00227 Preprocessor4_M…
## 10 250 125 accuracy multiclass 0.464 5 0.00439 Preprocessor2_M…
## # … with 15 more rows
autoplot(knn_res)
Now evaluate on test set.
final_model <- knn_workflow %>%
finalize_workflow(select_best(knn_res, metric = "accuracy")) %>%
fit(train)
preds <- final_model %>%
augment(test)
preds %>%
with(accuracy_vec(as.factor(language), .pred_class))
## [1] 0.4685115
Accuracy of ~0.47 isn’t bad for a classifier with 5 classes.
Let’s look at the accuracy for each language.
preds %>%
mutate(acc = .pred_class == language) %>%
group_by(language) %>%
summarize(mean(acc))
## # A tibble: 5 × 2
## language `mean(acc)`
## <chr> <dbl>
## 1 c 0.568
## 2 java 0.596
## 3 js 0.366
## 4 python 0.360
## 5 r 0.409
Python is the worst performing. Makes sense given how Python is the second best language at everything.
lin_rec <- master_rec %>%
step_normalize(all_numeric_predictors())
lin_model <- multinom_reg(
mode = "classification",
penalty = tune()
) %>%
set_engine("glmnet")
lin_workflow <- workflow() %>%
add_recipe(lin_rec) %>%
add_model(lin_model)
We will tune over beta penalty and number of tokens. We can do more tokens than knn because LASSO can handle sparse data.
lin_path <- here("outputs", "lin_res.rds")
override <- FALSE
if (!file.exists(lin_path) || override) {
lin_res <- lin_workflow %>%
tune_grid(folds,
metrics = mset,
control = control,
grid = crossing(
penalty = 10^seq(-7, -1, 0.2),
max_tokens = floor(seq(200, 1000, length.out = 5))
)
)
saveRDS(lin_res, lin_path)
}
lin_res <- readRDS(lin_path)
lin_res %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
arrange(-mean)
## # A tibble: 155 × 8
## penalty max_tokens .metric .estimator mean n std_err .config
## <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00251 1000 accuracy multiclass 0.601 5 0.00148 Preprocessor5_Mo…
## 2 0.00158 1000 accuracy multiclass 0.600 5 0.00199 Preprocessor5_Mo…
## 3 0.00398 1000 accuracy multiclass 0.599 5 0.00197 Preprocessor5_Mo…
## 4 0.001 1000 accuracy multiclass 0.596 5 0.00130 Preprocessor5_Mo…
## 5 0.00158 800 accuracy multiclass 0.595 5 0.00207 Preprocessor4_Mo…
## 6 0.00251 800 accuracy multiclass 0.595 5 0.00168 Preprocessor4_Mo…
## 7 0.001 800 accuracy multiclass 0.593 5 0.00278 Preprocessor4_Mo…
## 8 0.00398 800 accuracy multiclass 0.592 5 0.00153 Preprocessor4_Mo…
## 9 0.000631 1000 accuracy multiclass 0.591 5 0.00152 Preprocessor5_Mo…
## 10 0.000631 800 accuracy multiclass 0.590 5 0.00308 Preprocessor4_Mo…
## # … with 145 more rows
autoplot(lin_res)
Now evaluate on test set.
final_model <- lin_workflow %>%
finalize_workflow(select_best(lin_res, metric = "accuracy"))
fitted_model <- final_model %>%
fit(train)
preds <- fitted_model %>%
augment(test)
preds %>%
with(accuracy_vec(as.factor(language), .pred_class))
## [1] 0.5927163
A big improvement over knn. The unreasonable effectiveness of LASSO.
Let’s look at how LASSO performs by language.
preds %>%
mutate(acc = .pred_class == language) %>%
group_by(language) %>%
summarize(mean(acc))
## # A tibble: 5 × 2
## language `mean(acc)`
## <chr> <dbl>
## 1 c 0.717
## 2 java 0.552
## 3 js 0.587
## 4 python 0.495
## 5 r 0.584
Improved across the board markedly improved on the worst performing languages (python and js).
xgb_model <- boost_tree(
mode = "classification",
learn_rate = tune(),
trees = tune(),
mtry = tune()
) %>%
set_engine("xgboost")
xgb_workflow <- workflow() %>%
add_recipe(master_rec) %>%
add_model(xgb_model)
We will tune on number of trees, the learning rate, and number of columns sampled at each node.
xgb_path <- here("outputs", "xgb_res.rds")
override <- FALSE
if (!file.exists(xgb_path) || override) {
xgb_res <- xgb_workflow %>%
tune_grid(folds,
metrics = mset,
control = control,
grid = crossing(
trees = c(200, 400, 600, 800, 1000),
learn_rate = 10^seq(-3, -1, length.out = 5),
mtry = c(10, 30, 100),
max_tokens = floor(seq(250, 1000, length.out = 4))
)
)
saveRDS(xgb_res, xgb_path)
}
xgb_res <- readRDS(xgb_path)
xgb_res %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
arrange(-mean)
## # A tibble: 300 × 10
## mtry trees learn_rate max_tokens .metric .estimator mean n std_err
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 100 800 0.1 1000 accuracy multiclass 0.610 5 0.00392
## 2 10 800 0.1 1000 accuracy multiclass 0.609 5 0.00454
## 3 10 1000 0.1 1000 accuracy multiclass 0.609 5 0.00382
## 4 30 800 0.1 1000 accuracy multiclass 0.608 5 0.00398
## 5 100 600 0.1 1000 accuracy multiclass 0.608 5 0.00419
## 6 10 600 0.1 1000 accuracy multiclass 0.608 5 0.00466
## 7 100 1000 0.1 1000 accuracy multiclass 0.608 5 0.00321
## 8 30 600 0.1 1000 accuracy multiclass 0.608 5 0.00474
## 9 30 1000 0.1 1000 accuracy multiclass 0.607 5 0.00338
## 10 10 400 0.1 1000 accuracy multiclass 0.604 5 0.00499
## # … with 290 more rows, and 1 more variable: .config <chr>
The autoplot is jumbled, so we manually plot.
xgb_res %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
mutate(learn_rate = as.factor(round(learn_rate, digits = 3))) %>%
ggplot(aes(x = trees, y = mean, color = learn_rate)) +
geom_line() +
geom_point() +
facet_grid(rows = vars(max_tokens), cols = vars(mtry))
Looks like increasing max_tokens could help a little more. Increasing mtry seems to improve only the small learning rates models, but probably won’t give a large improvement overall.
Now evaluate on test set.
final_model <- xgb_workflow %>%
finalize_workflow(select_best(xgb_res, metric = "accuracy"))
fitted_model <- final_model %>%
fit(train)
## [11:40:34] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
preds <- fitted_model %>%
augment(test)
preds %>%
with(accuracy_vec(as.factor(language), .pred_class))
## [1] 0.6005089
This is an incremental improvement over LASSO and is pretty far from the transformer model we will use in the end so we will not further optimize over parameters.
How is it doing by class?
preds %>%
mutate(acc = .pred_class == language) %>%
group_by(language) %>%
summarize(mean(acc))
## # A tibble: 5 × 2
## language `mean(acc)`
## <chr> <dbl>
## 1 c 0.762
## 2 java 0.547
## 3 js 0.570
## 4 python 0.495
## 5 r 0.59
Pretty similar to LASSO.
Let’s look at the learning curves. We will subsample sets and evaluate the model.
n <- 4
p <- (1:n) / n
grid <- expand.grid(1:nrow(folds), p) %>%
as_tibble() %>%
rename(fold = Var1, p = Var2)
grid_path <- here("outputs", "grid.rds")
override <- FALSE
if (!file.exists(grid_path) || override) {
err_tot <- foreach(i = 1:nrow(grid)) %dopar% {
train_df <- folds$splits[[grid$fold[i]]] %>%
analysis() %>%
slice_sample(prop = grid$p[i])
fitted <- final_model %>%
fit(train_df)
error_train <- fitted %>%
augment(train_df) %>%
with(accuracy_vec(as.factor(language), .pred_class))
error_val <- fitted %>%
augment(assessment(folds$splits[[grid$fold[i]]])) %>%
with(accuracy_vec(as.factor(language), .pred_class))
c(error_train, error_val)
}
res <- grid %>%
mutate(
train = map_dbl(err_tot, ~ .[1]),
val = map_dbl(err_tot, ~ .[2])
)
saveRDS(res, grid_path)
}
res <- readRDS(grid_path)
res %>%
group_by(p) %>%
summarize(
train = mean(train),
val = mean(val)
) %>%
pivot_longer(train:val, names_to = "type", values_to = "score") %>%
ggplot(aes(x = p)) +
geom_line(aes(y = score, color = type)) +
geom_point(aes(y = score, color = type)) +
ylim(0, 1)
The curves look like they’ve asymptoted nicely so gathering more data probably would only make marginal improvements. There is a decent gap, suggesting overfitting.
Finally, let’s look at feature importance.
fitted_model %>%
extract_fit_engine() %>%
vip(n = 25)
List of highly important terms. These terms look pretty predictive of the language (android, react, pytorch, etc.). The number of words do seem decently helpful.
All the models are pretty different, so let’s see if combining them might help us improve our model.
We first collect just the best models of each to make this not be too slow.
knn_cand <- knn_res %>%
filter_parameters(parameters = select_best(knn_res, metric = "accuracy"))
lin_cand <- lin_res %>%
filter_parameters(parameters = select_best(lin_res, metric = "accuracy"))
xgb_cand <- xgb_res %>%
filter_parameters(parameters = select_best(xgb_res, metric = "accuracy"))
Now fit on the stacking members.
registerDoParallel(cores = 1)
lang_st <- stacks() %>%
add_candidates(lin_cand) %>%
add_candidates(knn_cand) %>%
add_candidates(xgb_cand)
lang_model <- lang_st %>%
blend_predictions() %>%
fit_members()
## [12:15:14] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Finally, test on holdout.
test %>%
{
\(x) bind_cols(x, predict(lang_model, x))
}() %>%
with(accuracy_vec(as.factor(language), .pred_class))
## [1] 0.6025763
womp womp. Another barely incremental improvement.