library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.1.1     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.4 ──

## ✔ broom        0.7.10     ✔ rsample      0.1.1 
## ✔ dials        0.0.10     ✔ tune         0.1.6 
## ✔ infer        1.0.0      ✔ workflows    0.2.4 
## ✔ modeldata    0.1.1      ✔ workflowsets 0.1.0 
## ✔ parsnip      0.1.7      ✔ yardstick    0.0.9 
## ✔ recipes      0.1.17

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(tidytext)
library(textrecipes)
library(stacks)
library(here)

## here() starts at /Volumes/GoogleDrive/My Drive/Google Drive sync/data/github_analysis

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(doParallel)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Loading required package: parallel

registerDoParallel(cores = 3)

df <- read_csv(here("data", "clean.csv")) %>%
  mutate(
    description = ifelse(is.na(description), "MISSING", description)
  )

## Rows: 31431 Columns: 24

## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): name, full_name, owner, description, homepage, language, license, ...
## dbl  (7): id, size, forks_count, open_issues_count, forks, open_issues, stars
## lgl  (6): has_issues, has_projects, has_downloads, has_wiki, has_pages, arch...
## dttm (3): created_at, updated_at, pushed_at

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Modeling

We will try to construct a model that predicts description to a language using tf-idf vectors. We will use three models:

k-NN model with cosine similarity
LASSO
XGBoost

set.seed(54321)

split <- initial_split(df, strata = "language", prop = 0.8)
train <- training(split)
test <- testing(split)
folds <- vfold_cv(train, v = 5)

# save indices for python analysis
split %>%
  tidy() %>%
  mutate(Row = Row - 1) %>%
  write_csv(here("data", "indices.csv"))

mset <- metric_set(mn_log_loss, accuracy)
control <- control_grid(
  save_workflow = TRUE,
  save_pred = TRUE,
  verbose = TRUE
)

Here is a master recipe where we

strip non-ascii words and excess whitespaces
filter out repos with too few words
normalize wordcount
tokenize description and filter out stopwords (including language names)
add tf-idf

master_rec <- recipe(language ~ description, data = df) %>%
  step_mutate(
    description = str_replace_all(description, "[^\\x00-\\x7F]", " "),
    description = str_replace_all(description, "\\s+", " "),
    nwords = str_count(description, "\\w+")
  ) %>%
  step_filter(nwords >= 2, skip = TRUE) %>%
  step_normalize(nwords) %>%
  step_tokenize(description) %>%
  step_stopwords(description) %>%
  step_stopwords(description, custom_stopword_source = c("r", "java", "javascript", "js", "python", "c")) %>%
  step_tokenfilter(description, max_tokens = tune()) %>%
  step_tfidf(description)

KNN model and workflow

knn_model <- nearest_neighbor(
  mode = "classification",
  neighbors = tune(),
  weight_func = "cos"
) %>%
  set_engine("kknn")

knn_workflow <- workflow() %>%
  add_recipe(master_rec) %>%
  add_model(knn_model)

We will tune on number of neighbors and max number of tokens.

knn_path <- here("outputs", "knn_res.rds")
override <- FALSE

if (!file.exists(knn_path) || override) {
  knn_res <- knn_workflow %>%
    tune_grid(folds,
      metrics = mset,
      control = control,
      grid = crossing(
        neighbors = floor(seq(50, 250, length.out = 5)),
        max_tokens = floor(seq(50, 350, length.out = 5)),
      )
    )

  saveRDS(knn_res, knn_path)
}

knn_res <- readRDS(knn_path)

knn_res %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  arrange(-mean)

## # A tibble: 25 × 8
##    neighbors max_tokens .metric  .estimator  mean     n std_err .config         
##        <dbl>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
##  1       250        350 accuracy multiclass 0.476     5 0.00546 Preprocessor5_M…
##  2       250        275 accuracy multiclass 0.474     5 0.00419 Preprocessor4_M…
##  3       150        350 accuracy multiclass 0.473     5 0.00598 Preprocessor5_M…
##  4       200        350 accuracy multiclass 0.471     5 0.00577 Preprocessor5_M…
##  5       200        275 accuracy multiclass 0.469     5 0.00469 Preprocessor4_M…
##  6       150        275 accuracy multiclass 0.468     5 0.00451 Preprocessor4_M…
##  7       100        350 accuracy multiclass 0.466     5 0.00408 Preprocessor5_M…
##  8       250        200 accuracy multiclass 0.465     5 0.00344 Preprocessor3_M…
##  9       100        275 accuracy multiclass 0.465     5 0.00227 Preprocessor4_M…
## 10       250        125 accuracy multiclass 0.464     5 0.00439 Preprocessor2_M…
## # … with 15 more rows

autoplot(knn_res)

Best: 0.476
neighbors: 250
max_tokens: 350

Now evaluate on test set.

final_model <- knn_workflow %>%
  finalize_workflow(select_best(knn_res, metric = "accuracy")) %>%
  fit(train)

preds <- final_model %>%
  augment(test)

preds %>%
  with(accuracy_vec(as.factor(language), .pred_class))

## [1] 0.4685115

Accuracy of ~0.47 isn’t bad for a classifier with 5 classes.

Let’s look at the accuracy for each language.

preds %>%
  mutate(acc = .pred_class == language) %>%
  group_by(language) %>%
  summarize(mean(acc))

## # A tibble: 5 × 2
##   language `mean(acc)`
##   <chr>          <dbl>
## 1 c              0.568
## 2 java           0.596
## 3 js             0.366
## 4 python         0.360
## 5 r              0.409

Python is the worst performing. Makes sense given how Python is the second best language at everything.

LASSO model and workflow

lin_rec <- master_rec %>%
  step_normalize(all_numeric_predictors())

lin_model <- multinom_reg(
  mode = "classification",
  penalty = tune()
) %>%
  set_engine("glmnet")

lin_workflow <- workflow() %>%
  add_recipe(lin_rec) %>%
  add_model(lin_model)

We will tune over beta penalty and number of tokens. We can do more tokens than knn because LASSO can handle sparse data.

lin_path <- here("outputs", "lin_res.rds")
override <- FALSE

if (!file.exists(lin_path) || override) {
  lin_res <- lin_workflow %>%
    tune_grid(folds,
      metrics = mset,
      control = control,
      grid = crossing(
        penalty = 10^seq(-7, -1, 0.2),
        max_tokens = floor(seq(200, 1000, length.out = 5))
      )
    )

  saveRDS(lin_res, lin_path)
}

lin_res <- readRDS(lin_path)

lin_res %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  arrange(-mean)

## # A tibble: 155 × 8
##     penalty max_tokens .metric  .estimator  mean     n std_err .config          
##       <dbl>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>            
##  1 0.00251        1000 accuracy multiclass 0.601     5 0.00148 Preprocessor5_Mo…
##  2 0.00158        1000 accuracy multiclass 0.600     5 0.00199 Preprocessor5_Mo…
##  3 0.00398        1000 accuracy multiclass 0.599     5 0.00197 Preprocessor5_Mo…
##  4 0.001          1000 accuracy multiclass 0.596     5 0.00130 Preprocessor5_Mo…
##  5 0.00158         800 accuracy multiclass 0.595     5 0.00207 Preprocessor4_Mo…
##  6 0.00251         800 accuracy multiclass 0.595     5 0.00168 Preprocessor4_Mo…
##  7 0.001           800 accuracy multiclass 0.593     5 0.00278 Preprocessor4_Mo…
##  8 0.00398         800 accuracy multiclass 0.592     5 0.00153 Preprocessor4_Mo…
##  9 0.000631       1000 accuracy multiclass 0.591     5 0.00152 Preprocessor5_Mo…
## 10 0.000631        800 accuracy multiclass 0.590     5 0.00308 Preprocessor4_Mo…
## # … with 145 more rows

autoplot(lin_res)

best: 0.601
max_tokens: 1000

Now evaluate on test set.

final_model <- lin_workflow %>%
  finalize_workflow(select_best(lin_res, metric = "accuracy"))

fitted_model <- final_model %>%
  fit(train)

preds <- fitted_model %>%
  augment(test)

preds %>%
  with(accuracy_vec(as.factor(language), .pred_class))

## [1] 0.5927163

A big improvement over knn. The unreasonable effectiveness of LASSO.

Let’s look at how LASSO performs by language.

preds %>%
  mutate(acc = .pred_class == language) %>%
  group_by(language) %>%
  summarize(mean(acc))

## # A tibble: 5 × 2
##   language `mean(acc)`
##   <chr>          <dbl>
## 1 c              0.717
## 2 java           0.552
## 3 js             0.587
## 4 python         0.495
## 5 r              0.584

Improved across the board markedly improved on the worst performing languages (python and js).

XGBoost model and workflow

xgb_model <- boost_tree(
  mode = "classification",
  learn_rate = tune(),
  trees = tune(),
  mtry = tune()
) %>%
  set_engine("xgboost")

xgb_workflow <- workflow() %>%
  add_recipe(master_rec) %>%
  add_model(xgb_model)

We will tune on number of trees, the learning rate, and number of columns sampled at each node.

xgb_path <- here("outputs", "xgb_res.rds")
override <- FALSE

if (!file.exists(xgb_path) || override) {
  xgb_res <- xgb_workflow %>%
    tune_grid(folds,
      metrics = mset,
      control = control,
      grid = crossing(
        trees = c(200, 400, 600, 800, 1000),
        learn_rate = 10^seq(-3, -1, length.out = 5),
        mtry = c(10, 30, 100),
        max_tokens = floor(seq(250, 1000, length.out = 4))
      )
    )

  saveRDS(xgb_res, xgb_path)
}

xgb_res <- readRDS(xgb_path)

xgb_res %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  arrange(-mean)

## # A tibble: 300 × 10
##     mtry trees learn_rate max_tokens .metric  .estimator  mean     n std_err
##    <dbl> <dbl>      <dbl>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl>
##  1   100   800        0.1       1000 accuracy multiclass 0.610     5 0.00392
##  2    10   800        0.1       1000 accuracy multiclass 0.609     5 0.00454
##  3    10  1000        0.1       1000 accuracy multiclass 0.609     5 0.00382
##  4    30   800        0.1       1000 accuracy multiclass 0.608     5 0.00398
##  5   100   600        0.1       1000 accuracy multiclass 0.608     5 0.00419
##  6    10   600        0.1       1000 accuracy multiclass 0.608     5 0.00466
##  7   100  1000        0.1       1000 accuracy multiclass 0.608     5 0.00321
##  8    30   600        0.1       1000 accuracy multiclass 0.608     5 0.00474
##  9    30  1000        0.1       1000 accuracy multiclass 0.607     5 0.00338
## 10    10   400        0.1       1000 accuracy multiclass 0.604     5 0.00499
## # … with 290 more rows, and 1 more variable: .config <chr>

best: 0.610
learn_rate: 0.1
mtry: 100
trees: 800
max_tokens: 1000

The autoplot is jumbled, so we manually plot.

xgb_res %>%
  collect_metrics() %>%
  filter(.metric == "accuracy") %>%
  mutate(learn_rate = as.factor(round(learn_rate, digits = 3))) %>%
  ggplot(aes(x = trees, y = mean, color = learn_rate)) +
  geom_line() +
  geom_point() +
  facet_grid(rows = vars(max_tokens), cols = vars(mtry))

Looks like increasing max_tokens could help a little more. Increasing mtry seems to improve only the small learning rates models, but probably won’t give a large improvement overall.

Now evaluate on test set.

final_model <- xgb_workflow %>%
  finalize_workflow(select_best(xgb_res, metric = "accuracy"))

fitted_model <- final_model %>%
  fit(train)

## [11:40:34] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

preds <- fitted_model %>%
  augment(test)

preds %>%
  with(accuracy_vec(as.factor(language), .pred_class))

## [1] 0.6005089

This is an incremental improvement over LASSO and is pretty far from the transformer model we will use in the end so we will not further optimize over parameters.

How is it doing by class?

preds %>%
  mutate(acc = .pred_class == language) %>%
  group_by(language) %>%
  summarize(mean(acc))

## # A tibble: 5 × 2
##   language `mean(acc)`
##   <chr>          <dbl>
## 1 c              0.762
## 2 java           0.547
## 3 js             0.570
## 4 python         0.495
## 5 r              0.59

Pretty similar to LASSO.

Let’s look at the learning curves. We will subsample sets and evaluate the model.

n <- 4
p <- (1:n) / n
grid <- expand.grid(1:nrow(folds), p) %>%
  as_tibble() %>%
  rename(fold = Var1, p = Var2)

grid_path <- here("outputs", "grid.rds")
override <- FALSE

if (!file.exists(grid_path) || override) {
  err_tot <- foreach(i = 1:nrow(grid)) %dopar% {
    train_df <- folds$splits[[grid$fold[i]]] %>%
      analysis() %>%
      slice_sample(prop = grid$p[i])

    fitted <- final_model %>%
      fit(train_df)

    error_train <- fitted %>%
      augment(train_df) %>%
      with(accuracy_vec(as.factor(language), .pred_class))

    error_val <- fitted %>%
      augment(assessment(folds$splits[[grid$fold[i]]])) %>%
      with(accuracy_vec(as.factor(language), .pred_class))

    c(error_train, error_val)
  }

  res <- grid %>%
    mutate(
      train = map_dbl(err_tot, ~ .[1]),
      val = map_dbl(err_tot, ~ .[2])
    )

  saveRDS(res, grid_path)
}

res <- readRDS(grid_path)

res %>%
  group_by(p) %>%
  summarize(
    train = mean(train),
    val = mean(val)
  ) %>%
  pivot_longer(train:val, names_to = "type", values_to = "score") %>%
  ggplot(aes(x = p)) +
  geom_line(aes(y = score, color = type)) +
  geom_point(aes(y = score, color = type)) +
  ylim(0, 1)

The curves look like they’ve asymptoted nicely so gathering more data probably would only make marginal improvements. There is a decent gap, suggesting overfitting.

Finally, let’s look at feature importance.

fitted_model %>%
  extract_fit_engine() %>%
  vip(n = 25)

List of highly important terms. These terms look pretty predictive of the language (android, react, pytorch, etc.). The number of words do seem decently helpful.

Let’s get stacking

All the models are pretty different, so let’s see if combining them might help us improve our model.

We first collect just the best models of each to make this not be too slow.

knn_cand <- knn_res %>%
  filter_parameters(parameters = select_best(knn_res, metric = "accuracy"))
lin_cand <- lin_res %>%
  filter_parameters(parameters = select_best(lin_res, metric = "accuracy"))
xgb_cand <- xgb_res %>%
  filter_parameters(parameters = select_best(xgb_res, metric = "accuracy"))

Now fit on the stacking members.

registerDoParallel(cores = 1)

lang_st <- stacks() %>%
  add_candidates(lin_cand) %>%
  add_candidates(knn_cand) %>%
  add_candidates(xgb_cand)

lang_model <- lang_st %>%
  blend_predictions() %>%
  fit_members()

## [12:15:14] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Finally, test on holdout.

test %>%
  {
    \(x) bind_cols(x, predict(lang_model, x))
  }() %>%
  with(accuracy_vec(as.factor(language), .pred_class))

## [1] 0.6025763

womp womp. Another barely incremental improvement.