diff --git a/.gitignore b/.gitignore index 083bae16..fa5d6213 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ inst/doc .DS_Store logs derby.log +catboost_info diff --git a/DESCRIPTION b/DESCRIPTION index 0866f8c3..05ca0eb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -77,5 +77,6 @@ Suggests: sparklyr (>= 0.8.0), tinytest, varImp, - xgboost + xgboost, + catboost RoxygenNote: 7.1.0 diff --git a/NAMESPACE b/NAMESPACE index ad138732..2dbd4b6c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(vi,Learner) S3method(vi,WrappedModel) S3method(vi,default) S3method(vi,model_fit) +S3method(vi,workflow) S3method(vi_firm,default) S3method(vi_model,C5.0) S3method(vi_model,H2OBinomialModel) @@ -13,6 +14,7 @@ S3method(vi_model,H2ORegressionModel) S3method(vi_model,Learner) S3method(vi_model,RandomForest) S3method(vi_model,WrappedModel) +S3method(vi_model,catboost.Model) S3method(vi_model,cforest) S3method(vi_model,constparty) S3method(vi_model,cubist) @@ -39,11 +41,13 @@ S3method(vi_model,randomForest) S3method(vi_model,ranger) S3method(vi_model,rpart) S3method(vi_model,train) +S3method(vi_model,workflow) S3method(vi_model,xgb.Booster) S3method(vi_permute,default) S3method(vi_shap,default) S3method(vip,default) S3method(vip,model_fit) +S3method(vip,workflow) export("%>%") export("%T>%") export(add_sparklines) diff --git a/R/get_feature_names.R b/R/get_feature_names.R index b1bb21cf..25f96865 100644 --- a/R/get_feature_names.R +++ b/R/get_feature_names.R @@ -282,3 +282,14 @@ get_feature_names.xgb.Booster <- function(object, ...) { object$feature_names } } + +# Package: catboost ------------------------------------------------------------- + +#' @keywords internal +get_feature_names.catboost.Model <- function(object, ...) { + if (is.null(rownames(fit$feature_importances))) { + get_feature_names.default(object) + } else { + rownames(fit$feature_importances) + } +} diff --git a/R/vi.R b/R/vi.R index 555cf7b9..89fa64bf 100644 --- a/R/vi.R +++ b/R/vi.R @@ -188,6 +188,12 @@ vi.model_fit <- function(object, ...) { # package: parsnip vi(object$fit, ...) } +#' @rdname vi +#' +#' @export +vi.workflow <- function(object, ...) { # package: workflows + vi(object$fit$fit$fit, ...) +} #' @rdname vi #' diff --git a/R/vi_model.R b/R/vi_model.R index 4d317b97..4dc865b3 100644 --- a/R/vi_model.R +++ b/R/vi_model.R @@ -238,6 +238,10 @@ #' #' }} #' +#' \item{\code{\link[catboost]{catboost}}}{See \code{\link[catboost]{catboost.get_feature_importance}} or visit +#' \url{https://catboost.ai/docs/concepts/r-reference_catboost-get_feature_importance.html} +#' for details.} +#' #' } #' #' @note Inspired by the \code{\link[caret]{varImp}} function. @@ -724,6 +728,14 @@ vi_model.model_fit <- function(object, ...) { # package: parsnip vi_model(object$fit, ...) } +# Package: parsnip ------------------------------------------------------------- + +#' @rdname vi_model +#' +#' @export +vi_model.workflow <- function(object, ...) { # package: workflows + vi_model(object$fit$fit$fit, ...) +} # Package: party --------------------------------------------------------------- @@ -1335,3 +1347,36 @@ vi_model.xgb.Booster <- function(object, type = c("gain", "cover", "frequency"), tib } + +# Package: catboost ------------------------------------------------------------- + +#' @rdname vi_model +#' +#' @export +vi_model.catboost.Model <- function(object, type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", "Interaction"), ...) { + + # Determine which type of variable importance to compute + type <- match.arg(type) + + # Construct model-specific variable importance scores + imp <- catboost::catboost.get_feature_importance(model = object, type = type, ...) + var_names <- get_feature_names.catboost.Model(object) + + if(type %in% c("LossFunctionChange", "FeatureImportance", "PredictionValuesChange")) { + tib <- tibble::enframe(imp[,1], name = "Variable", value = "Importance") + } else if(type == "Interaction") { + tib <- tibble::as_tibble(imp) + tib <- setNames(tib, c("Variable1", "Variable2", "Importance")) + tib$Variable1 <- var_names[tib$Variable1 + 1] + tib$Variable2 <- var_names[tib$Variable2 + 1] + } + + # Add variable importance type attribute + attr(tib, which = "type") <- type + + # Add "vi" class + class(tib) <- c("vi", class(tib)) + + # Return results + tib +} diff --git a/R/vi_shap.R b/R/vi_shap.R index 48562bdb..18d5b160 100644 --- a/R/vi_shap.R +++ b/R/vi_shap.R @@ -85,3 +85,19 @@ vi_shap.default <- function(object, feature_names = NULL, train = NULL, ...) { tib } + +#' @rdname vi_shap +#' +#' @export +vi_shap.catboost.Model <- function(object, feature_names = NULL, train = NULL, ...) { + # Try to extract feature names if not supplied + if (is.null(feature_names)) { + feature_names <- get_feature_names(object) + } + + # catboost do not give access to the training data directly from the model object. + if (is.null(train)) { + stop("Please provide a `catboost.Pool` object to the train argument. See `catboost::catboost.load_pool()`.") + } + +} diff --git a/R/vip.R b/R/vip.R index 5cc13523..888c5e0a 100644 --- a/R/vip.R +++ b/R/vip.R @@ -308,7 +308,6 @@ vip.default <- function( } - #' @rdname vip #' #' @export @@ -316,3 +315,9 @@ vip.model_fit <- function(object, ...) { vip(object$fit, ...) } +#' @rdname vip +#' +#' @export +vip.workflow <- function(object, ...) { + vip(object$fit$fit$fit, ...) +} diff --git a/inst/tinytest/test_pkg_catboost.R b/inst/tinytest/test_pkg_catboost.R new file mode 100644 index 00000000..cf455aad --- /dev/null +++ b/inst/tinytest/test_pkg_catboost.R @@ -0,0 +1,60 @@ +# Exits +if (!requireNamespace("catboost", quietly = TRUE)) { + exit_file("Package catboost missing") +} + +# # Load required packages +# suppressMessages({ +# library(catboost) +# }) + +# Generate Friedman benchmark data +friedman1 <- gen_friedman(seed = 101) + +# Fit model(s) +set.seed(101) +fit <- catboost::catboost.train( + learn_pool = catboost::catboost.load_pool(friedman1[,-1], friedman1[,1, drop = TRUE]), + params = list(logging_level = "Silent", iterations = 10) +) + +# Compute VI scores +vis_FeatureImportance_default <- vi_model(fit) +vis_FeatureImportance <- vi_model(fit, type = "FeatureImportance") +vis_PredictionValuesChange <- vi_model(fit, type = "PredictionValuesChange") +vis_LossFunctionChange <- vi_model(fit, type = "LossFunctionChange", pool = catboost::catboost.load_pool(friedman1[,-1], friedman1[,1, drop = TRUE])) +vis_Interaction <- vi_model(fit, type = "Interaction") + +# Expectations for `vi_model()` +expect_identical() +expect_identical() +expect_identical() + +# Expectations for `get_training_data()` +expect_error(vip:::get_training_data.default(fit)) + +# Expectations for `get_feature_names()` +expect_identical( + current = vip:::get_feature_names.catboost.Model(fit), + target = paste0("x", 1L:10L) +) + +# Call `vip::vip()` directly +p <- vip(fit, method = "model", include_type = TRUE) + +# Expect `p` to be a `"gg" "ggplot"` object +expect_identical( + current = class(p), + target = c("gg", "ggplot") +) + +# Display VIPs side by side +grid.arrange( + vip(vis_FeatureImportance_default, include_type = TRUE), + vip(vis_FeatureImportance, include_type = TRUE), + vip(vis_PredictionValuesChange, include_type = TRUE), + vip(vis_LossFunctionChange, include_type = TRUE), + # vip(vis_Interaction, include_type = TRUE), + p, + nrow = 1 +) diff --git a/inst/tinytest/test_pkg_workflows.R b/inst/tinytest/test_pkg_workflows.R new file mode 100644 index 00000000..94108b61 --- /dev/null +++ b/inst/tinytest/test_pkg_workflows.R @@ -0,0 +1,59 @@ +# Exits +if (!requireNamespace("workflows", quietly = TRUE)) { + exit_file("Package workflows missing") +} + +# Load required packages +suppressMessages({ + library(workflows) +}) + +# Generate Friedman benchmark data +friedman1 <- gen_friedman(seed = 101) + +# Fit a linear model +lin <- parsnip::linear_reg() %>% + parsnip::set_engine("lm") + +wf <- workflows::workflow() %>% + workflows::add_model(lin) %>% + workflows::add_formula(y ~ .) + +lin_fit <- wf %>% + parsnip::fit(data = friedman1) + +# Compute model-based VI scores +vis <- vi(lin_fit, scale = TRUE) + +# Expect `vi()` and `vi_model()` to both work +expect_identical( + current = vi(lin_fit, sort = FALSE), + target = vi_model(lin_fit) +) + +# Check class +expect_identical(class(vis), target = c("vi", "tbl_df", "tbl", "data.frame")) + +# Check dimensions (should be one row for each feature) +expect_identical(ncol(friedman1) - 1L, target = nrow(vis)) + +# Display VIP +vip(vis, geom = "point") + +# Try permutation importance +set.seed(953) # for reproducibility +p <- vip( + object = lin_fit, + method = "permute", + train = friedman1, + target = "y", + pred_wrapper = predict, + metric = "rmse", + nsim = 30, + geom = "violin", + jitter = TRUE, + all_permutation = TRUE, + mapping = ggplot2::aes(color = Variable) +) +expect_true(inherits(p, what = "ggplot")) +p # display VIP diff --git a/man/vi.Rd b/man/vi.Rd index 0fe265f9..47320242 100644 --- a/man/vi.Rd +++ b/man/vi.Rd @@ -4,6 +4,7 @@ \alias{vi} \alias{vi.default} \alias{vi.model_fit} +\alias{vi.workflow} \alias{vi.WrappedModel} \alias{vi.Learner} \title{Variable importance} @@ -27,6 +28,8 @@ vi(object, ...) \method{vi}{model_fit}(object, ...) +\method{vi}{workflow}(object, ...) + \method{vi}{WrappedModel}(object, ...) \method{vi}{Learner}(object, ...) diff --git a/man/vi_model.Rd b/man/vi_model.Rd index eb298bdf..d56a8065 100644 --- a/man/vi_model.Rd +++ b/man/vi_model.Rd @@ -18,6 +18,7 @@ \alias{vi_model.nn} \alias{vi_model.nnet} \alias{vi_model.model_fit} +\alias{vi_model.workflow} \alias{vi_model.RandomForest} \alias{vi_model.constparty} \alias{vi_model.cforest} @@ -36,6 +37,7 @@ \alias{vi_model.ml_model_random_forest_classification} \alias{vi_model.lm} \alias{vi_model.xgb.Booster} +\alias{vi_model.catboost.Model} \title{Model-specific variable importance} \usage{ vi_model(object, ...) @@ -72,6 +74,8 @@ vi_model(object, ...) \method{vi_model}{model_fit}(object, ...) +\method{vi_model}{workflow}(object, ...) + \method{vi_model}{RandomForest}(object, type = c("accuracy", "auc"), ...) \method{vi_model}{constparty}(object, ...) @@ -107,6 +111,13 @@ vi_model(object, ...) \method{vi_model}{lm}(object, type = c("stat", "raw"), ...) \method{vi_model}{xgb.Booster}(object, type = c("gain", "cover", "frequency"), ...) + +\method{vi_model}{catboost.Model}( + object, + type = c("FeatureImportance", "PredictionValuesChange", "LossFunctionChange", + "Interaction", "ShapValues"), + ... +) } \arguments{ \item{object}{A fitted model object (e.g., a \code{"randomForest"} object).} @@ -121,7 +132,9 @@ argument applies to.} A tidy data frame (i.e., a \code{"tibble"} object) with two columns: \code{Variable} and \code{Importance}. For \code{"lm"/"glm"}-like object, an additional column, called \code{Sign}, is also included which includes the -sign (i.e., POS/NEG) of the original coefficient. +sign (i.e., POS/NEG) of the original coefficient. For \code{"catboost.Model"} +object, type \code{Interaction} returns two Variable columns ("Variable1", +"Variable2"). } \description{ Compute model-specific variable importance scores for the predictors in a @@ -348,6 +361,10 @@ obtain three different types of variable importance: }} +\item{\code{\link[catboost]{catboost}}}{See \code{\link[catboost]{catboost.get_feature_importance}} or visit +\url{https://catboost.ai/docs/concepts/r-reference_catboost-get_feature_importance.html} +for details.} + } } \note{ diff --git a/man/vip.Rd b/man/vip.Rd index 3d78b426..c260adbd 100644 --- a/man/vip.Rd +++ b/man/vip.Rd @@ -4,6 +4,7 @@ \alias{vip} \alias{vip.default} \alias{vip.model_fit} +\alias{vip.workflow} \title{Variable importance plots} \usage{ vip(object, ...) @@ -29,6 +30,8 @@ vip(object, ...) ) \method{vip}{model_fit}(object, ...) + +\method{vip}{workflow}(object, ...) } \arguments{ \item{object}{A fitted model object (e.g., a \code{"randomForest"} object) or