Creating Create-New-Columns Steps
This template is for steps that create new columns from existing ones.
Overview
Create-new-columns steps: - Generate new columns based on existing columns - Can optionally keep or remove original columns - Assign roles to new columns - Examples: step_dummy, step_pca, step_interact, step_poly
Canonical implementations in recipes: - Encoding: R/dummy.R (one-hot encoding), R/bin2factor.R - Dimension reduction: R/pca.R, R/ica.R, R/kpca.R - Feature engineering: R/interact.R, R/poly.R, R/bs.R (basis splines) - Date features: R/date.R, R/holiday.R
Test patterns: - Encoding tests: tests/testthat/test-dummy.R - Dimension reduction: tests/testthat/test-pca.R
Source Development: When contributing to recipes itself, use internal helpers directly:
recipes_eval_select(),remove_original_cols(),check_name(), etc. Norecipes::prefix needed. See Best Practices (Source).
Key Differences from Modify-in-Place
roledefault is"predictor"(notNA)- Includes
keep_original_colsparameter - Calls
remove_original_cols()inbake() - May need to implement
.recipes_estimate_sparsity()for sparse support tidy()returns column names created, not just input columns
Complete Template
This template follows the same pattern as R/dummy.R and R/interact.R in the recipes repository.
#' Title for your step that creates new columns
#'
#' `step_yournewcols()` creates a *specification* of a recipe step that will
#' create new columns based on [description].
#'
#' @inheritParams step_center
#' @param ... One or more selector functions to choose variables for this step.
#' See [recipes::selections()] for more details.
#' @param role For model terms created by this step, what analysis role should
#' they be assigned? By default, the new columns created by this step will
#' be used as predictors in a model.
#' @param [your_params] Description of step-specific parameters.
#' @param columns A character vector of column names that will be populated
#' (eventually) by the [terms] argument. This is `NULL` until computed by
#' [prep()].
#' @param [learned_info] Description of information learned during prep().
#' This is `NULL` until computed by [prep()].
#' @param keep_original_cols A logical to keep the original variables in the
#' output. Defaults to `FALSE`.
#'
#' @return An updated version of `recipe` with the new step added to the
#' sequence of any existing operations.
#'
#' @family [your step category] steps
#' @export
#'
#' @details
#'
#' [Detailed explanation of the step, including formulas and computational
#' details.]
#'
#' When you [`tidy()`][recipes::tidy.recipe()] this step, a tibble is returned with
#' columns `terms`, `columns`, and `id`:
#'
#' \describe{
#' \item{terms}{character, the selectors or variables selected}
#' \item{columns}{character, names of the new columns created}
#' \item{id}{character, id of this step}
#' }
#'
#' # Case weights
#'
#' This step performs an unsupervised operation that can utilize case weights.
#' As a result, case weights are used with frequency weights as well as
#' importance weights. For more information, see the documentation in
#' [recipes::case_weights] and the examples on `tidymodels.org`.
#'
#' @examplesIf rlang::is_installed("modeldata")
#' data(biomass, package = "modeldata")
#'
#' rec <- recipe(HHV ~ ., data = biomass)
#'
#' # Create new columns
#' step_trans <- rec |>
#' step_yournewcols(carbon, hydrogen)
#'
#' step_trained <- prep(step_trans, training = biomass)
#'
#' # See new columns created
#' transformed <- bake(step_trained, biomass)
#' names(transformed)
#'
#' # Keep original columns
#' step_keep <- rec |>
#' step_yournewcols(carbon, hydrogen, keep_original_cols = TRUE)
#'
#' step_keep_trained <- prep(step_keep, training = biomass)
#' transformed_keep <- bake(step_keep_trained, biomass)
#' names(transformed_keep)
#'
#' tidy(step_trans, number = 1)
#' tidy(step_trained, number = 1)
step_yournewcols <- function(
recipe,
...,
role = "predictor",
trained = FALSE,
your_param = default_value,
columns = NULL,
learned_info = NULL,
keep_original_cols = FALSE,
skip = FALSE,
id = recipes::rand_id("yournewcols")
) {
recipes::add_step(
recipe,
step_yournewcols_new(
terms = rlang::enquos(...),
trained = trained,
role = role,
your_param = your_param,
columns = NULL,
learned_info = learned_info,
keep_original_cols = keep_original_cols,
skip = skip,
id = id,
case_weights = NULL
)
)
}
step_yournewcols_new <- function(terms, role, trained, your_param, columns,
learned_info, keep_original_cols, skip, id,
case_weights) {
recipes::step(
subclass = "yournewcols",
terms = terms,
role = role,
trained = trained,
your_param = your_param,
columns = columns,
learned_info = learned_info,
keep_original_cols = keep_original_cols,
skip = skip,
id = id,
case_weights = case_weights
)
}
#' @export
prep.step_yournewcols <- function(x, training, info = NULL, ...) {
col_names <- recipes::recipes_eval_select(x$terms, training, info)
recipes::check_type(training[, col_names], types = c("double", "integer"))
wts <- recipes::get_case_weights(info, training)
were_weights_used <- recipes::are_weights_used(wts, unsupervised = TRUE)
if (isFALSE(were_weights_used)) {
wts <- NULL
}
# Compute information needed to create new columns
# Example: compute coefficients, levels, loadings, etc.
learned_info <- compute_your_transformation(
training[, col_names],
wts,
x$your_param
)
step_yournewcols_new(
terms = x$terms,
role = x$role,
trained = TRUE,
your_param = x$your_param,
columns = col_names,
learned_info = learned_info,
keep_original_cols = x$keep_original_cols,
skip = x$skip,
id = x$id,
case_weights = were_weights_used
)
}
#' @export
bake.step_yournewcols <- function(object, new_data, ...) {
col_names <- object$columns
recipes::check_new_data(col_names, object, new_data)
# Create new columns based on learned information
# Example: create interaction terms, dummy variables, etc.
new_cols <- create_new_columns(
new_data[, col_names],
object$learned_info,
object$your_param
)
# Add new columns to data
new_data <- vctrs::vec_cbind(new_data, new_cols)
# Optionally remove original columns
new_data <- recipes::remove_original_cols(new_data, object, col_names)
new_data
}
#' @export
print.step_yournewcols <- function(x, width = max(20, options()$width - 30), ...) {
title <- "Creating new columns from "
recipes::print_step(
x$columns,
x$terms,
x$trained,
title,
width,
case_weights = x$case_weights
)
invisible(x)
}
#' @rdname tidy.recipe
#' @export
tidy.step_yournewcols <- function(x, ...) {
if (recipes::is_trained(x)) {
# Return information about created columns
res <- tibble::tibble(
terms = rep(x$columns, times = lengths_of_new_cols),
columns = names_of_new_columns_created
)
} else {
term_names <- recipes::sel2char(x$terms)
res <- tibble::tibble(
terms = term_names,
columns = rlang::na_chr
)
}
res$id <- x$id
res
}
# Optional: Implement sparsity estimation if your step can create sparse columns
.recipes_estimate_sparsity.step_yournewcols <- function(x, data, ...) {
# Estimate how many columns will be created and their sparsity
# Return a list with n_cols (integer) and sparsity (numeric between 0 and 1)
col_names <- recipes::recipes_eval_select(x$terms, data, recipes::get_recipe_info(data))
# Example: dummy coding creates n-1 columns with estimated sparsity
n_new_cols <- estimate_number_of_columns(data[, col_names])
est_sparsity <- estimate_sparsity(data[, col_names])
list(
n_cols = n_new_cols,
sparsity = est_sparsity
)
}Additional Notes for Create-New-Columns Steps
Column naming
- Use descriptive, consistent naming conventions
- Consider providing a
namingparameter for custom naming functions - Validate that new names don’t conflict with existing columns using
recipes::check_name()
Example naming pattern:
# For dummy variables
new_col_names <- paste0(original_col, "_", levels)
# For polynomial features
new_col_names <- paste0(original_col, "_poly_", 1:degree)
# For interactions
new_col_names <- paste(col1, col2, sep = "_x_")Sparsity support
- If your step creates sparse columns (like dummy variables), implement
.recipes_estimate_sparsity() - This allows workflows to optimize sparse data handling
- Return a list with
n_cols(integer) andsparsity(numeric 0-1)
Example:
.recipes_estimate_sparsity.step_dummy <- function(x, data, ...) {
col_names <- recipes_eval_select(x$terms, data, get_recipe_info(data))
# Count total levels across all factor columns
n_new_cols <- sum(sapply(data[col_names], function(x) nlevels(x) - 1))
# Estimate sparsity: for dummy variables, typically high
est_sparsity <- 0.8 # 80% zeros for typical categorical data
list(n_cols = n_new_cols, sparsity = est_sparsity)
}keep_original_cols
- Always use
remove_original_cols()inbake()to handle this parameter - Don’t implement the logic yourself - the helper does it correctly
# Good: use the helper
new_data <- recipes::remove_original_cols(new_data, object, col_names)
# Bad: manual implementation (error-prone)
if (!object$keep_original_cols) {
new_data <- new_data[, !names(new_data) %in% col_names]
}The helper correctly handles: - Preserving column order - Handling role assignments - Edge cases with column names
Common Patterns
Dummy variables (one-hot encoding)
prep:
# Store factor levels for each column
levels_list <- lapply(training[col_names], levels)
bake:
# Create dummy variables
for (col in col_names) {
dummies <- model.matrix(~ . - 1, data = data.frame(x = new_data[[col]]))
colnames(dummies) <- paste0(col, "_", levels_list[[col]])
new_data <- vctrs::vec_cbind(new_data, as.data.frame(dummies))
}Polynomial features
prep:
# Store degree parameter (no learning needed)
bake:
for (col in col_names) {
for (deg in 2:degree) {
new_col_name <- paste0(col, "_poly_", deg)
new_data[[new_col_name]] <- new_data[[col]]^deg
}
}Interaction terms
prep:
# Store which pairs to create interactions for
bake:
for (i in 1:(length(col_names)-1)) {
for (j in (i+1):length(col_names)) {
new_col_name <- paste(col_names[i], col_names[j], sep = "_x_")
new_data[[new_col_name]] <- new_data[[col_names[i]]] * new_data[[col_names[j]]]
}
}Testing
See package-extension-requirements.md#testing-requirements for comprehensive testing guide.
Key tests for create-new-columns steps
test_that("new columns are created", {
rec <- recipe(mpg ~ ., data = mtcars) |>
step_yournewcols(cyl, gear)
prepped <- prep(rec, training = mtcars)
results <- bake(prepped, mtcars)
# Verify new columns exist
expect_true("cyl_new" %in% names(results))
expect_true("gear_new" %in% names(results))
# Verify original columns removed by default
expect_false("cyl" %in% names(results))
expect_false("gear" %in% names(results))
})
test_that("keep_original_cols works", {
rec <- recipe(mpg ~ ., data = mtcars) |>
step_yournewcols(cyl, keep_original_cols = TRUE)
prepped <- prep(rec, training = mtcars)
results <- bake(prepped, mtcars)
# Verify both original and new columns exist
expect_true("cyl" %in% names(results))
expect_true("cyl_new" %in% names(results))
})
test_that("tidy returns correct information", {
rec <- recipe(mpg ~ ., data = mtcars) |>
step_yournewcols(cyl, gear)
# Untrained
untrained_tidy <- tidy(rec, 1)
expect_equal(untrained_tidy$columns, rep(NA_character_, 2))
# Trained
prepped <- prep(rec, training = mtcars)
trained_tidy <- tidy(prepped, 1)
expect_true(all(c("cyl_new", "gear_new") %in% trained_tidy$columns))
})Next Steps
- Understand architecture: step-architecture.md
- Modify in place: modify-in-place-steps.md
- Row operations: row-operation-steps.md
- Add optional methods: optional-methods.md
- Learn helper functions: helper-functions.md
- Document your step: package-roxygen-documentation.md
- Write tests: package-extension-requirements.md#testing-requirements