## Perform calibration sampling based on spectral PCA ## or random split ------------------------------------------------------------- split_data_q <- function( spec_chem, split_method, evaluation_method = "test_set", ratio_val, ken_sto_pc = 2, print = TRUE, invert = FALSE, env = parent.frame()) { MIR <- model <- type <- PC1 <- PC2 <- NULL # Evaluate the invert argument in the parent function (fit_pls) invert <- eval(invert, envir = parent.frame()) # Evaluate the validation argument in the parent function (fit_pls) evaluation_method <- eval(evaluation_method, envir = parent.frame()) # Slice based on sample_id if spectral data is in tibble class if (tibble::is_tibble(spec_chem)) { spec_chem <- spec_chem %>% dplyr::group_by(!!rlang::sym("sample_id")) %>% dplyr::slice(1L) } if (evaluation_method == "test_set") { # pc = 0.99 before !!! ken_sto_pc <- eval(ken_sto_pc, envir = parent.frame()) if (invert == FALSE) { ## Select calibration set by Kennard-Stones algorithm # Check if tibble; if yes slice tibble and bind list of data.tables in # one data table for spectral data if (tibble::is_tibble(spec_chem)) { spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre)) # k = number of samples to select # ken_sto_pc = if provided, the number of principal components # (see ?kenStone) sel <- prospectr::kenStone(X = spc_pre, k = round((1 - ratio_val) * nrow(spec_chem)), pc = substitute(ken_sto_pc)) } else { sel <- prospectr::kenStone(X = spec_chem$MIR, k = round((1 - ratio_val) * nrow(spec_chem)), pc = substitute(ken_sto_pc)) } # Split MIR data into calibration and validation set using # the results of Kennard-Stone Calibration Sampling # Selct by row index of calibration samples val_set <- spec_chem[- sel$model, ] cal_set <- spec_chem[sel$model, ] # Optionally split up calibation (train) and validation (test) sets # randomly; use function from modelr package # !!! Important note: The option to split up the calibration and # sets randomly is still experimental and a modification for the # PC space projection is not yet implemented for graphical output. # p_pc ggplot2 output needs to be updated for split_method = "random" if (split_method == "random") { # Split data sets into test and traing using modelr package df_split <- modelr::crossv_mc(spec_chem, n = 1, test = ratio_val) # Select train of df_split and convert back into tibble, # assign to calibration set cal_set <- tibble::as_tibble(df_split[1, ][["train"]][[1]]) # Select test of df_split and convert back into tibble, # assign to validation set val_set <- tibble::as_tibble(df_split[1, ][["test"]][[1]]) } sel_df_cal <- data.frame(sel$pc[sel$model, 1:2]) sel_df_val <- data.frame(sel$pc[- sel$model, 1:2]) } else { if (tibble::is_tibble(spec_chem)) { spc_pre <- as.matrix(data.table::rbindlist(spec_chem$spc_pre)) sel <- prospectr::kenStone(X = spc_pre, k = round(ratio_val * nrow(spec_chem)), pc = substitute(ken_sto_pc)) } else { ## Select validation set by Kennard-Stones algorithm sel <- prospectr::kenStone(X = spec_chem$MIR, k = round(ratio_val * nrow(spec_chem)), pc = substitute(ken_sto_pc)) } sel_df_cal <- data.frame(sel$pc[- sel$model, 1:2]) sel_df_val <- data.frame(sel$pc[sel$model, 1:2]) # Split MIR data into calibration and validation set using # the results of Kennard-Stone Calibration Sampling # Selct by row index of calibration samples val_set <- spec_chem[sel$model, ] cal_set <- spec_chem[- sel$model, ] } # Add additional columns to calibration set and validation sets for plotting sel_df_cal$type <- as.factor( rep("calibration", nrow(sel_df_cal)) ) sel_df_val$type <- as.factor( rep("validation", nrow(sel_df_val))) sel_df <- rbind(sel_df_cal, sel_df_val) # Compute ratio needed to make the figure square ratio <- with(sel_df, diff(range(PC1)) / diff(range(PC2))) # Save graph showing the selected calibration and validation samples # for the first two principal components (pc) p_pc <- ggplot2::ggplot(data = sel_df) + ggplot2::geom_point( ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) + ggplot2::coord_fixed(ratio = 1) + ggplot2::scale_shape_manual(values = c(1, 19)) + ggplot2::scale_colour_manual(values = c("black", "red")) + ggplot2::theme_bw() + ggplot2::theme(legend.title = ggplot2::element_blank()) # Print outputs to list list_out <- list( calibration = cal_set, validation = val_set, p_pc = p_pc ) list_out # Check number of observations (rows) for calibration set # nrow(cal_set) } else { cal_set <- spec_chem list( calibration = cal_set, validation = NULL, p_pc = NULL ) } } # trainControl generating helper function control_train_q <- function(x, response, resampling_seed, env = parent.frame()) { calibration <- NULL # List of calibration and validation samples # set up a cross-validation scheme # create 10 folds that we will keep for the different # modeling approaches to allow comparison # randomly break the data into 10 partitions # note that k is the total number of samples for leave-one-out # use substitute function to make non-standard evaluation # of variable argument (looks at a function as argument, # sees code used to compute value; # see chapter 13.1 Capturing expressions # in Advanced R (Hadley Wickham) # !! p. 270 response <- eval(response, x$calibration, env) # Set seed for creating resampling indices set.seed(eval(resampling_seed, env)) idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE) # inject the index in the trainControl object caret::trainControl(method = "cv", index = idx, savePredictions = TRUE, selectionFunction = "oneSE") } ## Adapt model tuning to leave-one-out cross-validation ======================== ## trainControl generating helper function control_train_loocv_q <- function(x, response, env = parent.frame()) { calibration <- NULL # r: response response <- eval(response, x$calibration, env) # Set up leave-one-out cross-validation caret::trainControl(method = "LOOCV", savePredictions = TRUE, selectionFunction = "oneSE") } ## Adapt model tuning to repeated k-fold cross-validation ====================== ## trainControl generating helper function control_train_rcv_q <- function(x, response, resampling_seed, env = parent.frame()) { calibration <- NULL # r: response response <- eval(response, x$calibration, env) # Set seed for creating resampling indices set.seed(eval(resampling_seed, env)) # Set up 5 times repeated 10-fold cross-validation idx <- caret::createMultiFolds(y = response, k = 10, times = 5) # update *** # Inject the index in the trainControl object caret::trainControl(method = "repeatedcv", index = idx, savePredictions = TRUE, selectionFunction = "oneSE") } ## Fitting models without parameter tuning ===================================== # 5.9; https://topepo.github.io/caret/model-training-and-tuning.html ## trainControl generating helper function control_train_none_q <- function(x, response, resampling_seed, env = parent.frame()) { calibration <- NULL # r: response response <- eval(response, x$calibration, env) # Set seed for creating resampling indices set.seed(eval(resampling_seed, env)) # Set trainControl argument to "none" so that caret::train will only fit # one model to the entire training set; # use a fixed number of PLS components instead idx <- caret::createFolds(y = response, k = 10, returnTrain = TRUE) # update *** # inject the index in the trainControl object caret::trainControl(method = "none", index = idx, savePredictions = TRUE, selectionFunction = "oneSE") } ## Standard evlauation version of trainContol helper function control_train <- function(x, response, env = parent.frame()) { control_train_q(x, substitute(response), env) } # Fit a PLS regression model using the caret package --------------------------- ## Train a PLS regression model train_pls_q <- function(x, evaluation_method = "test_resampling", response, tr_control, env = parent.frame(), pls_ncomp_max = 20, ncomp_fixed = 5, center, scale, tuning_method = "resampling") { # Fit a partial least square regression (pls) model # center and scale MIR (you can try without) calibration <- MIR <- NULL r <- eval(response, x$calibration, env) # ? Is it really necessary to evaluate this in the parent frame? pls_ncomp_max <- eval(pls_ncomp_max, envir = parent.frame()) # Evaluate fixed number of PLS regression components # from ncomp_fixed object in parent frame (fit_pls function) ncomp_fixed <- eval(ncomp_fixed, envir = parent.frame()) # Test whether the spectral object has the class "tibble" if (!tibble::is_tibble(x$calibration)) { stop("spec_chem needs to be of class tibble") } spc_pre <- data.table::rbindlist(x$calibration$spc_pre) if (scale == TRUE && center == TRUE) { if (tuning_method == "resampling") { # Fit model with parameter tuning pls_model <- caret::train(x = spc_pre, y = r, method = "pls", tuneLength = pls_ncomp_max, trControl = tr_control, preProcess = c("center", "scale")) } else if (tuning_method == "none") { # Fit model without parameter tuning pls_model <- caret::train(x = spc_pre, y = r, method = "pls", trControl = tr_control, preProcess = c("center", "scale"), tuneGrid = data.frame(ncomp = ncomp_fixed)) } } else { # No centering and scaling! pls_model <- caret::train(x = spc_pre, y = r, method = "pls", tuneLength = pls_ncomp_max, trControl = tr_control) } } ## Standard evaluation version for training a PLS regression model train_pls <- function(x, response, evaluation_method = "resampling", env = parent.frame()) { train_pls_q(x = x, evaluation_method = substitute(evaluation_method), response = substitute(response), env ) } # Fit a random forest model using the caret package ---------------------------- ## Train a random forest model train_rf_q <- function(x, validation = TRUE, evaluation_method = "resampling", response, tr_control, ntree_max = 500, env = parent.frame()) { # Fit a partial least square regression (pls) model # center and scale MIR (you can try without) calibration <- MIR <- NULL response <- eval(response, x$calibration, env) ntree_max <- eval(ntree_max, envir = parent.frame()) if (tibble::is_tibble(x$calibration)) { spc_pre <- data.table::rbindlist(x$calibration$spc_pre) rf_model <- caret::train( x = spc_pre, y = response, method = "rf", ntree = ntree_max, trControl = tr_control, preProcess = c("center", "scale") ) } else { rf_model <- caret::train(x = x$calibration$MIR, y = response, method = "rf", ntree = ntree_max, trControl = tr_control, preProcess = c("center", "scale") ) } rf_model } # Evaluate model performance (validation and cross-validation) ----------------- ## Helper function to transform repeated k-fold cross-validation hold-out ## predictions transform_cvpredictions <- function(cal_index, predobs_cv) { predobs_cv <- dplyr::full_join(cal_index, predobs_cv, by = "rowIndex") %>% dplyr::group_by(!!rlang::sym("sample_id")) %>% # Average observed and predicted values dplyr::mutate("obs" = mean(!!rlang::sym("obs")), "pred_sd" = sd(!!rlang::sym("pred"))) %>% # Add 95% confidence interval for mean hold-out predictions from # repeated k-fold cross-validation dplyr::mutate_at(.vars = dplyr::vars(!!rlang::sym("pred")), .funs = dplyr::funs("pred_sem_ci" = sem_ci)) %>% # Add mean hold-out predictions from repeated k-fold cross-validation dplyr::mutate("pred" = mean(!!rlang::sym("pred"))) %>% # Slice data set to only have one row per sample_id dplyr::slice(1L) } ## Evaluate PLS performance evaluate_model_q <- function(x, model, response, evaluation_method, tuning_method, resampling_method, print = TRUE, env = parent.frame()) { # Set global variables to NULL to avoid R CMD check notes MIR <- object <- dataType <- obs <- pred_sem_ci <- pred <- NULL ncomp <- finalModel <- rmse <- r2 <- r2 <- rpd <- n <- NULL rmse <- calibration <- NULL # Collect fitted object into a list list_models <- list("final_model" = model) # Evaluate validation argument in parent.frame !!! evaluation_method <- eval(evaluation_method, envir = parent.frame()) # Evaluate tuning_method argument in parent.frame tuning_method <- eval(tuning_method, envir = parent.frame()) # Evaluate resampling_method argument in parent.frame resampling_method <- eval(resampling_method, envir = parent.frame()) # Extract best tuning parameters and associated cv predictions if (evaluation_method == "test_set") { # Calculate training (calibration) and test (validation) data # predictions based on pls model with calibration data r <- eval(response, x$validation, env) if (!tibble::is_tibble(x$validation)) { stop("Spectra and reference data need to be provided as tibble (class `tbl_df`, `tbl`, `data.frame`") } spc_pre <- data.table::rbindlist(x$validation$spc_pre) predobs <- caret::extractPrediction(list_models, testX = spc_pre, testY = r) # update *** # Append sample_id column to predobs data.frame # extract sample_id from validation set predobs$sample_id <- c( x$calibration$sample_id, x$validation$sample_id) # Create new data frame column predobs$object <- predobs$model # Replace levels "Training" and "Test" in dataType column # by "Calibration" and "Validation" (rename levels of factor) predobs$dataType <- plyr::revalue(predobs$dataType, c("Test" = "Validation", "Training" = "Calibration") ) # Change the order of rows in the data frame # Calibration as first level (show Calibration in ggplot graph # on left panel) predobs$dataType <- factor(predobs$dataType, levels = c("Calibration", "Validation")) # Calculate model performance indexes by model and dataType # uses package plyr and function summary.df of SPECmisc.R stats <- plyr::ddply(predobs, c("model", "dataType"), function(x) summary_df(x, "obs", "pred") ) # Check whether method = "none" argument is selected in train(); # this is the case when ncomp_fixed argument in fit_pls() is # evaluated # Checking for the existence of a element in the train function output # list can be dangerous and doesn't work in all cases when using # e.g. is.element('pred', x) or is.null(x$pred); # Problems can occur e.g. if a list element contains NULL element; # see # http://stackoverflow.com/questions/7719741/how-to-test-if-list-element-exists } else if (evaluation_method == "resampling" && tuning_method == "resampling") { # Good discussion on which cross-validation results are returned from caret # Extract best tuning parameters and associated cv predictions # http://stats.stackexchange.com/questions/219154/how-does-cross-validation-in-train-caret-precisely-work # Alternative solution for one model: conformal::GetCVPreds(model) function # see https://github.com/cran/conformal/blob/master/R/misc.R predobs_cv <- plyr::ldply(list_models, function(x) dplyr::anti_join(x$pred, x$bestTune, by = "ncomp"), .id = "model" ) # Extract auto-prediction predobs <- caret::extractPrediction(list_models) # !!! new --- # Replace levels "Training" dataType column # by "Calibration" (rename levels of factor) predobs$dataType <- plyr::revalue(predobs$dataType, c("Training" = "Calibration") ) # Append sample_id column to predobs data.frame # extract sample_id from calibration set predobs$sample_id <- x$calibration$sample_id # Create rowIndex for calibration tibble x$calibration$rowIndex <- 1:nrow(x$calibration) # Generate sample_id column for rowIndex of pred list element of # train object; select only rowIndex and sample_id of calibration tibble vars_indexing <- c("rowIndex", "sample_id") cal_index <- dplyr::select(x$calibration, !!!rlang::syms(vars_indexing)) # Transform cross-validation hold-out predictions -------------------------- predobs_cv <- transform_cvpredictions(cal_index = cal_index, predobs_cv = predobs_cv) predobs_cv$object <- predobs_cv$model predobs_cv$model <- factor(predobs_cv$model) predobs_cv$dataType <- factor("Cross-validation") vars_keep <- c("obs", "pred", "pred_sd", "pred_sem_ci", "model", "dataType", "object") predobs_cv <- dplyr::select(predobs_cv, # !!! sample_id newly added !!!rlang::syms(vars_keep) ) # Add column pred_sd to predobs data frame (assign values to 0) so that # column pred_sd is retained in predobs_cv after dplyr::bind_rows predobs$pred_sd <- NA # Desn't work because some columns are turned into numeric; # resulting data frame has only two rows # pb_2018-11-09: Model evaluation graph should only show cross-validation # results when arguments `evaluation_method` == "resampling" && # `tuning_method` == "resampling" predobs <- predobs_cv # predobs <- dplyr::bind_rows(predobs, predobs_cv) # Calculate model performance indexes by model and dataType # uses package plyr and function summary.df of SPECmisc.R stats <- suppressWarnings(plyr::ddply(predobs, c("model", "dataType"), function(x) summary_df(x, "obs", "pred")) ) } # Add number of components to stats; from finalModel list item # from train() function output (function from caret package) stats$ncomp <- rep(model$finalModel$ncomp, nrow(stats)) # !!! Experimental: return stats # return(stats) # Add range of observed values for validation and calibraton # get range from predicted vs. observed data frame # stored in object predobs obs_cal <- subset(predobs, dataType == "Calibration")$obs # Get name of predicted variable; see p. 261 of book # "Advanced R" (Hadley Wickham) response_name <- deparse(response) if (evaluation_method == "test_set") { # Assign validation set to separate data frame obs_val <- subset(predobs, dataType == "Validation")$obs # before: deparse(substitute(variable)) df_range <- data.frame( response = rep(response_name, 2), dataType = c("Calibration", "Validation"), min_obs = c(range(obs_cal)[1], range(obs_val)[1]), median_obs = c(median(obs_cal), median(obs_val)), max_obs = c(range(obs_cal)[2], range(obs_val)[2]), mean_obs = c(mean(obs_cal), mean(obs_val)), CV = c(sd(obs_cal) / mean(obs_cal) * 100, sd(obs_val) / mean(obs_val) * 100) ) } else if (evaluation_method == "resampling" && tuning_method == "resampling") { # Assign cross-validation set to separate data frame obs_val <- subset(predobs, dataType == "Cross-validation")$obs df_range <- data.frame( response = rep(response_name, 2), dataType = factor(c("Calibration", "Cross-validation")), min_obs = c(range(obs_cal)[1], range(obs_val)[1]), median_obs = c(median(obs_cal), median(obs_val)), max_obs = c(range(obs_cal)[2], range(obs_val)[2]), mean_obs = c(mean(obs_cal), mean(obs_val)), CV = c(sd(obs_cal) / mean(obs_cal) * 100, sd(obs_val) / mean(obs_val) * 100) ) } # Join stats with range data frame (df_range) stats <- suppressWarnings(dplyr::inner_join(stats, df_range, by = "dataType")) annotation <- plyr::mutate(stats, rmse = as.character(as.expression(paste0("RMSE == ", round(rmse, 2)))), r2 = as.character(as.expression(paste0("italic(R)^2 == ", round(r2, 2)))), rpd = as.character(as.expression(paste("RPD == ", round(rpd, 2)))), n = as.character(as.expression(paste0("italic(n) == ", n))), ncomp = as.character(as.expression(paste0("ncomp == ", ncomp))) ) # Plot predicted vs. observed values and model indexes # update label, xlim, and ylim *** # Add label number of samples to facet_grid using a # labeling function # ! Update labeller API: # https://github.com/hadley/ggplot2/commit/ef33dc7 # http://sahirbhatnagar.com/facet_wrap_labels # Prepare lookup character vector make_label <- function(x, evaluation_method = "test_set", resampling_method = "kfold_cv") { dataType <- n <- NULL if (evaluation_method == "test_set") { c(`Calibration` = paste0("Calibration", "~(", x[x$dataType == "Calibration", ]$n, ")" ), `Validation` = paste0("Validation", "~(", x[x$dataType == "Validation", ]$n, ")" ) ) } else if (evaluation_method == "resampling" && resampling_method == "rep_kfold_cv") { c(`Calibration` = paste0("Calibration", "~(", x[x$dataType == "Calibration", ]$n, ")" ), `Cross-validation` = paste0("5%*%repeated~10*-fold~CV", "~(", x[x$dataType == "Cross-validation", ]$n, ")" ) ) } else { c(`Calibration` = paste0("Calibration", "~(", x[x$dataType == "Calibration", ]$n, ")" ), `Cross-validation` = paste0("10*-fold~CV", "~(", x[x$dataType == "Cross-validation", ]$n, ")" ) ) } } if (evaluation_method == "test_set") { label_validation <- make_label(x = annotation, evaluation_method = "test_set" ) } else if (evaluation_method == "resampling" && resampling_method == "rep_kfold_cv") { label_validation <- make_label(x = annotation, evaluation_method = "resampling", resampling_method = "rep_kfold_cv" ) } else { label_validation <- make_label(x = annotation, evaluation_method = "resampling" ) } # Rename labels on the fly with a lookup character vector to_string <- ggplot2::as_labeller( x = label_validation, ggplot2::label_parsed ) # Create model evaluation plot ----------------------------------------------- ## ggplot graph for model comparison ## (arranged later in panels) x_label <- paste0("Observed ", as.character(response_name)) y_label <- paste0("Predicted ", as.character(response_name)) ## Create x and y minimum and maximum for plotting range; use either ## observed or predicted data, depending on what minimum and maximum values ## are xy_min <- if (min(predobs$obs) < min(predobs$pred)) {predobs$obs} else {predobs$pred} xy_max <- if (max(predobs$obs) > max(predobs$pred)) {predobs$obs} else {predobs$pred} xy_range <- ifelse(diff(range(xy_min) > diff(range(xy_max))), diff(range(xy_min)), diff(range(xy_max))) if (model$method == "pls") { p_model <- ggplot2::ggplot(data = predobs) + ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), shape = 1, size = 2, alpha = 1/2, data = predobs) + ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = ncomp), size = 5, hjust = 1.15, vjust = -4.5, parse = TRUE) + # !!! additional label ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5, hjust = 1.15, vjust = -3, parse = TRUE) + ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5, hjust = 1.12, vjust = -2.5, parse = TRUE) + ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5, hjust = 1.15, vjust = -1.25, parse = TRUE) + ggplot2::facet_grid(~ dataType, labeller = ggplot2::as_labeller(to_string)) + ggplot2::geom_abline(col = "red") + ggplot2::labs(x = x_label, y = y_label) + ggplot2::xlim(c(min(xy_min) - 0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) + ggplot2::ylim(c(min(xy_min) - 0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) + ggplot2::coord_fixed() + ggplot2::theme_bw() + ggplot2::theme(strip.background = ggplot2::element_rect(fill = "white")) if (evaluation_method == "resampling") { p_model <- p_model + ggplot2::geom_errorbar( ggplot2::aes(x = obs, ymin = pred - pred_sem_ci, ymax = pred + pred_sem_ci), width = 0, data = predobs, inherit.aes = FALSE) } } else { p_model <- ggplot2::ggplot(data = predobs) + ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), shape = 1, size = 2, alpha = 1/2) + # without ncomp label ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 5, hjust = 1.15, vjust = -3, parse = TRUE) + ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 5, hjust = 1.12, vjust = -2.5, parse = TRUE) + ggplot2::geom_text(data = annotation, ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 5, hjust = 1.15, vjust = -1.25, parse = TRUE) + ggplot2::facet_grid(~ dataType, labeller = ggplot2::as_labeller(to_string)) + ggplot2::geom_abline(col = "red") + ggplot2::labs(x = x_label, y = y_label) + ggplot2::xlim(c(min(xy_min) - 0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) + ggplot2::ylim(c(min(xy_min) - 0.05 * xy_range, max(xy_max) + 0.05 * xy_range)) + ggplot2::coord_fixed() + ggplot2::theme_bw() } if (print == TRUE) { print(p_model) } list(stats = stats, p_model = p_model, predobs = predobs) } ## PLS regression modeling in one function ====================== #' @name fit_pls #' @title Calibration sampling, model tuning, and PLS regression #' @description Perform calibration sampling and use selected #' calibration set for model tuning #' @param spec_chem Tibble that contains spectra, metadata and chemical #' reference as list-columns. The tibble to be supplied to \code{spec_chem} can #' be generated by the `join_chem_spc() function` #' @param response Response variable as symbol or name #' (without quotes, no character string). The provided response symbol needs to be #' a column name in the \code{spec_chem} tibble. #' @param variable Depreciated and replaced by `response` #' @param center Logical whether to perform mean centering of each spectrum column #' (e.g. wavenumber or wavelength) after common spectrum preprocessing. Default is #' \code{center = TRUE} #' @param scale Logical whether to perform standard deviation scaling #' of each spectrum column (e.g. wavenumber or wavelength) after common #' spectrum preprocessing. Default is \code{scale = TRUE} #' @param evaluation_method Character string stating evaluation method. #' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"} #' will split the data into a calibration (training) and validation (test) set, #' and evaluate the final model by predicting on the validation set. #' If \code{"resampling"}, the finally selected model will be evaluated based #' on the cross-validation hold-out predictions. #' @param validation Depreciated and replaced by \code{evaluation_method}. #' Default is \code{TRUE}. #' @param split_method Method how to to split the data into a independent test #' set. Default is \code{"ken_sto"}, which will select samples for calibration #' based on Kennard-Stone sampling algorithm of preprocessed spectra. The #' proportion of validation to the total number of samples can be specified #' in the argument \code{ratio_val}. #' \code{split_method = "random"} will create a single random split. #' @param ratio_val Ratio of validation (test) samples to #' total number of samples (calibration (training) and validation (test)). #' @param ken_sto_pc Number of component used #' for calculating mahalanobsis distance on PCA scores for computing #' Kennard-Stone algorithm. #' Default is \code{ken_sto_pc = 2}, which will use the first two PCA #' components. #' @param pc Depreciated; renamed argument is `ken_sto_pc`. #' @param invert Logical #' @param tuning_method Character specifying tuning method. Tuning method #' affects how caret selects a final tuning value set from a list of candidate #' values. Possible values are \code{"resampling"}, which will use a #' specified resampling method such as repeated k-fold cross-validation (see #' argument \code{resampling_method}) and the generated performance profile #' based on the hold-out predictions to decide on the final tuning values #' that lead to optimal model performance. The value \code{"none"} will force #' caret to compute a final model for a predefined canditate PLS tuning #' parameter number of PLS components. In this case, the value #' supplied by \code{ncomp_fixed}` is used to set model complexity at #' a fixed number of components. #' @param resampling_method Character specifying resampling method. Currently, #' \code{"kfold_cv"} (default, performs 10-fold cross-validation), #' \code{"rep_kfold_cv"} (performs 5-times repeated 10-fold cross-validation), #' \code{"loocv"} (performs leave-one-out cross-validation), and \code{"none"} #' (if \code{resampling_method = "none"}) are supported. #' @param resampling_seed Random seed (integer) that will be used for generating #' resampling indices, which will be supplied to \code{caret::trainControl}. #' This makes sure that modeling results are constant when re-fitting. #' Default is \code{resampling_seed = 123}. #' @param cv Depreciated. Use \code{resampling_method} instead. #' @param pls_ncomp_max Maximum number of PLS components that are evaluated #' by caret::train. Caret will aggregate a performance profile using resampling #' for an integer sequence from 1 to \code{pls_ncomp_max} #' @param ncomp_fixed Integer of fixed number of PLS components. Will only be #' used when \code{tuning_method = "none"} and \code{resampling_method = "none"} #' are used. #' @param print Logical expression whether model evaluation graphs shall be #' printed #' @param env Environment where function is evaluated. Default is #' \code{parent.frame}. #' @export # Note: check non standard evaluation, argument passing... fit_pls <- function( spec_chem, response, variable = NULL, # variable depreciated, will not work center = TRUE, scale = TRUE, # center and scale all predictors (wavenumbers) evaluation_method = "test_set", validation = TRUE, # validation depreciated split_method = "ken_stone", ratio_val = 1/3, # is only used if evaluation_method = "test_set" ken_sto_pc = 2, pc, # only if split_method = "ken_stone"; number of component # used for calculating mahalanobsis distance on PCA scores. pc is depreciated. invert = TRUE, # only if split_method = "ken_stone" tuning_method = "resampling", resampling_method = "kfold_cv", cv = NULL, # cv depreciated resampling_seed = 123, # Seed for creating resampling indices pls_ncomp_max = 20, # Maximal number of PLS components used by model tuning ncomp_fixed = 5, # only fit and evaluate one model, if tuning_method = "none" print = TRUE, # print model summary and evaluation graphs env = parent.frame()) { calibration <- 0 # Warning messages and reassignment for depreciated arguments ---------------- # Depreciate argument variable, use more specific term for the response # to be predicted by spectral modeling if (!is.null(variable)) { stop("argument variable has been replaced by response for simplerspec_0.1.0") } # 20170602: revise argument name and values of validation; # Replace validation = TRUE or FALSE with # new argument evaluation_method = "test_set" or "resampling" if (!missing(validation)) { warning("argument validation is depreciated; please use evaluation_method instead.", call. = FALSE) evaluation_method <- validation } # Depreciate argument pc, use more consistent and verbose argument ken_sto_pc if (!missing(pc)) { warning("argument pc is depreciated; please use ken_sto_pc instead.", call. = FALSE) ken_sto_pc <- pc } # Depreciate argument cv, use more consistent and verbose argument # resampling_method if (!missing(cv)) { warning("argument cv is depreciated; please use resampling_method instead.", call. = FALSE) resampling_method <- cv } # Change values for resampling_method argument if (resampling_method == "LOOCV") { warning("value 'LOOCV' (leave one out cross-validation) for argument resampling_method is depreciated; please use value 'loocv' instead.") resampling_method <- "loocv" } if (resampling_method == "repeatedcv") { warning("value 'repeatedcv' (repeated k-fold cross-validation) for argument resampling_method is depreciated; please use value 'rep_kfold_cv' instead.") resampling_method <- "rep_kfold_cv" } # Perform calibration sampling ----------------------------------------------- list_sampled <- split_data_q( spec_chem, split_method, ratio_val = ratio_val, ken_sto_pc = substitute(ken_sto_pc), evaluation_method = substitute(evaluation_method), invert = substitute(invert) ) # Check on method for cross-validation to be used in caret model tuning ------ if (resampling_method == "loocv") { # leave-one-out cross-validation tr_control <- control_train_loocv_q(x = list_sampled, response = substitute(response), env = env) } else if (resampling_method == "rep_kfold_cv") { # repeated k-fold cross-validation tr_control <- control_train_rcv_q(x = list_sampled, response = substitute(response), resampling_seed = substitute(resampling_seed), env = env) } else if (resampling_method == "none") { # no resampling; calls caret::train(..., method = "none"); # fixed number of PLS components; tuning_method argument has also # to be set to "none" tr_control <- control_train_none_q(x = list_sampled, response = substitute(response), resampling_seed = substitute(resampling_seed), env = env) } else if (resampling_method == "kfold_cv") { # k-fold cross validation tr_control <- control_train_q(x = list_sampled, response = substitute(response), resampling_seed = substitute(resampling_seed), env = env) } # Fit a pls calibration model; pls object is output from caret::train() ------ if (tuning_method == "resampling") { pls <- train_pls_q(x = list_sampled, evaluation_method = "test_set", response = substitute(response), tr_control = tr_control, center = center, scale = scale, pls_ncomp_max = substitute(pls_ncomp_max), env ) } else if (tuning_method == "none") { pls <- train_pls_q(x = list_sampled, evaluation_method = "test_set", response = substitute(response), tr_control = tr_control, center = center, scale = scale, tuning_method = "none", ncomp_fixed = substitute(ncomp_fixed), env ) } # Evaluate model accuracy (predicted vs. observed) --------------------------- stats <- evaluate_model_q(x = list_sampled, model = pls, response = substitute(response), evaluation_method = substitute(evaluation_method), tuning_method = substitute(tuning_method), resampling_method = substitute(resampling_method), env = parent.frame() ) list(data = list_sampled, p_pc = list_sampled$p_pc, model = pls, stats = stats$stats, p_model = stats$p_model, predobs = stats$predobs) } ## Old function name of `fit_pls`: `pls_ken_stone` #' @rdname fit_pls #' @export pls_ken_stone <- fit_pls ## Random forest modeling in one function ======================= #' @title Calibration sampling, and random forest model tuning and evaluation #' @description Perform calibration sampling and use selected #' calibration set for model tuning #' @param spec_chem Tibble that contains spectra, metadata and chemical #' reference as list-columns. The tibble to be supplied to \code{spec_chem} can #' be generated by the `join_chem_spc() function` #' @param response Response variable as symbol or name #' (without quotes, no character string). The provided response symbol needs to be #' a column name in the \code{spec_chem} tibble. #' @param variable Depreciated and replaced by `response` #' @param evaluation_method Character string stating evaluation method. #' Either \code{"test_set"} (default) or \code{"resampling"}. \code{"test_set"} #' will split the data into a calibration (training) and validation (test) set, #' and evaluate the final model by predicting on the validation set. #' If \code{"resampling"}, the finally selected model will be evaluated based #' on the cross-validation hold-out predictions. #' @param validation Depreciated and replaced by \code{evaluation_method}. #' Default is \code{TRUE}. #' @param split_method Method how to to split the data into a independent test #' set. Default is \code{"ken_sto"}, which will select samples for calibration #' based on Kennard-Stone sampling algorithm of preprocessed spectra. The #' proportion of validation to the total number of samples can be specified #' in the argument \code{ratio_val}. #' \code{split_method = "random"} will create a single random split. #' @param ratio_val Ratio of validation (test) samples to #' total number of samples (calibration (training) and validation (test)). #' @param ken_sto_pc Number of component used #' for calculating mahalanobsis distance on PCA scores for computing #' Kennard-Stone algorithm. #' Default is \code{ken_sto_pc = 2}, which will use the first two PCA #' components. #' @param pc Depreciated; renamed argument is `ken_sto_pc`. #' @param invert Logical #' @param tuning_method Character specifying tuning method. Tuning method #' affects how caret selects a final tuning value set from a list of candidate #' values. Possible values are \code{"resampling"}, which will use a #' specified resampling method such as repeated k-fold cross-validation (see #' argument \code{resampling_method}) and the generated performance profile #' based on the hold-out predictions to decide on the final tuning values #' that lead to optimal model performance. The value \code{"none"} will force #' caret to compute a final model for a predefined canditate PLS tuning #' parameter number of PLS components. In this case, the value #' supplied by \code{ncomp_fixed}` is used to set model complexity at #' a fixed number of components. #' @param resampling_seed Random seed (integer) that will be used for generating #' resampling indices, which will be supplied to \code{caret::trainControl}. #' This makes sure that modeling results are constant when re-fitting. #' Default is \code{resampling_seed = 123}. #' @param cv Depreciated. Use \code{resampling_method} instead. #' @param ntree_max Maximum random forest trees #' by caret::train. Caret will aggregate a performance profile using resampling #' for an integer sequence from 1 to \code{ntree_max} trees. #' @param print Logical expression whether model evaluation graphs shall be #' printed #' @param env Environment where function is evaluated. Default is #' \code{parent.frame}. #' @export # Note: check non standard evaluation, argument passing... fit_rf <- function(spec_chem, response, variable = NULL, # variable depreciated, will not work evaluation_method = "test_set", validation = NULL, # Validation is depreciated split_method = "ken_stone", ratio_val, ken_sto_pc = 2, pc = NULL, invert = TRUE, # only if split_method = "ken_stone tuning_method = "resampling", resampling_seed = 123, cv = NULL, # cv depreciated ntree_max = 500, print = TRUE, env = parent.frame()) { calibration <- NULL # Warning messages and reassignment for depreciated arguments ---------------- # Replace validation = TRUE or FALSE with # new argument evaluation_method = "test_set" or "resampling" if (!missing(validation)) { warning("argument validation is deprecated; please use evaluation_method instead.", call. = FALSE) evaluation_method <- validation } if (!missing(variable)) { warning("argument variable is deprecated; please use response instead.", call. = FALSE) response <- variable } # Depreciate argument pc, use more consistent and verbose argument ken_sto_pc if (!missing(pc)) { warning("argument pc is depreciated; please use ken_sto_pc instead.", call. = FALSE) ken_sto_pc <- pc } # Calibration sampling ------------------------------------------------------- list_sampled <- split_data_q( spec_chem, split_method, ratio_val = ratio_val, ken_sto_pc = substitute(ken_sto_pc), evaluation_method = substitute(evaluation_method), invert = substitute(invert) ) # Control parameters for caret::train ---------------------------------------- tr_control <- control_train_q(x = list_sampled, response = substitute(response), resampling_seed = substitute(resampling_seed), env = env) # Train random forest model (model tuning) ----------------------------------- rf <- train_rf_q(x = list_sampled, evaluation_method = "test_set", response = substitute(response), tr_control = tr_control, env, ntree_max = substitute(ntree_max) ) # Evaluate finally chosen random forest model -------------------------------- stats <- evaluate_model_q(x = list_sampled, model = rf, response = substitute(response), evaluation_method = substitute(evaluation_method), tuning_method = substitute(tuning_method), resampling_method = substitute(resampling_method), env = parent.frame() ) # Return list with results --------------------------------------------------- list(data = list_sampled, p_pc = list_sampled$p_pc, rf_model = rf, stats = stats$stats, p_model = stats$p_model) }