You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
466 lines
17 KiB
466 lines
17 KiB
# Perform calibration sampling based on spectral PCA ------------
|
|
#' @title Split
|
|
#' @description Perform calibration sampling based on
|
|
#' the Kennard-Stones algorithm.
|
|
#' @param spec_chem data.frame that contains chemical
|
|
#' and IR spectroscopy data
|
|
#' @param ratio_val Ratio of number of validation and all samples.
|
|
#' @param pc Number of principal components (numeric)
|
|
#' @param print logical expression weather calibration
|
|
#' @param validation Logical expression weather
|
|
#' calibration sampling is performed
|
|
#' (\code{TRUE} or \code{FALSE}).
|
|
#' @usage ken_stone(spec_chem, ratio_val, pc, print = TRUE,
|
|
#' validation = TRUE)
|
|
#' @export
|
|
ken_stone <- function(spec_chem, ratio_val, pc,
|
|
print = TRUE, validation = TRUE) {
|
|
MIR <- model <- type <- PC1 <- PC2 <- NULL
|
|
# Now with a real dataset
|
|
# k = number of samples to select
|
|
# pc = if provided, the number of principal components
|
|
# (see ?kenStone)
|
|
if(validation == TRUE) {
|
|
# pc = 0.99 before !!!
|
|
sel <- prospectr::kenStone(X = spec_chem$MIR,
|
|
k = round(ratio_val * nrow(spec_chem)), pc = 2)
|
|
sel$model # The row index of calibration samples
|
|
# plot(sel$pc[, 1:2], xlab = 'PC1', ylab = 'PC2')
|
|
# Points selected for calibration
|
|
# points(sel$pc[sel$model, 1:2], pch = 19, col = 2)
|
|
|
|
# Plot samples selected for calibration in ggplot
|
|
sel_df_cal <- data.frame(sel$pc[- sel$model,1:2])
|
|
sel_df_cal$type <- as.factor(
|
|
rep("calibration", nrow(sel_df_cal))
|
|
)
|
|
sel_df_val <- data.frame(sel$pc[sel$model, 1:2])
|
|
sel_df_val$type <- as.factor(
|
|
rep("validation", nrow(sel_df_val)))
|
|
sel_df <- rbind(sel_df_cal, sel_df_val)
|
|
# Compute ratio needed to make the figure square
|
|
ratio <- with(sel_df, diff(range(PC1))/diff(range(PC2)))
|
|
p_pc <- ggplot2::ggplot(data = sel_df) +
|
|
ggplot2::geom_point(
|
|
ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) +
|
|
ggplot2::coord_fixed(ratio = 1) +
|
|
ggplot2::scale_shape_manual(values=c(1, 19)) +
|
|
ggplot2::scale_colour_manual(values=c("black", "red")) +
|
|
ggplot2::theme_bw()
|
|
# ggplot2::theme.user +
|
|
ggplot2::theme(legend.title = ggplot2::element_blank())
|
|
# print(p_pc)
|
|
|
|
# Split MIR data into calibration and validation set using
|
|
# the results of Kennard-Stone Calibration Sampling
|
|
# Selct by row index of calibration samples
|
|
val_set <- spec_chem[sel$model, ]
|
|
# Check number of observations (rows) for validation set
|
|
nrow(val_set)
|
|
cal_set <- spec_chem[- sel$model, ]
|
|
list_out <- list(
|
|
calibration = cal_set,
|
|
validation = val_set,
|
|
p_pc = p_pc
|
|
)
|
|
list_out
|
|
# Check number of observations (rows) for calibration set
|
|
# nrow(cal_set)
|
|
} else {
|
|
cal_set <- spec_chem
|
|
list(calibration = cal_set)
|
|
}
|
|
}
|
|
#' @title Perform model tuning
|
|
#' @description Uses function from caret to to model tuning
|
|
#' for PLS regression.
|
|
#' @param x list from calibration sampling
|
|
#' @param variable response variable for PLS regression, supplied
|
|
#' as character expression
|
|
#' @param validation Logical expression weather an independent
|
|
#' validation is performed.
|
|
#' @param env Environment where function is evaluated
|
|
#' @export
|
|
tune_model_q <- function(x, variable,
|
|
env = parent.frame(), validation = TRUE) {
|
|
calibration <- NULL
|
|
# List of calibration and validation samples
|
|
# set up a cross-validation scheme
|
|
# create 10 folds that we will keep for the different
|
|
# modeling approaches to allow comparison
|
|
# randomly break the data into 10 partitions
|
|
# note that k is the total number of samples for leave-one-out
|
|
# use substitute function to make non-standard evaluation
|
|
# of variable argument (looks at a function as argument,
|
|
# sees code used to compute value;
|
|
# see chapter 13.1 Capturing expressions
|
|
# in Advanced R (Hadley Wickham)
|
|
# !! p. 270
|
|
r <- eval(variable, x$calibration, env)
|
|
idx <- caret::createFolds(y = r, k = 10, returnTrain = T) # update ***
|
|
idx
|
|
# inject the index in the trainControl object
|
|
tr_control <- caret::trainControl(method = "cv", index = idx,
|
|
savePredictions = T)
|
|
if (validation == TRUE) {
|
|
tr_control
|
|
} else {
|
|
tr_control
|
|
}
|
|
}
|
|
|
|
#' @title Perform model tuning
|
|
#' @description Uses function from caret to to model tuning
|
|
#' for PLS regression.
|
|
#' @param x list from calibration sampling
|
|
#' @param variable response variable for PLS regression, supplied
|
|
#' as character expression
|
|
#' @param validation Logical expression weather an independent
|
|
#' validation is performed.
|
|
#' @param env Environment where function is evaluated
|
|
#' @export
|
|
tune_model <- function(x, variable,
|
|
env = parent.frame(), validation = TRUE) {
|
|
tune_model_q(x, substitute(variable), env)
|
|
}
|
|
|
|
# Fit a PLS regression model using the caret package ------------
|
|
|
|
#' @title Fit a PLS regression model
|
|
#' (quoted version of the function)
|
|
#' @description Uses the caret package to perform PLS modeling.
|
|
#' Spectra are centered and scaled prior to modeling.
|
|
#' @param x List that contains calibration
|
|
#' set, validation set, and model tuning options
|
|
#' @param validation Logical expression weather independent
|
|
#' validation is performed
|
|
#' @param variable Response variable to be modeled
|
|
#' @param tr_control Object that defines controlling parameters
|
|
#' of the desired internal validation framework
|
|
#' @param env Environment where function is evaluated
|
|
#' @export
|
|
fit_pls_q <- function(x, validation = TRUE,
|
|
variable, tr_control, env = parent.frame()) {
|
|
# Fit a partial least square regression (pls) model
|
|
# center and scale MIR (you can try without)
|
|
calibration <- MIR <- NULL
|
|
v <- eval(variable, x$calibration, env)
|
|
if (validation == TRUE) {
|
|
pls_model <- caret::train(x = x$calibration$MIR, y = v,
|
|
method = "pls",
|
|
tuneLength = 20,
|
|
trControl = tr_control,
|
|
preProcess = c("center", "scale")
|
|
)
|
|
} else {
|
|
pls_model <- caret::train(x = x$calibration$MIR, y = v,
|
|
method = "pls",
|
|
tuneLength = 20,
|
|
trControl = tr_control,
|
|
preProcess = c("center", "scale")
|
|
)
|
|
}
|
|
# Collect fitted object into a list
|
|
# fitList_cal <- list(pls = fit_pls)
|
|
# fitList_cal
|
|
pls_model
|
|
}
|
|
|
|
#' @title Fit a PLS regression model
|
|
#' @description Uses the caret package to perform PLS modeling.
|
|
#' Spectra are centered and scaled prior to modeling.
|
|
#' @param x List that contains calibration
|
|
#' set, validation set, and model tuning options
|
|
#' @param validation Logical expression weather independent
|
|
#' validation is performed
|
|
#' @param variable Response variable to be modeled
|
|
#' @param env Environment where function is evaluated
|
|
#' @export
|
|
fit_pls <- function(x, validation = TRUE,
|
|
variable, env = parent.frame()) {
|
|
fit_pls_q(x = x, validation = TRUE,
|
|
variable = substitute(variable), env
|
|
)
|
|
}
|
|
|
|
# Evaluate PLS performance (validation and cross-validation) ----
|
|
|
|
#' @title Evaluate PLS performance
|
|
#' @description Calculate model performance indices based
|
|
#' on observed and predicted values of validation and calibration
|
|
#' set, and internal cross-validation
|
|
#' @param x List that contains calibration and validation data
|
|
#' frame with combined spectral and chemical data
|
|
#' @param pls_model List with PLS regression model output from
|
|
#' the caret package
|
|
#' @param variable Response variable (e.g. chemical property) to be
|
|
#' modelled (needs to be non-quoted expression). \code{variable}
|
|
#' needs to be a column name in the \code{validation} data.frame
|
|
#' (element of \code{x})
|
|
#' @param validation Logical expression if independent validation
|
|
#' is performed (split data set into calibration set and
|
|
#' validation set)
|
|
#' @param print Print observed vs. predicted for calibration
|
|
#' and validation. Default is \code{TRUE}.
|
|
#' @param env Specifiy the environment in which the function is
|
|
#' called. Default argument of \code{env} is
|
|
#' \code{parent.frame()}
|
|
#' @export
|
|
evaluate_pls_q <- function(x, pls_model, variable,
|
|
validation = TRUE, print = TRUE, env = parent.frame()) {
|
|
# Set global variables to NULL to avoid R CMD check notes
|
|
MIR <- object <- model <- dataType <- obs <- pred <- NULL
|
|
ncomp <- finalModel <- rmsd <- r2 <- r2 <- rpd <- n <- NULL
|
|
rmse <- calibration <- NULL
|
|
# Collect fitted object into a list
|
|
list_models <- list(pls = pls_model)
|
|
# Extract best tuning parameters and associated cv predictions
|
|
if(validation == TRUE) {
|
|
predobs_cal <- plyr::ldply(list_models,
|
|
function(x) plyr::match_df(x$pred, x$bestTune),
|
|
.id = "model"
|
|
)
|
|
# Calculate training (calibration) and test (validation) data
|
|
# predictions based on pls model with calibration data
|
|
v <- eval(variable, x$validation, env)
|
|
predobs_val <- caret::extractPrediction(list_models,
|
|
testX = x$validation$MIR, testY = v) # update ***
|
|
# Create new data frame column <object>
|
|
predobs_val$object <- predobs_val$model
|
|
|
|
# Replace levels "Training" and "Test" in dataType column
|
|
# by "Calibration" and "Validation" (rename levels of factor)
|
|
predobs_val$dataType <- plyr::revalue(predobs_val$dataType,
|
|
c("Test" = "Validation", "Training" = "Calibration")
|
|
)
|
|
# Change the order of rows in the data frame
|
|
# Calibration as first level (show Calibration in ggplot graph
|
|
# on left panel)
|
|
predobs_val$dataType <- factor(predobs_val$dataType,
|
|
levels = c("Calibration", "Validation"))
|
|
# Calculate model performance indexes by model and dataType
|
|
# uses package plyr and function summary.df of SPECmisc.R
|
|
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
|
|
function(x) summary_df(x, "obs", "pred")
|
|
)
|
|
|
|
} else {
|
|
# Extract best tuning parameters and associated cv predictions
|
|
predobs_cv <- plyr::ldply(list_models,
|
|
function(x) plyr::match_df(x$pred, x$bestTune),
|
|
.id = "model"
|
|
)
|
|
# Extract auto-prediction
|
|
predobs <- caret::extractPrediction(list_models)
|
|
predobs_cv$object <- predobs_cv$model
|
|
predobs_cv$dataType <- "Cross-validation"
|
|
predobs_cv <- dplyr::select(
|
|
predobs_cv, obs, pred, model, dataType, object
|
|
)
|
|
predobs_val <- rbind(predobs, predobs_cv)
|
|
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
|
|
function(x) summary_df(x, "obs", "pred")
|
|
)
|
|
}
|
|
|
|
# Add number of components to stats; from finalModel list item
|
|
# from train() function output (function from caret package)
|
|
stats$ncomp <- rep(pls_model$finalModel$ncomp, nrow(stats))
|
|
# Add range of observed values for validation and calibraton
|
|
# get range from predicted vs. observed data frame
|
|
# stored in object predobs
|
|
obs_cal <- subset(predobs_val, dataType == "Calibration")$obs
|
|
obs_val <- subset(predobs_val, dataType == "Validation")$obs
|
|
# Get name of predicted variable; see p. 261 of book
|
|
# "Advanced R" (Hadley Wickham)
|
|
variable_name <- deparse(variable)
|
|
# before: deparse(substitute(variable))
|
|
df_range <- data.frame(
|
|
variable = rep(variable_name, 2),
|
|
dataType = c("Calibration", "Validation"),
|
|
min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
|
|
median_obs = c(median(obs_cal), median(obs_val)),
|
|
max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
|
|
mean_obs = c(mean(obs_cal), mean(obs_val)),
|
|
CV = c(sd(obs_cal) / mean(obs_cal) * 100,
|
|
sd(obs_val) / mean(obs_val) * 100)
|
|
)
|
|
|
|
# Join stats with range data frame (df_range)
|
|
stats <- plyr::join(stats, df_range, type = "inner")
|
|
annotation <- plyr::mutate(stats,
|
|
rmse = as.character(as.expression(paste0("RMSE == ",
|
|
round(rmsd, 2)))),
|
|
r2 = as.character(as.expression(paste0("italic(R)^2 == ",
|
|
round(r2, 2)))),
|
|
rpd = as.character(as.expression(paste("RPD == ",
|
|
round(rpd, 2)))),
|
|
n = as.character(as.expression(paste0("italic(n) == ", n))),
|
|
ncomp = as.character(as.expression(paste0("ncomp = ",
|
|
ncomp)))
|
|
)
|
|
|
|
# Plot predicted vs. observed values and model indexes
|
|
# update label, xlim, and ylim ***
|
|
# Add label number of samples to facet_grid using a
|
|
# labeling function
|
|
# ! Update labeller API:
|
|
# https://github.com/hadley/ggplot2/commit/ef33dc7
|
|
# http://sahirbhatnagar.com/facet_wrap_labels
|
|
|
|
# Prepare lookup character vector
|
|
make_label <- function(x, validation = TRUE) {
|
|
dataType <- n <- NULL
|
|
if (validation == TRUE) {
|
|
c(`Calibration` = paste0("Calibration", "~(",
|
|
x[x$dataType == "Calibration", ]$n, ")"
|
|
),
|
|
`Validation` = paste0("Validation", "~(",
|
|
x[x$dataType == "Validation", ]$n, ")"
|
|
)
|
|
)
|
|
} else{
|
|
c(`Calibration` = paste0("Calibration", "~(",
|
|
x[x$dataType == "Calibration", ]$n, ")"
|
|
),
|
|
`Cross-Validation` = paste0("Cross-Validation", "~(",
|
|
x[x$dataType == "Cross-Validation", ]$n, ")"
|
|
)
|
|
)
|
|
}
|
|
}
|
|
if (validation == TRUE) {
|
|
label_validation <- make_label(x = annotation,
|
|
validation = TRUE
|
|
)
|
|
} else {
|
|
label_validation <- make_label(x = annotation,
|
|
validation = FALSE
|
|
)
|
|
}
|
|
|
|
# Rename labels on the fly with a lookup character vector
|
|
to_string <- ggplot2::as_labeller(
|
|
x = label_validation, ggplot2::label_parsed
|
|
)
|
|
|
|
# -------------------------------------------------------------
|
|
|
|
|
|
# http://docs.ggplot2.org/0.9.3.1/label_parsed.html
|
|
# some other info: https://coderclub.b.uib.no/tag/plotmath/
|
|
# !!! now depreciated in ggplot2 >= 2.0.0
|
|
# dataType_labeller <- function(variable, value){
|
|
# new <- paste0(dataType_names[value], "~(", annotation$n, ")")
|
|
# plyr::llply(as.character(new), function(x) parse(text = x))
|
|
# }
|
|
p_pred_obs <- ggplot2::ggplot(data = predobs_val) +
|
|
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
|
|
shape = 1, size = 4) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = -Inf, y = Inf, label = r2), size = 7,
|
|
hjust = -0.1, vjust = 1.5, parse = TRUE) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = -Inf, y = Inf, label = rmse), size = 7,
|
|
hjust = -0.075, vjust = 4.25, parse = TRUE) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = -Inf, y = Inf, label = rpd), size = 7,
|
|
hjust = -0.1, vjust = 6.5, parse = TRUE) +
|
|
ggplot2::facet_grid(~ dataType,
|
|
labeller =ggplot2::as_labeller(to_string)) +
|
|
# ggplot2::facet_grid(~ dataType,
|
|
# labeller = dataType_labeller) +
|
|
ggplot2::theme_bw() +
|
|
ggplot2::geom_abline(col = "red") +
|
|
ggplot2::labs(x = "Observed", y = "Predicted") +
|
|
ggplot2::xlim(c(min(predobs_val$obs) -
|
|
0.05 * diff(range(predobs_val$obs)),
|
|
max(predobs_val$obs) +
|
|
0.05 * diff(range(predobs_val$obs)))) +
|
|
ggplot2::ylim(c(min(predobs_val$obs) -
|
|
0.05 * diff(range(predobs_val$obs)),
|
|
max(predobs_val$obs) +
|
|
0.05 * diff(range(predobs_val$obs)))) # +
|
|
# theme.user
|
|
|
|
## ggplot graph for model comparison
|
|
## (arranged later in panels)
|
|
x_label <- paste0("Observed ",
|
|
as.character(variable_name))
|
|
y_label <- paste0("Predicted ",
|
|
as.character(variable_name))
|
|
p_model <- ggplot2::ggplot(data = predobs_val) +
|
|
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
|
|
shape = 1, size = 2, alpha = 1/2) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 3,
|
|
hjust = 1.15, vjust = -3, parse = TRUE) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 3,
|
|
hjust = 1.12, vjust = -2.5, parse = TRUE) +
|
|
ggplot2::geom_text(data = annotation,
|
|
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 3,
|
|
hjust = 1.15, vjust = -1.25, parse = TRUE) +
|
|
ggplot2::facet_grid(~ dataType,
|
|
labeller = ggplot2::as_labeller(to_string)) +
|
|
# ggplot2::facet_grid(~ dataType,
|
|
# labeller = dataType_labeller) +
|
|
ggplot2::theme_bw() +
|
|
ggplot2::geom_abline(col = "red") +
|
|
ggplot2::labs(x = x_label, y = y_label) +
|
|
ggplot2::xlim(c(min(predobs_val$obs) -
|
|
0.05 * diff(range(predobs_val$obs)),
|
|
max(predobs_val$obs) +
|
|
0.05 * diff(range(predobs_val$obs)))) +
|
|
ggplot2::ylim(c(min(predobs_val$obs) -
|
|
0.05 * diff(range(predobs_val$obs)),
|
|
max(predobs_val$obs) +
|
|
0.05 * diff(range(predobs_val$obs)))) +
|
|
ggplot2::coord_fixed()
|
|
if(print == TRUE) {
|
|
print(p_model)
|
|
}
|
|
|
|
list(stats = stats, p_model = p_model)
|
|
}
|
|
|
|
|
|
## PLS regression modeling in one function ======================
|
|
|
|
#' @title Calibration sampling, model tuning, and PLS regression
|
|
#' @description Perform calibration sampling and use selected
|
|
#' calibration set for model tuning
|
|
#' @param spec_chem data.frame that contains IR spectroscopy
|
|
#' and chemical data
|
|
#' @param k Number of validation samples
|
|
#' @param pc Number of Principal Components used for Calibration
|
|
#' sampling (Kennard-Stones algorithm)
|
|
#' @param ratio_val Ratio of number of validation and all samples.
|
|
#' @param print Logical expression weather graphs shall be printed
|
|
#' @param validation Logical expression weather independent
|
|
#' validation is performed
|
|
#' @param variable Response variable (without quotes)
|
|
#' @param env Environment where function is evaluated
|
|
#' @export
|
|
# Note: check non standard evaluation, argument passing...
|
|
pls_ken_stone <- function(spec_chem, ratio_val, pc,
|
|
print = TRUE, validation = TRUE, variable,
|
|
env = parent.frame()) {
|
|
calibration <- 0
|
|
# Calibration sampling
|
|
list_sampled <- ken_stone(
|
|
spec_chem, ratio_val = ratio_val, pc = 2, validation = TRUE
|
|
)
|
|
tr_control <- tune_model_q(list_sampled,
|
|
substitute(variable), env
|
|
)
|
|
pls <- fit_pls_q(x = list_sampled, validation = TRUE,
|
|
variable = substitute(variable), tr_control = tr_control, env
|
|
)
|
|
stats <- evaluate_pls_q(x = list_sampled, pls_model = pls,
|
|
variable = substitute(variable), env = parent.frame()
|
|
)
|
|
list(data = list_sampled, p_pc = list_sampled$p_pc,
|
|
pls_model = pls, stats = stats$stats, p_model = stats$p_model)
|
|
}
|
|
|
|
|