Streamlining spectral data processing and modeling for spectroscopy applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

466 lines
17 KiB

# Perform calibration sampling based on spectral PCA ------------
#' @title Split
#' @description Perform calibration sampling based on
#' the Kennard-Stones algorithm.
#' @param spec_chem data.frame that contains chemical
#' and IR spectroscopy data
#' @param ratio_val Ratio of number of validation and all samples.
#' @param pc Number of principal components (numeric)
#' @param print logical expression weather calibration
#' @param validation Logical expression weather
#' calibration sampling is performed
#' (\code{TRUE} or \code{FALSE}).
#' @usage ken_stone(spec_chem, ratio_val, pc, print = TRUE,
#' validation = TRUE)
#' @export
ken_stone <- function(spec_chem, ratio_val, pc,
print = TRUE, validation = TRUE) {
MIR <- model <- type <- PC1 <- PC2 <- NULL
# Now with a real dataset
# k = number of samples to select
# pc = if provided, the number of principal components
# (see ?kenStone)
if(validation == TRUE) {
# pc = 0.99 before !!!
sel <- prospectr::kenStone(X = spec_chem$MIR,
k = round(ratio_val * nrow(spec_chem)), pc = 2)
sel$model # The row index of calibration samples
# plot(sel$pc[, 1:2], xlab = 'PC1', ylab = 'PC2')
# Points selected for calibration
# points(sel$pc[sel$model, 1:2], pch = 19, col = 2)
# Plot samples selected for calibration in ggplot
sel_df_cal <- data.frame(sel$pc[- sel$model,1:2])
sel_df_cal$type <- as.factor(
rep("calibration", nrow(sel_df_cal))
)
sel_df_val <- data.frame(sel$pc[sel$model, 1:2])
sel_df_val$type <- as.factor(
rep("validation", nrow(sel_df_val)))
sel_df <- rbind(sel_df_cal, sel_df_val)
# Compute ratio needed to make the figure square
ratio <- with(sel_df, diff(range(PC1))/diff(range(PC2)))
p_pc <- ggplot2::ggplot(data = sel_df) +
ggplot2::geom_point(
ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::scale_shape_manual(values=c(1, 19)) +
ggplot2::scale_colour_manual(values=c("black", "red")) +
ggplot2::theme_bw()
# ggplot2::theme.user +
ggplot2::theme(legend.title = ggplot2::element_blank())
# print(p_pc)
# Split MIR data into calibration and validation set using
# the results of Kennard-Stone Calibration Sampling
# Selct by row index of calibration samples
val_set <- spec_chem[sel$model, ]
# Check number of observations (rows) for validation set
nrow(val_set)
cal_set <- spec_chem[- sel$model, ]
list_out <- list(
calibration = cal_set,
validation = val_set,
p_pc = p_pc
)
list_out
# Check number of observations (rows) for calibration set
# nrow(cal_set)
} else {
cal_set <- spec_chem
list(calibration = cal_set)
}
}
#' @title Perform model tuning
#' @description Uses function from caret to to model tuning
#' for PLS regression.
#' @param x list from calibration sampling
#' @param variable response variable for PLS regression, supplied
#' as character expression
#' @param validation Logical expression weather an independent
#' validation is performed.
#' @param env Environment where function is evaluated
#' @export
tune_model_q <- function(x, variable,
env = parent.frame(), validation = TRUE) {
calibration <- NULL
# List of calibration and validation samples
# set up a cross-validation scheme
# create 10 folds that we will keep for the different
# modeling approaches to allow comparison
# randomly break the data into 10 partitions
# note that k is the total number of samples for leave-one-out
# use substitute function to make non-standard evaluation
# of variable argument (looks at a function as argument,
# sees code used to compute value;
# see chapter 13.1 Capturing expressions
# in Advanced R (Hadley Wickham)
# !! p. 270
r <- eval(variable, x$calibration, env)
idx <- caret::createFolds(y = r, k = 10, returnTrain = T) # update ***
idx
# inject the index in the trainControl object
tr_control <- caret::trainControl(method = "cv", index = idx,
savePredictions = T)
if (validation == TRUE) {
tr_control
} else {
tr_control
}
}
#' @title Perform model tuning
#' @description Uses function from caret to to model tuning
#' for PLS regression.
#' @param x list from calibration sampling
#' @param variable response variable for PLS regression, supplied
#' as character expression
#' @param validation Logical expression weather an independent
#' validation is performed.
#' @param env Environment where function is evaluated
#' @export
tune_model <- function(x, variable,
env = parent.frame(), validation = TRUE) {
tune_model_q(x, substitute(variable), env)
}
# Fit a PLS regression model using the caret package ------------
#' @title Fit a PLS regression model
#' (quoted version of the function)
#' @description Uses the caret package to perform PLS modeling.
#' Spectra are centered and scaled prior to modeling.
#' @param x List that contains calibration
#' set, validation set, and model tuning options
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable to be modeled
#' @param tr_control Object that defines controlling parameters
#' of the desired internal validation framework
#' @param env Environment where function is evaluated
#' @export
fit_pls_q <- function(x, validation = TRUE,
variable, tr_control, env = parent.frame()) {
# Fit a partial least square regression (pls) model
# center and scale MIR (you can try without)
calibration <- MIR <- NULL
v <- eval(variable, x$calibration, env)
if (validation == TRUE) {
pls_model <- caret::train(x = x$calibration$MIR, y = v,
method = "pls",
tuneLength = 20,
trControl = tr_control,
preProcess = c("center", "scale")
)
} else {
pls_model <- caret::train(x = x$calibration$MIR, y = v,
method = "pls",
tuneLength = 20,
trControl = tr_control,
preProcess = c("center", "scale")
)
}
# Collect fitted object into a list
# fitList_cal <- list(pls = fit_pls)
# fitList_cal
pls_model
}
#' @title Fit a PLS regression model
#' @description Uses the caret package to perform PLS modeling.
#' Spectra are centered and scaled prior to modeling.
#' @param x List that contains calibration
#' set, validation set, and model tuning options
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable to be modeled
#' @param env Environment where function is evaluated
#' @export
fit_pls <- function(x, validation = TRUE,
variable, env = parent.frame()) {
fit_pls_q(x = x, validation = TRUE,
variable = substitute(variable), env
)
}
# Evaluate PLS performance (validation and cross-validation) ----
#' @title Evaluate PLS performance
#' @description Calculate model performance indices based
#' on observed and predicted values of validation and calibration
#' set, and internal cross-validation
#' @param x List that contains calibration and validation data
#' frame with combined spectral and chemical data
#' @param pls_model List with PLS regression model output from
#' the caret package
#' @param variable Response variable (e.g. chemical property) to be
#' modelled (needs to be non-quoted expression). \code{variable}
#' needs to be a column name in the \code{validation} data.frame
#' (element of \code{x})
#' @param validation Logical expression if independent validation
#' is performed (split data set into calibration set and
#' validation set)
#' @param print Print observed vs. predicted for calibration
#' and validation. Default is \code{TRUE}.
#' @param env Specifiy the environment in which the function is
#' called. Default argument of \code{env} is
#' \code{parent.frame()}
#' @export
evaluate_pls_q <- function(x, pls_model, variable,
validation = TRUE, print = TRUE, env = parent.frame()) {
# Set global variables to NULL to avoid R CMD check notes
MIR <- object <- model <- dataType <- obs <- pred <- NULL
ncomp <- finalModel <- rmsd <- r2 <- r2 <- rpd <- n <- NULL
rmse <- calibration <- NULL
# Collect fitted object into a list
list_models <- list(pls = pls_model)
# Extract best tuning parameters and associated cv predictions
if(validation == TRUE) {
predobs_cal <- plyr::ldply(list_models,
function(x) plyr::match_df(x$pred, x$bestTune),
.id = "model"
)
# Calculate training (calibration) and test (validation) data
# predictions based on pls model with calibration data
v <- eval(variable, x$validation, env)
predobs_val <- caret::extractPrediction(list_models,
testX = x$validation$MIR, testY = v) # update ***
# Create new data frame column <object>
predobs_val$object <- predobs_val$model
# Replace levels "Training" and "Test" in dataType column
# by "Calibration" and "Validation" (rename levels of factor)
predobs_val$dataType <- plyr::revalue(predobs_val$dataType,
c("Test" = "Validation", "Training" = "Calibration")
)
# Change the order of rows in the data frame
# Calibration as first level (show Calibration in ggplot graph
# on left panel)
predobs_val$dataType <- factor(predobs_val$dataType,
levels = c("Calibration", "Validation"))
# Calculate model performance indexes by model and dataType
# uses package plyr and function summary.df of SPECmisc.R
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
)
} else {
# Extract best tuning parameters and associated cv predictions
predobs_cv <- plyr::ldply(list_models,
function(x) plyr::match_df(x$pred, x$bestTune),
.id = "model"
)
# Extract auto-prediction
predobs <- caret::extractPrediction(list_models)
predobs_cv$object <- predobs_cv$model
predobs_cv$dataType <- "Cross-validation"
predobs_cv <- dplyr::select(
predobs_cv, obs, pred, model, dataType, object
)
predobs_val <- rbind(predobs, predobs_cv)
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
)
}
# Add number of components to stats; from finalModel list item
# from train() function output (function from caret package)
stats$ncomp <- rep(pls_model$finalModel$ncomp, nrow(stats))
# Add range of observed values for validation and calibraton
# get range from predicted vs. observed data frame
# stored in object predobs
obs_cal <- subset(predobs_val, dataType == "Calibration")$obs
obs_val <- subset(predobs_val, dataType == "Validation")$obs
# Get name of predicted variable; see p. 261 of book
# "Advanced R" (Hadley Wickham)
variable_name <- deparse(variable)
# before: deparse(substitute(variable))
df_range <- data.frame(
variable = rep(variable_name, 2),
dataType = c("Calibration", "Validation"),
min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
median_obs = c(median(obs_cal), median(obs_val)),
max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
mean_obs = c(mean(obs_cal), mean(obs_val)),
CV = c(sd(obs_cal) / mean(obs_cal) * 100,
sd(obs_val) / mean(obs_val) * 100)
)
# Join stats with range data frame (df_range)
stats <- plyr::join(stats, df_range, type = "inner")
annotation <- plyr::mutate(stats,
rmse = as.character(as.expression(paste0("RMSE == ",
round(rmsd, 2)))),
r2 = as.character(as.expression(paste0("italic(R)^2 == ",
round(r2, 2)))),
rpd = as.character(as.expression(paste("RPD == ",
round(rpd, 2)))),
n = as.character(as.expression(paste0("italic(n) == ", n))),
ncomp = as.character(as.expression(paste0("ncomp = ",
ncomp)))
)
# Plot predicted vs. observed values and model indexes
# update label, xlim, and ylim ***
# Add label number of samples to facet_grid using a
# labeling function
# ! Update labeller API:
# https://github.com/hadley/ggplot2/commit/ef33dc7
# http://sahirbhatnagar.com/facet_wrap_labels
# Prepare lookup character vector
make_label <- function(x, validation = TRUE) {
dataType <- n <- NULL
if (validation == TRUE) {
c(`Calibration` = paste0("Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Validation` = paste0("Validation", "~(",
x[x$dataType == "Validation", ]$n, ")"
)
)
} else{
c(`Calibration` = paste0("Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Cross-Validation` = paste0("Cross-Validation", "~(",
x[x$dataType == "Cross-Validation", ]$n, ")"
)
)
}
}
if (validation == TRUE) {
label_validation <- make_label(x = annotation,
validation = TRUE
)
} else {
label_validation <- make_label(x = annotation,
validation = FALSE
)
}
# Rename labels on the fly with a lookup character vector
to_string <- ggplot2::as_labeller(
x = label_validation, ggplot2::label_parsed
)
# -------------------------------------------------------------
# http://docs.ggplot2.org/0.9.3.1/label_parsed.html
# some other info: https://coderclub.b.uib.no/tag/plotmath/
# !!! now depreciated in ggplot2 >= 2.0.0
# dataType_labeller <- function(variable, value){
# new <- paste0(dataType_names[value], "~(", annotation$n, ")")
# plyr::llply(as.character(new), function(x) parse(text = x))
# }
p_pred_obs <- ggplot2::ggplot(data = predobs_val) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 4) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = r2), size = 7,
hjust = -0.1, vjust = 1.5, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = rmse), size = 7,
hjust = -0.075, vjust = 4.25, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = rpd), size = 7,
hjust = -0.1, vjust = 6.5, parse = TRUE) +
ggplot2::facet_grid(~ dataType,
labeller =ggplot2::as_labeller(to_string)) +
# ggplot2::facet_grid(~ dataType,
# labeller = dataType_labeller) +
ggplot2::theme_bw() +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = "Observed", y = "Predicted") +
ggplot2::xlim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::ylim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) # +
# theme.user
## ggplot graph for model comparison
## (arranged later in panels)
x_label <- paste0("Observed ",
as.character(variable_name))
y_label <- paste0("Predicted ",
as.character(variable_name))
p_model <- ggplot2::ggplot(data = predobs_val) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 2, alpha = 1/2) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 3,
hjust = 1.15, vjust = -3, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 3,
hjust = 1.12, vjust = -2.5, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 3,
hjust = 1.15, vjust = -1.25, parse = TRUE) +
ggplot2::facet_grid(~ dataType,
labeller = ggplot2::as_labeller(to_string)) +
# ggplot2::facet_grid(~ dataType,
# labeller = dataType_labeller) +
ggplot2::theme_bw() +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = x_label, y = y_label) +
ggplot2::xlim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::ylim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::coord_fixed()
if(print == TRUE) {
print(p_model)
}
list(stats = stats, p_model = p_model)
}
## PLS regression modeling in one function ======================
#' @title Calibration sampling, model tuning, and PLS regression
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem data.frame that contains IR spectroscopy
#' and chemical data
#' @param k Number of validation samples
#' @param pc Number of Principal Components used for Calibration
#' sampling (Kennard-Stones algorithm)
#' @param ratio_val Ratio of number of validation and all samples.
#' @param print Logical expression weather graphs shall be printed
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable (without quotes)
#' @param env Environment where function is evaluated
#' @export
# Note: check non standard evaluation, argument passing...
pls_ken_stone <- function(spec_chem, ratio_val, pc,
print = TRUE, validation = TRUE, variable,
env = parent.frame()) {
calibration <- 0
# Calibration sampling
list_sampled <- ken_stone(
spec_chem, ratio_val = ratio_val, pc = 2, validation = TRUE
)
tr_control <- tune_model_q(list_sampled,
substitute(variable), env
)
pls <- fit_pls_q(x = list_sampled, validation = TRUE,
variable = substitute(variable), tr_control = tr_control, env
)
stats <- evaluate_pls_q(x = list_sampled, pls_model = pls,
variable = substitute(variable), env = parent.frame()
)
list(data = list_sampled, p_pc = list_sampled$p_pc,
pls_model = pls, stats = stats$stats, p_model = stats$p_model)
}