Streamlining spectral data processing and modeling for spectroscopy applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

271 lines
12 KiB

### VIP.R: Implementation of VIP (variable importance in projection)(*) for the
### `pls' package.
### $Id: VIP.R,v 1.2 2007/07/30 09:17:36 bhm Exp $
### Copyright: 2006,2007 Bjoern-Helge Mevik
### This program is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License version 2 as
### published by the Free Software Foundation.
###
### This program is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
### GNU General Public License for more details.
### A copy of the GPL text is available here:
### http://www.gnu.org/licenses/gpl-2.0.txt
### Contact info:
### Boejrn-Helge Mevik
### bhx6@mevik.net
### Roedtvetvien 20
### N-0955 Oslo
### Norway
### (*) As described in Chong, Il-Gyo & Jun, Chi-Hyuck, 2005, Performance of
### some variable selection methods when multicollinearity is present,
### Chemometrics and Intelligent Laboratory Systems 78, 103--112.
## VIP returns all VIP values for all variables and all number of components,
## as a ncomp x nvars matrix.
VIP <- function(object) {
# pb: added to avoid `R CMD check` note
method <- Yloadings <- scores <- loading.weights <- NULL
if (object$method != "oscorespls")
stop("Only implemented for orthogonal scores algorithm. Refit with 'method = \"oscorespls\"'")
if (nrow(object$Yloadings) > 1)
stop("Only implemented for single-response models")
SS <- c(object$Yloadings)^2 * colSums(object$scores^2)
Wnorm2 <- colSums(object$loading.weights^2)
SSW <- sweep(object$loading.weights^2, 2, SS / Wnorm2, "*")
sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS))
}
## VIPjh returns the VIP of variable j with h components
VIPjh <- function(object, j, h) {
# pb: added to avoid `R CMD check` note
method <- Yloadings <- scores <- loading.weights <- NULL
if (object$method != "oscorespls")
stop("Only implemented for orthogonal scores algorithm. Refit with 'method = \"oscorespls\"'")
if (nrow(object$Yloadings) > 1)
stop("Only implemented for single-response models")
b <- c(object$Yloadings)[1:h]
T <- object$scores[,1:h, drop = FALSE]
SS <- b^2 * colSums(T^2)
W <- object$loading.weights[,1:h, drop = FALSE]
Wnorm2 <- colSums(W^2)
sqrt(nrow(W) * sum(SS * W[j,]^2 / Wnorm2) / sum(SS))
}
#' @title Extract VIPs (variable importance in the projection) for a PLS
#' regression model output returned from model fitting with
#' \code{simplerspec::fit_pls()}
#' @description VIPs are extracted based on the \code{finalModel} sublist
#' in the \code{caret::train} output contained in the \code{model} element
#' of the \code{simplerspec::fit_pls()} model output list. The VIPs for
#' derived number of PLS components in the \code{finalModel} are computed.
#' @param mout Model output list returned from \code{simplerspec::fit_pls()}.
#' @usage extract_pls_vip(mout)
#' @return A tibble data frame with columns \code{wavenumber} and correponding
#' VIP values in the column \code{vip} for the finally chosen PLS regression
#' model at the final number of PLS components.
#' @export
extract_pls_vip <- function(mout) {
# Compute VIP for all wavenumbers and select only VIPs with ncomp in final
# model
final_model <- mout$model$finalModel
vip <- VIP(object = final_model)[final_model$ncomp, ]
# Collect wavenumbers from preprocessed spectra
wn <- as.numeric(colnames(mout$data$calibration$spc_pre[[1]]))
# Create a data frame with wavenumbers and VIP scores
tibble::tibble(wavenumber = wn, vip = vip)
}
#' @title Create a data frame containing start and end positions (wavenumbers)
#' where variable importance in projection (VIP) is > 1
#' @description Given a data frame with VIP outputs (wavenumber and vip
#' columns), start and end values denoting spectral regions where VIP > 1
#' are returned as data frame. The functions can be used as helper
#' function for plotting VIP.
#' @param df_vip Data frame containing \code{wavenumber} and \code{vip} columns
#' (numeric)
#' @return Data.frame containing vectors \code{start} (numeric; wavenumber),
#' \code{end} (numeric; wavenumber) and group (integer; values are
#' \code{1:length(start))}.
#' @export
create_vip_rects <- function(df_vip) {
# Avoid `R CMD check NOTE`: `no visible binding for global variable`
VIP <- wavenumber <- tail <- NULL
# Highlight region data
# https://stackoverflow.com/questions/32543176/highlight-areas-within-certain-x-range-in-ggplot2
v <- rep(0, nrow(df_vip))
v[df_vip$vip > 1] <- 1
# Get the start and end points for highlighted regions
inds <- diff(c(0, v))
start <- df_vip$wavenumber[inds == 1]
end <- df_vip$wavenumber[inds == -1]
if (length(start) > length(end)) {
end <- c(end, tail(df_vip$wavenumber, 1))
}
# Create data frame for rectangle layer (geom_rects)
data.frame(start = start, end = end, group = seq_along(start))
}
#' @title Plot stacked ggplot2 graphs with the Variable Importance for the
#' Projection (VIP) scores, mean replicate spectra (absorbance) per sample_id,
#'and the preprocessed spectra.
#' @description Plot stacked ggplot2 graphs of VIP for the final
#' PLS regression model output of the calibration (training) data set for the
#' final number of components, raw (replicate mean) spectra, and preprocessed
#' spectra. Regions with VIP > 1 are highlighted across the stacked graphs
#' in beige colour rectangles. VIP calculation is implemented as described in
#' Chong, I.-G., and Jun, C.-H. (2005). Performance of some variable selection
#' methods when multicollinearity is present. Chemometrics and Intelligent
#' Laboratory Systems, 78(1--2), 103--112. https://doi.org/10.1016/j.chemolab.2004.12.011
#' @param mout Model output list that is returned from
#' \code{simplerspec::fit_pls()}. This object contains a nested list with
#' the \code{caret::train()} object (class \code{train}), based on which
#' VIPs at finally selected number of PLS components are computed.
#' @param y1 Character vector of list-column name in
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
#' Default is \code{"spc_mean"}, which plots the mean calibration spectra after
#' resampling.
#' @param y2 Character string of list-column name in
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
#' Default is \code{"spc_pre"}, which plots the preprocessed calibration
#' spectra after resampling.
#' @param by Character string that is used to assign spectra to the same group
#' and therefore ensures that all spectra are plotted with the same colour.
#' Default is \code{"sample_id"}
#' @param xlab Character string of X axis title for shared x axis of stacked
#' graphs. Default is \code{expression(paste("Wavenumber [", cm^-1, "]"))}
#' @param ylab1 Y axis title of bottom spectrum. Default is \code{"Absorbance"}.
#' @param ylab2 Y axis title of bottom spectrum. Default is
#' \code{"Preprocessed Abs."}.
#' @param alpha Double between 0 and 1 that defines transparency of spectra
#' lines in returned graph (ggplot plot object).
#' @usage plot_pls_vip(mout, y1 = "spc_mean", y2 = "spc_pre",
#' by = "sample_id",
#' xlab = expression(paste("Wavenumber [", cm^-1, "]")),
#' ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
#' alpha = 0.2)
#' @export
plot_pls_vip <- function(mout, y1 = "spc_mean", y2 = "spc_pre",
by = "sample_id",
xlab = expression(paste("Wavenumber [", cm^-1, "]")),
ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
alpha = 0.2) {
# Avoid `R CMD check` NOTE: no visible binding for global variable `...`
variable <- wavenumber <- value <- group <- vip <- NULL
# Extract spectra tibble for calibration
spc_tbl <- mout$data$calibration
# Gather spectra in one data.table
ifelse(y1 == "spc",
{dt1 <- data.table::data.table(do.call(rbind, spc_tbl[, y1][[y1]]))},
{dt1 <- data.table::rbindlist(spc_tbl[, y1][[y1]])}
)
ifelse(y2 == "spc",
{dt2 <- data.table::data.table(do.call(rbind, spc_tbl[, y2][[y2]]))},
{dt2 <- data.table::rbindlist(spc_tbl[, y2][[y2]])}
)
# Extract ID variable and append it to the data.table
id <- spc_tbl[, by][[by]]
dt1[, id := id]
dt2[, id := id]
# Convert data table to long form for ggplot2 plotting
dt1_long <- data.table::melt(
dt1, measure = names(dt1)[!names(dt1) %in% c("id")]
)
dt1_long[, variable := as.numeric(as.character(variable))]
data.table::setnames(dt1_long, old = "variable", new = "wavenumber")
dt2_long <- data.table::melt(
dt2, measure = names(dt2)[!names(dt2) %in% c("id")]
)
dt2_long[, variable := as.numeric(as.character(variable))]
data.table::setnames(dt2_long, old = "variable", new = "wavenumber")
# Plot spectra and VIP scores
brk <- pretty(as.numeric(names(dt1)[!names(dt1) %in% c("id")]),
n = 10)
maxmin <- function(x) {c(max(x), min(x))}
x_lim <- maxmin(as.numeric(names(dt1)[!names(dt1) %in% c("id")]))
# Extract VIP (Variable Importance in Projection) scores for the given
# model
df_vip <- extract_pls_vip(mout)
# Determine highlighted regions above VIP = 1
rects <- create_vip_rects(df_vip)
# Plot for resampled and mean replicate spectra
p_spc <- ggplot2::ggplot(dt1_long, ggplot2::aes(x = wavenumber, y = value)) +
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
ggplot2::aes(xmin = start, xmax = end, ymin = min(dt1_long$value),
ymax = max(dt1_long$value), group = group), color = "transparent",
fill = "orange", alpha = 0.3) +
ggplot2::geom_line(data = dt1_long, inherit.aes = FALSE,
ggplot2::aes(x = wavenumber, y = value, group = id),
alpha = alpha, size = 0.2) +
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
ggplot2::labs(x = xlab, y = ylab1) +
ggplot2::theme_bw() +
ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
units = "points"), axis.text.x = ggplot2::element_blank())
p_spc_pre <- ggplot2::ggplot(dt2_long, ggplot2::aes(wavenumber, value)) +
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
ggplot2::aes(xmin = start, xmax = end, ymin = min(dt2_long$value),
ymax = max(dt2_long$value), group = group), color = "transparent",
fill = "orange", alpha = 0.3) +
ggplot2::geom_line(ggplot2::aes(group = id), alpha = alpha, size = 0.2) +
ggplot2::labs(x = xlab, y = ylab2) +
ggplot2::theme_bw() +
ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
units = "points")) +
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
units = "points"),
axis.title.y = ggplot2::element_text(vjust = 0.25),
axis.text.x = ggplot2::element_blank())
# Plot for VIP
# Extract PLS model response variable and number of components
response <- as.character(unique(mout$stats$response))
ncomp <- mout$model$finalModel$ncomp
p_vip <- ggplot2::ggplot(data = df_vip,
ggplot2::aes(x = wavenumber, y = vip)) +
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
ggplot2::aes(xmin = start, xmax = end, ymin = min(df_vip$vip),
ymax = max(df_vip$vip), group = group), color = "transparent",
fill = "orange", alpha = 0.3) +
ggplot2::geom_hline(yintercept = 1, colour = "red") +
ggplot2::geom_line() +
ggplot2::xlab(xlab) +
ggplot2::ylab(
bquote(paste(VIP[PLSR], " (", .(response),
", ", .(ncomp), " comps)"))) +
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
ggplot2::theme_bw() +
ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
units = "points"), axis.title.y = ggplot2::element_text(vjust = 0.25))
# Arrange plots in two panels without any margins in between
# Hints from
# https://stackoverflow.com/questions/42567045/alignment-of-two-plots-using-grid-arrange
cowplot::plot_grid(
p_spc, p_spc_pre, p_vip, rel_heights = c(0.4, 0.3, 0.3),
ncol = 1, align = "v")
}