You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
271 lines
12 KiB
271 lines
12 KiB
### VIP.R: Implementation of VIP (variable importance in projection)(*) for the
|
|
### `pls' package.
|
|
### $Id: VIP.R,v 1.2 2007/07/30 09:17:36 bhm Exp $
|
|
|
|
### Copyright: 2006,2007 Bjoern-Helge Mevik
|
|
### This program is free software; you can redistribute it and/or modify
|
|
### it under the terms of the GNU General Public License version 2 as
|
|
### published by the Free Software Foundation.
|
|
###
|
|
### This program is distributed in the hope that it will be useful,
|
|
### but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
### GNU General Public License for more details.
|
|
|
|
### A copy of the GPL text is available here:
|
|
### http://www.gnu.org/licenses/gpl-2.0.txt
|
|
|
|
### Contact info:
|
|
### Boejrn-Helge Mevik
|
|
### bhx6@mevik.net
|
|
### Roedtvetvien 20
|
|
### N-0955 Oslo
|
|
### Norway
|
|
|
|
### (*) As described in Chong, Il-Gyo & Jun, Chi-Hyuck, 2005, Performance of
|
|
### some variable selection methods when multicollinearity is present,
|
|
### Chemometrics and Intelligent Laboratory Systems 78, 103--112.
|
|
|
|
## VIP returns all VIP values for all variables and all number of components,
|
|
## as a ncomp x nvars matrix.
|
|
VIP <- function(object) {
|
|
# pb: added to avoid `R CMD check` note
|
|
method <- Yloadings <- scores <- loading.weights <- NULL
|
|
if (object$method != "oscorespls")
|
|
stop("Only implemented for orthogonal scores algorithm. Refit with 'method = \"oscorespls\"'")
|
|
if (nrow(object$Yloadings) > 1)
|
|
stop("Only implemented for single-response models")
|
|
|
|
SS <- c(object$Yloadings)^2 * colSums(object$scores^2)
|
|
Wnorm2 <- colSums(object$loading.weights^2)
|
|
SSW <- sweep(object$loading.weights^2, 2, SS / Wnorm2, "*")
|
|
sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS))
|
|
}
|
|
|
|
|
|
## VIPjh returns the VIP of variable j with h components
|
|
VIPjh <- function(object, j, h) {
|
|
# pb: added to avoid `R CMD check` note
|
|
method <- Yloadings <- scores <- loading.weights <- NULL
|
|
if (object$method != "oscorespls")
|
|
stop("Only implemented for orthogonal scores algorithm. Refit with 'method = \"oscorespls\"'")
|
|
if (nrow(object$Yloadings) > 1)
|
|
stop("Only implemented for single-response models")
|
|
|
|
b <- c(object$Yloadings)[1:h]
|
|
T <- object$scores[,1:h, drop = FALSE]
|
|
SS <- b^2 * colSums(T^2)
|
|
W <- object$loading.weights[,1:h, drop = FALSE]
|
|
Wnorm2 <- colSums(W^2)
|
|
sqrt(nrow(W) * sum(SS * W[j,]^2 / Wnorm2) / sum(SS))
|
|
}
|
|
|
|
#' @title Extract VIPs (variable importance in the projection) for a PLS
|
|
#' regression model output returned from model fitting with
|
|
#' \code{simplerspec::fit_pls()}
|
|
#' @description VIPs are extracted based on the \code{finalModel} sublist
|
|
#' in the \code{caret::train} output contained in the \code{model} element
|
|
#' of the \code{simplerspec::fit_pls()} model output list. The VIPs for
|
|
#' derived number of PLS components in the \code{finalModel} are computed.
|
|
#' @param mout Model output list returned from \code{simplerspec::fit_pls()}.
|
|
#' @usage extract_pls_vip(mout)
|
|
#' @return A tibble data frame with columns \code{wavenumber} and correponding
|
|
#' VIP values in the column \code{vip} for the finally chosen PLS regression
|
|
#' model at the final number of PLS components.
|
|
#' @export
|
|
extract_pls_vip <- function(mout) {
|
|
# Compute VIP for all wavenumbers and select only VIPs with ncomp in final
|
|
# model
|
|
final_model <- mout$model$finalModel
|
|
vip <- VIP(object = final_model)[final_model$ncomp, ]
|
|
# Collect wavenumbers from preprocessed spectra
|
|
wn <- as.numeric(colnames(mout$data$calibration$spc_pre[[1]]))
|
|
# Create a data frame with wavenumbers and VIP scores
|
|
tibble::tibble(wavenumber = wn, vip = vip)
|
|
}
|
|
|
|
#' @title Create a data frame containing start and end positions (wavenumbers)
|
|
#' where variable importance in projection (VIP) is > 1
|
|
#' @description Given a data frame with VIP outputs (wavenumber and vip
|
|
#' columns), start and end values denoting spectral regions where VIP > 1
|
|
#' are returned as data frame. The functions can be used as helper
|
|
#' function for plotting VIP.
|
|
#' @param df_vip Data frame containing \code{wavenumber} and \code{vip} columns
|
|
#' (numeric)
|
|
#' @return Data.frame containing vectors \code{start} (numeric; wavenumber),
|
|
#' \code{end} (numeric; wavenumber) and group (integer; values are
|
|
#' \code{1:length(start))}.
|
|
#' @export
|
|
create_vip_rects <- function(df_vip) {
|
|
|
|
# Avoid `R CMD check NOTE`: `no visible binding for global variable`
|
|
VIP <- wavenumber <- tail <- NULL
|
|
|
|
# Highlight region data
|
|
# https://stackoverflow.com/questions/32543176/highlight-areas-within-certain-x-range-in-ggplot2
|
|
v <- rep(0, nrow(df_vip))
|
|
v[df_vip$vip > 1] <- 1
|
|
# Get the start and end points for highlighted regions
|
|
inds <- diff(c(0, v))
|
|
start <- df_vip$wavenumber[inds == 1]
|
|
end <- df_vip$wavenumber[inds == -1]
|
|
if (length(start) > length(end)) {
|
|
end <- c(end, tail(df_vip$wavenumber, 1))
|
|
}
|
|
# Create data frame for rectangle layer (geom_rects)
|
|
data.frame(start = start, end = end, group = seq_along(start))
|
|
}
|
|
|
|
#' @title Plot stacked ggplot2 graphs with the Variable Importance for the
|
|
#' Projection (VIP) scores, mean replicate spectra (absorbance) per sample_id,
|
|
#'and the preprocessed spectra.
|
|
#' @description Plot stacked ggplot2 graphs of VIP for the final
|
|
#' PLS regression model output of the calibration (training) data set for the
|
|
#' final number of components, raw (replicate mean) spectra, and preprocessed
|
|
#' spectra. Regions with VIP > 1 are highlighted across the stacked graphs
|
|
#' in beige colour rectangles. VIP calculation is implemented as described in
|
|
#' Chong, I.-G., and Jun, C.-H. (2005). Performance of some variable selection
|
|
#' methods when multicollinearity is present. Chemometrics and Intelligent
|
|
#' Laboratory Systems, 78(1--2), 103--112. https://doi.org/10.1016/j.chemolab.2004.12.011
|
|
#' @param mout Model output list that is returned from
|
|
#' \code{simplerspec::fit_pls()}. This object contains a nested list with
|
|
#' the \code{caret::train()} object (class \code{train}), based on which
|
|
#' VIPs at finally selected number of PLS components are computed.
|
|
#' @param y1 Character vector of list-column name in
|
|
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
|
|
#' Default is \code{"spc_mean"}, which plots the mean calibration spectra after
|
|
#' resampling.
|
|
#' @param y2 Character string of list-column name in
|
|
#' \code{mout$data$calibration}, where spectra for bottom graph are extracted.
|
|
#' Default is \code{"spc_pre"}, which plots the preprocessed calibration
|
|
#' spectra after resampling.
|
|
#' @param by Character string that is used to assign spectra to the same group
|
|
#' and therefore ensures that all spectra are plotted with the same colour.
|
|
#' Default is \code{"sample_id"}
|
|
#' @param xlab Character string of X axis title for shared x axis of stacked
|
|
#' graphs. Default is \code{expression(paste("Wavenumber [", cm^-1, "]"))}
|
|
#' @param ylab1 Y axis title of bottom spectrum. Default is \code{"Absorbance"}.
|
|
#' @param ylab2 Y axis title of bottom spectrum. Default is
|
|
#' \code{"Preprocessed Abs."}.
|
|
#' @param alpha Double between 0 and 1 that defines transparency of spectra
|
|
#' lines in returned graph (ggplot plot object).
|
|
#' @usage plot_pls_vip(mout, y1 = "spc_mean", y2 = "spc_pre",
|
|
#' by = "sample_id",
|
|
#' xlab = expression(paste("Wavenumber [", cm^-1, "]")),
|
|
#' ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
|
|
#' alpha = 0.2)
|
|
#' @export
|
|
plot_pls_vip <- function(mout, y1 = "spc_mean", y2 = "spc_pre",
|
|
by = "sample_id",
|
|
xlab = expression(paste("Wavenumber [", cm^-1, "]")),
|
|
ylab1 = "Absorbance", ylab2 = "Preprocessed Abs.",
|
|
alpha = 0.2) {
|
|
|
|
# Avoid `R CMD check` NOTE: no visible binding for global variable `...`
|
|
variable <- wavenumber <- value <- group <- vip <- NULL
|
|
|
|
# Extract spectra tibble for calibration
|
|
spc_tbl <- mout$data$calibration
|
|
# Gather spectra in one data.table
|
|
|
|
ifelse(y1 == "spc",
|
|
{dt1 <- data.table::data.table(do.call(rbind, spc_tbl[, y1][[y1]]))},
|
|
{dt1 <- data.table::rbindlist(spc_tbl[, y1][[y1]])}
|
|
)
|
|
|
|
ifelse(y2 == "spc",
|
|
{dt2 <- data.table::data.table(do.call(rbind, spc_tbl[, y2][[y2]]))},
|
|
{dt2 <- data.table::rbindlist(spc_tbl[, y2][[y2]])}
|
|
)
|
|
|
|
# Extract ID variable and append it to the data.table
|
|
id <- spc_tbl[, by][[by]]
|
|
dt1[, id := id]
|
|
dt2[, id := id]
|
|
|
|
# Convert data table to long form for ggplot2 plotting
|
|
dt1_long <- data.table::melt(
|
|
dt1, measure = names(dt1)[!names(dt1) %in% c("id")]
|
|
)
|
|
dt1_long[, variable := as.numeric(as.character(variable))]
|
|
data.table::setnames(dt1_long, old = "variable", new = "wavenumber")
|
|
|
|
dt2_long <- data.table::melt(
|
|
dt2, measure = names(dt2)[!names(dt2) %in% c("id")]
|
|
)
|
|
dt2_long[, variable := as.numeric(as.character(variable))]
|
|
data.table::setnames(dt2_long, old = "variable", new = "wavenumber")
|
|
|
|
# Plot spectra and VIP scores
|
|
brk <- pretty(as.numeric(names(dt1)[!names(dt1) %in% c("id")]),
|
|
n = 10)
|
|
maxmin <- function(x) {c(max(x), min(x))}
|
|
x_lim <- maxmin(as.numeric(names(dt1)[!names(dt1) %in% c("id")]))
|
|
|
|
# Extract VIP (Variable Importance in Projection) scores for the given
|
|
# model
|
|
df_vip <- extract_pls_vip(mout)
|
|
|
|
# Determine highlighted regions above VIP = 1
|
|
rects <- create_vip_rects(df_vip)
|
|
|
|
# Plot for resampled and mean replicate spectra
|
|
p_spc <- ggplot2::ggplot(dt1_long, ggplot2::aes(x = wavenumber, y = value)) +
|
|
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
|
|
ggplot2::aes(xmin = start, xmax = end, ymin = min(dt1_long$value),
|
|
ymax = max(dt1_long$value), group = group), color = "transparent",
|
|
fill = "orange", alpha = 0.3) +
|
|
ggplot2::geom_line(data = dt1_long, inherit.aes = FALSE,
|
|
ggplot2::aes(x = wavenumber, y = value, group = id),
|
|
alpha = alpha, size = 0.2) +
|
|
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
|
|
ggplot2::labs(x = xlab, y = ylab1) +
|
|
ggplot2::theme_bw() +
|
|
ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
|
|
units = "points"), axis.text.x = ggplot2::element_blank())
|
|
|
|
p_spc_pre <- ggplot2::ggplot(dt2_long, ggplot2::aes(wavenumber, value)) +
|
|
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
|
|
ggplot2::aes(xmin = start, xmax = end, ymin = min(dt2_long$value),
|
|
ymax = max(dt2_long$value), group = group), color = "transparent",
|
|
fill = "orange", alpha = 0.3) +
|
|
ggplot2::geom_line(ggplot2::aes(group = id), alpha = alpha, size = 0.2) +
|
|
ggplot2::labs(x = xlab, y = ylab2) +
|
|
ggplot2::theme_bw() +
|
|
ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
|
|
units = "points")) +
|
|
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
|
|
ggplot2::theme(plot.margin = ggplot2::unit(c(1, 5, -30, 6),
|
|
units = "points"),
|
|
axis.title.y = ggplot2::element_text(vjust = 0.25),
|
|
axis.text.x = ggplot2::element_blank())
|
|
|
|
# Plot for VIP
|
|
# Extract PLS model response variable and number of components
|
|
response <- as.character(unique(mout$stats$response))
|
|
ncomp <- mout$model$finalModel$ncomp
|
|
p_vip <- ggplot2::ggplot(data = df_vip,
|
|
ggplot2::aes(x = wavenumber, y = vip)) +
|
|
ggplot2::geom_rect(data = rects, inherit.aes = FALSE,
|
|
ggplot2::aes(xmin = start, xmax = end, ymin = min(df_vip$vip),
|
|
ymax = max(df_vip$vip), group = group), color = "transparent",
|
|
fill = "orange", alpha = 0.3) +
|
|
ggplot2::geom_hline(yintercept = 1, colour = "red") +
|
|
ggplot2::geom_line() +
|
|
ggplot2::xlab(xlab) +
|
|
ggplot2::ylab(
|
|
bquote(paste(VIP[PLSR], " (", .(response),
|
|
", ", .(ncomp), " comps)"))) +
|
|
ggplot2::scale_x_reverse(limits = x_lim, breaks = brk) +
|
|
ggplot2::theme_bw() +
|
|
ggplot2::theme(plot.margin = ggplot2::unit(c(0, 5, 1, 1),
|
|
units = "points"), axis.title.y = ggplot2::element_text(vjust = 0.25))
|
|
|
|
# Arrange plots in two panels without any margins in between
|
|
# Hints from
|
|
# https://stackoverflow.com/questions/42567045/alignment-of-two-plots-using-grid-arrange
|
|
cowplot::plot_grid(
|
|
p_spc, p_spc_pre, p_vip, rel_heights = c(0.4, 0.3, 0.3),
|
|
ncol = 1, align = "v")
|
|
}
|
|
|
|
|