Streamlining spectral data processing and modeling for spectroscopy applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
6.1 KiB

#' @title Remove outlier spectra
#' @description Remove outlier spectra based on the
#' \code{pcout()} function of the \code{mvoutlier} package.
#' @usage remove_outliers(list_spectra, remove = TRUE)
#' @param list_spectra List that contains averaged
#' spectral information
#' in list element \code{MIR_mean} (data.frame) and metadata in
#' \code{data_meta} (data.frame).
#' @param remove logical expression (\code{TRUE} or \code{FALSE})
#' that specifies weather spectra shall be removed.
#' If \code{rm = FALSE}, there will be no outlier removal
#' @return Returns list \code{spectra_out} that contains:
#' \itemize{
#' \item \code{MIR_mean}: Outlier removed MIR spectra as
#' data.frame object. If \code{remove = FALSE},
#' the function will
#' return almost identical list identical to \code{list_spectra},
#' except that the first \code{indices} column of the spectral
#' data frame \code{MIR_mean} is removed
#' (This is done for both options
#' \code{remove = TRUE} and \code{remove = FALSE}).
#' \item \code{data_meta}: metadata data.frame, identical
#' as in the \code{list_spectra} input list.
#' \item \code{plot_out}: (optional) ggplot2 graph
#' that shows all spectra (absorbance on x-axis and wavenumber
#' on y-axis) with outlier marked, if
#' \code{remove = TRUE}.
#' }
#' @details This is an optional function if one wants to remove
#' outliers.
#' @export
remove_outliers <- function(list_spectra, remove = TRUE) {
# Outlier detection
# Use the mvoutlier package and pcout function to identify
# multivariate outliers
wfinal01 <- ID <- NULL
if (remove == TRUE) {
# Remove the 'indices' column
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1]
out <- mvoutlier::pcout(list_spectra$MIR_mean, makeplot = T,
outbound = 0.05) # parameters should be adapted
# Plot outlying spectra
plot_out <- plotMIR(
list_spectra$MIR_mean[
order(out$wfinal01, decreasing = T), ],
col = as.factor(out$wfinal01[order(out$wfinal01,
decreasing = T)])) +
ggplot2::scale_colour_brewer("outlier", palette = "Set1")
out_id <- as.character(
list_spectra$data_meta$ID[!as.logical(out$wfinal01)]
)
# Remove outliers
MIR_mean <- list_spectra$MIR_mean[
! list_spectra$data_meta$ID %in% out_id, ]
# rep ID and country name
data_meta <- list_spectra$data_meta[
! list_spectra$data_meta$ID %in% out_id, ]
spectra_out <- list(MIR_mean = MIR_mean,
data_meta = data_meta,
plot_out = plot_out)
} else {
# Remove the 'indices' column
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1]
spectra_out <- list(MIR_mean = list_spectra$MIR_mean,
data_meta = list_spectra$data_meta)
}
spectra_out
}
## plotMIR function of Antoine Stevens; don't export this
## function to the NAMESPACE
plotMIR <- function(spc, group = NULL, col = NULL,
linetype = NULL, wr = NULL, brk = NULL,
ylab = "Absorbance", xlab = "Wavenumber /cm-1",
by = NULL, by.wrap = T, ...){
# Function to plot spectra, based on the ggplot2 package
# spc = spectral matrix, with colnames = wavelengths
# group = grouping variable, usually the id's of the sample
# wr = wavelength range to plot
# brk = breaks of the x-axis
# by = factor variable for which the mean and sd of
# each level will be computed and plotted (optional)
# Requires packages ggplot2; data.table; reshape2
# Workaround to pass R CMD check:
# http://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
# Setting the variables to NULL first
variable <- value <- colour <- NULL
spc <- as.data.frame(spc)
if (!is.null(wr))
spc <- spc[, as.numeric(colnames(spc)) >= min(wr) &
as.numeric(colnames(spc)) <= max(wr)]
if (is.null(brk))
brk <- pretty(as.numeric(colnames(spc)), n = 10)
if (!is.null(by)) {
spc$by <- by
spc <- data.table::data.table(spc, check.names = F)
mean.spc <- reshape2::melt(
spc[, lapply(data.table::.SD, mean), by = by],
id.vars = "by"
)
sd.spc <- reshape2::melt(
spc[, lapply(data.table::.SD, sd), by = by],
id.vars = "by"
)
mean.spc$min <- mean.spc$value - sd.spc$value
mean.spc$max <- mean.spc$value + sd.spc$value
mean.spc$variable <- as.numeric(
as.character(mean.spc$variable)
)
if (by.wrap) {
p <- ggplot2::ggplot(data = mean.spc) +
ggplot2::geom_ribbon(
ggplot2::aes(x = variable, ymin = min, ymax = max),
fill = "grey", col = "black", size = 0.15) +
ggplot2::theme_bw()
p <- p + ggplot2::geom_line(
ggplot2::aes(x = variable, y = value),
size = 0.25) +
ggplot2::facet_wrap(~ by) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::scale_x_reverse(breaks = brk)
} else {
p <- ggplot2::ggplot(data = mean.spc,
ggplot2::aes(x = variable, y = value, group = by, col = by)) +
ggplot2::geom_line(size = 0.25) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::scale_x_reverse(breaks = brk) +
ggplot2::theme_bw()
}
return(p)
} else {
if (is.null(group))
group <- as.character(1:nrow(spc))
spc$group <- group
spc$colour <- col
spc$linetype <- linetype
id.var <- colnames(spc)[
grep("group|colour|linetype",colnames(spc))]
tmp <- reshape2::melt(spc, id.var = id.var)
tmp$variable <- as.numeric(as.character(tmp$variable))
p <- ggplot2::ggplot(tmp,
ggplot2::aes(variable, value, group = group)) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::theme_bw() +
ggplot2::scale_x_reverse(breaks = brk)
if (is.null(col) & is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = group))
else if (!is.null(col) & is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = colour))
else if (is.null(col) & !is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = group,
linetype = linetype))
else p <- p + ggplot2::geom_line(
ggplot2::aes(colour = colour,
linetype = linetype))
return(p)
}
}