@ -0,0 +1,2 @@ | |||
^.*\.Rproj$ | |||
^\.Rproj\.user$ |
@ -0,0 +1,4 @@ | |||
.Rproj.user | |||
.Rhistory | |||
.RData | |||
.Ruserdata |
@ -0,0 +1,27 @@ | |||
Package: simplerspec | |||
Type: Package | |||
Title: Soil and plant spectroscopic model building and prediction | |||
Depends: R (>= 3.2) | |||
Imports: | |||
ggplot2 (>= 2.0.0), | |||
plyr, | |||
data.table, | |||
reshape2, | |||
mvoutlier, | |||
hexView, | |||
Rcpp, | |||
hyperSpec, | |||
prospectr, | |||
dplyr, | |||
caret, | |||
tidyr | |||
Version: 0.1.0 | |||
Authors@R: person("Philipp", "Baumann", | |||
email = "baumanph@student.ethz.ch", role = c("aut", "cre")) | |||
Description: Functions that cover | |||
reading of spectral data, outlier removal, | |||
spectral preprocessing, calibration sampling, PLS regression | |||
using caret, and model diagnostic statistics and plots. | |||
License: GPL-2 | |||
LazyData: TRUE | |||
RoxygenNote: 5.0.1 |
@ -0,0 +1,23 @@ | |||
# Generated by roxygen2: do not edit by hand | |||
export(average_spectra) | |||
export(do_pretreatment) | |||
export(evaluate_pls_q) | |||
export(fit_pls) | |||
export(fit_pls_q) | |||
export(join_chem_spec) | |||
export(ken_stone) | |||
export(pls_ken_stone) | |||
export(predict_from_spectra) | |||
export(readOPUS) | |||
export(readOPUS_bin) | |||
export(readOPUS_text) | |||
export(read_spectra) | |||
export(remove_outliers) | |||
export(resample_spectra) | |||
export(summary_df) | |||
export(tune_model) | |||
export(tune_model_q) | |||
import(Rcpp) | |||
import(data.table) | |||
import(hyperSpec) |
@ -0,0 +1,85 @@ | |||
# Helper function written by Antoine Stevens that | |||
# is used for averaging replication scans of one sample | |||
#' @import data.table | |||
#' @import Rcpp | |||
by_spc <- function(spc, indices, fun = mean){ | |||
# Fast summary of spectral data | |||
# spc = spectral matrix | |||
# indices = factor variable used to summarize data | |||
# fun = summary function | |||
# Avoid NOTE "no visible binding for global variable '.SD'" | |||
# when checking package by devtools::check() | |||
# .SD <- NULL | |||
spc <- data.table::data.table(indices, spc, check.names = F) | |||
if(is.null(ncol(indices))){ | |||
x <- 1 | |||
} else { | |||
x <- ncol(indices) | |||
} | |||
as.data.frame(spc[, lapply(.SD, fun), | |||
by = eval(names(spc)[1:x])]) | |||
} | |||
#' @title Calculate mean of spectra | |||
#' @description Calculate the mean of each spectral repetitions | |||
#' (absorbance average per wavenumber) | |||
#' @param in_spectra List that contains spectral data in the | |||
#' element \code{MIR} (data.frame) and sample metadata in the | |||
#' list element \code{data_rep} (data.frame). | |||
#' The data.frame \code{data_meta} | |||
#' contains the sample ID stored in the \code{ID} | |||
#' vector (originally from spectral file names), | |||
#' country abbreviation stored in \code{contry} (2 letters), | |||
#' and the vector \code{site} (2 letters) that is the country | |||
#' abbreviation. | |||
#' @return \code{out_spectra}: List that contains: | |||
#' \itemize{ | |||
#' \item \code{data_meta}: metadata of sample (data.frame) | |||
#' that is | |||
#' taken from the element \code{rep} of the input list argument | |||
#' \code{in_spectra} | |||
#' \item \code{MIR_mean}: average spectra from replicates of | |||
#' sample ID | |||
#' (data.frame) | |||
#' \item \code{MIR_sd}: standard deviation of spectra calculated | |||
#' from replicates of sample ID (data.frame) | |||
#' \item \code{cvar} coefficient of variance over all | |||
#' wavenumbers of spectra | |||
#' calculated from replicates of sample ID (vector) | |||
#' } | |||
#' @export | |||
average_spectra <- function(in_spectra) { | |||
# Compute mean per sample with by_spc, | |||
# provided by Antoine | |||
# spc = spectral data | |||
# indices = character vector(s) or factor(s) to group the rows | |||
# by fun = summary function | |||
# Also compute the standard deviation (SD) | |||
# of the three measurements | |||
# Identify samples in which the spectrum has SD higher than 1.5 | |||
# and need to be re-scanned - ? | |||
MIR <- NULL | |||
data_rep <- NULL | |||
ID <- NULL | |||
MIR_mean <- by_spc(spc = in_spectra$MIR[, ], | |||
indices = in_spectra$data_rep$ID[], fun = mean) | |||
MIR_sd <- by_spc(spc = in_spectra$MIR[, ], | |||
indices = in_spectra$data_rep$ID[], fun = sd) | |||
# MIR_sd[order(rowMeans(MIR_sd[, -1])), 1] | |||
# rowMeans(MIR_sd[, -1])[order(rowMeans(MIR_sd[, -1]))] / | |||
# rowMeans(MIR_mean[, -1])[order(rowMeans(MIR_sd[, -1]))] | |||
# compute the coefficient of variation | |||
cvar <- rowMeans(MIR_sd[, -1])/rowMeans(MIR_mean[, -1]) | |||
# add metadata for each sample; take strings of rownames | |||
# from spectra; first two characters of the sample_ID code | |||
# is country, character pos. 4 and 5 is site abbreviation | |||
data_meta <- data.frame(ID = MIR_mean[, 1]) | |||
data_meta <- cbind(data_meta, | |||
country = substring(data_meta$ID, first = 1, last = 2), | |||
site = substring(data_meta$ID, first = 4, last = 5) | |||
) | |||
out_spectra <- list(data_meta = data_meta, | |||
MIR_mean = MIR_mean, | |||
MIR_sd = MIR_sd, | |||
cvar = cvar) | |||
return(out_spectra) | |||
} |
@ -0,0 +1,34 @@ | |||
## Join chemical and spectral data ============================== | |||
#' @title Join chemical and spectral data frames | |||
#' @description Combines spectral data (data.frame) and chemical | |||
#' data (data.frame). | |||
#' @param dat_chem data.frame that contains chemical values of | |||
#' the sample | |||
#' @param dat_spec List that contains spectral data | |||
#' @return List: xxx | |||
#' @param by character of column name that defines sample_ID | |||
#' @export | |||
join_chem_spec <- function( | |||
dat_chem, dat_spec, by = "sample_ID") { | |||
# Alternative when "no visible binding for global variable": | |||
data_meta <- MIR <- MIR0 <- ori <- MIR_mean <- NULL | |||
# http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable | |||
# Replace sample_ID by ID | |||
if(!is.data.frame(dat_chem)) { | |||
stop(dat_chem, "needs to be a data.frame", call. = FALSE) | |||
} else { | |||
colnames(dat_chem)[colnames(dat_chem) == by] <- "ID" | |||
dat_chem$ID <- as.factor(dat_chem$ID) | |||
# Select only chemical data that have no outlier spectra | |||
dat_chem <- dat_chem[dat_spec$data_meta$ID, ] | |||
ID <- as.factor(dat_spec$data_meta$ID) | |||
# Join ref analyses | |||
MIRdata <- data.frame(ID = ID) | |||
MIRdata$MIR <- dat_spec$MIR0 | |||
MIRdata$ori <- dat_spec$MIR_mean | |||
# Joining by ID, type = "inner" | |||
MIRdata_chem <- plyr::join(dat_chem, MIRdata, type = "inner") | |||
# before dplyr::inner_join(dat_chem, MIRdata) | |||
MIRdata_chem | |||
} | |||
} |
@ -0,0 +1,71 @@ | |||
## Function 1: Read spectra in text form ======================== | |||
#' @title Read an OPUS text file and extract metadata | |||
#' @description | |||
#' Read single text file acquired with | |||
#' an Bruker Vertex FTIR Instrument | |||
#' (as exported from OPUS software) and extract sample metadata | |||
#' provided in the filename | |||
#' @usage | |||
#' read_spectra(path) | |||
#' @param path character of the directory | |||
#' where the spectral text files are stored | |||
#' @return | |||
#' List that contains the following elements: | |||
#' \itemize{ | |||
#' \item \code{MIR}: data.frame that contains all the spectra. | |||
#' The columns of \code{MIR} contain absorbance values at | |||
#' different wavenumber in the MIR range. The wavenumbers | |||
#' rounded to 0.1 are given as column names. The original file | |||
#' names are stored as row names. One line in the data frame | |||
#' \code{MIR} contains one replicate scan of a sample. | |||
#' \item \code{data_rep}: data.frame that constists of sample | |||
#' metadata that was extracted from the file name of | |||
#' individual spectral files. The first vector \code{ID} | |||
#' contains the spectral file name without the repetition number | |||
#' supplied as \code{.<number>} in the file name. | |||
#' Letters 1 to 2 of the spectral | |||
#' file name are used for the country abbreviation, stored | |||
#' as in the \code{} vector \code{data_rep} . Letters | |||
#' 4 to 5 of the file name are used for the landscape (site) | |||
#' abbreviation. | |||
#' } | |||
#' @note: This function is derived from a re-factored and | |||
#' simplified version of the \code{read.opus} function from the | |||
#' \sQuote{soil.spec} package for reading OPUS VERTEX files | |||
#' The function should also work for other OPUS files (eg alpha), | |||
#' see \code{read.opus}. The function readOPUS() was | |||
#' written by Antoine Stevens. | |||
#' @export | |||
read_spectra <- function(path){ | |||
# Needs various utilities for spectral processing that Antoine | |||
# put on github | |||
ID <- NULL | |||
# Load the MIR data exported from OPUS to txt files | |||
# List files in the directory | |||
lf <- list.files(path, full.names = TRUE) | |||
# Read files into R with readOPUS() | |||
# (comes from the github file) | |||
MIR <- readOPUS(lf, in_format = "txt") | |||
# Wavenumber, from 3996.4 to 599.8 cm-1 | |||
colnames(MIR) <- round(as.numeric(colnames(MIR)), 1) | |||
# Remove the txt extension | |||
rownames(MIR) <- sub("\\.txt", "", row.names(MIR)) | |||
# Prepare a dataset with ID, | |||
# Extract country with substring | |||
# and repetition | |||
# (ID = character before the dot; rep = number after the dot) | |||
data_rep <- data.frame(ID = sub("(.+)\\.[[:digit:]]+$", "\\1", | |||
row.names(MIR)), | |||
rep = as.numeric(sub(".+\\.([[:digit:]])+$", "\\1", | |||
row.names(MIR))) | |||
) | |||
data_rep <- cbind(data_rep, | |||
country = substring(data_rep$ID, first = 1, last = 2), | |||
site = substring(data_rep$ID, first = 4, last = 5) | |||
) | |||
list_spectra <- list( | |||
MIR = MIR, | |||
data_rep = data_rep | |||
) | |||
return(list_spectra) | |||
} |
@ -0,0 +1,310 @@ | |||
## Soil spectroscopy related functions that were compiled by | |||
## Antoine Stevens ============================================== | |||
#' @title Read an OPUS text file | |||
#' @description | |||
#' Read single text file acquired with | |||
#' an Bruker Vertex FTIR Instrument | |||
#' (as exported from OPUS software) | |||
#' @param file.name Character vector with path to files | |||
#' @usage readOPUS_text(file.name) | |||
#' @export | |||
readOPUS_text <- function(file.name){ | |||
if (file.exists(file.name)) { | |||
out <- read.csv(file.name, header=F, | |||
col.names = c("wavenumber", "absorbance") | |||
) | |||
return(out) | |||
} else { | |||
warning(paste("File", file.name, "does not exist")) | |||
} | |||
} | |||
#' @title Read an OPUS binary file | |||
#' @description | |||
#' Read single binary file acquired with an | |||
#' Bruker Vertex FTIR Instrument | |||
#' @param file.name Character vector with path to files | |||
#' @usage readOPUS_bin(file.name) | |||
#' @export | |||
readOPUS_bin <- function(file.name){ | |||
size <- fileRaw <- NULL | |||
if (file.exists(file.name)) { | |||
try( | |||
pa <- hexView::readRaw(file.name, offset = 0, | |||
nbytes = file.info(file.name)$size, human = "char", | |||
size = 1, endian = "little"), silent = TRUE) | |||
if (!class(.Last.value)[1] == "try-error") { | |||
pr <- pa$fileRaw | |||
# Get source of instrument | |||
ins <- grepRaw("INS", pr, all = TRUE) | |||
ins <- hexView::readRaw( | |||
file.name, offset = ins[length(ins)] + 7, | |||
nbytes = 3, human = "char", size = 1, endian = "little" | |||
) | |||
ins <- hexView::blockString(ins) | |||
# Get source of infrared to know if NIR or MIR | |||
src <- grepRaw("SRC", pr, all = TRUE) | |||
src <- hexView::readRaw( | |||
file.name, offset = src[length(src)] + 4, | |||
nbytes = 3, human = "char", size = 1, endian = "little" | |||
) | |||
src <- hexView::blockString(src) | |||
instr.range <- tolower(paste(ins, src, sep = "-")) | |||
# Get Beam Splitter | |||
bms <- grepRaw("BMS", pr, all = TRUE) | |||
bms <- hexView::readRaw( | |||
file.name, offset = bms[length(bms)] + 4, | |||
nbytes = 4, human = "char", size = 1, endian = "little" | |||
) | |||
bms <- hexView::blockString(bms) | |||
z <- grepRaw("ZFF", pr, all = TRUE)[1] + 5 | |||
re <- grepRaw("RES", pr, all = TRUE)[1] + 5 | |||
snm <- grepRaw("SNM", pr, all = TRUE)[1] + 7 | |||
lwn <- grepRaw("LWN", pr, all = TRUE)[1] + 7 | |||
fx <- grepRaw("FXV", pr, all = TRUE)[3] + 7 | |||
lx <- grepRaw("LXV", pr, all = TRUE)[3] + 7 | |||
npt0 <- grepRaw("NPT", pr, all = TRUE)[2] + 3 | |||
npt1 <- grepRaw("NPT", pr, all = TRUE)[3] + 7 | |||
mxy <- grepRaw("MXY", pr, all = TRUE)[1] + 7 | |||
mny <- grepRaw("MNY", pr, all = TRUE)[3] + 7 | |||
end <- grepRaw("END", pr, all = TRUE) + 11 | |||
dat <- grepRaw( "DAT", pr, all = TRUE)[1] + 7 | |||
tim <- grepRaw("TIM", pr, all = TRUE) + 11 | |||
# calculate end and start of each block | |||
offs <- end[5:10] | |||
byts <- diff(offs) | |||
ZFF <- hexView::readRaw(file.name, offset = z, nbytes = 4, | |||
human = "int", size = 2)[[5]][1] | |||
RES <- hexView::readRaw(file.name, offset = re, nbytes = 4, | |||
human = "int", size = 2)[[5]][1] | |||
snm.lab.material <- hexView::blockString( | |||
hexView::readRaw(file.name, offset = snm, nbytes = 22, | |||
human = "char", size = 1, endian = "little") | |||
) | |||
if (!nzchar(snm.lab.material)) { | |||
SSN <- "" | |||
Material <- "" | |||
warning("Product name not found inside OPUS file...") | |||
} | |||
else { | |||
if (!length(grep(snm.lab.material, pattern = ";")) == 0) { | |||
snm.lab.material <- as.vector( | |||
strsplit(snm.lab.material, ";") | |||
)[[1]] | |||
SSN <- paste0(snm.lab.material[2], snm.lab.material[1]) | |||
Material <- snm.lab.material[3] | |||
} else { | |||
if (!length(grep(snm.lab.material, pattern = "_")) == 0) { | |||
# Don't remove "_" from unique id SSN (@baumann) | |||
# SSN <- sub("_", "", snm.lab.material) | |||
SSN <- snm.lab.material | |||
Material <- "" | |||
} else { | |||
if (!length(snm.lab.material) == 0) { | |||
SSN <- snm.lab.material | |||
Material <- "" | |||
} | |||
} | |||
} | |||
} | |||
# Set three SSN first three characters to lower | |||
# Don't convert to lowercase | |||
# SSN <- paste0(tolower(substr(SSN, 1, 3)), | |||
# substr(SSN, 4, 20)) | |||
Scandate <- hexView::blockString( | |||
hexView::readRaw(file.name, offset = dat, | |||
nbytes = 10, human = "char", size = 1, | |||
endian = "little") | |||
) | |||
Scantime <- hexView::blockString( | |||
hexView::readRaw(file.name, | |||
offset = tim[2] - 4, nbytes = 8, human = "char", | |||
size = 1, endian = "little") | |||
) | |||
Scandate <- paste(Scandate, Scantime) | |||
LWN <- hexView::readRaw( | |||
file.name, offset = lwn, nbytes = 8, | |||
human = "real", size = 8)[[5]][1] | |||
# Combine the above parameters | |||
spectrum.meta <- c(SSN, Material, Scandate, ZFF, RES, LWN) | |||
# Get number of data points for each spectra data block | |||
NPT0 <- hexView::readRaw( | |||
file.name, offset = npt0, nbytes = 12, | |||
human = "int", size = 4)[[5]][2] | |||
NPT1 <- hexView::readRaw( | |||
file.name, offset = npt1, nbytes = 4, | |||
human = "int", size = 4)[[5]][1] | |||
# fxv: Frequency of first point | |||
fxv <- hexView::readRaw( | |||
file.name, offset = fx, nbytes = 16, | |||
human = "real", size = 8)[[5]][1] | |||
# lxv: Frequency of last point | |||
lxv <- hexView::readRaw( | |||
file.name, offset = lx, nbytes = 16, | |||
human = "real", size = 8)[[5]][1] | |||
# Read all through all the data blocks inside the OPUS file | |||
nbytes1 <- NPT0 * 4 # initial parameters | |||
nbytes.f <- NPT1 * 4 | |||
if (offs[1] < 2000) { | |||
offs.f <- offs[3] | |||
nbytes.f <- NPT1 * 4 | |||
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1))) | |||
} | |||
else if (offs[1] > 20000) { | |||
offs.f <- offs[2] | |||
nbytes.f <- NPT1 * 4 | |||
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1))) | |||
} else { # for vert-MIR | |||
offs.f <- 7188 | |||
nbytes.f <- NPT0 * 4 | |||
lxv <- hexView::readRaw( | |||
file.name, offset = 8768, nbytes = 16, | |||
human = "real", size = 8)[[5]][1] | |||
fxv <- hexView::readRaw( | |||
file.name, offset = 8752, nbytes = 16, | |||
human = "real", size = 8)[[5]][1] | |||
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT0 - 1))) | |||
} | |||
spectra <- hexView::readRaw(file.name, width = NULL, | |||
offset = offs.f - 4, nbytes = nbytes.f, human = "real", # needs to be -4 according to soil.spec function | |||
size = 4, endian = "little")[[5]] | |||
# File name | |||
file_name <- sub(".+/(.+)", "\\1", file.name) | |||
# Create date_time object | |||
date_time <- as.POSIXct(spectrum.meta[3], | |||
format = "%d/%m/%Y %H:%M:%S ") | |||
# Create unique_id using file_name and time | |||
ymd_id <- format(date_time, "%Y%m%d") | |||
unique_id <- paste0(file_name, "_", ymd_id) | |||
# Add sample_id: remove extension .0, .1 etc. from OPUS files | |||
sample_id <- sub("(.+)\\.[[:digit:]]+$", "\\1", file_name) | |||
# Extract repetition number (rep_no) from file name | |||
rep_no <- sub(".+\\.([[:digit:]])+$", "\\1", file.name) | |||
# Convert spectra to matrix and add dimnames (wavenumbers for columns | |||
# and unique_id for rows) | |||
spc_m <- matrix(spectra, ncol = length(spectra), byrow = FALSE) | |||
rownames(spc_m) <- unique_id | |||
colnames(spc_m) <- round(wavenumbers, 1) | |||
out <- list( | |||
metadata = data.frame( | |||
unique_id = unique_id, | |||
scan_id = file_name, # changed file_name to scan_id in output list | |||
sample_id = sample_id, | |||
rep_no = rep_no, | |||
date_time = date_time, | |||
sample_info = spectrum.meta[1], | |||
instrument_name = instr.range, | |||
resolution = spectrum.meta[5], | |||
bms = bms, | |||
lwn = spectrum.meta[6] | |||
), | |||
spc = spc_m, | |||
wavenumbers = wavenumbers | |||
) | |||
# names(out)[-c(1:9)] <- as.character(round(wavenumbers, 1)) | |||
return(out) | |||
} | |||
} else { | |||
warning(paste("File", file.name, "does not exist")) | |||
} | |||
} | |||
#' @title Read OPUS binary and ASCII files | |||
#' @description | |||
#' Read single or multiple binary and ASCII files acquired with | |||
#' an Bruker Vertex FTIR Instrument | |||
#' @usage | |||
#' readOPUS(fnames, in_format, out_format) | |||
#' @param fnames character \code{vector} of the name(s) | |||
#' (with absolute path) of the file(s) to read | |||
#' @param in_format format of the input file: \code{'binary'} or | |||
#' \code{'txt'} | |||
#' @param out_format format of the output: | |||
#' \code{'matrix'} (default) or \code{'list'} (see below) | |||
#' @return | |||
#' if \code{out_format} = \code{'matrix'}, absorbance values | |||
#' of the input file(s) in a single \code{matrix}. | |||
#' | |||
#' if \code{out_format} = \code{'list'}, a \code{list} of the | |||
#' input file(s) data consisting of a \code{list} with components: | |||
#' \itemize{ | |||
#' \item{\code{Name}}{ name of the file imported} | |||
#' \item{\code{datetime}}{ date and time of acquisition in | |||
#' \code{POSIXct} format (available only when | |||
#' \code{in_format} = 'binary')} | |||
#' \item{\code{metadata}}{ \code{list} with information | |||
#' on instrument configuration (available only when | |||
#' \code{in_format} = 'binary')} | |||
#' \item{\code{absorbance}}{ a numeric \code{vector} | |||
#' of absorbance values} | |||
#' \item{\code{wavenumbers}}{ numeric \code{vector} | |||
#' of the band positions} | |||
#' } | |||
#' @author Antoine Stevens and Andrew Sila (soil.spec package) | |||
#' @note | |||
#' This is essentially a re-factored and simplified version of | |||
#' the \code{read.opus} function from the | |||
#' \sQuote{soil.spec} package for reading OPUS VERTEX files | |||
#' The function should also work for other OPUS files (eg alpha), | |||
#' see \code{read.opus}. | |||
#' @export | |||
readOPUS<- function(fnames, in_format = c("binary", "txt"), | |||
out_format = c("matrix", "list")) { | |||
# hexView and plyr are required | |||
wavenumbers <- NULL | |||
absorbance <- NULL | |||
in_format <- match.arg(in_format) | |||
out_format <- match.arg(out_format) | |||
spc <- vector("list", length(fnames)) | |||
i <- 1 | |||
for (file.name in fnames) { | |||
if (in_format == "binary") { | |||
spc[[i]] <- readOPUS_bin(file.name) | |||
} else { | |||
spc[[i]] <- readOPUS_text(file.name) | |||
} | |||
i <- i + 1 | |||
} | |||
names(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames) | |||
if (out_format == "matrix") { | |||
test <- sapply(spc, function(x) class(x) != "character") | |||
# warning( | |||
# paste0(paste(names(spc)[!test], collapse = ","), | |||
# " do not exist") | |||
# ) | |||
spc <- spc[test] | |||
if(in_format == "binary"){ | |||
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x){ | |||
x <- t(data.frame( | |||
wav = x$wavenumbers, absorbance = x$absorbance)) | |||
colnames(x) <- x[1,] | |||
data.frame(x[2, , drop = F], check.names = F)})) | |||
} else { | |||
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x) { | |||
x <- t(x) | |||
colnames(x) <- x[1,] | |||
data.frame(x[2, , drop = F], check.names = F)})) | |||
} | |||
rownames(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames) | |||
} | |||
return(spc) | |||
} |
@ -0,0 +1,466 @@ | |||
# Perform calibration sampling based on spectral PCA ------------ | |||
#' @title Split | |||
#' @description Perform calibration sampling based on | |||
#' the Kennard-Stones algorithm. | |||
#' @param spec_chem data.frame that contains chemical | |||
#' and IR spectroscopy data | |||
#' @param ratio_val Ratio of number of validation and all samples. | |||
#' @param pc Number of principal components (numeric) | |||
#' @param print logical expression weather calibration | |||
#' @param validation Logical expression weather | |||
#' calibration sampling is performed | |||
#' (\code{TRUE} or \code{FALSE}). | |||
#' @usage ken_stone(spec_chem, ratio_val, pc, print = TRUE, | |||
#' validation = TRUE) | |||
#' @export | |||
ken_stone <- function(spec_chem, ratio_val, pc, | |||
print = TRUE, validation = TRUE) { | |||
MIR <- model <- type <- PC1 <- PC2 <- NULL | |||
# Now with a real dataset | |||
# k = number of samples to select | |||
# pc = if provided, the number of principal components | |||
# (see ?kenStone) | |||
if(validation == TRUE) { | |||
# pc = 0.99 before !!! | |||
sel <- prospectr::kenStone(X = spec_chem$MIR, | |||
k = round(ratio_val * nrow(spec_chem)), pc = 2) | |||
sel$model # The row index of calibration samples | |||
# plot(sel$pc[, 1:2], xlab = 'PC1', ylab = 'PC2') | |||
# Points selected for calibration | |||
# points(sel$pc[sel$model, 1:2], pch = 19, col = 2) | |||
# Plot samples selected for calibration in ggplot | |||
sel_df_cal <- data.frame(sel$pc[- sel$model,1:2]) | |||
sel_df_cal$type <- as.factor( | |||
rep("calibration", nrow(sel_df_cal)) | |||
) | |||
sel_df_val <- data.frame(sel$pc[sel$model, 1:2]) | |||
sel_df_val$type <- as.factor( | |||
rep("validation", nrow(sel_df_val))) | |||
sel_df <- rbind(sel_df_cal, sel_df_val) | |||
# Compute ratio needed to make the figure square | |||
ratio <- with(sel_df, diff(range(PC1))/diff(range(PC2))) | |||
p_pc <- ggplot2::ggplot(data = sel_df) + | |||
ggplot2::geom_point( | |||
ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) + | |||
ggplot2::coord_fixed(ratio = 1) + | |||
ggplot2::scale_shape_manual(values=c(1, 19)) + | |||
ggplot2::scale_colour_manual(values=c("black", "red")) + | |||
ggplot2::theme_bw() | |||
# ggplot2::theme.user + | |||
ggplot2::theme(legend.title = ggplot2::element_blank()) | |||
# print(p_pc) | |||
# Split MIR data into calibration and validation set using | |||
# the results of Kennard-Stone Calibration Sampling | |||
# Selct by row index of calibration samples | |||
val_set <- spec_chem[sel$model, ] | |||
# Check number of observations (rows) for validation set | |||
nrow(val_set) | |||
cal_set <- spec_chem[- sel$model, ] | |||
list_out <- list( | |||
calibration = cal_set, | |||
validation = val_set, | |||
p_pc = p_pc | |||
) | |||
list_out | |||
# Check number of observations (rows) for calibration set | |||
# nrow(cal_set) | |||
} else { | |||
cal_set <- spec_chem | |||
list(calibration = cal_set) | |||
} | |||
} | |||
#' @title Perform model tuning | |||
#' @description Uses function from caret to to model tuning | |||
#' for PLS regression. | |||
#' @param x list from calibration sampling | |||
#' @param variable response variable for PLS regression, supplied | |||
#' as character expression | |||
#' @param validation Logical expression weather an independent | |||
#' validation is performed. | |||
#' @param env Environment where function is evaluated | |||
#' @export | |||
tune_model_q <- function(x, variable, | |||
env = parent.frame(), validation = TRUE) { | |||
calibration <- NULL | |||
# List of calibration and validation samples | |||
# set up a cross-validation scheme | |||
# create 10 folds that we will keep for the different | |||
# modeling approaches to allow comparison | |||
# randomly break the data into 10 partitions | |||
# note that k is the total number of samples for leave-one-out | |||
# use substitute function to make non-standard evaluation | |||
# of variable argument (looks at a function as argument, | |||
# sees code used to compute value; | |||
# see chapter 13.1 Capturing expressions | |||
# in Advanced R (Hadley Wickham) | |||
# !! p. 270 | |||
r <- eval(variable, x$calibration, env) | |||
idx <- caret::createFolds(y = r, k = 10, returnTrain = T) # update *** | |||
idx | |||
# inject the index in the trainControl object | |||
tr_control <- caret::trainControl(method = "cv", index = idx, | |||
savePredictions = T) | |||
if (validation == TRUE) { | |||
tr_control | |||
} else { | |||
tr_control | |||
} | |||
} | |||
#' @title Perform model tuning | |||
#' @description Uses function from caret to to model tuning | |||
#' for PLS regression. | |||
#' @param x list from calibration sampling | |||
#' @param variable response variable for PLS regression, supplied | |||
#' as character expression | |||
#' @param validation Logical expression weather an independent | |||
#' validation is performed. | |||
#' @param env Environment where function is evaluated | |||
#' @export | |||
tune_model <- function(x, variable, | |||
env = parent.frame(), validation = TRUE) { | |||
tune_model_q(x, substitute(variable), env) | |||
} | |||
# Fit a PLS regression model using the caret package ------------ | |||
#' @title Fit a PLS regression model | |||
#' (quoted version of the function) | |||
#' @description Uses the caret package to perform PLS modeling. | |||
#' Spectra are centered and scaled prior to modeling. | |||
#' @param x List that contains calibration | |||
#' set, validation set, and model tuning options | |||
#' @param validation Logical expression weather independent | |||
#' validation is performed | |||
#' @param variable Response variable to be modeled | |||
#' @param tr_control Object that defines controlling parameters | |||
#' of the desired internal validation framework | |||
#' @param env Environment where function is evaluated | |||
#' @export | |||
fit_pls_q <- function(x, validation = TRUE, | |||
variable, tr_control, env = parent.frame()) { | |||
# Fit a partial least square regression (pls) model | |||
# center and scale MIR (you can try without) | |||
calibration <- MIR <- NULL | |||
v <- eval(variable, x$calibration, env) | |||
if (validation == TRUE) { | |||
pls_model <- caret::train(x = x$calibration$MIR, y = v, | |||
method = "pls", | |||
tuneLength = 20, | |||
trControl = tr_control, | |||
preProcess = c("center", "scale") | |||
) | |||
} else { | |||
pls_model <- caret::train(x = x$calibration$MIR, y = v, | |||
method = "pls", | |||
tuneLength = 20, | |||
trControl = tr_control, | |||
preProcess = c("center", "scale") | |||
) | |||
} | |||
# Collect fitted object into a list | |||
# fitList_cal <- list(pls = fit_pls) | |||
# fitList_cal | |||
pls_model | |||
} | |||
#' @title Fit a PLS regression model | |||
#' @description Uses the caret package to perform PLS modeling. | |||
#' Spectra are centered and scaled prior to modeling. | |||
#' @param x List that contains calibration | |||
#' set, validation set, and model tuning options | |||
#' @param validation Logical expression weather independent | |||
#' validation is performed | |||
#' @param variable Response variable to be modeled | |||
#' @param env Environment where function is evaluated | |||
#' @export | |||
fit_pls <- function(x, validation = TRUE, | |||
variable, env = parent.frame()) { | |||
fit_pls_q(x = x, validation = TRUE, | |||
variable = substitute(variable), env | |||
) | |||
} | |||
# Evaluate PLS performance (validation and cross-validation) ---- | |||
#' @title Evaluate PLS performance | |||
#' @description Calculate model performance indices based | |||
#' on observed and predicted values of validation and calibration | |||
#' set, and internal cross-validation | |||
#' @param x List that contains calibration and validation data | |||
#' frame with combined spectral and chemical data | |||
#' @param pls_model List with PLS regression model output from | |||
#' the caret package | |||
#' @param variable Response variable (e.g. chemical property) to be | |||
#' modelled (needs to be non-quoted expression). \code{variable} | |||
#' needs to be a column name in the \code{validation} data.frame | |||
#' (element of \code{x}) | |||
#' @param validation Logical expression if independent validation | |||
#' is performed (split data set into calibration set and | |||
#' validation set) | |||
#' @param print Print observed vs. predicted for calibration | |||
#' and validation. Default is \code{TRUE}. | |||
#' @param env Specifiy the environment in which the function is | |||
#' called. Default argument of \code{env} is | |||
#' \code{parent.frame()} | |||
#' @export | |||
evaluate_pls_q <- function(x, pls_model, variable, | |||
validation = TRUE, print = TRUE, env = parent.frame()) { | |||
# Set global variables to NULL to avoid R CMD check notes | |||
MIR <- object <- model <- dataType <- obs <- pred <- NULL | |||
ncomp <- finalModel <- rmsd <- r2 <- r2 <- rpd <- n <- NULL | |||
rmse <- calibration <- NULL | |||
# Collect fitted object into a list | |||
list_models <- list(pls = pls_model) | |||
# Extract best tuning parameters and associated cv predictions | |||
if(validation == TRUE) { | |||
predobs_cal <- plyr::ldply(list_models, | |||
function(x) plyr::match_df(x$pred, x$bestTune), | |||
.id = "model" | |||
) | |||
# Calculate training (calibration) and test (validation) data | |||
# predictions based on pls model with calibration data | |||
v <- eval(variable, x$validation, env) | |||
predobs_val <- caret::extractPrediction(list_models, | |||
testX = x$validation$MIR, testY = v) # update *** | |||
# Create new data frame column <object> | |||
predobs_val$object <- predobs_val$model | |||
# Replace levels "Training" and "Test" in dataType column | |||
# by "Calibration" and "Validation" (rename levels of factor) | |||
predobs_val$dataType <- plyr::revalue(predobs_val$dataType, | |||
c("Test" = "Validation", "Training" = "Calibration") | |||
) | |||
# Change the order of rows in the data frame | |||
# Calibration as first level (show Calibration in ggplot graph | |||
# on left panel) | |||
predobs_val$dataType <- factor(predobs_val$dataType, | |||
levels = c("Calibration", "Validation")) | |||
# Calculate model performance indexes by model and dataType | |||
# uses package plyr and function summary.df of SPECmisc.R | |||
stats <- plyr::ddply(predobs_val, c("model", "dataType"), | |||
function(x) summary_df(x, "obs", "pred") | |||
) | |||
} else { | |||
# Extract best tuning parameters and associated cv predictions | |||
predobs_cv <- plyr::ldply(list_models, | |||
function(x) plyr::match_df(x$pred, x$bestTune), | |||
.id = "model" | |||
) | |||
# Extract auto-prediction | |||
predobs <- caret::extractPrediction(list_models) | |||
predobs_cv$object <- predobs_cv$model | |||
predobs_cv$dataType <- "Cross-validation" | |||
predobs_cv <- dplyr::select( | |||
predobs_cv, obs, pred, model, dataType, object | |||
) | |||
predobs_val <- rbind(predobs, predobs_cv) | |||
stats <- plyr::ddply(predobs_val, c("model", "dataType"), | |||
function(x) summary_df(x, "obs", "pred") | |||
) | |||
} | |||
# Add number of components to stats; from finalModel list item | |||
# from train() function output (function from caret package) | |||
stats$ncomp <- rep(pls_model$finalModel$ncomp, nrow(stats)) | |||
# Add range of observed values for validation and calibraton | |||
# get range from predicted vs. observed data frame | |||
# stored in object predobs | |||
obs_cal <- subset(predobs_val, dataType == "Calibration")$obs | |||
obs_val <- subset(predobs_val, dataType == "Validation")$obs | |||
# Get name of predicted variable; see p. 261 of book | |||
# "Advanced R" (Hadley Wickham) | |||
variable_name <- deparse(variable) | |||
# before: deparse(substitute(variable)) | |||
df_range <- data.frame( | |||
variable = rep(variable_name, 2), | |||
dataType = c("Calibration", "Validation"), | |||
min_obs = c(range(obs_cal)[1], range(obs_val)[1]), | |||
median_obs = c(median(obs_cal), median(obs_val)), | |||
max_obs = c(range(obs_cal)[2], range(obs_val)[2]), | |||
mean_obs = c(mean(obs_cal), mean(obs_val)), | |||
CV = c(sd(obs_cal) / mean(obs_cal) * 100, | |||
sd(obs_val) / mean(obs_val) * 100) | |||
) | |||
# Join stats with range data frame (df_range) | |||
stats <- plyr::join(stats, df_range, type = "inner") | |||
annotation <- plyr::mutate(stats, | |||
rmse = as.character(as.expression(paste0("RMSE == ", | |||
round(rmsd, 2)))), | |||
r2 = as.character(as.expression(paste0("italic(R)^2 == ", | |||
round(r2, 2)))), | |||
rpd = as.character(as.expression(paste("RPD == ", | |||
round(rpd, 2)))), | |||
n = as.character(as.expression(paste0("italic(n) == ", n))), | |||
ncomp = as.character(as.expression(paste0("ncomp = ", | |||
ncomp))) | |||
) | |||
# Plot predicted vs. observed values and model indexes | |||
# update label, xlim, and ylim *** | |||
# Add label number of samples to facet_grid using a | |||
# labeling function | |||
# ! Update labeller API: | |||
# https://github.com/hadley/ggplot2/commit/ef33dc7 | |||
# http://sahirbhatnagar.com/facet_wrap_labels | |||
# Prepare lookup character vector | |||
make_label <- function(x, validation = TRUE) { | |||
dataType <- n <- NULL | |||
if (validation == TRUE) { | |||
c(`Calibration` = paste0("Calibration", "~(", | |||
x[x$dataType == "Calibration", ]$n, ")" | |||
), | |||
`Validation` = paste0("Validation", "~(", | |||
x[x$dataType == "Validation", ]$n, ")" | |||
) | |||
) | |||
} else{ | |||
c(`Calibration` = paste0("Calibration", "~(", | |||
x[x$dataType == "Calibration", ]$n, ")" | |||
), | |||
`Cross-Validation` = paste0("Cross-Validation", "~(", | |||
x[x$dataType == "Cross-Validation", ]$n, ")" | |||
) | |||
) | |||
} | |||
} | |||
if (validation == TRUE) { | |||
label_validation <- make_label(x = annotation, | |||
validation = TRUE | |||
) | |||
} else { | |||
label_validation <- make_label(x = annotation, | |||
validation = FALSE | |||
) | |||
} | |||
# Rename labels on the fly with a lookup character vector | |||
to_string <- ggplot2::as_labeller( | |||
x = label_validation, ggplot2::label_parsed | |||
) | |||
# ------------------------------------------------------------- | |||
# http://docs.ggplot2.org/0.9.3.1/label_parsed.html | |||
# some other info: https://coderclub.b.uib.no/tag/plotmath/ | |||
# !!! now depreciated in ggplot2 >= 2.0.0 | |||
# dataType_labeller <- function(variable, value){ | |||
# new <- paste0(dataType_names[value], "~(", annotation$n, ")") | |||
# plyr::llply(as.character(new), function(x) parse(text = x)) | |||
# } | |||
p_pred_obs <- ggplot2::ggplot(data = predobs_val) + | |||
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), | |||
shape = 1, size = 4) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = -Inf, y = Inf, label = r2), size = 7, | |||
hjust = -0.1, vjust = 1.5, parse = TRUE) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = -Inf, y = Inf, label = rmse), size = 7, | |||
hjust = -0.075, vjust = 4.25, parse = TRUE) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = -Inf, y = Inf, label = rpd), size = 7, | |||
hjust = -0.1, vjust = 6.5, parse = TRUE) + | |||
ggplot2::facet_grid(~ dataType, | |||
labeller =ggplot2::as_labeller(to_string)) + | |||
# ggplot2::facet_grid(~ dataType, | |||
# labeller = dataType_labeller) + | |||
ggplot2::theme_bw() + | |||
ggplot2::geom_abline(col = "red") + | |||
ggplot2::labs(x = "Observed", y = "Predicted") + | |||
ggplot2::xlim(c(min(predobs_val$obs) - | |||
0.05 * diff(range(predobs_val$obs)), | |||
max(predobs_val$obs) + | |||
0.05 * diff(range(predobs_val$obs)))) + | |||
ggplot2::ylim(c(min(predobs_val$obs) - | |||
0.05 * diff(range(predobs_val$obs)), | |||
max(predobs_val$obs) + | |||
0.05 * diff(range(predobs_val$obs)))) # + | |||
# theme.user | |||
## ggplot graph for model comparison | |||
## (arranged later in panels) | |||
x_label <- paste0("Observed ", | |||
as.character(variable_name)) | |||
y_label <- paste0("Predicted ", | |||
as.character(variable_name)) | |||
p_model <- ggplot2::ggplot(data = predobs_val) + | |||
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), | |||
shape = 1, size = 2, alpha = 1/2) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 3, | |||
hjust = 1.15, vjust = -3, parse = TRUE) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 3, | |||
hjust = 1.12, vjust = -2.5, parse = TRUE) + | |||
ggplot2::geom_text(data = annotation, | |||
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 3, | |||
hjust = 1.15, vjust = -1.25, parse = TRUE) + | |||
ggplot2::facet_grid(~ dataType, | |||
labeller = ggplot2::as_labeller(to_string)) + | |||
# ggplot2::facet_grid(~ dataType, | |||
# labeller = dataType_labeller) + | |||
ggplot2::theme_bw() + | |||
ggplot2::geom_abline(col = "red") + | |||
ggplot2::labs(x = x_label, y = y_label) + | |||
ggplot2::xlim(c(min(predobs_val$obs) - | |||
0.05 * diff(range(predobs_val$obs)), | |||
max(predobs_val$obs) + | |||
0.05 * diff(range(predobs_val$obs)))) + | |||
ggplot2::ylim(c(min(predobs_val$obs) - | |||
0.05 * diff(range(predobs_val$obs)), | |||
max(predobs_val$obs) + | |||
0.05 * diff(range(predobs_val$obs)))) + | |||
ggplot2::coord_fixed() | |||
if(print == TRUE) { | |||
print(p_model) | |||
} | |||
list(stats = stats, p_model = p_model) | |||
} | |||
## PLS regression modeling in one function ====================== | |||
#' @title Calibration sampling, model tuning, and PLS regression | |||
#' @description Perform calibration sampling and use selected | |||
#' calibration set for model tuning | |||
#' @param spec_chem data.frame that contains IR spectroscopy | |||
#' and chemical data | |||
#' @param k Number of validation samples | |||
#' @param pc Number of Principal Components used for Calibration | |||
#' sampling (Kennard-Stones algorithm) | |||
#' @param ratio_val Ratio of number of validation and all samples. | |||
#' @param print Logical expression weather graphs shall be printed | |||
#' @param validation Logical expression weather independent | |||
#' validation is performed | |||
#' @param variable Response variable (without quotes) | |||
#' @param env Environment where function is evaluated | |||
#' @export | |||
# Note: check non standard evaluation, argument passing... | |||
pls_ken_stone <- function(spec_chem, ratio_val, pc, | |||
print = TRUE, validation = TRUE, variable, | |||
env = parent.frame()) { | |||
calibration <- 0 | |||
# Calibration sampling | |||
list_sampled <- ken_stone( | |||
spec_chem, ratio_val = ratio_val, pc = 2, validation = TRUE | |||
) | |||
tr_control <- tune_model_q(list_sampled, | |||
substitute(variable), env | |||
) | |||
pls <- fit_pls_q(x = list_sampled, validation = TRUE, | |||
variable = substitute(variable), tr_control = tr_control, env | |||
) | |||
stats <- evaluate_pls_q(x = list_sampled, pls_model = pls, | |||
variable = substitute(variable), env = parent.frame() | |||
) | |||
list(data = list_sampled, p_pc = list_sampled$p_pc, | |||
pls_model = pls, stats = stats$stats, p_model = stats$p_model) | |||
} | |||
@ -0,0 +1,42 @@ | |||
#' @title Predict soil properties of new spectra based on calibration models | |||
#' @description | |||
#' Function that uses pre-processed spectra, additional metadata of new | |||
#' samples, and caret model output for the different soil property models | |||
#' to create predicted values. | |||
#' @param model_list List that contains caret output objects | |||
#' of the different calibration models to predict (one model per soil property) | |||
#' @param spectra_list List that contains spectra and additional data | |||
#' after pre-processing (\code{do_pretreatment()}including metadata | |||
#' (\code{sample_ID}) | |||
#' @usage predict_from_spectra(model_list, spectra_list) | |||
#' @export | |||
predict_from_spectra <- function(model_list, spectra_list) { | |||
# Use extractPrediction function (caret) and supply model_list that contains | |||
# caret calibration outputs; use pre-processed spectra dataset (list | |||
# resulting from do_pretreatment()) | |||
predictions_caret <- caret::extractPrediction( | |||
models_prediction, | |||
unkX = soilspec_test$MIR0 | |||
) | |||
# Convert data.frame into long form; one sample should be represented by | |||
# one single row and the predicted values of soil properties should be | |||
# in the different columns | |||
# Use the tidyr::spread() function (from tidyr packge) | |||
# to gather columns into rows | |||
# Add sample_ID column to uniquely identify observations | |||
# Number of caret model objects used to predict | |||
n <- length(unique(predictions_caret$object)) | |||
# Add sample_ID from metadata of spectra to predicted values | |||
sample_ID <- spectra_list$data_meta$ID | |||
# Repeat meta_data for each of the additional model rows and add | |||
# ID column to long form data frame | |||
id <- rep(sample_ID, n) | |||
predictions_metadata <- cbind(predictions_caret, sample_ID = id) | |||
# Get data into wide form | |||
predictions_wide <- tidyr::spread( | |||
data = predictions_metadata, key = "object", value = "pred" | |||
) | |||
} |
@ -0,0 +1,51 @@ | |||
#' @title Preprocess spectra | |||
#' @description Use commonly used preprocessing algorithms on | |||
#' the spectra. | |||
#' @param list_spectra List that contains averaged spectra | |||
#' in the list element called \code{MIR_mean} | |||
#' @param select Character string that specifies the predefined | |||
#' pretreatment options. Possible arguments are: | |||
#' \code{select = "MIR0"} for Savitzky Golay smoothing filter | |||
#' without derivative, \code{select = "MIR1"} for Savitky Golay | |||
#' with first derivative, \code{select = "MIR2"} for Savitzky | |||
#' Golay with second derivative, \code{select = "MIR0_snv"} | |||
#' for Standard Normal Variate after Savitzky Golay without | |||
#' derivative, and \code{select = "MIRb"} for | |||
#' baseline correction. | |||
#' @usage do_pretreatment(list_spectra, select) | |||
#' @return list_spectra: List that contains preprocessed | |||
#' spectra in element \code{MIR0} | |||
#' @import hyperSpec | |||
#' @export | |||
do_pretreatment <- function(list_spectra, select) { | |||
MIR_mean <- NULL | |||
MIR_raw <- list_spectra$MIR_mean | |||
# Filter the data using the Savitzky and Golay smoothing filter | |||
# with a window size of 5 spectral variables and | |||
# a polynomial order of 3 (no differentiation) | |||
# p = polynomial order; plot variance vs polynomial order? | |||
# w = window size (must be odd) | |||
# m = m-th derivative of the polynomial coefficients | |||
# (0 = smoothing) | |||
MIR0 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, | |||
m = 0, p = 3, w = 9) # smoothing and averaging | |||
MIR1 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, | |||
m = 1, p = 3, w = 5) # first derivative *** | |||
MIR2 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, | |||
m = 2, p = 3, w = 5) # second derivative *** | |||
# Calculate standard normal variate (SNV) after smoothing | |||
MIR0_snv <- prospectr::standardNormalVariate(MIR0) | |||
MIR1_snv <- prospectr::standardNormalVariate(MIR1) # added 2016-08-05 | |||
# Baseline correction | |||
# Compute baseline but first, create hyperSpec obj | |||
spc <- new("hyperSpec", spc = as.matrix(list_spectra$MIR_mean), | |||
wavelength = as.numeric(colnames(list_spectra$MIR_mean))) | |||
below <- hyperSpec::spc.fit.poly.below( | |||
fit.to = spc[, , 4000 ~ 900], | |||
apply.to = spc, npts.min = 20, poly.order = 2) | |||
spc_corr <- spc - below | |||
MIRb <- spc_corr[[]] | |||
pre <- select | |||
list_spectra$MIR0 <- get(pre) | |||
return(list_spectra) | |||
} |
@ -0,0 +1,162 @@ | |||
#' @title Remove outlier spectra | |||
#' @description Remove outlier spectra based on the | |||
#' \code{pcout()} function of the \code{mvoutlier} package. | |||
#' @usage remove_outliers(list_spectra, remove = TRUE) | |||
#' @param list_spectra List that contains averaged | |||
#' spectral information | |||
#' in list element \code{MIR_mean} (data.frame) and metadata in | |||
#' \code{data_meta} (data.frame). | |||
#' @param remove logical expression (\code{TRUE} or \code{FALSE}) | |||
#' that specifies weather spectra shall be removed. | |||
#' If \code{rm = FALSE}, there will be no outlier removal | |||
#' @return Returns list \code{spectra_out} that contains: | |||
#' \itemize{ | |||
#' \item \code{MIR_mean}: Outlier removed MIR spectra as | |||
#' data.frame object. If \code{remove = FALSE}, | |||
#' the function will | |||
#' return almost identical list identical to \code{list_spectra}, | |||
#' except that the first \code{indices} column of the spectral | |||
#' data frame \code{MIR_mean} is removed | |||
#' (This is done for both options | |||
#' \code{remove = TRUE} and \code{remove = FALSE}). | |||
#' \item \code{data_meta}: metadata data.frame, identical | |||
#' as in the \code{list_spectra} input list. | |||
#' \item \code{plot_out}: (optional) ggplot2 graph | |||
#' that shows all spectra (absorbance on x-axis and wavenumber | |||
#' on y-axis) with outlier marked, if | |||
#' \code{remove = TRUE}. | |||
#' } | |||
#' @details This is an optional function if one wants to remove | |||
#' outliers. | |||
#' @export | |||
remove_outliers <- function(list_spectra, remove = TRUE) { | |||
# Outlier detection | |||
# Use the mvoutlier package and pcout function to identify | |||
# multivariate outliers | |||
wfinal01 <- ID <- NULL | |||
if (remove == TRUE) { | |||
# Remove the 'indices' column | |||
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1] | |||
out <- mvoutlier::pcout(list_spectra$MIR_mean, makeplot = T, | |||
outbound = 0.05) # parameters should be adapted | |||
# Plot outlying spectra | |||
plot_out <- plotMIR( | |||
list_spectra$MIR_mean[ | |||
order(out$wfinal01, decreasing = T), ], | |||
col = as.factor(out$wfinal01[order(out$wfinal01, | |||
decreasing = T)])) + | |||
ggplot2::scale_colour_brewer("outlier", palette = "Set1") | |||
out_id <- as.character( | |||
list_spectra$data_meta$ID[!as.logical(out$wfinal01)] | |||
) | |||
# Remove outliers | |||
MIR_mean <- list_spectra$MIR_mean[ | |||
! list_spectra$data_meta$ID %in% out_id, ] | |||
# rep ID and country name | |||
data_meta <- list_spectra$data_meta[ | |||
! list_spectra$data_meta$ID %in% out_id, ] | |||
spectra_out <- list(MIR_mean = MIR_mean, | |||
data_meta = data_meta, | |||
plot_out = plot_out) | |||
} else { | |||
# Remove the 'indices' column | |||
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1] | |||
spectra_out <- list(MIR_mean = list_spectra$MIR_mean, | |||
data_meta = list_spectra$data_meta) | |||
} | |||
spectra_out | |||
} | |||
## plotMIR function of Antoine Stevens; don't export this | |||
## function to the NAMESPACE | |||
plotMIR <- function(spc, group = NULL, col = NULL, | |||
linetype = NULL, wr = NULL, brk = NULL, | |||
ylab = "Absorbance", xlab = "Wavenumber /cm-1", | |||
by = NULL, by.wrap = T, ...){ | |||
# Function to plot spectra, based on the ggplot2 package | |||
# spc = spectral matrix, with colnames = wavelengths | |||
# group = grouping variable, usually the id's of the sample | |||
# wr = wavelength range to plot | |||
# brk = breaks of the x-axis | |||
# by = factor variable for which the mean and sd of | |||
# each level will be computed and plotted (optional) | |||
# Requires packages ggplot2; data.table; reshape2 | |||
# Workaround to pass R CMD check: | |||
# http://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when | |||
# Setting the variables to NULL first | |||
variable <- value <- colour <- NULL | |||
spc <- as.data.frame(spc) | |||
if (!is.null(wr)) | |||
spc <- spc[, as.numeric(colnames(spc)) >= min(wr) & | |||
as.numeric(colnames(spc)) <= max(wr)] | |||
if (is.null(brk)) | |||
brk <- pretty(as.numeric(colnames(spc)), n = 10) | |||
if (!is.null(by)) { | |||
spc$by <- by | |||
spc <- data.table::data.table(spc, check.names = F) | |||
mean.spc <- reshape2::melt( | |||
spc[, lapply(data.table::.SD, mean), by = by], | |||
id.vars = "by" | |||
) | |||
sd.spc <- reshape2::melt( | |||
spc[, lapply(data.table::.SD, sd), by = by], | |||
id.vars = "by" | |||
) | |||
mean.spc$min <- mean.spc$value - sd.spc$value | |||
mean.spc$max <- mean.spc$value + sd.spc$value | |||
mean.spc$variable <- as.numeric( | |||
as.character(mean.spc$variable) | |||
) | |||
if (by.wrap) { | |||
p <- ggplot2::ggplot(data = mean.spc) + | |||
ggplot2::geom_ribbon( | |||
ggplot2::aes(x = variable, ymin = min, ymax = max), | |||
fill = "grey", col = "black", size = 0.15) + | |||
ggplot2::theme_bw() | |||
p <- p + ggplot2::geom_line( | |||
ggplot2::aes(x = variable, y = value), | |||
size = 0.25) + | |||
ggplot2::facet_wrap(~ by) + | |||
ggplot2::labs(x = xlab, y = ylab) + | |||
ggplot2::scale_x_reverse(breaks = brk) | |||
} else { | |||
p <- ggplot2::ggplot(data = mean.spc, | |||
ggplot2::aes(x = variable, y = value, group = by, col = by)) + | |||
ggplot2::geom_line(size = 0.25) + | |||
ggplot2::labs(x = xlab, y = ylab) + | |||
ggplot2::scale_x_reverse(breaks = brk) + | |||
ggplot2::theme_bw() | |||
} | |||
return(p) | |||
} else { | |||
if (is.null(group)) | |||
group <- as.character(1:nrow(spc)) | |||
spc$group <- group | |||
spc$colour <- col | |||
spc$linetype <- linetype | |||
id.var <- colnames(spc)[ | |||
grep("group|colour|linetype",colnames(spc))] | |||
tmp <- reshape2::melt(spc, id.var = id.var) | |||
tmp$variable <- as.numeric(as.character(tmp$variable)) | |||
p <- ggplot2::ggplot(tmp, | |||
ggplot2::aes(variable, value, group = group)) + | |||
ggplot2::labs(x = xlab, y = ylab) + | |||
ggplot2::theme_bw() + | |||
ggplot2::scale_x_reverse(breaks = brk) | |||
if (is.null(col) & is.null(linetype)) | |||
p <- p + ggplot2::geom_line( | |||
ggplot2::aes(colour = group)) | |||
else if (!is.null(col) & is.null(linetype)) | |||
p <- p + ggplot2::geom_line( | |||
ggplot2::aes(colour = colour)) | |||
else if (is.null(col) & !is.null(linetype)) | |||
p <- p + ggplot2::geom_line( | |||
ggplot2::aes(colour = group, | |||
linetype = linetype)) | |||
else p <- p + ggplot2::geom_line( | |||
ggplot2::aes(colour = colour, | |||
linetype = linetype)) | |||
return(p) | |||
} | |||
} | |||
@ -0,0 +1,19 @@ | |||
#' @title Resample spectra stored to new | |||
#' @description Calculates model statistics for predicted (y) | |||
#' vs. observed (y) values | |||
#' @param list_spectra List of spectra and metadata | |||
#' @param wn_lower Numerical value for lowest wavenumber in sampling interval | |||
#' @param wn_upper Numerical value for highest wavenumber in sampling interval | |||
#' @export | |||
resample_spectra <- function( | |||
list_spectra, wn_lower = 510, wn_upper = 3988, wn_interval = 2) | |||
{ | |||
# Create sequence of new wavenumbers | |||
wn_seq <- rev(seq(from = wn_lower, wn_upper, by = wn_interval)) | |||
list_spectra$MIR0 <- prospectr::resample( | |||
X = list_spectra$MIR_mean, # spectral matrix to resample | |||
wav = as.numeric(colnames(list_spectra$MIR_mean)), # old wavenumbers | |||
new.wav = wn_seq # new wavenumbers | |||
) | |||
return(list_spectra) | |||
} |
@ -0,0 +1,25 @@ | |||
#' @title Calculate model statistics | |||
#' @description Calculates model statistics for predicted (y) | |||
#' vs. observed (y) values | |||
#' @param df data.frame with predicted and observed data | |||
#' @param x column with observed values | |||
#' @param y column with predicted values | |||
#' @export | |||
summary_df <- function(df, x, y){ | |||
x <- df[, x] | |||
y <- df[, y] | |||
data.frame(rmse = sqrt(sum((x - y)^2, na.rm = T) / (length(x)-1)), | |||
rmsd = mean((x - y)^2)^.5, | |||
sdev = sd(x, na.rm = T), | |||
rpd = sd(x,na.rm = T) / | |||
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)), | |||
rpiq = (quantile(x, .75, na.rm = T) - quantile(x, .25, na.rm = T)) / | |||
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)), | |||
r2 = cor(x, y, use = "pairwise.complete.obs")^2, | |||
bias = mean(x, na.rm = T) - mean(y, na.rm = T), | |||
SB = (mean(x, na.rm = T) - mean(y, na.rm = T))^2, | |||
NU = var(x, na.rm = T) * (1 - lm(y ~ x)$coefficients[2])^2, | |||
LC = var(y, na.rm = T) * | |||
(1 - cor(x, y, use = "pairwise.complete.obs")^2), | |||
n = length(x)) | |||
} |
@ -0,0 +1,41 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/average-spectra.R | |||
\name{average_spectra} | |||
\alias{average_spectra} | |||
\title{Calculate mean of spectra} | |||
\usage{ | |||
average_spectra(in_spectra) | |||
} | |||
\arguments{ | |||
\item{in_spectra}{List that contains spectral data in the | |||
element \code{MIR} (data.frame) and sample metadata in the | |||
list element \code{data_rep} (data.frame). | |||
The data.frame \code{data_meta} | |||
contains the sample ID stored in the \code{ID} | |||
vector (originally from spectral file names), | |||
country abbreviation stored in \code{contry} (2 letters), | |||
and the vector \code{site} (2 letters) that is the country | |||
abbreviation.} | |||
} | |||
\value{ | |||
\code{out_spectra}: List that contains: | |||
\itemize{ | |||
\item \code{data_meta}: metadata of sample (data.frame) | |||
that is | |||
taken from the element \code{rep} of the input list argument | |||
\code{in_spectra} | |||
\item \code{MIR_mean}: average spectra from replicates of | |||
sample ID | |||
(data.frame) | |||
\item \code{MIR_sd}: standard deviation of spectra calculated | |||
from replicates of sample ID (data.frame) | |||
\item \code{cvar} coefficient of variance over all | |||
wavenumbers of spectra | |||
calculated from replicates of sample ID (vector) | |||
} | |||
} | |||
\description{ | |||
Calculate the mean of each spectral repetitions | |||
(absorbance average per wavenumber) | |||
} | |||
@ -0,0 +1,31 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pretreat-spectra.R | |||
\name{do_pretreatment} | |||
\alias{do_pretreatment} | |||
\title{Preprocess spectra} | |||
\usage{ | |||
do_pretreatment(list_spectra, select) | |||
} | |||
\arguments{ | |||
\item{list_spectra}{List that contains averaged spectra | |||
in the list element called \code{MIR_mean}} | |||
\item{select}{Character string that specifies the predefined | |||
pretreatment options. Possible arguments are: | |||
\code{select = "MIR0"} for Savitzky Golay smoothing filter | |||
without derivative, \code{select = "MIR1"} for Savitky Golay | |||
with first derivative, \code{select = "MIR2"} for Savitzky | |||
Golay with second derivative, \code{select = "MIR0_snv"} | |||
for Standard Normal Variate after Savitzky Golay without | |||
derivative, and \code{select = "MIRb"} for | |||
baseline correction.} | |||
} | |||
\value{ | |||
list_spectra: List that contains preprocessed | |||
spectra in element \code{MIR0} | |||
} | |||
\description{ | |||
Use commonly used preprocessing algorithms on | |||
the spectra. | |||
} | |||
@ -0,0 +1,38 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pls-modeling.R | |||
\name{evaluate_pls_q} | |||
\alias{evaluate_pls_q} | |||
\title{Evaluate PLS performance} | |||
\usage{ | |||
evaluate_pls_q(x, pls_model, variable, validation = TRUE, print = TRUE, | |||
env = parent.frame()) | |||
} | |||
\arguments{ | |||
\item{x}{List that contains calibration and validation data | |||
frame with combined spectral and chemical data} | |||
\item{pls_model}{List with PLS regression model output from | |||
the caret package} | |||
\item{variable}{Response variable (e.g. chemical property) to be | |||
modelled (needs to be non-quoted expression). \code{variable} | |||
needs to be a column name in the \code{validation} data.frame | |||
(element of \code{x})} | |||
\item{validation}{Logical expression if independent validation | |||
is performed (split data set into calibration set and | |||
validation set)} | |||
\item{print}{Print observed vs. predicted for calibration | |||
and validation. Default is \code{TRUE}.} | |||
\item{env}{Specifiy the environment in which the function is | |||
called. Default argument of \code{env} is | |||
\code{parent.frame()}} | |||
} | |||
\description{ | |||
Calculate model performance indices based | |||
on observed and predicted values of validation and calibration | |||
set, and internal cross-validation | |||
} | |||
@ -0,0 +1,24 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pls-modeling.R | |||
\name{fit_pls} | |||
\alias{fit_pls} | |||
\title{Fit a PLS regression model} | |||
\usage{ | |||
fit_pls(x, validation = TRUE, variable, env = parent.frame()) | |||
} | |||
\arguments{ | |||
\item{x}{List that contains calibration | |||
set, validation set, and model tuning options} | |||
\item{validation}{Logical expression weather independent | |||
validation is performed} | |||
\item{variable}{Response variable to be modeled} | |||
\item{env}{Environment where function is evaluated} | |||
} | |||
\description{ | |||
Uses the caret package to perform PLS modeling. | |||
Spectra are centered and scaled prior to modeling. | |||
} | |||
@ -0,0 +1,28 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pls-modeling.R | |||
\name{fit_pls_q} | |||
\alias{fit_pls_q} | |||
\title{Fit a PLS regression model | |||
(quoted version of the function)} | |||
\usage{ | |||
fit_pls_q(x, validation = TRUE, variable, tr_control, env = parent.frame()) | |||
} | |||
\arguments{ | |||
\item{x}{List that contains calibration | |||
set, validation set, and model tuning options} | |||
\item{validation}{Logical expression weather independent | |||
validation is performed} | |||
\item{variable}{Response variable to be modeled} | |||
\item{tr_control}{Object that defines controlling parameters | |||
of the desired internal validation framework} | |||
\item{env}{Environment where function is evaluated} | |||
} | |||
\description{ | |||
Uses the caret package to perform PLS modeling. | |||
Spectra are centered and scaled prior to modeling. | |||
} | |||
@ -0,0 +1,24 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/join-chem-spectra.R | |||
\name{join_chem_spec} | |||
\alias{join_chem_spec} | |||
\title{Join chemical and spectral data frames} | |||
\usage{ | |||
join_chem_spec(dat_chem, dat_spec, by = "sample_ID") | |||
} | |||
\arguments{ | |||
\item{dat_chem}{data.frame that contains chemical values of | |||
the sample} | |||
\item{dat_spec}{List that contains spectral data} | |||
\item{by}{character of column name that defines sample_ID} | |||
} | |||
\value{ | |||
List: xxx | |||
} | |||
\description{ | |||
Combines spectral data (data.frame) and chemical | |||
data (data.frame). | |||
} | |||
@ -0,0 +1,28 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pls-modeling.R | |||
\name{ken_stone} | |||
\alias{ken_stone} | |||
\title{Split} | |||
\usage{ | |||
ken_stone(spec_chem, ratio_val, pc, print = TRUE, | |||
validation = TRUE) | |||
} | |||
\arguments{ | |||
\item{spec_chem}{data.frame that contains chemical | |||
and IR spectroscopy data} | |||
\item{ratio_val}{Ratio of number of validation and all samples.} | |||
\item{pc}{Number of principal components (numeric)} | |||
\item{print}{logical expression weather calibration} | |||
\item{validation}{Logical expression weather | |||
calibration sampling is performed | |||
(\code{TRUE} or \code{FALSE}).} | |||
} | |||
\description{ | |||
Perform calibration sampling based on | |||
the Kennard-Stones algorithm. | |||
} | |||
@ -0,0 +1,34 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/pls-modeling.R | |||
\name{pls_ken_stone} | |||
\alias{pls_ken_stone} | |||
\title{Calibration sampling, model tuning, and PLS regression} | |||
\usage{ | |||
pls_ken_stone(spec_chem, ratio_val, pc, print = TRUE, validation = TRUE, | |||
variable, env = parent.frame()) | |||
} | |||
\arguments{ | |||
\item{spec_chem}{data.frame that contains IR spectroscopy | |||
and chemical data} | |||
\item{ratio_val}{Ratio of number of validation and all samples.} | |||
\item{pc}{Number of Principal Components used for Calibration | |||
sampling (Kennard-Stones algorithm)} | |||
\item{print}{Logical expression weather graphs shall be printed} | |||
\item{validation}{Logical expression weather independent | |||
validation is performed} | |||
\item{variable}{Response variable (without quotes)} | |||
\item{env}{Environment where function is evaluated} | |||
\item{k}{Number of validation samples} | |||
} | |||
\description{ | |||
Perform calibration sampling and use selected | |||
calibration set for model tuning | |||
} | |||
@ -0,0 +1,22 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/predict-spectra.R | |||
\name{predict_from_spectra} | |||
\alias{predict_from_spectra} | |||
\title{Predict soil properties of new spectra based on calibration models} | |||
\usage{ | |||
predict_from_spectra(model_list, spectra_list) | |||
} | |||
\arguments{ | |||