Philipp Baumann
8 years ago
commit
7ddbe3ec0e
320 changed files with 491036 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||||
|
^.*\.Rproj$ |
||||
|
^\.Rproj\.user$ |
@ -0,0 +1,4 @@ |
|||||
|
.Rproj.user |
||||
|
.Rhistory |
||||
|
.RData |
||||
|
.Ruserdata |
@ -0,0 +1,27 @@ |
|||||
|
Package: simplerspec |
||||
|
Type: Package |
||||
|
Title: Soil and plant spectroscopic model building and prediction |
||||
|
Depends: R (>= 3.2) |
||||
|
Imports: |
||||
|
ggplot2 (>= 2.0.0), |
||||
|
plyr, |
||||
|
data.table, |
||||
|
reshape2, |
||||
|
mvoutlier, |
||||
|
hexView, |
||||
|
Rcpp, |
||||
|
hyperSpec, |
||||
|
prospectr, |
||||
|
dplyr, |
||||
|
caret, |
||||
|
tidyr |
||||
|
Version: 0.1.0 |
||||
|
Authors@R: person("Philipp", "Baumann", |
||||
|
email = "baumanph@student.ethz.ch", role = c("aut", "cre")) |
||||
|
Description: Functions that cover |
||||
|
reading of spectral data, outlier removal, |
||||
|
spectral preprocessing, calibration sampling, PLS regression |
||||
|
using caret, and model diagnostic statistics and plots. |
||||
|
License: GPL-2 |
||||
|
LazyData: TRUE |
||||
|
RoxygenNote: 5.0.1 |
@ -0,0 +1,23 @@ |
|||||
|
# Generated by roxygen2: do not edit by hand |
||||
|
|
||||
|
export(average_spectra) |
||||
|
export(do_pretreatment) |
||||
|
export(evaluate_pls_q) |
||||
|
export(fit_pls) |
||||
|
export(fit_pls_q) |
||||
|
export(join_chem_spec) |
||||
|
export(ken_stone) |
||||
|
export(pls_ken_stone) |
||||
|
export(predict_from_spectra) |
||||
|
export(readOPUS) |
||||
|
export(readOPUS_bin) |
||||
|
export(readOPUS_text) |
||||
|
export(read_spectra) |
||||
|
export(remove_outliers) |
||||
|
export(resample_spectra) |
||||
|
export(summary_df) |
||||
|
export(tune_model) |
||||
|
export(tune_model_q) |
||||
|
import(Rcpp) |
||||
|
import(data.table) |
||||
|
import(hyperSpec) |
@ -0,0 +1,85 @@ |
|||||
|
# Helper function written by Antoine Stevens that |
||||
|
# is used for averaging replication scans of one sample |
||||
|
#' @import data.table |
||||
|
#' @import Rcpp |
||||
|
by_spc <- function(spc, indices, fun = mean){ |
||||
|
# Fast summary of spectral data |
||||
|
# spc = spectral matrix |
||||
|
# indices = factor variable used to summarize data |
||||
|
# fun = summary function |
||||
|
# Avoid NOTE "no visible binding for global variable '.SD'" |
||||
|
# when checking package by devtools::check() |
||||
|
# .SD <- NULL |
||||
|
spc <- data.table::data.table(indices, spc, check.names = F) |
||||
|
if(is.null(ncol(indices))){ |
||||
|
x <- 1 |
||||
|
} else { |
||||
|
x <- ncol(indices) |
||||
|
} |
||||
|
as.data.frame(spc[, lapply(.SD, fun), |
||||
|
by = eval(names(spc)[1:x])]) |
||||
|
} |
||||
|
#' @title Calculate mean of spectra |
||||
|
#' @description Calculate the mean of each spectral repetitions |
||||
|
#' (absorbance average per wavenumber) |
||||
|
#' @param in_spectra List that contains spectral data in the |
||||
|
#' element \code{MIR} (data.frame) and sample metadata in the |
||||
|
#' list element \code{data_rep} (data.frame). |
||||
|
#' The data.frame \code{data_meta} |
||||
|
#' contains the sample ID stored in the \code{ID} |
||||
|
#' vector (originally from spectral file names), |
||||
|
#' country abbreviation stored in \code{contry} (2 letters), |
||||
|
#' and the vector \code{site} (2 letters) that is the country |
||||
|
#' abbreviation. |
||||
|
#' @return \code{out_spectra}: List that contains: |
||||
|
#' \itemize{ |
||||
|
#' \item \code{data_meta}: metadata of sample (data.frame) |
||||
|
#' that is |
||||
|
#' taken from the element \code{rep} of the input list argument |
||||
|
#' \code{in_spectra} |
||||
|
#' \item \code{MIR_mean}: average spectra from replicates of |
||||
|
#' sample ID |
||||
|
#' (data.frame) |
||||
|
#' \item \code{MIR_sd}: standard deviation of spectra calculated |
||||
|
#' from replicates of sample ID (data.frame) |
||||
|
#' \item \code{cvar} coefficient of variance over all |
||||
|
#' wavenumbers of spectra |
||||
|
#' calculated from replicates of sample ID (vector) |
||||
|
#' } |
||||
|
#' @export |
||||
|
average_spectra <- function(in_spectra) { |
||||
|
# Compute mean per sample with by_spc, |
||||
|
# provided by Antoine |
||||
|
# spc = spectral data |
||||
|
# indices = character vector(s) or factor(s) to group the rows |
||||
|
# by fun = summary function |
||||
|
# Also compute the standard deviation (SD) |
||||
|
# of the three measurements |
||||
|
# Identify samples in which the spectrum has SD higher than 1.5 |
||||
|
# and need to be re-scanned - ? |
||||
|
MIR <- NULL |
||||
|
data_rep <- NULL |
||||
|
ID <- NULL |
||||
|
MIR_mean <- by_spc(spc = in_spectra$MIR[, ], |
||||
|
indices = in_spectra$data_rep$ID[], fun = mean) |
||||
|
MIR_sd <- by_spc(spc = in_spectra$MIR[, ], |
||||
|
indices = in_spectra$data_rep$ID[], fun = sd) |
||||
|
# MIR_sd[order(rowMeans(MIR_sd[, -1])), 1] |
||||
|
# rowMeans(MIR_sd[, -1])[order(rowMeans(MIR_sd[, -1]))] / |
||||
|
# rowMeans(MIR_mean[, -1])[order(rowMeans(MIR_sd[, -1]))] |
||||
|
# compute the coefficient of variation |
||||
|
cvar <- rowMeans(MIR_sd[, -1])/rowMeans(MIR_mean[, -1]) |
||||
|
# add metadata for each sample; take strings of rownames |
||||
|
# from spectra; first two characters of the sample_ID code |
||||
|
# is country, character pos. 4 and 5 is site abbreviation |
||||
|
data_meta <- data.frame(ID = MIR_mean[, 1]) |
||||
|
data_meta <- cbind(data_meta, |
||||
|
country = substring(data_meta$ID, first = 1, last = 2), |
||||
|
site = substring(data_meta$ID, first = 4, last = 5) |
||||
|
) |
||||
|
out_spectra <- list(data_meta = data_meta, |
||||
|
MIR_mean = MIR_mean, |
||||
|
MIR_sd = MIR_sd, |
||||
|
cvar = cvar) |
||||
|
return(out_spectra) |
||||
|
} |
@ -0,0 +1,34 @@ |
|||||
|
## Join chemical and spectral data ============================== |
||||
|
#' @title Join chemical and spectral data frames |
||||
|
#' @description Combines spectral data (data.frame) and chemical |
||||
|
#' data (data.frame). |
||||
|
#' @param dat_chem data.frame that contains chemical values of |
||||
|
#' the sample |
||||
|
#' @param dat_spec List that contains spectral data |
||||
|
#' @return List: xxx |
||||
|
#' @param by character of column name that defines sample_ID |
||||
|
#' @export |
||||
|
join_chem_spec <- function( |
||||
|
dat_chem, dat_spec, by = "sample_ID") { |
||||
|
# Alternative when "no visible binding for global variable": |
||||
|
data_meta <- MIR <- MIR0 <- ori <- MIR_mean <- NULL |
||||
|
# http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable |
||||
|
# Replace sample_ID by ID |
||||
|
if(!is.data.frame(dat_chem)) { |
||||
|
stop(dat_chem, "needs to be a data.frame", call. = FALSE) |
||||
|
} else { |
||||
|
colnames(dat_chem)[colnames(dat_chem) == by] <- "ID" |
||||
|
dat_chem$ID <- as.factor(dat_chem$ID) |
||||
|
# Select only chemical data that have no outlier spectra |
||||
|
dat_chem <- dat_chem[dat_spec$data_meta$ID, ] |
||||
|
ID <- as.factor(dat_spec$data_meta$ID) |
||||
|
# Join ref analyses |
||||
|
MIRdata <- data.frame(ID = ID) |
||||
|
MIRdata$MIR <- dat_spec$MIR0 |
||||
|
MIRdata$ori <- dat_spec$MIR_mean |
||||
|
# Joining by ID, type = "inner" |
||||
|
MIRdata_chem <- plyr::join(dat_chem, MIRdata, type = "inner") |
||||
|
# before dplyr::inner_join(dat_chem, MIRdata) |
||||
|
MIRdata_chem |
||||
|
} |
||||
|
} |
@ -0,0 +1,71 @@ |
|||||
|
## Function 1: Read spectra in text form ======================== |
||||
|
#' @title Read an OPUS text file and extract metadata |
||||
|
#' @description |
||||
|
#' Read single text file acquired with |
||||
|
#' an Bruker Vertex FTIR Instrument |
||||
|
#' (as exported from OPUS software) and extract sample metadata |
||||
|
#' provided in the filename |
||||
|
#' @usage |
||||
|
#' read_spectra(path) |
||||
|
#' @param path character of the directory |
||||
|
#' where the spectral text files are stored |
||||
|
#' @return |
||||
|
#' List that contains the following elements: |
||||
|
#' \itemize{ |
||||
|
#' \item \code{MIR}: data.frame that contains all the spectra. |
||||
|
#' The columns of \code{MIR} contain absorbance values at |
||||
|
#' different wavenumber in the MIR range. The wavenumbers |
||||
|
#' rounded to 0.1 are given as column names. The original file |
||||
|
#' names are stored as row names. One line in the data frame |
||||
|
#' \code{MIR} contains one replicate scan of a sample. |
||||
|
#' \item \code{data_rep}: data.frame that constists of sample |
||||
|
#' metadata that was extracted from the file name of |
||||
|
#' individual spectral files. The first vector \code{ID} |
||||
|
#' contains the spectral file name without the repetition number |
||||
|
#' supplied as \code{.<number>} in the file name. |
||||
|
#' Letters 1 to 2 of the spectral |
||||
|
#' file name are used for the country abbreviation, stored |
||||
|
#' as in the \code{} vector \code{data_rep} . Letters |
||||
|
#' 4 to 5 of the file name are used for the landscape (site) |
||||
|
#' abbreviation. |
||||
|
#' } |
||||
|
#' @note: This function is derived from a re-factored and |
||||
|
#' simplified version of the \code{read.opus} function from the |
||||
|
#' \sQuote{soil.spec} package for reading OPUS VERTEX files |
||||
|
#' The function should also work for other OPUS files (eg alpha), |
||||
|
#' see \code{read.opus}. The function readOPUS() was |
||||
|
#' written by Antoine Stevens. |
||||
|
#' @export |
||||
|
read_spectra <- function(path){ |
||||
|
# Needs various utilities for spectral processing that Antoine |
||||
|
# put on github |
||||
|
ID <- NULL |
||||
|
# Load the MIR data exported from OPUS to txt files |
||||
|
# List files in the directory |
||||
|
lf <- list.files(path, full.names = TRUE) |
||||
|
# Read files into R with readOPUS() |
||||
|
# (comes from the github file) |
||||
|
MIR <- readOPUS(lf, in_format = "txt") |
||||
|
# Wavenumber, from 3996.4 to 599.8 cm-1 |
||||
|
colnames(MIR) <- round(as.numeric(colnames(MIR)), 1) |
||||
|
# Remove the txt extension |
||||
|
rownames(MIR) <- sub("\\.txt", "", row.names(MIR)) |
||||
|
# Prepare a dataset with ID, |
||||
|
# Extract country with substring |
||||
|
# and repetition |
||||
|
# (ID = character before the dot; rep = number after the dot) |
||||
|
data_rep <- data.frame(ID = sub("(.+)\\.[[:digit:]]+$", "\\1", |
||||
|
row.names(MIR)), |
||||
|
rep = as.numeric(sub(".+\\.([[:digit:]])+$", "\\1", |
||||
|
row.names(MIR))) |
||||
|
) |
||||
|
data_rep <- cbind(data_rep, |
||||
|
country = substring(data_rep$ID, first = 1, last = 2), |
||||
|
site = substring(data_rep$ID, first = 4, last = 5) |
||||
|
) |
||||
|
list_spectra <- list( |
||||
|
MIR = MIR, |
||||
|
data_rep = data_rep |
||||
|
) |
||||
|
return(list_spectra) |
||||
|
} |
@ -0,0 +1,310 @@ |
|||||
|
## Soil spectroscopy related functions that were compiled by |
||||
|
## Antoine Stevens ============================================== |
||||
|
|
||||
|
#' @title Read an OPUS text file |
||||
|
#' @description |
||||
|
#' Read single text file acquired with |
||||
|
#' an Bruker Vertex FTIR Instrument |
||||
|
#' (as exported from OPUS software) |
||||
|
#' @param file.name Character vector with path to files |
||||
|
#' @usage readOPUS_text(file.name) |
||||
|
#' @export |
||||
|
readOPUS_text <- function(file.name){ |
||||
|
if (file.exists(file.name)) { |
||||
|
out <- read.csv(file.name, header=F, |
||||
|
col.names = c("wavenumber", "absorbance") |
||||
|
) |
||||
|
return(out) |
||||
|
} else { |
||||
|
warning(paste("File", file.name, "does not exist")) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
#' @title Read an OPUS binary file |
||||
|
#' @description |
||||
|
#' Read single binary file acquired with an |
||||
|
#' Bruker Vertex FTIR Instrument |
||||
|
#' @param file.name Character vector with path to files |
||||
|
#' @usage readOPUS_bin(file.name) |
||||
|
#' @export |
||||
|
readOPUS_bin <- function(file.name){ |
||||
|
size <- fileRaw <- NULL |
||||
|
if (file.exists(file.name)) { |
||||
|
try( |
||||
|
pa <- hexView::readRaw(file.name, offset = 0, |
||||
|
nbytes = file.info(file.name)$size, human = "char", |
||||
|
size = 1, endian = "little"), silent = TRUE) |
||||
|
if (!class(.Last.value)[1] == "try-error") { |
||||
|
|
||||
|
pr <- pa$fileRaw |
||||
|
# Get source of instrument |
||||
|
ins <- grepRaw("INS", pr, all = TRUE) |
||||
|
ins <- hexView::readRaw( |
||||
|
file.name, offset = ins[length(ins)] + 7, |
||||
|
nbytes = 3, human = "char", size = 1, endian = "little" |
||||
|
) |
||||
|
ins <- hexView::blockString(ins) |
||||
|
# Get source of infrared to know if NIR or MIR |
||||
|
src <- grepRaw("SRC", pr, all = TRUE) |
||||
|
src <- hexView::readRaw( |
||||
|
file.name, offset = src[length(src)] + 4, |
||||
|
nbytes = 3, human = "char", size = 1, endian = "little" |
||||
|
) |
||||
|
src <- hexView::blockString(src) |
||||
|
instr.range <- tolower(paste(ins, src, sep = "-")) |
||||
|
# Get Beam Splitter |
||||
|
bms <- grepRaw("BMS", pr, all = TRUE) |
||||
|
bms <- hexView::readRaw( |
||||
|
file.name, offset = bms[length(bms)] + 4, |
||||
|
nbytes = 4, human = "char", size = 1, endian = "little" |
||||
|
) |
||||
|
bms <- hexView::blockString(bms) |
||||
|
|
||||
|
z <- grepRaw("ZFF", pr, all = TRUE)[1] + 5 |
||||
|
re <- grepRaw("RES", pr, all = TRUE)[1] + 5 |
||||
|
snm <- grepRaw("SNM", pr, all = TRUE)[1] + 7 |
||||
|
lwn <- grepRaw("LWN", pr, all = TRUE)[1] + 7 |
||||
|
fx <- grepRaw("FXV", pr, all = TRUE)[3] + 7 |
||||
|
lx <- grepRaw("LXV", pr, all = TRUE)[3] + 7 |
||||
|
npt0 <- grepRaw("NPT", pr, all = TRUE)[2] + 3 |
||||
|
npt1 <- grepRaw("NPT", pr, all = TRUE)[3] + 7 |
||||
|
mxy <- grepRaw("MXY", pr, all = TRUE)[1] + 7 |
||||
|
mny <- grepRaw("MNY", pr, all = TRUE)[3] + 7 |
||||
|
end <- grepRaw("END", pr, all = TRUE) + 11 |
||||
|
dat <- grepRaw( "DAT", pr, all = TRUE)[1] + 7 |
||||
|
tim <- grepRaw("TIM", pr, all = TRUE) + 11 |
||||
|
# calculate end and start of each block |
||||
|
offs <- end[5:10] |
||||
|
|
||||
|
byts <- diff(offs) |
||||
|
ZFF <- hexView::readRaw(file.name, offset = z, nbytes = 4, |
||||
|
human = "int", size = 2)[[5]][1] |
||||
|
RES <- hexView::readRaw(file.name, offset = re, nbytes = 4, |
||||
|
human = "int", size = 2)[[5]][1] |
||||
|
snm.lab.material <- hexView::blockString( |
||||
|
hexView::readRaw(file.name, offset = snm, nbytes = 22, |
||||
|
human = "char", size = 1, endian = "little") |
||||
|
) |
||||
|
if (!nzchar(snm.lab.material)) { |
||||
|
SSN <- "" |
||||
|
Material <- "" |
||||
|
warning("Product name not found inside OPUS file...") |
||||
|
} |
||||
|
else { |
||||
|
if (!length(grep(snm.lab.material, pattern = ";")) == 0) { |
||||
|
snm.lab.material <- as.vector( |
||||
|
strsplit(snm.lab.material, ";") |
||||
|
)[[1]] |
||||
|
SSN <- paste0(snm.lab.material[2], snm.lab.material[1]) |
||||
|
Material <- snm.lab.material[3] |
||||
|
} else { |
||||
|
if (!length(grep(snm.lab.material, pattern = "_")) == 0) { |
||||
|
# Don't remove "_" from unique id SSN (@baumann) |
||||
|
# SSN <- sub("_", "", snm.lab.material) |
||||
|
SSN <- snm.lab.material |
||||
|
Material <- "" |
||||
|
} else { |
||||
|
if (!length(snm.lab.material) == 0) { |
||||
|
SSN <- snm.lab.material |
||||
|
Material <- "" |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
# Set three SSN first three characters to lower |
||||
|
# Don't convert to lowercase |
||||
|
# SSN <- paste0(tolower(substr(SSN, 1, 3)), |
||||
|
# substr(SSN, 4, 20)) |
||||
|
Scandate <- hexView::blockString( |
||||
|
hexView::readRaw(file.name, offset = dat, |
||||
|
nbytes = 10, human = "char", size = 1, |
||||
|
endian = "little") |
||||
|
) |
||||
|
Scantime <- hexView::blockString( |
||||
|
hexView::readRaw(file.name, |
||||
|
offset = tim[2] - 4, nbytes = 8, human = "char", |
||||
|
size = 1, endian = "little") |
||||
|
) |
||||
|
Scandate <- paste(Scandate, Scantime) |
||||
|
LWN <- hexView::readRaw( |
||||
|
file.name, offset = lwn, nbytes = 8, |
||||
|
human = "real", size = 8)[[5]][1] |
||||
|
# Combine the above parameters |
||||
|
spectrum.meta <- c(SSN, Material, Scandate, ZFF, RES, LWN) |
||||
|
# Get number of data points for each spectra data block |
||||
|
NPT0 <- hexView::readRaw( |
||||
|
file.name, offset = npt0, nbytes = 12, |
||||
|
human = "int", size = 4)[[5]][2] |
||||
|
NPT1 <- hexView::readRaw( |
||||
|
file.name, offset = npt1, nbytes = 4, |
||||
|
human = "int", size = 4)[[5]][1] |
||||
|
# fxv: Frequency of first point |
||||
|
fxv <- hexView::readRaw( |
||||
|
file.name, offset = fx, nbytes = 16, |
||||
|
human = "real", size = 8)[[5]][1] |
||||
|
# lxv: Frequency of last point |
||||
|
lxv <- hexView::readRaw( |
||||
|
file.name, offset = lx, nbytes = 16, |
||||
|
human = "real", size = 8)[[5]][1] |
||||
|
# Read all through all the data blocks inside the OPUS file |
||||
|
nbytes1 <- NPT0 * 4 # initial parameters |
||||
|
nbytes.f <- NPT1 * 4 |
||||
|
if (offs[1] < 2000) { |
||||
|
offs.f <- offs[3] |
||||
|
nbytes.f <- NPT1 * 4 |
||||
|
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1))) |
||||
|
} |
||||
|
else if (offs[1] > 20000) { |
||||
|
offs.f <- offs[2] |
||||
|
nbytes.f <- NPT1 * 4 |
||||
|
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1))) |
||||
|
} else { # for vert-MIR |
||||
|
offs.f <- 7188 |
||||
|
nbytes.f <- NPT0 * 4 |
||||
|
lxv <- hexView::readRaw( |
||||
|
file.name, offset = 8768, nbytes = 16, |
||||
|
human = "real", size = 8)[[5]][1] |
||||
|
fxv <- hexView::readRaw( |
||||
|
file.name, offset = 8752, nbytes = 16, |
||||
|
human = "real", size = 8)[[5]][1] |
||||
|
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT0 - 1))) |
||||
|
} |
||||
|
|
||||
|
spectra <- hexView::readRaw(file.name, width = NULL, |
||||
|
offset = offs.f - 4, nbytes = nbytes.f, human = "real", # needs to be -4 according to soil.spec function |
||||
|
size = 4, endian = "little")[[5]] |
||||
|
|
||||
|
# File name |
||||
|
file_name <- sub(".+/(.+)", "\\1", file.name) |
||||
|
|
||||
|
# Create date_time object |
||||
|
date_time <- as.POSIXct(spectrum.meta[3], |
||||
|
format = "%d/%m/%Y %H:%M:%S ") |
||||
|
|
||||
|
# Create unique_id using file_name and time |
||||
|
ymd_id <- format(date_time, "%Y%m%d") |
||||
|
unique_id <- paste0(file_name, "_", ymd_id) |
||||
|
|
||||
|
# Add sample_id: remove extension .0, .1 etc. from OPUS files |
||||
|
sample_id <- sub("(.+)\\.[[:digit:]]+$", "\\1", file_name) |
||||
|
|
||||
|
# Extract repetition number (rep_no) from file name |
||||
|
rep_no <- sub(".+\\.([[:digit:]])+$", "\\1", file.name) |
||||
|
|
||||
|
# Convert spectra to matrix and add dimnames (wavenumbers for columns |
||||
|
# and unique_id for rows) |
||||
|
spc_m <- matrix(spectra, ncol = length(spectra), byrow = FALSE) |
||||
|
rownames(spc_m) <- unique_id |
||||
|
colnames(spc_m) <- round(wavenumbers, 1) |
||||
|
|
||||
|
out <- list( |
||||
|
metadata = data.frame( |
||||
|
unique_id = unique_id, |
||||
|
scan_id = file_name, # changed file_name to scan_id in output list |
||||
|
sample_id = sample_id, |
||||
|
rep_no = rep_no, |
||||
|
date_time = date_time, |
||||
|
sample_info = spectrum.meta[1], |
||||
|
instrument_name = instr.range, |
||||
|
resolution = spectrum.meta[5], |
||||
|
bms = bms, |
||||
|
lwn = spectrum.meta[6] |
||||
|
), |
||||
|
spc = spc_m, |
||||
|
wavenumbers = wavenumbers |
||||
|
) |
||||
|
|
||||
|
# names(out)[-c(1:9)] <- as.character(round(wavenumbers, 1)) |
||||
|
return(out) |
||||
|
} |
||||
|
} else { |
||||
|
warning(paste("File", file.name, "does not exist")) |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
#' @title Read OPUS binary and ASCII files |
||||
|
#' @description |
||||
|
#' Read single or multiple binary and ASCII files acquired with |
||||
|
#' an Bruker Vertex FTIR Instrument |
||||
|
#' @usage |
||||
|
#' readOPUS(fnames, in_format, out_format) |
||||
|
#' @param fnames character \code{vector} of the name(s) |
||||
|
#' (with absolute path) of the file(s) to read |
||||
|
#' @param in_format format of the input file: \code{'binary'} or |
||||
|
#' \code{'txt'} |
||||
|
#' @param out_format format of the output: |
||||
|
#' \code{'matrix'} (default) or \code{'list'} (see below) |
||||
|
#' @return |
||||
|
#' if \code{out_format} = \code{'matrix'}, absorbance values |
||||
|
#' of the input file(s) in a single \code{matrix}. |
||||
|
#' |
||||
|
#' if \code{out_format} = \code{'list'}, a \code{list} of the |
||||
|
#' input file(s) data consisting of a \code{list} with components: |
||||
|
#' \itemize{ |
||||
|
#' \item{\code{Name}}{ name of the file imported} |
||||
|
#' \item{\code{datetime}}{ date and time of acquisition in |
||||
|
#' \code{POSIXct} format (available only when |
||||
|
#' \code{in_format} = 'binary')} |
||||
|
#' \item{\code{metadata}}{ \code{list} with information |
||||
|
#' on instrument configuration (available only when |
||||
|
#' \code{in_format} = 'binary')} |
||||
|
#' \item{\code{absorbance}}{ a numeric \code{vector} |
||||
|
#' of absorbance values} |
||||
|
#' \item{\code{wavenumbers}}{ numeric \code{vector} |
||||
|
#' of the band positions} |
||||
|
#' } |
||||
|
#' @author Antoine Stevens and Andrew Sila (soil.spec package) |
||||
|
#' @note |
||||
|
#' This is essentially a re-factored and simplified version of |
||||
|
#' the \code{read.opus} function from the |
||||
|
#' \sQuote{soil.spec} package for reading OPUS VERTEX files |
||||
|
#' The function should also work for other OPUS files (eg alpha), |
||||
|
#' see \code{read.opus}. |
||||
|
#' @export |
||||
|
readOPUS<- function(fnames, in_format = c("binary", "txt"), |
||||
|
out_format = c("matrix", "list")) { |
||||
|
# hexView and plyr are required |
||||
|
|
||||
|
wavenumbers <- NULL |
||||
|
absorbance <- NULL |
||||
|
|
||||
|
in_format <- match.arg(in_format) |
||||
|
out_format <- match.arg(out_format) |
||||
|
|
||||
|
spc <- vector("list", length(fnames)) |
||||
|
i <- 1 |
||||
|
for (file.name in fnames) { |
||||
|
if (in_format == "binary") { |
||||
|
spc[[i]] <- readOPUS_bin(file.name) |
||||
|
} else { |
||||
|
spc[[i]] <- readOPUS_text(file.name) |
||||
|
} |
||||
|
i <- i + 1 |
||||
|
} |
||||
|
names(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames) |
||||
|
if (out_format == "matrix") { |
||||
|
test <- sapply(spc, function(x) class(x) != "character") |
||||
|
# warning( |
||||
|
# paste0(paste(names(spc)[!test], collapse = ","), |
||||
|
# " do not exist") |
||||
|
# ) |
||||
|
spc <- spc[test] |
||||
|
if(in_format == "binary"){ |
||||
|
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x){ |
||||
|
x <- t(data.frame( |
||||
|
wav = x$wavenumbers, absorbance = x$absorbance)) |
||||
|
colnames(x) <- x[1,] |
||||
|
data.frame(x[2, , drop = F], check.names = F)})) |
||||
|
|
||||
|
} else { |
||||
|
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x) { |
||||
|
x <- t(x) |
||||
|
colnames(x) <- x[1,] |
||||
|
data.frame(x[2, , drop = F], check.names = F)})) |
||||
|
} |
||||
|
rownames(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames) |
||||
|
|
||||
|
} |
||||
|
return(spc) |
||||
|
|
||||
|
} |
@ -0,0 +1,466 @@ |
|||||
|
# Perform calibration sampling based on spectral PCA ------------ |
||||
|
#' @title Split |
||||
|
#' @description Perform calibration sampling based on |
||||
|
#' the Kennard-Stones algorithm. |
||||
|
#' @param spec_chem data.frame that contains chemical |
||||
|
#' and IR spectroscopy data |
||||
|
#' @param ratio_val Ratio of number of validation and all samples. |
||||
|
#' @param pc Number of principal components (numeric) |
||||
|
#' @param print logical expression weather calibration |
||||
|
#' @param validation Logical expression weather |
||||
|
#' calibration sampling is performed |
||||
|
#' (\code{TRUE} or \code{FALSE}). |
||||
|
#' @usage ken_stone(spec_chem, ratio_val, pc, print = TRUE, |
||||
|
#' validation = TRUE) |
||||
|
#' @export |
||||
|
ken_stone <- function(spec_chem, ratio_val, pc, |
||||
|
print = TRUE, validation = TRUE) { |
||||
|
MIR <- model <- type <- PC1 <- PC2 <- NULL |
||||
|
# Now with a real dataset |
||||
|
# k = number of samples to select |
||||
|
# pc = if provided, the number of principal components |
||||
|
# (see ?kenStone) |
||||
|
if(validation == TRUE) { |
||||
|
# pc = 0.99 before !!! |
||||
|
sel <- prospectr::kenStone(X = spec_chem$MIR, |
||||
|
k = round(ratio_val * nrow(spec_chem)), pc = 2) |
||||
|
sel$model # The row index of calibration samples |
||||
|
# plot(sel$pc[, 1:2], xlab = 'PC1', ylab = 'PC2') |
||||
|
# Points selected for calibration |
||||
|
# points(sel$pc[sel$model, 1:2], pch = 19, col = 2) |
||||
|
|
||||
|
# Plot samples selected for calibration in ggplot |
||||
|
sel_df_cal <- data.frame(sel$pc[- sel$model,1:2]) |
||||
|
sel_df_cal$type <- as.factor( |
||||
|
rep("calibration", nrow(sel_df_cal)) |
||||
|
) |
||||
|
sel_df_val <- data.frame(sel$pc[sel$model, 1:2]) |
||||
|
sel_df_val$type <- as.factor( |
||||
|
rep("validation", nrow(sel_df_val))) |
||||
|
sel_df <- rbind(sel_df_cal, sel_df_val) |
||||
|
# Compute ratio needed to make the figure square |
||||
|
ratio <- with(sel_df, diff(range(PC1))/diff(range(PC2))) |
||||
|
p_pc <- ggplot2::ggplot(data = sel_df) + |
||||
|
ggplot2::geom_point( |
||||
|
ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) + |
||||
|
ggplot2::coord_fixed(ratio = 1) + |
||||
|
ggplot2::scale_shape_manual(values=c(1, 19)) + |
||||
|
ggplot2::scale_colour_manual(values=c("black", "red")) + |
||||
|
ggplot2::theme_bw() |
||||
|
# ggplot2::theme.user + |
||||
|
ggplot2::theme(legend.title = ggplot2::element_blank()) |
||||
|
# print(p_pc) |
||||
|
|
||||
|
# Split MIR data into calibration and validation set using |
||||
|
# the results of Kennard-Stone Calibration Sampling |
||||
|
# Selct by row index of calibration samples |
||||
|
val_set <- spec_chem[sel$model, ] |
||||
|
# Check number of observations (rows) for validation set |
||||
|
nrow(val_set) |
||||
|
cal_set <- spec_chem[- sel$model, ] |
||||
|
list_out <- list( |
||||
|
calibration = cal_set, |
||||
|
validation = val_set, |
||||
|
p_pc = p_pc |
||||
|
) |
||||
|
list_out |
||||
|
# Check number of observations (rows) for calibration set |
||||
|
# nrow(cal_set) |
||||
|
} else { |
||||
|
cal_set <- spec_chem |
||||
|
list(calibration = cal_set) |
||||
|
} |
||||
|
} |
||||
|
#' @title Perform model tuning |
||||
|
#' @description Uses function from caret to to model tuning |
||||
|
#' for PLS regression. |
||||
|
#' @param x list from calibration sampling |
||||
|
#' @param variable response variable for PLS regression, supplied |
||||
|
#' as character expression |
||||
|
#' @param validation Logical expression weather an independent |
||||
|
#' validation is performed. |
||||
|
#' @param env Environment where function is evaluated |
||||
|
#' @export |
||||
|
tune_model_q <- function(x, variable, |
||||
|
env = parent.frame(), validation = TRUE) { |
||||
|
calibration <- NULL |
||||
|
# List of calibration and validation samples |
||||
|
# set up a cross-validation scheme |
||||
|
# create 10 folds that we will keep for the different |
||||
|
# modeling approaches to allow comparison |
||||
|
# randomly break the data into 10 partitions |
||||
|
# note that k is the total number of samples for leave-one-out |
||||
|
# use substitute function to make non-standard evaluation |
||||
|
# of variable argument (looks at a function as argument, |
||||
|
# sees code used to compute value; |
||||
|
# see chapter 13.1 Capturing expressions |
||||
|
# in Advanced R (Hadley Wickham) |
||||
|
# !! p. 270 |
||||
|
r <- eval(variable, x$calibration, env) |
||||
|
idx <- caret::createFolds(y = r, k = 10, returnTrain = T) # update *** |
||||
|
idx |
||||
|
# inject the index in the trainControl object |
||||
|
tr_control <- caret::trainControl(method = "cv", index = idx, |
||||
|
savePredictions = T) |
||||
|
if (validation == TRUE) { |
||||
|
tr_control |
||||
|
} else { |
||||
|
tr_control |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
#' @title Perform model tuning |
||||
|
#' @description Uses function from caret to to model tuning |
||||
|
#' for PLS regression. |
||||
|
#' @param x list from calibration sampling |
||||
|
#' @param variable response variable for PLS regression, supplied |
||||
|
#' as character expression |
||||
|
#' @param validation Logical expression weather an independent |
||||
|
#' validation is performed. |
||||
|
#' @param env Environment where function is evaluated |
||||
|
#' @export |
||||
|
tune_model <- function(x, variable, |
||||
|
env = parent.frame(), validation = TRUE) { |
||||
|
tune_model_q(x, substitute(variable), env) |
||||
|
} |
||||
|
|
||||
|
# Fit a PLS regression model using the caret package ------------ |
||||
|
|
||||
|
#' @title Fit a PLS regression model |
||||
|
#' (quoted version of the function) |
||||
|
#' @description Uses the caret package to perform PLS modeling. |
||||
|
#' Spectra are centered and scaled prior to modeling. |
||||
|
#' @param x List that contains calibration |
||||
|
#' set, validation set, and model tuning options |
||||
|
#' @param validation Logical expression weather independent |
||||
|
#' validation is performed |
||||
|
#' @param variable Response variable to be modeled |
||||
|
#' @param tr_control Object that defines controlling parameters |
||||
|
#' of the desired internal validation framework |
||||
|
#' @param env Environment where function is evaluated |
||||
|
#' @export |
||||
|
fit_pls_q <- function(x, validation = TRUE, |
||||
|
variable, tr_control, env = parent.frame()) { |
||||
|
# Fit a partial least square regression (pls) model |
||||
|
# center and scale MIR (you can try without) |
||||
|
calibration <- MIR <- NULL |
||||
|
v <- eval(variable, x$calibration, env) |
||||
|
if (validation == TRUE) { |
||||
|
pls_model <- caret::train(x = x$calibration$MIR, y = v, |
||||
|
method = "pls", |
||||
|
tuneLength = 20, |
||||
|
trControl = tr_control, |
||||
|
preProcess = c("center", "scale") |
||||
|
) |
||||
|
} else { |
||||
|
pls_model <- caret::train(x = x$calibration$MIR, y = v, |
||||
|
method = "pls", |
||||
|
tuneLength = 20, |
||||
|
trControl = tr_control, |
||||
|
preProcess = c("center", "scale") |
||||
|
) |
||||
|
} |
||||
|
# Collect fitted object into a list |
||||
|
# fitList_cal <- list(pls = fit_pls) |
||||
|
# fitList_cal |
||||
|
pls_model |
||||
|
} |
||||
|
|
||||
|
#' @title Fit a PLS regression model |
||||
|
#' @description Uses the caret package to perform PLS modeling. |
||||
|
#' Spectra are centered and scaled prior to modeling. |
||||
|
#' @param x List that contains calibration |
||||
|
#' set, validation set, and model tuning options |
||||
|
#' @param validation Logical expression weather independent |
||||
|
#' validation is performed |
||||
|
#' @param variable Response variable to be modeled |
||||
|
#' @param env Environment where function is evaluated |
||||
|
#' @export |
||||
|
fit_pls <- function(x, validation = TRUE, |
||||
|
variable, env = parent.frame()) { |
||||
|
fit_pls_q(x = x, validation = TRUE, |
||||
|
variable = substitute(variable), env |
||||
|
) |
||||
|
} |
||||
|
|
||||
|
# Evaluate PLS performance (validation and cross-validation) ---- |
||||
|
|
||||
|
#' @title Evaluate PLS performance |
||||
|
#' @description Calculate model performance indices based |
||||
|
#' on observed and predicted values of validation and calibration |
||||
|
#' set, and internal cross-validation |
||||
|
#' @param x List that contains calibration and validation data |
||||
|
#' frame with combined spectral and chemical data |
||||
|
#' @param pls_model List with PLS regression model output from |
||||
|
#' the caret package |
||||
|
#' @param variable Response variable (e.g. chemical property) to be |
||||
|
#' modelled (needs to be non-quoted expression). \code{variable} |
||||
|
#' needs to be a column name in the \code{validation} data.frame |
||||
|
#' (element of \code{x}) |
||||
|
#' @param validation Logical expression if independent validation |
||||
|
#' is performed (split data set into calibration set and |
||||
|
#' validation set) |
||||
|
#' @param print Print observed vs. predicted for calibration |
||||
|
#' and validation. Default is \code{TRUE}. |
||||
|
#' @param env Specifiy the environment in which the function is |
||||
|
#' called. Default argument of \code{env} is |
||||
|
#' \code{parent.frame()} |
||||
|
#' @export |
||||
|
evaluate_pls_q <- function(x, pls_model, variable, |
||||
|
validation = TRUE, print = TRUE, env = parent.frame()) { |
||||
|
# Set global variables to NULL to avoid R CMD check notes |
||||
|
MIR <- object <- model <- dataType <- obs <- pred <- NULL |
||||
|
ncomp <- finalModel <- rmsd <- r2 <- r2 <- rpd <- n <- NULL |
||||
|
rmse <- calibration <- NULL |
||||
|
# Collect fitted object into a list |
||||
|
list_models <- list(pls = pls_model) |
||||
|
# Extract best tuning parameters and associated cv predictions |
||||
|
if(validation == TRUE) { |
||||
|
predobs_cal <- plyr::ldply(list_models, |
||||
|
function(x) plyr::match_df(x$pred, x$bestTune), |
||||
|
.id = "model" |
||||
|
) |
||||
|
# Calculate training (calibration) and test (validation) data |
||||
|
# predictions based on pls model with calibration data |
||||
|
v <- eval(variable, x$validation, env) |
||||
|
predobs_val <- caret::extractPrediction(list_models, |
||||
|
testX = x$validation$MIR, testY = v) # update *** |
||||
|
# Create new data frame column <object> |
||||
|
predobs_val$object <- predobs_val$model |
||||
|
|
||||
|
# Replace levels "Training" and "Test" in dataType column |
||||
|
# by "Calibration" and "Validation" (rename levels of factor) |
||||
|
predobs_val$dataType <- plyr::revalue(predobs_val$dataType, |
||||
|
c("Test" = "Validation", "Training" = "Calibration") |
||||
|
) |
||||
|
# Change the order of rows in the data frame |
||||
|
# Calibration as first level (show Calibration in ggplot graph |
||||
|
# on left panel) |
||||
|
predobs_val$dataType <- factor(predobs_val$dataType, |
||||
|
levels = c("Calibration", "Validation")) |
||||
|
# Calculate model performance indexes by model and dataType |
||||
|
# uses package plyr and function summary.df of SPECmisc.R |
||||
|
stats <- plyr::ddply(predobs_val, c("model", "dataType"), |
||||
|
function(x) summary_df(x, "obs", "pred") |
||||
|
) |
||||
|
|
||||
|
} else { |
||||
|
# Extract best tuning parameters and associated cv predictions |
||||
|
predobs_cv <- plyr::ldply(list_models, |
||||
|
function(x) plyr::match_df(x$pred, x$bestTune), |
||||
|
.id = "model" |
||||
|
) |
||||
|
# Extract auto-prediction |
||||
|
predobs <- caret::extractPrediction(list_models) |
||||
|
predobs_cv$object <- predobs_cv$model |
||||
|
predobs_cv$dataType <- "Cross-validation" |
||||
|
predobs_cv <- dplyr::select( |
||||
|
predobs_cv, obs, pred, model, dataType, object |
||||
|
) |
||||
|
predobs_val <- rbind(predobs, predobs_cv) |
||||
|
stats <- plyr::ddply(predobs_val, c("model", "dataType"), |
||||
|
function(x) summary_df(x, "obs", "pred") |
||||
|
) |
||||
|
} |
||||
|
|
||||
|
# Add number of components to stats; from finalModel list item |
||||
|
# from train() function output (function from caret package) |
||||
|
stats$ncomp <- rep(pls_model$finalModel$ncomp, nrow(stats)) |
||||
|
# Add range of observed values for validation and calibraton |
||||
|
# get range from predicted vs. observed data frame |
||||
|
# stored in object predobs |
||||
|
obs_cal <- subset(predobs_val, dataType == "Calibration")$obs |
||||
|
obs_val <- subset(predobs_val, dataType == "Validation")$obs |
||||
|
# Get name of predicted variable; see p. 261 of book |
||||
|
# "Advanced R" (Hadley Wickham) |
||||
|
variable_name <- deparse(variable) |
||||
|
# before: deparse(substitute(variable)) |
||||
|
df_range <- data.frame( |
||||
|
variable = rep(variable_name, 2), |
||||
|
dataType = c("Calibration", "Validation"), |
||||
|
min_obs = c(range(obs_cal)[1], range(obs_val)[1]), |
||||
|
median_obs = c(median(obs_cal), median(obs_val)), |
||||
|
max_obs = c(range(obs_cal)[2], range(obs_val)[2]), |
||||
|
mean_obs = c(mean(obs_cal), mean(obs_val)), |
||||
|
CV = c(sd(obs_cal) / mean(obs_cal) * 100, |
||||
|
sd(obs_val) / mean(obs_val) * 100) |
||||
|
) |
||||
|
|
||||
|
# Join stats with range data frame (df_range) |
||||
|
stats <- plyr::join(stats, df_range, type = "inner") |
||||
|
annotation <- plyr::mutate(stats, |
||||
|
rmse = as.character(as.expression(paste0("RMSE == ", |
||||
|
round(rmsd, 2)))), |
||||
|
r2 = as.character(as.expression(paste0("italic(R)^2 == ", |
||||
|
round(r2, 2)))), |
||||
|
rpd = as.character(as.expression(paste("RPD == ", |
||||
|
round(rpd, 2)))), |
||||
|
n = as.character(as.expression(paste0("italic(n) == ", n))), |
||||
|
ncomp = as.character(as.expression(paste0("ncomp = ", |
||||
|
ncomp))) |
||||
|
) |
||||
|
|
||||
|
# Plot predicted vs. observed values and model indexes |
||||
|
# update label, xlim, and ylim *** |
||||
|
# Add label number of samples to facet_grid using a |
||||
|
# labeling function |
||||
|
# ! Update labeller API: |
||||
|
# https://github.com/hadley/ggplot2/commit/ef33dc7 |
||||
|
# http://sahirbhatnagar.com/facet_wrap_labels |
||||
|
|
||||
|
# Prepare lookup character vector |
||||
|
make_label <- function(x, validation = TRUE) { |
||||
|
dataType <- n <- NULL |
||||
|
if (validation == TRUE) { |
||||
|
c(`Calibration` = paste0("Calibration", "~(", |
||||
|
x[x$dataType == "Calibration", ]$n, ")" |
||||
|
), |
||||
|
`Validation` = paste0("Validation", "~(", |
||||
|
x[x$dataType == "Validation", ]$n, ")" |
||||
|
) |
||||
|
) |
||||
|
} else{ |
||||
|
c(`Calibration` = paste0("Calibration", "~(", |
||||
|
x[x$dataType == "Calibration", ]$n, ")" |
||||
|
), |
||||
|
`Cross-Validation` = paste0("Cross-Validation", "~(", |
||||
|
x[x$dataType == "Cross-Validation", ]$n, ")" |
||||
|
) |
||||
|
) |
||||
|
} |
||||
|
} |
||||
|
if (validation == TRUE) { |
||||
|
label_validation <- make_label(x = annotation, |
||||
|
validation = TRUE |
||||
|
) |
||||
|
} else { |
||||
|
label_validation <- make_label(x = annotation, |
||||
|
validation = FALSE |
||||
|
) |
||||
|
} |
||||
|
|
||||
|
# Rename labels on the fly with a lookup character vector |
||||
|
to_string <- ggplot2::as_labeller( |
||||
|
x = label_validation, ggplot2::label_parsed |
||||
|
) |
||||
|
|
||||
|
# ------------------------------------------------------------- |
||||
|
|
||||
|
|
||||
|
# http://docs.ggplot2.org/0.9.3.1/label_parsed.html |
||||
|
# some other info: https://coderclub.b.uib.no/tag/plotmath/ |
||||
|
# !!! now depreciated in ggplot2 >= 2.0.0 |
||||
|
# dataType_labeller <- function(variable, value){ |
||||
|
# new <- paste0(dataType_names[value], "~(", annotation$n, ")") |
||||
|
# plyr::llply(as.character(new), function(x) parse(text = x)) |
||||
|
# } |
||||
|
p_pred_obs <- ggplot2::ggplot(data = predobs_val) + |
||||
|
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), |
||||
|
shape = 1, size = 4) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = -Inf, y = Inf, label = r2), size = 7, |
||||
|
hjust = -0.1, vjust = 1.5, parse = TRUE) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = -Inf, y = Inf, label = rmse), size = 7, |
||||
|
hjust = -0.075, vjust = 4.25, parse = TRUE) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = -Inf, y = Inf, label = rpd), size = 7, |
||||
|
hjust = -0.1, vjust = 6.5, parse = TRUE) + |
||||
|
ggplot2::facet_grid(~ dataType, |
||||
|
labeller =ggplot2::as_labeller(to_string)) + |
||||
|
# ggplot2::facet_grid(~ dataType, |
||||
|
# labeller = dataType_labeller) + |
||||
|
ggplot2::theme_bw() + |
||||
|
ggplot2::geom_abline(col = "red") + |
||||
|
ggplot2::labs(x = "Observed", y = "Predicted") + |
||||
|
ggplot2::xlim(c(min(predobs_val$obs) - |
||||
|
0.05 * diff(range(predobs_val$obs)), |
||||
|
max(predobs_val$obs) + |
||||
|
0.05 * diff(range(predobs_val$obs)))) + |
||||
|
ggplot2::ylim(c(min(predobs_val$obs) - |
||||
|
0.05 * diff(range(predobs_val$obs)), |
||||
|
max(predobs_val$obs) + |
||||
|
0.05 * diff(range(predobs_val$obs)))) # + |
||||
|
# theme.user |
||||
|
|
||||
|
## ggplot graph for model comparison |
||||
|
## (arranged later in panels) |
||||
|
x_label <- paste0("Observed ", |
||||
|
as.character(variable_name)) |
||||
|
y_label <- paste0("Predicted ", |
||||
|
as.character(variable_name)) |
||||
|
p_model <- ggplot2::ggplot(data = predobs_val) + |
||||
|
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred), |
||||
|
shape = 1, size = 2, alpha = 1/2) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 3, |
||||
|
hjust = 1.15, vjust = -3, parse = TRUE) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 3, |
||||
|
hjust = 1.12, vjust = -2.5, parse = TRUE) + |
||||
|
ggplot2::geom_text(data = annotation, |
||||
|
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 3, |
||||
|
hjust = 1.15, vjust = -1.25, parse = TRUE) + |
||||
|
ggplot2::facet_grid(~ dataType, |
||||
|
labeller = ggplot2::as_labeller(to_string)) + |
||||
|
# ggplot2::facet_grid(~ dataType, |
||||
|
# labeller = dataType_labeller) + |
||||
|
ggplot2::theme_bw() + |
||||
|
ggplot2::geom_abline(col = "red") + |
||||
|
ggplot2::labs(x = x_label, y = y_label) + |
||||
|
ggplot2::xlim(c(min(predobs_val$obs) - |
||||
|
0.05 * diff(range(predobs_val$obs)), |
||||
|
max(predobs_val$obs) + |
||||
|
0.05 * diff(range(predobs_val$obs)))) + |
||||
|
ggplot2::ylim(c(min(predobs_val$obs) - |
||||
|
0.05 * diff(range(predobs_val$obs)), |
||||
|
max(predobs_val$obs) + |
||||
|
0.05 * diff(range(predobs_val$obs)))) + |
||||
|
ggplot2::coord_fixed() |
||||
|
if(print == TRUE) { |
||||
|
print(p_model) |
||||
|
} |
||||
|
|
||||
|
list(stats = stats, p_model = p_model) |
||||
|
} |
||||
|
|
||||
|
|
||||
|
## PLS regression modeling in one function ====================== |
||||
|
|
||||
|
#' @title Calibration sampling, model tuning, and PLS regression |
||||
|
#' @description Perform calibration sampling and use selected |
||||
|
#' calibration set for model tuning |
||||
|
#' @param spec_chem data.frame that contains IR spectroscopy |
||||
|
#' and chemical data |
||||
|
#' @param k Number of validation samples |
||||
|
#' @param pc Number of Principal Components used for Calibration |
||||
|
#' sampling (Kennard-Stones algorithm) |
||||
|
#' @param ratio_val Ratio of number of validation and all samples. |
||||
|
#' @param print Logical expression weather graphs shall be printed |
||||
|
#' @param validation Logical expression weather independent |
||||
|
#' validation is performed |
||||
|
#' @param variable Response variable (without quotes) |
||||
|
#' @param env Environment where function is evaluated |
||||
|
#' @export |
||||
|
# Note: check non standard evaluation, argument passing... |
||||
|
pls_ken_stone <- function(spec_chem, ratio_val, pc, |
||||
|
print = TRUE, validation = TRUE, variable, |
||||
|
env = parent.frame()) { |
||||
|
calibration <- 0 |
||||
|
# Calibration sampling |
||||
|
list_sampled <- ken_stone( |
||||
|
spec_chem, ratio_val = ratio_val, pc = 2, validation = TRUE |
||||
|
) |
||||
|
tr_control <- tune_model_q(list_sampled, |
||||
|
substitute(variable), env |
||||
|
) |
||||
|
pls <- fit_pls_q(x = list_sampled, validation = TRUE, |
||||
|
variable = substitute(variable), tr_control = tr_control, env |
||||
|
) |
||||
|
stats <- evaluate_pls_q(x = list_sampled, pls_model = pls, |
||||
|
variable = substitute(variable), env = parent.frame() |
||||
|
) |
||||
|
list(data = list_sampled, p_pc = list_sampled$p_pc, |
||||
|
pls_model = pls, stats = stats$stats, p_model = stats$p_model) |
||||
|
} |
||||
|
|
@ -0,0 +1,42 @@ |
|||||
|
#' @title Predict soil properties of new spectra based on calibration models |
||||
|
#' @description |
||||
|
#' Function that uses pre-processed spectra, additional metadata of new |
||||
|
#' samples, and caret model output for the different soil property models |
||||
|
#' to create predicted values. |
||||
|
#' @param model_list List that contains caret output objects |
||||
|
#' of the different calibration models to predict (one model per soil property) |
||||
|
#' @param spectra_list List that contains spectra and additional data |
||||
|
#' after pre-processing (\code{do_pretreatment()}including metadata |
||||
|
#' (\code{sample_ID}) |
||||
|
#' @usage predict_from_spectra(model_list, spectra_list) |
||||
|
#' @export |
||||
|
predict_from_spectra <- function(model_list, spectra_list) { |
||||
|
|
||||
|
# Use extractPrediction function (caret) and supply model_list that contains |
||||
|
# caret calibration outputs; use pre-processed spectra dataset (list |
||||
|
# resulting from do_pretreatment()) |
||||
|
predictions_caret <- caret::extractPrediction( |
||||
|
models_prediction, |
||||
|
unkX = soilspec_test$MIR0 |
||||
|
) |
||||
|
|
||||
|
# Convert data.frame into long form; one sample should be represented by |
||||
|
# one single row and the predicted values of soil properties should be |
||||
|
# in the different columns |
||||
|
# Use the tidyr::spread() function (from tidyr packge) |
||||
|
# to gather columns into rows |
||||
|
# Add sample_ID column to uniquely identify observations |
||||
|
|
||||
|
# Number of caret model objects used to predict |
||||
|
n <- length(unique(predictions_caret$object)) |
||||
|
# Add sample_ID from metadata of spectra to predicted values |
||||
|
sample_ID <- spectra_list$data_meta$ID |
||||
|
# Repeat meta_data for each of the additional model rows and add |
||||
|
# ID column to long form data frame |
||||
|
id <- rep(sample_ID, n) |
||||
|
predictions_metadata <- cbind(predictions_caret, sample_ID = id) |
||||
|
# Get data into wide form |
||||
|
predictions_wide <- tidyr::spread( |
||||
|
data = predictions_metadata, key = "object", value = "pred" |
||||
|
) |
||||
|
} |
@ -0,0 +1,51 @@ |
|||||
|
#' @title Preprocess spectra |
||||
|
#' @description Use commonly used preprocessing algorithms on |
||||
|
#' the spectra. |
||||
|
#' @param list_spectra List that contains averaged spectra |
||||
|
#' in the list element called \code{MIR_mean} |
||||
|
#' @param select Character string that specifies the predefined |
||||
|
#' pretreatment options. Possible arguments are: |
||||
|
#' \code{select = "MIR0"} for Savitzky Golay smoothing filter |
||||
|
#' without derivative, \code{select = "MIR1"} for Savitky Golay |
||||
|
#' with first derivative, \code{select = "MIR2"} for Savitzky |
||||
|
#' Golay with second derivative, \code{select = "MIR0_snv"} |
||||
|
#' for Standard Normal Variate after Savitzky Golay without |
||||
|
#' derivative, and \code{select = "MIRb"} for |
||||
|
#' baseline correction. |
||||
|
#' @usage do_pretreatment(list_spectra, select) |
||||
|
#' @return list_spectra: List that contains preprocessed |
||||
|
#' spectra in element \code{MIR0} |
||||
|
#' @import hyperSpec |
||||
|
#' @export |
||||
|
do_pretreatment <- function(list_spectra, select) { |
||||
|
MIR_mean <- NULL |
||||
|
MIR_raw <- list_spectra$MIR_mean |
||||
|
# Filter the data using the Savitzky and Golay smoothing filter |
||||
|
# with a window size of 5 spectral variables and |
||||
|
# a polynomial order of 3 (no differentiation) |
||||
|
# p = polynomial order; plot variance vs polynomial order? |
||||
|
# w = window size (must be odd) |
||||
|
# m = m-th derivative of the polynomial coefficients |
||||
|
# (0 = smoothing) |
||||
|
MIR0 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, |
||||
|
m = 0, p = 3, w = 9) # smoothing and averaging |
||||
|
MIR1 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, |
||||
|
m = 1, p = 3, w = 5) # first derivative *** |
||||
|
MIR2 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean, |
||||
|
m = 2, p = 3, w = 5) # second derivative *** |
||||
|
# Calculate standard normal variate (SNV) after smoothing |
||||
|
MIR0_snv <- prospectr::standardNormalVariate(MIR0) |
||||
|
MIR1_snv <- prospectr::standardNormalVariate(MIR1) # added 2016-08-05 |
||||
|
# Baseline correction |
||||
|
# Compute baseline but first, create hyperSpec obj |
||||
|
spc <- new("hyperSpec", spc = as.matrix(list_spectra$MIR_mean), |
||||
|
wavelength = as.numeric(colnames(list_spectra$MIR_mean))) |
||||
|
below <- hyperSpec::spc.fit.poly.below( |
||||
|
fit.to = spc[, , 4000 ~ 900], |
||||
|
apply.to = spc, npts.min = 20, poly.order = 2) |
||||
|
spc_corr <- spc - below |
||||
|
MIRb <- spc_corr[[]] |
||||
|
pre <- select |
||||
|
list_spectra$MIR0 <- get(pre) |
||||
|
return(list_spectra) |
||||
|
} |
@ -0,0 +1,162 @@ |
|||||
|
#' @title Remove outlier spectra |
||||
|
#' @description Remove outlier spectra based on the |
||||
|
#' \code{pcout()} function of the \code{mvoutlier} package. |
||||
|
#' @usage remove_outliers(list_spectra, remove = TRUE) |
||||
|
#' @param list_spectra List that contains averaged |
||||
|
#' spectral information |
||||
|
#' in list element \code{MIR_mean} (data.frame) and metadata in |
||||
|
#' \code{data_meta} (data.frame). |
||||
|
#' @param remove logical expression (\code{TRUE} or \code{FALSE}) |
||||
|
#' that specifies weather spectra shall be removed. |
||||
|
#' If \code{rm = FALSE}, there will be no outlier removal |
||||
|
#' @return Returns list \code{spectra_out} that contains: |
||||
|
#' \itemize{ |
||||
|
#' \item \code{MIR_mean}: Outlier removed MIR spectra as |
||||
|
#' data.frame object. If \code{remove = FALSE}, |
||||
|
#' the function will |
||||
|
#' return almost identical list identical to \code{list_spectra}, |
||||
|
#' except that the first \code{indices} column of the spectral |
||||
|
#' data frame \code{MIR_mean} is removed |
||||
|
#' (This is done for both options |
||||
|
#' \code{remove = TRUE} and \code{remove = FALSE}). |
||||
|
#' \item \code{data_meta}: metadata data.frame, identical |
||||
|
#' as in the \code{list_spectra} input list. |
||||
|
#' \item \code{plot_out}: (optional) ggplot2 graph |
||||
|
#' that shows all spectra (absorbance on x-axis and wavenumber |
||||
|
#' on y-axis) with outlier marked, if |
||||
|
#' \code{remove = TRUE}. |
||||
|
#' } |
||||
|
#' @details This is an optional function if one wants to remove |
||||
|
#' outliers. |
||||
|
#' @export |
||||
|
remove_outliers <- function(list_spectra, remove = TRUE) { |
||||
|
# Outlier detection |
||||
|
# Use the mvoutlier package and pcout function to identify |
||||
|
# multivariate outliers |
||||
|
wfinal01 <- ID <- NULL |
||||
|
if (remove == TRUE) { |
||||
|
# Remove the 'indices' column |
||||
|
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1] |
||||
|
out <- mvoutlier::pcout(list_spectra$MIR_mean, makeplot = T, |
||||
|
outbound = 0.05) # parameters should be adapted |
||||
|
# Plot outlying spectra |
||||
|
plot_out <- plotMIR( |
||||
|
list_spectra$MIR_mean[ |
||||
|
order(out$wfinal01, decreasing = T), ], |
||||
|
col = as.factor(out$wfinal01[order(out$wfinal01, |
||||
|
decreasing = T)])) + |
||||
|
ggplot2::scale_colour_brewer("outlier", palette = "Set1") |
||||
|
out_id <- as.character( |
||||
|
list_spectra$data_meta$ID[!as.logical(out$wfinal01)] |
||||
|
) |
||||
|
# Remove outliers |
||||
|
MIR_mean <- list_spectra$MIR_mean[ |
||||
|
! list_spectra$data_meta$ID %in% out_id, ] |
||||
|
# rep ID and country name |
||||
|
data_meta <- list_spectra$data_meta[ |
||||
|
! list_spectra$data_meta$ID %in% out_id, ] |
||||
|
spectra_out <- list(MIR_mean = MIR_mean, |
||||
|
data_meta = data_meta, |
||||
|
plot_out = plot_out) |
||||
|
} else { |
||||
|
# Remove the 'indices' column |
||||
|
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1] |
||||
|
spectra_out <- list(MIR_mean = list_spectra$MIR_mean, |
||||
|
data_meta = list_spectra$data_meta) |
||||
|
} |
||||
|
spectra_out |
||||
|
} |
||||
|
|
||||
|
## plotMIR function of Antoine Stevens; don't export this |
||||
|
## function to the NAMESPACE |
||||
|
plotMIR <- function(spc, group = NULL, col = NULL, |
||||
|
linetype = NULL, wr = NULL, brk = NULL, |
||||
|
ylab = "Absorbance", xlab = "Wavenumber /cm-1", |
||||
|
by = NULL, by.wrap = T, ...){ |
||||
|
# Function to plot spectra, based on the ggplot2 package |
||||
|
# spc = spectral matrix, with colnames = wavelengths |
||||
|
# group = grouping variable, usually the id's of the sample |
||||
|
# wr = wavelength range to plot |
||||
|
# brk = breaks of the x-axis |
||||
|
# by = factor variable for which the mean and sd of |
||||
|
# each level will be computed and plotted (optional) |
||||
|
# Requires packages ggplot2; data.table; reshape2 |
||||
|
# Workaround to pass R CMD check: |
||||
|
# http://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when |
||||
|
# Setting the variables to NULL first |
||||
|
variable <- value <- colour <- NULL |
||||
|
spc <- as.data.frame(spc) |
||||
|
if (!is.null(wr)) |
||||
|
spc <- spc[, as.numeric(colnames(spc)) >= min(wr) & |
||||
|
as.numeric(colnames(spc)) <= max(wr)] |
||||
|
if (is.null(brk)) |
||||
|
brk <- pretty(as.numeric(colnames(spc)), n = 10) |
||||
|
if (!is.null(by)) { |
||||
|
spc$by <- by |
||||
|
spc <- data.table::data.table(spc, check.names = F) |
||||
|
mean.spc <- reshape2::melt( |
||||
|
spc[, lapply(data.table::.SD, mean), by = by], |
||||
|
id.vars = "by" |
||||
|
) |
||||
|
sd.spc <- reshape2::melt( |
||||
|
spc[, lapply(data.table::.SD, sd), by = by], |
||||
|
id.vars = "by" |
||||
|
) |
||||
|
mean.spc$min <- mean.spc$value - sd.spc$value |
||||
|
mean.spc$max <- mean.spc$value + sd.spc$value |
||||
|
mean.spc$variable <- as.numeric( |
||||
|
as.character(mean.spc$variable) |
||||
|
) |
||||
|
if (by.wrap) { |
||||
|
p <- ggplot2::ggplot(data = mean.spc) + |
||||
|
ggplot2::geom_ribbon( |
||||
|
ggplot2::aes(x = variable, ymin = min, ymax = max), |
||||
|
fill = "grey", col = "black", size = 0.15) + |
||||
|
ggplot2::theme_bw() |
||||
|
p <- p + ggplot2::geom_line( |
||||
|
ggplot2::aes(x = variable, y = value), |
||||
|
size = 0.25) + |
||||
|
ggplot2::facet_wrap(~ by) + |
||||
|
ggplot2::labs(x = xlab, y = ylab) + |
||||
|
ggplot2::scale_x_reverse(breaks = brk) |
||||
|
} else { |
||||
|
p <- ggplot2::ggplot(data = mean.spc, |
||||
|
ggplot2::aes(x = variable, y = value, group = by, col = by)) + |
||||
|
ggplot2::geom_line(size = 0.25) + |
||||
|
ggplot2::labs(x = xlab, y = ylab) + |
||||
|
ggplot2::scale_x_reverse(breaks = brk) + |
||||
|
ggplot2::theme_bw() |
||||
|
} |
||||
|
return(p) |
||||
|
} else { |
||||
|
if (is.null(group)) |
||||
|
group <- as.character(1:nrow(spc)) |
||||
|
spc$group <- group |
||||
|
spc$colour <- col |
||||
|
spc$linetype <- linetype |
||||
|
id.var <- colnames(spc)[ |
||||
|
grep("group|colour|linetype",colnames(spc))] |
||||
|
tmp <- reshape2::melt(spc, id.var = id.var) |
||||
|
tmp$variable <- as.numeric(as.character(tmp$variable)) |
||||
|
p <- ggplot2::ggplot(tmp, |
||||
|
ggplot2::aes(variable, value, group = group)) + |
||||
|
ggplot2::labs(x = xlab, y = ylab) + |
||||
|
ggplot2::theme_bw() + |
||||
|
ggplot2::scale_x_reverse(breaks = brk) |
||||
|
if (is.null(col) & is.null(linetype)) |
||||
|
p <- p + ggplot2::geom_line( |
||||
|
ggplot2::aes(colour = group)) |
||||
|
else if (!is.null(col) & is.null(linetype)) |
||||
|
p <- p + ggplot2::geom_line( |
||||
|
ggplot2::aes(colour = colour)) |
||||
|
else if (is.null(col) & !is.null(linetype)) |
||||
|
p <- p + ggplot2::geom_line( |
||||
|
ggplot2::aes(colour = group, |
||||
|
linetype = linetype)) |
||||
|
else p <- p + ggplot2::geom_line( |
||||
|
ggplot2::aes(colour = colour, |
||||
|
linetype = linetype)) |
||||
|
return(p) |
||||
|
} |
||||
|
} |
||||
|
|
@ -0,0 +1,19 @@ |
|||||
|
#' @title Resample spectra stored to new |
||||
|
#' @description Calculates model statistics for predicted (y) |
||||
|
#' vs. observed (y) values |
||||
|
#' @param list_spectra List of spectra and metadata |
||||
|
#' @param wn_lower Numerical value for lowest wavenumber in sampling interval |
||||
|
#' @param wn_upper Numerical value for highest wavenumber in sampling interval |
||||
|
#' @export |
||||
|
resample_spectra <- function( |
||||
|
list_spectra, wn_lower = 510, wn_upper = 3988, wn_interval = 2) |
||||
|
{ |
||||
|
# Create sequence of new wavenumbers |
||||
|
wn_seq <- rev(seq(from = wn_lower, wn_upper, by = wn_interval)) |
||||
|
list_spectra$MIR0 <- prospectr::resample( |
||||
|
X = list_spectra$MIR_mean, # spectral matrix to resample |
||||
|
wav = as.numeric(colnames(list_spectra$MIR_mean)), # old wavenumbers |
||||
|
new.wav = wn_seq # new wavenumbers |
||||
|
) |
||||
|
return(list_spectra) |
||||
|
} |
@ -0,0 +1,25 @@ |
|||||
|
#' @title Calculate model statistics |
||||
|
#' @description Calculates model statistics for predicted (y) |
||||
|
#' vs. observed (y) values |
||||
|
#' @param df data.frame with predicted and observed data |
||||
|
#' @param x column with observed values |
||||
|
#' @param y column with predicted values |
||||
|
#' @export |
||||
|
summary_df <- function(df, x, y){ |
||||
|
x <- df[, x] |
||||
|
y <- df[, y] |
||||
|
data.frame(rmse = sqrt(sum((x - y)^2, na.rm = T) / (length(x)-1)), |
||||
|
rmsd = mean((x - y)^2)^.5, |
||||
|
sdev = sd(x, na.rm = T), |
||||
|
rpd = sd(x,na.rm = T) / |
||||
|
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)), |
||||
|
rpiq = (quantile(x, .75, na.rm = T) - quantile(x, .25, na.rm = T)) / |
||||
|
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)), |
||||
|
r2 = cor(x, y, use = "pairwise.complete.obs")^2, |
||||
|
bias = mean(x, na.rm = T) - mean(y, na.rm = T), |
||||
|
SB = (mean(x, na.rm = T) - mean(y, na.rm = T))^2, |
||||
|
NU = var(x, na.rm = T) * (1 - lm(y ~ x)$coefficients[2])^2, |
||||
|
LC = var(y, na.rm = T) * |
||||
|
(1 - cor(x, y, use = "pairwise.complete.obs")^2), |
||||
|
n = length(x)) |
||||
|
} |
@ -0,0 +1,41 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/average-spectra.R |
||||
|
\name{average_spectra} |
||||
|
\alias{average_spectra} |
||||
|
\title{Calculate mean of spectra} |
||||
|
\usage{ |
||||
|
average_spectra(in_spectra) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{in_spectra}{List that contains spectral data in the |
||||
|
element \code{MIR} (data.frame) and sample metadata in the |
||||
|
list element \code{data_rep} (data.frame). |
||||
|
The data.frame \code{data_meta} |
||||
|
contains the sample ID stored in the \code{ID} |
||||
|
vector (originally from spectral file names), |
||||
|
country abbreviation stored in \code{contry} (2 letters), |
||||
|
and the vector \code{site} (2 letters) that is the country |
||||
|
abbreviation.} |
||||
|
} |
||||
|
\value{ |
||||
|
\code{out_spectra}: List that contains: |
||||
|
\itemize{ |
||||
|
\item \code{data_meta}: metadata of sample (data.frame) |
||||
|
that is |
||||
|
taken from the element \code{rep} of the input list argument |
||||
|
\code{in_spectra} |
||||
|
\item \code{MIR_mean}: average spectra from replicates of |
||||
|
sample ID |
||||
|
(data.frame) |
||||
|
\item \code{MIR_sd}: standard deviation of spectra calculated |
||||
|
from replicates of sample ID (data.frame) |
||||
|
\item \code{cvar} coefficient of variance over all |
||||
|
wavenumbers of spectra |
||||
|
calculated from replicates of sample ID (vector) |
||||
|
} |
||||
|
} |
||||
|
\description{ |
||||
|
Calculate the mean of each spectral repetitions |
||||
|
(absorbance average per wavenumber) |
||||
|
} |
||||
|
|
@ -0,0 +1,31 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pretreat-spectra.R |
||||
|
\name{do_pretreatment} |
||||
|
\alias{do_pretreatment} |
||||
|
\title{Preprocess spectra} |
||||
|
\usage{ |
||||
|
do_pretreatment(list_spectra, select) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{list_spectra}{List that contains averaged spectra |
||||
|
in the list element called \code{MIR_mean}} |
||||
|
|
||||
|
\item{select}{Character string that specifies the predefined |
||||
|
pretreatment options. Possible arguments are: |
||||
|
\code{select = "MIR0"} for Savitzky Golay smoothing filter |
||||
|
without derivative, \code{select = "MIR1"} for Savitky Golay |
||||
|
with first derivative, \code{select = "MIR2"} for Savitzky |
||||
|
Golay with second derivative, \code{select = "MIR0_snv"} |
||||
|
for Standard Normal Variate after Savitzky Golay without |
||||
|
derivative, and \code{select = "MIRb"} for |
||||
|
baseline correction.} |
||||
|
} |
||||
|
\value{ |
||||
|
list_spectra: List that contains preprocessed |
||||
|
spectra in element \code{MIR0} |
||||
|
} |
||||
|
\description{ |
||||
|
Use commonly used preprocessing algorithms on |
||||
|
the spectra. |
||||
|
} |
||||
|
|
@ -0,0 +1,38 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{evaluate_pls_q} |
||||
|
\alias{evaluate_pls_q} |
||||
|
\title{Evaluate PLS performance} |
||||
|
\usage{ |
||||
|
evaluate_pls_q(x, pls_model, variable, validation = TRUE, print = TRUE, |
||||
|
env = parent.frame()) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{List that contains calibration and validation data |
||||
|
frame with combined spectral and chemical data} |
||||
|
|
||||
|
\item{pls_model}{List with PLS regression model output from |
||||
|
the caret package} |
||||
|
|
||||
|
\item{variable}{Response variable (e.g. chemical property) to be |
||||
|
modelled (needs to be non-quoted expression). \code{variable} |
||||
|
needs to be a column name in the \code{validation} data.frame |
||||
|
(element of \code{x})} |
||||
|
|
||||
|
\item{validation}{Logical expression if independent validation |
||||
|
is performed (split data set into calibration set and |
||||
|
validation set)} |
||||
|
|
||||
|
\item{print}{Print observed vs. predicted for calibration |
||||
|
and validation. Default is \code{TRUE}.} |
||||
|
|
||||
|
\item{env}{Specifiy the environment in which the function is |
||||
|
called. Default argument of \code{env} is |
||||
|
\code{parent.frame()}} |
||||
|
} |
||||
|
\description{ |
||||
|
Calculate model performance indices based |
||||
|
on observed and predicted values of validation and calibration |
||||
|
set, and internal cross-validation |
||||
|
} |
||||
|
|
@ -0,0 +1,24 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{fit_pls} |
||||
|
\alias{fit_pls} |
||||
|
\title{Fit a PLS regression model} |
||||
|
\usage{ |
||||
|
fit_pls(x, validation = TRUE, variable, env = parent.frame()) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{List that contains calibration |
||||
|
set, validation set, and model tuning options} |
||||
|
|
||||
|
\item{validation}{Logical expression weather independent |
||||
|
validation is performed} |
||||
|
|
||||
|
\item{variable}{Response variable to be modeled} |
||||
|
|
||||
|
\item{env}{Environment where function is evaluated} |
||||
|
} |
||||
|
\description{ |
||||
|
Uses the caret package to perform PLS modeling. |
||||
|
Spectra are centered and scaled prior to modeling. |
||||
|
} |
||||
|
|
@ -0,0 +1,28 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{fit_pls_q} |
||||
|
\alias{fit_pls_q} |
||||
|
\title{Fit a PLS regression model |
||||
|
(quoted version of the function)} |
||||
|
\usage{ |
||||
|
fit_pls_q(x, validation = TRUE, variable, tr_control, env = parent.frame()) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{List that contains calibration |
||||
|
set, validation set, and model tuning options} |
||||
|
|
||||
|
\item{validation}{Logical expression weather independent |
||||
|
validation is performed} |
||||
|
|
||||
|
\item{variable}{Response variable to be modeled} |
||||
|
|
||||
|
\item{tr_control}{Object that defines controlling parameters |
||||
|
of the desired internal validation framework} |
||||
|
|
||||
|
\item{env}{Environment where function is evaluated} |
||||
|
} |
||||
|
\description{ |
||||
|
Uses the caret package to perform PLS modeling. |
||||
|
Spectra are centered and scaled prior to modeling. |
||||
|
} |
||||
|
|
@ -0,0 +1,24 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/join-chem-spectra.R |
||||
|
\name{join_chem_spec} |
||||
|
\alias{join_chem_spec} |
||||
|
\title{Join chemical and spectral data frames} |
||||
|
\usage{ |
||||
|
join_chem_spec(dat_chem, dat_spec, by = "sample_ID") |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{dat_chem}{data.frame that contains chemical values of |
||||
|
the sample} |
||||
|
|
||||
|
\item{dat_spec}{List that contains spectral data} |
||||
|
|
||||
|
\item{by}{character of column name that defines sample_ID} |
||||
|
} |
||||
|
\value{ |
||||
|
List: xxx |
||||
|
} |
||||
|
\description{ |
||||
|
Combines spectral data (data.frame) and chemical |
||||
|
data (data.frame). |
||||
|
} |
||||
|
|
@ -0,0 +1,28 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{ken_stone} |
||||
|
\alias{ken_stone} |
||||
|
\title{Split} |
||||
|
\usage{ |
||||
|
ken_stone(spec_chem, ratio_val, pc, print = TRUE, |
||||
|
validation = TRUE) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{spec_chem}{data.frame that contains chemical |
||||
|
and IR spectroscopy data} |
||||
|
|
||||
|
\item{ratio_val}{Ratio of number of validation and all samples.} |
||||
|
|
||||
|
\item{pc}{Number of principal components (numeric)} |
||||
|
|
||||
|
\item{print}{logical expression weather calibration} |
||||
|
|
||||
|
\item{validation}{Logical expression weather |
||||
|
calibration sampling is performed |
||||
|
(\code{TRUE} or \code{FALSE}).} |
||||
|
} |
||||
|
\description{ |
||||
|
Perform calibration sampling based on |
||||
|
the Kennard-Stones algorithm. |
||||
|
} |
||||
|
|
@ -0,0 +1,34 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{pls_ken_stone} |
||||
|
\alias{pls_ken_stone} |
||||
|
\title{Calibration sampling, model tuning, and PLS regression} |
||||
|
\usage{ |
||||
|
pls_ken_stone(spec_chem, ratio_val, pc, print = TRUE, validation = TRUE, |
||||
|
variable, env = parent.frame()) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{spec_chem}{data.frame that contains IR spectroscopy |
||||
|
and chemical data} |
||||
|
|
||||
|
\item{ratio_val}{Ratio of number of validation and all samples.} |
||||
|
|
||||
|
\item{pc}{Number of Principal Components used for Calibration |
||||
|
sampling (Kennard-Stones algorithm)} |
||||
|
|
||||
|
\item{print}{Logical expression weather graphs shall be printed} |
||||
|
|
||||
|
\item{validation}{Logical expression weather independent |
||||
|
validation is performed} |
||||
|
|
||||
|
\item{variable}{Response variable (without quotes)} |
||||
|
|
||||
|
\item{env}{Environment where function is evaluated} |
||||
|
|
||||
|
\item{k}{Number of validation samples} |
||||
|
} |
||||
|
\description{ |
||||
|
Perform calibration sampling and use selected |
||||
|
calibration set for model tuning |
||||
|
} |
||||
|
|
@ -0,0 +1,22 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/predict-spectra.R |
||||
|
\name{predict_from_spectra} |
||||
|
\alias{predict_from_spectra} |
||||
|
\title{Predict soil properties of new spectra based on calibration models} |
||||
|
\usage{ |
||||
|
predict_from_spectra(model_list, spectra_list) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{model_list}{List that contains caret output objects |
||||
|
of the different calibration models to predict (one model per soil property)} |
||||
|
|
||||
|
\item{spectra_list}{List that contains spectra and additional data |
||||
|
after pre-processing (\code{do_pretreatment()}including metadata |
||||
|
(\code{sample_ID})} |
||||
|
} |
||||
|
\description{ |
||||
|
Function that uses pre-processed spectra, additional metadata of new |
||||
|
samples, and caret model output for the different soil property models |
||||
|
to create predicted values. |
||||
|
} |
||||
|
|
@ -0,0 +1,53 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/load-spectra.R |
||||
|
\name{readOPUS} |
||||
|
\alias{readOPUS} |
||||
|
\title{Read OPUS binary and ASCII files} |
||||
|
\usage{ |
||||
|
readOPUS(fnames, in_format, out_format) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{fnames}{character \code{vector} of the name(s) |
||||
|
(with absolute path) of the file(s) to read} |
||||
|
|
||||
|
\item{in_format}{format of the input file: \code{'binary'} or |
||||
|
\code{'txt'}} |
||||
|
|
||||
|
\item{out_format}{format of the output: |
||||
|
\code{'matrix'} (default) or \code{'list'} (see below)} |
||||
|
} |
||||
|
\value{ |
||||
|
if \code{out_format} = \code{'matrix'}, absorbance values |
||||
|
of the input file(s) in a single \code{matrix}. |
||||
|
|
||||
|
if \code{out_format} = \code{'list'}, a \code{list} of the |
||||
|
input file(s) data consisting of a \code{list} with components: |
||||
|
\itemize{ |
||||
|
\item{\code{Name}}{ name of the file imported} |
||||
|
\item{\code{datetime}}{ date and time of acquisition in |
||||
|
\code{POSIXct} format (available only when |
||||
|
\code{in_format} = 'binary')} |
||||
|
\item{\code{metadata}}{ \code{list} with information |
||||
|
on instrument configuration (available only when |
||||
|
\code{in_format} = 'binary')} |
||||
|
\item{\code{absorbance}}{ a numeric \code{vector} |
||||
|
of absorbance values} |
||||
|
\item{\code{wavenumbers}}{ numeric \code{vector} |
||||
|
of the band positions} |
||||
|
} |
||||
|
} |
||||
|
\description{ |
||||
|
Read single or multiple binary and ASCII files acquired with |
||||
|
an Bruker Vertex FTIR Instrument |
||||
|
} |
||||
|
\note{ |
||||
|
This is essentially a re-factored and simplified version of |
||||
|
the \code{read.opus} function from the |
||||
|
\sQuote{soil.spec} package for reading OPUS VERTEX files |
||||
|
The function should also work for other OPUS files (eg alpha), |
||||
|
see \code{read.opus}. |
||||
|
} |
||||
|
\author{ |
||||
|
Antoine Stevens and Andrew Sila (soil.spec package) |
||||
|
} |
||||
|
|
@ -0,0 +1,16 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/load-spectra.R |
||||
|
\name{readOPUS_bin} |
||||
|
\alias{readOPUS_bin} |
||||
|
\title{Read an OPUS binary file} |
||||
|
\usage{ |
||||
|
readOPUS_bin(file.name) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{file.name}{Character vector with path to files} |
||||
|
} |
||||
|
\description{ |
||||
|
Read single binary file acquired with an |
||||
|
Bruker Vertex FTIR Instrument |
||||
|
} |
||||
|
|
@ -0,0 +1,17 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/load-spectra.R |
||||
|
\name{readOPUS_text} |
||||
|
\alias{readOPUS_text} |
||||
|
\title{Read an OPUS text file} |
||||
|
\usage{ |
||||
|
readOPUS_text(file.name) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{file.name}{Character vector with path to files} |
||||
|
} |
||||
|
\description{ |
||||
|
Read single text file acquired with |
||||
|
an Bruker Vertex FTIR Instrument |
||||
|
(as exported from OPUS software) |
||||
|
} |
||||
|
|
@ -0,0 +1,48 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/load-spectra-yamsys.R |
||||
|
\name{read_spectra} |
||||
|
\alias{read_spectra} |
||||
|
\title{Read an OPUS text file and extract metadata} |
||||
|
\usage{ |
||||
|
read_spectra(path) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{path}{character of the directory |
||||
|
where the spectral text files are stored} |
||||
|
} |
||||
|
\value{ |
||||
|
List that contains the following elements: |
||||
|
\itemize{ |
||||
|
\item \code{MIR}: data.frame that contains all the spectra. |
||||
|
The columns of \code{MIR} contain absorbance values at |
||||
|
different wavenumber in the MIR range. The wavenumbers |
||||
|
rounded to 0.1 are given as column names. The original file |
||||
|
names are stored as row names. One line in the data frame |
||||
|
\code{MIR} contains one replicate scan of a sample. |
||||
|
\item \code{data_rep}: data.frame that constists of sample |
||||
|
metadata that was extracted from the file name of |
||||
|
individual spectral files. The first vector \code{ID} |
||||
|
contains the spectral file name without the repetition number |
||||
|
supplied as \code{.<number>} in the file name. |
||||
|
Letters 1 to 2 of the spectral |
||||
|
file name are used for the country abbreviation, stored |
||||
|
as in the \code{} vector \code{data_rep} . Letters |
||||
|
4 to 5 of the file name are used for the landscape (site) |
||||
|
abbreviation. |
||||
|
} |
||||
|
} |
||||
|
\description{ |
||||
|
Read single text file acquired with |
||||
|
an Bruker Vertex FTIR Instrument |
||||
|
(as exported from OPUS software) and extract sample metadata |
||||
|
provided in the filename |
||||
|
} |
||||
|
\note{ |
||||
|
: This function is derived from a re-factored and |
||||
|
simplified version of the \code{read.opus} function from the |
||||
|
\sQuote{soil.spec} package for reading OPUS VERTEX files |
||||
|
The function should also work for other OPUS files (eg alpha), |
||||
|
see \code{read.opus}. The function readOPUS() was |
||||
|
written by Antoine Stevens. |
||||
|
} |
||||
|
|
@ -0,0 +1,46 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/remove-outl-spectra.R |
||||
|
\name{remove_outliers} |
||||
|
\alias{remove_outliers} |
||||
|
\title{Remove outlier spectra} |
||||
|
\usage{ |
||||
|
remove_outliers(list_spectra, remove = TRUE) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{list_spectra}{List that contains averaged |
||||
|
spectral information |
||||
|
in list element \code{MIR_mean} (data.frame) and metadata in |
||||
|
\code{data_meta} (data.frame).} |
||||
|
|
||||
|
\item{remove}{logical expression (\code{TRUE} or \code{FALSE}) |
||||
|
that specifies weather spectra shall be removed. |
||||
|
If \code{rm = FALSE}, there will be no outlier removal} |
||||
|
} |
||||
|
\value{ |
||||
|
Returns list \code{spectra_out} that contains: |
||||
|
\itemize{ |
||||
|
\item \code{MIR_mean}: Outlier removed MIR spectra as |
||||
|
data.frame object. If \code{remove = FALSE}, |
||||
|
the function will |
||||
|
return almost identical list identical to \code{list_spectra}, |
||||
|
except that the first \code{indices} column of the spectral |
||||
|
data frame \code{MIR_mean} is removed |
||||
|
(This is done for both options |
||||
|
\code{remove = TRUE} and \code{remove = FALSE}). |
||||
|
\item \code{data_meta}: metadata data.frame, identical |
||||
|
as in the \code{list_spectra} input list. |
||||
|
\item \code{plot_out}: (optional) ggplot2 graph |
||||
|
that shows all spectra (absorbance on x-axis and wavenumber |
||||
|
on y-axis) with outlier marked, if |
||||
|
\code{remove = TRUE}. |
||||
|
} |
||||
|
} |
||||
|
\description{ |
||||
|
Remove outlier spectra based on the |
||||
|
\code{pcout()} function of the \code{mvoutlier} package. |
||||
|
} |
||||
|
\details{ |
||||
|
This is an optional function if one wants to remove |
||||
|
outliers. |
||||
|
} |
||||
|
|
@ -0,0 +1,21 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/resample-spectra.R |
||||
|
\name{resample_spectra} |
||||
|
\alias{resample_spectra} |
||||
|
\title{Resample spectra stored to new} |
||||
|
\usage{ |
||||
|
resample_spectra(list_spectra, wn_lower = 510, wn_upper = 3988, |
||||
|
wn_interval = 2) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{list_spectra}{List of spectra and metadata} |
||||
|
|
||||
|
\item{wn_lower}{Numerical value for lowest wavenumber in sampling interval} |
||||
|
|
||||
|
\item{wn_upper}{Numerical value for highest wavenumber in sampling interval} |
||||
|
} |
||||
|
\description{ |
||||
|
Calculates model statistics for predicted (y) |
||||
|
vs. observed (y) values |
||||
|
} |
||||
|
|
@ -0,0 +1,20 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/spectra-utils.R |
||||
|
\name{summary_df} |
||||
|
\alias{summary_df} |
||||
|
\title{Calculate model statistics} |
||||
|
\usage{ |
||||
|
summary_df(df, x, y) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{df}{data.frame with predicted and observed data} |
||||
|
|
||||
|
\item{x}{column with observed values} |
||||
|
|
||||
|
\item{y}{column with predicted values} |
||||
|
} |
||||
|
\description{ |
||||
|
Calculates model statistics for predicted (y) |
||||
|
vs. observed (y) values |
||||
|
} |
||||
|
|
@ -0,0 +1,24 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{tune_model} |
||||
|
\alias{tune_model} |
||||
|
\title{Perform model tuning} |
||||
|
\usage{ |
||||
|
tune_model(x, variable, env = parent.frame(), validation = TRUE) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{list from calibration sampling} |
||||
|
|
||||
|
\item{variable}{response variable for PLS regression, supplied |
||||
|
as character expression} |
||||
|
|
||||
|
\item{env}{Environment where function is evaluated} |
||||
|
|
||||
|
\item{validation}{Logical expression weather an independent |
||||
|
validation is performed.} |
||||
|
} |
||||
|
\description{ |
||||
|
Uses function from caret to to model tuning |
||||
|
for PLS regression. |
||||
|
} |
||||
|
|
@ -0,0 +1,24 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/pls-modeling.R |
||||
|
\name{tune_model_q} |
||||
|
\alias{tune_model_q} |
||||
|
\title{Perform model tuning} |
||||
|
\usage{ |
||||
|
tune_model_q(x, variable, env = parent.frame(), validation = TRUE) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{list from calibration sampling} |
||||
|
|
||||
|
\item{variable}{response variable for PLS regression, supplied |
||||
|
as character expression} |
||||
|
|
||||
|
\item{env}{Environment where function is evaluated} |
||||
|
|
||||
|
\item{validation}{Logical expression weather an independent |
||||
|
validation is performed.} |
||||
|
} |
||||
|
\description{ |
||||
|
Uses function from caret to to model tuning |
||||
|
for PLS regression. |
||||
|
} |
||||
|
|
@ -0,0 +1,21 @@ |
|||||
|
Version: 1.0 |
||||
|
|
||||
|
RestoreWorkspace: Default |
||||
|
SaveWorkspace: Default |
||||
|
AlwaysSaveHistory: Default |
||||
|
|
||||
|
EnableCodeIndexing: Yes |
||||
|
UseSpacesForTab: Yes |
||||
|
NumSpacesForTab: 2 |
||||
|
Encoding: UTF-8 |
||||
|
|
||||
|
RnwWeave: Sweave |
||||
|
LaTeX: pdfLaTeX |
||||
|
|
||||
|
AutoAppendNewline: Yes |
||||
|
StripTrailingWhitespace: Yes |
||||
|
|
||||
|
BuildType: Package |
||||
|
PackageUseDevtools: Yes |
||||
|
PackageInstallArgs: --no-multiarch --with-keep.source |
||||
|
PackageRoxygenize: rd,collate,namespace,vignette |
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
Some files were not shown because too many files changed in this diff
Loading…
Reference in new issue