Browse Source

Add first locally developed version of simplerspec package to github

pull/3/head
Philipp Baumann 4 years ago
commit
7ddbe3ec0e
320 changed files with 491036 additions and 0 deletions
  1. +2
    -0
      .Rbuildignore
  2. +4
    -0
      .gitignore
  3. +27
    -0
      DESCRIPTION
  4. +23
    -0
      NAMESPACE
  5. +85
    -0
      R/average-spectra.R
  6. +34
    -0
      R/join-chem-spectra.R
  7. +71
    -0
      R/load-spectra-yamsys.R
  8. +310
    -0
      R/load-spectra.R
  9. +466
    -0
      R/pls-modeling.R
  10. +42
    -0
      R/predict-spectra.R
  11. +51
    -0
      R/pretreat-spectra.R
  12. +162
    -0
      R/remove-outl-spectra.R
  13. +19
    -0
      R/resample-spectra.R
  14. +25
    -0
      R/spectra-utils.R
  15. +41
    -0
      man/average_spectra.Rd
  16. +31
    -0
      man/do_pretreatment.Rd
  17. +38
    -0
      man/evaluate_pls_q.Rd
  18. +24
    -0
      man/fit_pls.Rd
  19. +28
    -0
      man/fit_pls_q.Rd
  20. +24
    -0
      man/join_chem_spec.Rd
  21. +28
    -0
      man/ken_stone.Rd
  22. +34
    -0
      man/pls_ken_stone.Rd
  23. +22
    -0
      man/predict_from_spectra.Rd
  24. +53
    -0
      man/readOPUS.Rd
  25. +16
    -0
      man/readOPUS_bin.Rd
  26. +17
    -0
      man/readOPUS_text.Rd
  27. +48
    -0
      man/read_spectra.Rd
  28. +46
    -0
      man/remove_outliers.Rd
  29. +21
    -0
      man/resample_spectra.Rd
  30. +20
    -0
      man/summary_df.Rd
  31. +24
    -0
      man/tune_model.Rd
  32. +24
    -0
      man/tune_model_q.Rd
  33. +21
    -0
      simplerspec.Rproj
  34. +0
    -0
      tests/.Rapp.history
  35. +95
    -0
      tests/testdata/soilchem/soilchem_YAMSYS.csv
  36. +1716
    -0
      tests/testdata/soilspec/BF_lo_01_soil_cal.0.txt
  37. +1716
    -0
      tests/testdata/soilspec/BF_lo_01_soil_cal.1.txt
  38. +1716
    -0
      tests/testdata/soilspec/BF_lo_01_soil_cal.2.txt
  39. +1716
    -0
      tests/testdata/soilspec/BF_lo_02_soil_cal.0.txt
  40. +1716
    -0
      tests/testdata/soilspec/BF_lo_02_soil_cal.1.txt
  41. +1716
    -0
      tests/testdata/soilspec/BF_lo_02_soil_cal.2.txt
  42. +1716
    -0
      tests/testdata/soilspec/BF_lo_03_soil_cal.0.txt
  43. +1716
    -0
      tests/testdata/soilspec/BF_lo_03_soil_cal.1.txt
  44. +1716
    -0
      tests/testdata/soilspec/BF_lo_03_soil_cal.2.txt
  45. +1716
    -0
      tests/testdata/soilspec/BF_lo_04_soil_cal.0.txt
  46. +1716
    -0
      tests/testdata/soilspec/BF_lo_04_soil_cal.1.txt
  47. +1716
    -0
      tests/testdata/soilspec/BF_lo_04_soil_cal.2.txt
  48. +1716
    -0
      tests/testdata/soilspec/BF_lo_05_soil_cal.0.txt
  49. +1716
    -0
      tests/testdata/soilspec/BF_lo_05_soil_cal.1.txt
  50. +1716
    -0
      tests/testdata/soilspec/BF_lo_05_soil_cal.2.txt
  51. +1716
    -0
      tests/testdata/soilspec/BF_lo_06_soil_cal.0.txt
  52. +1716
    -0
      tests/testdata/soilspec/BF_lo_06_soil_cal.1.txt
  53. +1716
    -0
      tests/testdata/soilspec/BF_lo_06_soil_cal.2.txt
  54. +1716
    -0
      tests/testdata/soilspec/BF_lo_07_soil_cal.0.txt
  55. +1716
    -0
      tests/testdata/soilspec/BF_lo_07_soil_cal.1.txt
  56. +1716
    -0
      tests/testdata/soilspec/BF_lo_07_soil_cal.2.txt
  57. +1716
    -0
      tests/testdata/soilspec/BF_lo_08_soil_cal.0.txt
  58. +1716
    -0
      tests/testdata/soilspec/BF_lo_08_soil_cal.1.txt
  59. +1716
    -0
      tests/testdata/soilspec/BF_lo_08_soil_cal.2.txt
  60. +1716
    -0
      tests/testdata/soilspec/BF_lo_09_soil_cal.0.txt
  61. +1716
    -0
      tests/testdata/soilspec/BF_lo_09_soil_cal.1.txt
  62. +1716
    -0
      tests/testdata/soilspec/BF_lo_09_soil_cal.2.txt
  63. +1716
    -0
      tests/testdata/soilspec/BF_lo_10_soil_cal.0.txt
  64. +1716
    -0
      tests/testdata/soilspec/BF_lo_10_soil_cal.1.txt
  65. +1716
    -0
      tests/testdata/soilspec/BF_lo_10_soil_cal.2.txt
  66. +1716
    -0
      tests/testdata/soilspec/BF_lo_11_soil_cal.0.txt
  67. +1716
    -0
      tests/testdata/soilspec/BF_lo_11_soil_cal.1.txt
  68. +1716
    -0
      tests/testdata/soilspec/BF_lo_11_soil_cal.2.txt
  69. +1716
    -0
      tests/testdata/soilspec/BF_lo_12_soil_cal.0.txt
  70. +1716
    -0
      tests/testdata/soilspec/BF_lo_12_soil_cal.1.txt
  71. +1716
    -0
      tests/testdata/soilspec/BF_lo_12_soil_cal.2.txt
  72. +1716
    -0
      tests/testdata/soilspec/BF_lo_13_soil_cal.3.txt
  73. +1716
    -0
      tests/testdata/soilspec/BF_lo_13_soil_cal.4.txt
  74. +1716
    -0
      tests/testdata/soilspec/BF_lo_13_soil_cal.5.txt
  75. +1716
    -0
      tests/testdata/soilspec/BF_lo_14_soil_cal.0.txt
  76. +1716
    -0
      tests/testdata/soilspec/BF_lo_14_soil_cal.1.txt
  77. +1716
    -0
      tests/testdata/soilspec/BF_lo_14_soil_cal.2.txt
  78. +1716
    -0
      tests/testdata/soilspec/BF_lo_15_soil_cal.0.txt
  79. +1716
    -0
      tests/testdata/soilspec/BF_lo_15_soil_cal.1.txt
  80. +1716
    -0
      tests/testdata/soilspec/BF_lo_15_soil_cal.2.txt
  81. +1716
    -0
      tests/testdata/soilspec/BF_lo_16_soil_cal.0.txt
  82. +1716
    -0
      tests/testdata/soilspec/BF_lo_16_soil_cal.1.txt
  83. +1716
    -0
      tests/testdata/soilspec/BF_lo_16_soil_cal.2.txt
  84. +1716
    -0
      tests/testdata/soilspec/BF_lo_17_soil_cal.0.txt
  85. +1716
    -0
      tests/testdata/soilspec/BF_lo_17_soil_cal.1.txt
  86. +1716
    -0
      tests/testdata/soilspec/BF_lo_17_soil_cal.2.txt
  87. +1716
    -0
      tests/testdata/soilspec/BF_lo_18_soil_cal.0.txt
  88. +1716
    -0
      tests/testdata/soilspec/BF_lo_18_soil_cal.1.txt
  89. +1716
    -0
      tests/testdata/soilspec/BF_lo_18_soil_cal.2.txt
  90. +1716
    -0
      tests/testdata/soilspec/BF_lo_19_soil_cal.0.txt
  91. +1716
    -0
      tests/testdata/soilspec/BF_lo_19_soil_cal.1.txt
  92. +1716
    -0
      tests/testdata/soilspec/BF_lo_19_soil_cal.2.txt
  93. +1716
    -0
      tests/testdata/soilspec/BF_lo_20_soil_cal.0.txt
  94. +1716
    -0
      tests/testdata/soilspec/BF_lo_20_soil_cal.1.txt
  95. +1716
    -0
      tests/testdata/soilspec/BF_lo_20_soil_cal.2.txt
  96. +1716
    -0
      tests/testdata/soilspec/BF_mo_01_soil_cal.0.txt
  97. +1716
    -0
      tests/testdata/soilspec/BF_mo_01_soil_cal.1.txt
  98. +1716
    -0
      tests/testdata/soilspec/BF_mo_01_soil_cal.2.txt
  99. +1716
    -0
      tests/testdata/soilspec/BF_mo_02_soil_cal.0.txt
  100. +1716
    -0
      tests/testdata/soilspec/BF_mo_02_soil_cal.1.txt

+ 2
- 0
.Rbuildignore View File

@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$

+ 4
- 0
.gitignore View File

@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata

+ 27
- 0
DESCRIPTION View File

@ -0,0 +1,27 @@
Package: simplerspec
Type: Package
Title: Soil and plant spectroscopic model building and prediction
Depends: R (>= 3.2)
Imports:
ggplot2 (>= 2.0.0),
plyr,
data.table,
reshape2,
mvoutlier,
hexView,
Rcpp,
hyperSpec,
prospectr,
dplyr,
caret,
tidyr
Version: 0.1.0
Authors@R: person("Philipp", "Baumann",
email = "baumanph@student.ethz.ch", role = c("aut", "cre"))
Description: Functions that cover
reading of spectral data, outlier removal,
spectral preprocessing, calibration sampling, PLS regression
using caret, and model diagnostic statistics and plots.
License: GPL-2
LazyData: TRUE
RoxygenNote: 5.0.1

+ 23
- 0
NAMESPACE View File

@ -0,0 +1,23 @@
# Generated by roxygen2: do not edit by hand
export(average_spectra)
export(do_pretreatment)
export(evaluate_pls_q)
export(fit_pls)
export(fit_pls_q)
export(join_chem_spec)
export(ken_stone)
export(pls_ken_stone)
export(predict_from_spectra)
export(readOPUS)
export(readOPUS_bin)
export(readOPUS_text)
export(read_spectra)
export(remove_outliers)
export(resample_spectra)
export(summary_df)
export(tune_model)
export(tune_model_q)
import(Rcpp)
import(data.table)
import(hyperSpec)

+ 85
- 0
R/average-spectra.R View File

@ -0,0 +1,85 @@
# Helper function written by Antoine Stevens that
# is used for averaging replication scans of one sample
#' @import data.table
#' @import Rcpp
by_spc <- function(spc, indices, fun = mean){
# Fast summary of spectral data
# spc = spectral matrix
# indices = factor variable used to summarize data
# fun = summary function
# Avoid NOTE "no visible binding for global variable '.SD'"
# when checking package by devtools::check()
# .SD <- NULL
spc <- data.table::data.table(indices, spc, check.names = F)
if(is.null(ncol(indices))){
x <- 1
} else {
x <- ncol(indices)
}
as.data.frame(spc[, lapply(.SD, fun),
by = eval(names(spc)[1:x])])
}
#' @title Calculate mean of spectra
#' @description Calculate the mean of each spectral repetitions
#' (absorbance average per wavenumber)
#' @param in_spectra List that contains spectral data in the
#' element \code{MIR} (data.frame) and sample metadata in the
#' list element \code{data_rep} (data.frame).
#' The data.frame \code{data_meta}
#' contains the sample ID stored in the \code{ID}
#' vector (originally from spectral file names),
#' country abbreviation stored in \code{contry} (2 letters),
#' and the vector \code{site} (2 letters) that is the country
#' abbreviation.
#' @return \code{out_spectra}: List that contains:
#' \itemize{
#' \item \code{data_meta}: metadata of sample (data.frame)
#' that is
#' taken from the element \code{rep} of the input list argument
#' \code{in_spectra}
#' \item \code{MIR_mean}: average spectra from replicates of
#' sample ID
#' (data.frame)
#' \item \code{MIR_sd}: standard deviation of spectra calculated
#' from replicates of sample ID (data.frame)
#' \item \code{cvar} coefficient of variance over all
#' wavenumbers of spectra
#' calculated from replicates of sample ID (vector)
#' }
#' @export
average_spectra <- function(in_spectra) {
# Compute mean per sample with by_spc,
# provided by Antoine
# spc = spectral data
# indices = character vector(s) or factor(s) to group the rows
# by fun = summary function
# Also compute the standard deviation (SD)
# of the three measurements
# Identify samples in which the spectrum has SD higher than 1.5
# and need to be re-scanned - ?
MIR <- NULL
data_rep <- NULL
ID <- NULL
MIR_mean <- by_spc(spc = in_spectra$MIR[, ],
indices = in_spectra$data_rep$ID[], fun = mean)
MIR_sd <- by_spc(spc = in_spectra$MIR[, ],
indices = in_spectra$data_rep$ID[], fun = sd)
# MIR_sd[order(rowMeans(MIR_sd[, -1])), 1]
# rowMeans(MIR_sd[, -1])[order(rowMeans(MIR_sd[, -1]))] /
# rowMeans(MIR_mean[, -1])[order(rowMeans(MIR_sd[, -1]))]
# compute the coefficient of variation
cvar <- rowMeans(MIR_sd[, -1])/rowMeans(MIR_mean[, -1])
# add metadata for each sample; take strings of rownames
# from spectra; first two characters of the sample_ID code
# is country, character pos. 4 and 5 is site abbreviation
data_meta <- data.frame(ID = MIR_mean[, 1])
data_meta <- cbind(data_meta,
country = substring(data_meta$ID, first = 1, last = 2),
site = substring(data_meta$ID, first = 4, last = 5)
)
out_spectra <- list(data_meta = data_meta,
MIR_mean = MIR_mean,
MIR_sd = MIR_sd,
cvar = cvar)
return(out_spectra)
}

+ 34
- 0
R/join-chem-spectra.R View File

@ -0,0 +1,34 @@
## Join chemical and spectral data ==============================
#' @title Join chemical and spectral data frames
#' @description Combines spectral data (data.frame) and chemical
#' data (data.frame).
#' @param dat_chem data.frame that contains chemical values of
#' the sample
#' @param dat_spec List that contains spectral data
#' @return List: xxx
#' @param by character of column name that defines sample_ID
#' @export
join_chem_spec <- function(
dat_chem, dat_spec, by = "sample_ID") {
# Alternative when "no visible binding for global variable":
data_meta <- MIR <- MIR0 <- ori <- MIR_mean <- NULL
# http://stackoverflow.com/questions/23475309/in-r-is-it-possible-to-suppress-note-no-visible-binding-for-global-variable
# Replace sample_ID by ID
if(!is.data.frame(dat_chem)) {
stop(dat_chem, "needs to be a data.frame", call. = FALSE)
} else {
colnames(dat_chem)[colnames(dat_chem) == by] <- "ID"
dat_chem$ID <- as.factor(dat_chem$ID)
# Select only chemical data that have no outlier spectra
dat_chem <- dat_chem[dat_spec$data_meta$ID, ]
ID <- as.factor(dat_spec$data_meta$ID)
# Join ref analyses
MIRdata <- data.frame(ID = ID)
MIRdata$MIR <- dat_spec$MIR0
MIRdata$ori <- dat_spec$MIR_mean
# Joining by ID, type = "inner"
MIRdata_chem <- plyr::join(dat_chem, MIRdata, type = "inner")
# before dplyr::inner_join(dat_chem, MIRdata)
MIRdata_chem
}
}

+ 71
- 0
R/load-spectra-yamsys.R View File

@ -0,0 +1,71 @@
## Function 1: Read spectra in text form ========================
#' @title Read an OPUS text file and extract metadata
#' @description
#' Read single text file acquired with
#' an Bruker Vertex FTIR Instrument
#' (as exported from OPUS software) and extract sample metadata
#' provided in the filename
#' @usage
#' read_spectra(path)
#' @param path character of the directory
#' where the spectral text files are stored
#' @return
#' List that contains the following elements:
#' \itemize{
#' \item \code{MIR}: data.frame that contains all the spectra.
#' The columns of \code{MIR} contain absorbance values at
#' different wavenumber in the MIR range. The wavenumbers
#' rounded to 0.1 are given as column names. The original file
#' names are stored as row names. One line in the data frame
#' \code{MIR} contains one replicate scan of a sample.
#' \item \code{data_rep}: data.frame that constists of sample
#' metadata that was extracted from the file name of
#' individual spectral files. The first vector \code{ID}
#' contains the spectral file name without the repetition number
#' supplied as \code{.<number>} in the file name.
#' Letters 1 to 2 of the spectral
#' file name are used for the country abbreviation, stored
#' as in the \code{} vector \code{data_rep} . Letters
#' 4 to 5 of the file name are used for the landscape (site)
#' abbreviation.
#' }
#' @note: This function is derived from a re-factored and
#' simplified version of the \code{read.opus} function from the
#' \sQuote{soil.spec} package for reading OPUS VERTEX files
#' The function should also work for other OPUS files (eg alpha),
#' see \code{read.opus}. The function readOPUS() was
#' written by Antoine Stevens.
#' @export
read_spectra <- function(path){
# Needs various utilities for spectral processing that Antoine
# put on github
ID <- NULL
# Load the MIR data exported from OPUS to txt files
# List files in the directory
lf <- list.files(path, full.names = TRUE)
# Read files into R with readOPUS()
# (comes from the github file)
MIR <- readOPUS(lf, in_format = "txt")
# Wavenumber, from 3996.4 to 599.8 cm-1
colnames(MIR) <- round(as.numeric(colnames(MIR)), 1)
# Remove the txt extension
rownames(MIR) <- sub("\\.txt", "", row.names(MIR))
# Prepare a dataset with ID,
# Extract country with substring
# and repetition
# (ID = character before the dot; rep = number after the dot)
data_rep <- data.frame(ID = sub("(.+)\\.[[:digit:]]+$", "\\1",
row.names(MIR)),
rep = as.numeric(sub(".+\\.([[:digit:]])+$", "\\1",
row.names(MIR)))
)
data_rep <- cbind(data_rep,
country = substring(data_rep$ID, first = 1, last = 2),
site = substring(data_rep$ID, first = 4, last = 5)
)
list_spectra <- list(
MIR = MIR,
data_rep = data_rep
)
return(list_spectra)
}

+ 310
- 0
R/load-spectra.R View File

@ -0,0 +1,310 @@
## Soil spectroscopy related functions that were compiled by
## Antoine Stevens ==============================================
#' @title Read an OPUS text file
#' @description
#' Read single text file acquired with
#' an Bruker Vertex FTIR Instrument
#' (as exported from OPUS software)
#' @param file.name Character vector with path to files
#' @usage readOPUS_text(file.name)
#' @export
readOPUS_text <- function(file.name){
if (file.exists(file.name)) {
out <- read.csv(file.name, header=F,
col.names = c("wavenumber", "absorbance")
)
return(out)
} else {
warning(paste("File", file.name, "does not exist"))
}
}
#' @title Read an OPUS binary file
#' @description
#' Read single binary file acquired with an
#' Bruker Vertex FTIR Instrument
#' @param file.name Character vector with path to files
#' @usage readOPUS_bin(file.name)
#' @export
readOPUS_bin <- function(file.name){
size <- fileRaw <- NULL
if (file.exists(file.name)) {
try(
pa <- hexView::readRaw(file.name, offset = 0,
nbytes = file.info(file.name)$size, human = "char",
size = 1, endian = "little"), silent = TRUE)
if (!class(.Last.value)[1] == "try-error") {
pr <- pa$fileRaw
# Get source of instrument
ins <- grepRaw("INS", pr, all = TRUE)
ins <- hexView::readRaw(
file.name, offset = ins[length(ins)] + 7,
nbytes = 3, human = "char", size = 1, endian = "little"
)
ins <- hexView::blockString(ins)
# Get source of infrared to know if NIR or MIR
src <- grepRaw("SRC", pr, all = TRUE)
src <- hexView::readRaw(
file.name, offset = src[length(src)] + 4,
nbytes = 3, human = "char", size = 1, endian = "little"
)
src <- hexView::blockString(src)
instr.range <- tolower(paste(ins, src, sep = "-"))
# Get Beam Splitter
bms <- grepRaw("BMS", pr, all = TRUE)
bms <- hexView::readRaw(
file.name, offset = bms[length(bms)] + 4,
nbytes = 4, human = "char", size = 1, endian = "little"
)
bms <- hexView::blockString(bms)
z <- grepRaw("ZFF", pr, all = TRUE)[1] + 5
re <- grepRaw("RES", pr, all = TRUE)[1] + 5
snm <- grepRaw("SNM", pr, all = TRUE)[1] + 7
lwn <- grepRaw("LWN", pr, all = TRUE)[1] + 7
fx <- grepRaw("FXV", pr, all = TRUE)[3] + 7
lx <- grepRaw("LXV", pr, all = TRUE)[3] + 7
npt0 <- grepRaw("NPT", pr, all = TRUE)[2] + 3
npt1 <- grepRaw("NPT", pr, all = TRUE)[3] + 7
mxy <- grepRaw("MXY", pr, all = TRUE)[1] + 7
mny <- grepRaw("MNY", pr, all = TRUE)[3] + 7
end <- grepRaw("END", pr, all = TRUE) + 11
dat <- grepRaw( "DAT", pr, all = TRUE)[1] + 7
tim <- grepRaw("TIM", pr, all = TRUE) + 11
# calculate end and start of each block
offs <- end[5:10]
byts <- diff(offs)
ZFF <- hexView::readRaw(file.name, offset = z, nbytes = 4,
human = "int", size = 2)[[5]][1]
RES <- hexView::readRaw(file.name, offset = re, nbytes = 4,
human = "int", size = 2)[[5]][1]
snm.lab.material <- hexView::blockString(
hexView::readRaw(file.name, offset = snm, nbytes = 22,
human = "char", size = 1, endian = "little")
)
if (!nzchar(snm.lab.material)) {
SSN <- ""
Material <- ""
warning("Product name not found inside OPUS file...")
}
else {
if (!length(grep(snm.lab.material, pattern = ";")) == 0) {
snm.lab.material <- as.vector(
strsplit(snm.lab.material, ";")
)[[1]]
SSN <- paste0(snm.lab.material[2], snm.lab.material[1])
Material <- snm.lab.material[3]
} else {
if (!length(grep(snm.lab.material, pattern = "_")) == 0) {
# Don't remove "_" from unique id SSN (@baumann)
# SSN <- sub("_", "", snm.lab.material)
SSN <- snm.lab.material
Material <- ""
} else {
if (!length(snm.lab.material) == 0) {
SSN <- snm.lab.material
Material <- ""
}
}
}
}
# Set three SSN first three characters to lower
# Don't convert to lowercase
# SSN <- paste0(tolower(substr(SSN, 1, 3)),
# substr(SSN, 4, 20))
Scandate <- hexView::blockString(
hexView::readRaw(file.name, offset = dat,
nbytes = 10, human = "char", size = 1,
endian = "little")
)
Scantime <- hexView::blockString(
hexView::readRaw(file.name,
offset = tim[2] - 4, nbytes = 8, human = "char",
size = 1, endian = "little")
)
Scandate <- paste(Scandate, Scantime)
LWN <- hexView::readRaw(
file.name, offset = lwn, nbytes = 8,
human = "real", size = 8)[[5]][1]
# Combine the above parameters
spectrum.meta <- c(SSN, Material, Scandate, ZFF, RES, LWN)
# Get number of data points for each spectra data block
NPT0 <- hexView::readRaw(
file.name, offset = npt0, nbytes = 12,
human = "int", size = 4)[[5]][2]
NPT1 <- hexView::readRaw(
file.name, offset = npt1, nbytes = 4,
human = "int", size = 4)[[5]][1]
# fxv: Frequency of first point
fxv <- hexView::readRaw(
file.name, offset = fx, nbytes = 16,
human = "real", size = 8)[[5]][1]
# lxv: Frequency of last point
lxv <- hexView::readRaw(
file.name, offset = lx, nbytes = 16,
human = "real", size = 8)[[5]][1]
# Read all through all the data blocks inside the OPUS file
nbytes1 <- NPT0 * 4 # initial parameters
nbytes.f <- NPT1 * 4
if (offs[1] < 2000) {
offs.f <- offs[3]
nbytes.f <- NPT1 * 4
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1)))
}
else if (offs[1] > 20000) {
offs.f <- offs[2]
nbytes.f <- NPT1 * 4
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT1 - 1)))
} else { # for vert-MIR
offs.f <- 7188
nbytes.f <- NPT0 * 4
lxv <- hexView::readRaw(
file.name, offset = 8768, nbytes = 16,
human = "real", size = 8)[[5]][1]
fxv <- hexView::readRaw(
file.name, offset = 8752, nbytes = 16,
human = "real", size = 8)[[5]][1]
wavenumbers <- rev(seq(lxv, fxv, (fxv - lxv)/(NPT0 - 1)))
}
spectra <- hexView::readRaw(file.name, width = NULL,
offset = offs.f - 4, nbytes = nbytes.f, human = "real", # needs to be -4 according to soil.spec function
size = 4, endian = "little")[[5]]
# File name
file_name <- sub(".+/(.+)", "\\1", file.name)
# Create date_time object
date_time <- as.POSIXct(spectrum.meta[3],
format = "%d/%m/%Y %H:%M:%S ")
# Create unique_id using file_name and time
ymd_id <- format(date_time, "%Y%m%d")
unique_id <- paste0(file_name, "_", ymd_id)
# Add sample_id: remove extension .0, .1 etc. from OPUS files
sample_id <- sub("(.+)\\.[[:digit:]]+$", "\\1", file_name)
# Extract repetition number (rep_no) from file name
rep_no <- sub(".+\\.([[:digit:]])+$", "\\1", file.name)
# Convert spectra to matrix and add dimnames (wavenumbers for columns
# and unique_id for rows)
spc_m <- matrix(spectra, ncol = length(spectra), byrow = FALSE)
rownames(spc_m) <- unique_id
colnames(spc_m) <- round(wavenumbers, 1)
out <- list(
metadata = data.frame(
unique_id = unique_id,
scan_id = file_name, # changed file_name to scan_id in output list
sample_id = sample_id,
rep_no = rep_no,
date_time = date_time,
sample_info = spectrum.meta[1],
instrument_name = instr.range,
resolution = spectrum.meta[5],
bms = bms,
lwn = spectrum.meta[6]
),
spc = spc_m,
wavenumbers = wavenumbers
)
# names(out)[-c(1:9)] <- as.character(round(wavenumbers, 1))
return(out)
}
} else {
warning(paste("File", file.name, "does not exist"))
}
}
#' @title Read OPUS binary and ASCII files
#' @description
#' Read single or multiple binary and ASCII files acquired with
#' an Bruker Vertex FTIR Instrument
#' @usage
#' readOPUS(fnames, in_format, out_format)
#' @param fnames character \code{vector} of the name(s)
#' (with absolute path) of the file(s) to read
#' @param in_format format of the input file: \code{'binary'} or
#' \code{'txt'}
#' @param out_format format of the output:
#' \code{'matrix'} (default) or \code{'list'} (see below)
#' @return
#' if \code{out_format} = \code{'matrix'}, absorbance values
#' of the input file(s) in a single \code{matrix}.
#'
#' if \code{out_format} = \code{'list'}, a \code{list} of the
#' input file(s) data consisting of a \code{list} with components:
#' \itemize{
#' \item{\code{Name}}{ name of the file imported}
#' \item{\code{datetime}}{ date and time of acquisition in
#' \code{POSIXct} format (available only when
#' \code{in_format} = 'binary')}
#' \item{\code{metadata}}{ \code{list} with information
#' on instrument configuration (available only when
#' \code{in_format} = 'binary')}
#' \item{\code{absorbance}}{ a numeric \code{vector}
#' of absorbance values}
#' \item{\code{wavenumbers}}{ numeric \code{vector}
#' of the band positions}
#' }
#' @author Antoine Stevens and Andrew Sila (soil.spec package)
#' @note
#' This is essentially a re-factored and simplified version of
#' the \code{read.opus} function from the
#' \sQuote{soil.spec} package for reading OPUS VERTEX files
#' The function should also work for other OPUS files (eg alpha),
#' see \code{read.opus}.
#' @export
readOPUS<- function(fnames, in_format = c("binary", "txt"),
out_format = c("matrix", "list")) {
# hexView and plyr are required
wavenumbers <- NULL
absorbance <- NULL
in_format <- match.arg(in_format)
out_format <- match.arg(out_format)
spc <- vector("list", length(fnames))
i <- 1
for (file.name in fnames) {
if (in_format == "binary") {
spc[[i]] <- readOPUS_bin(file.name)
} else {
spc[[i]] <- readOPUS_text(file.name)
}
i <- i + 1
}
names(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames)
if (out_format == "matrix") {
test <- sapply(spc, function(x) class(x) != "character")
# warning(
# paste0(paste(names(spc)[!test], collapse = ","),
# " do not exist")
# )
spc <- spc[test]
if(in_format == "binary"){
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x){
x <- t(data.frame(
wav = x$wavenumbers, absorbance = x$absorbance))
colnames(x) <- x[1,]
data.frame(x[2, , drop = F], check.names = F)}))
} else {
spc <- do.call(plyr::rbind.fill, lapply(spc, function(x) {
x <- t(x)
colnames(x) <- x[1,]
data.frame(x[2, , drop = F], check.names = F)}))
}
rownames(spc) <- sub(".+/(.+)(\\.txt)?$", "\\1", fnames)
}
return(spc)
}

+ 466
- 0
R/pls-modeling.R View File

@ -0,0 +1,466 @@
# Perform calibration sampling based on spectral PCA ------------
#' @title Split
#' @description Perform calibration sampling based on
#' the Kennard-Stones algorithm.
#' @param spec_chem data.frame that contains chemical
#' and IR spectroscopy data
#' @param ratio_val Ratio of number of validation and all samples.
#' @param pc Number of principal components (numeric)
#' @param print logical expression weather calibration
#' @param validation Logical expression weather
#' calibration sampling is performed
#' (\code{TRUE} or \code{FALSE}).
#' @usage ken_stone(spec_chem, ratio_val, pc, print = TRUE,
#' validation = TRUE)
#' @export
ken_stone <- function(spec_chem, ratio_val, pc,
print = TRUE, validation = TRUE) {
MIR <- model <- type <- PC1 <- PC2 <- NULL
# Now with a real dataset
# k = number of samples to select
# pc = if provided, the number of principal components
# (see ?kenStone)
if(validation == TRUE) {
# pc = 0.99 before !!!
sel <- prospectr::kenStone(X = spec_chem$MIR,
k = round(ratio_val * nrow(spec_chem)), pc = 2)
sel$model # The row index of calibration samples
# plot(sel$pc[, 1:2], xlab = 'PC1', ylab = 'PC2')
# Points selected for calibration
# points(sel$pc[sel$model, 1:2], pch = 19, col = 2)
# Plot samples selected for calibration in ggplot
sel_df_cal <- data.frame(sel$pc[- sel$model,1:2])
sel_df_cal$type <- as.factor(
rep("calibration", nrow(sel_df_cal))
)
sel_df_val <- data.frame(sel$pc[sel$model, 1:2])
sel_df_val$type <- as.factor(
rep("validation", nrow(sel_df_val)))
sel_df <- rbind(sel_df_cal, sel_df_val)
# Compute ratio needed to make the figure square
ratio <- with(sel_df, diff(range(PC1))/diff(range(PC2)))
p_pc <- ggplot2::ggplot(data = sel_df) +
ggplot2::geom_point(
ggplot2::aes(x = PC1, y = PC2, shape = type), size = 4) +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::scale_shape_manual(values=c(1, 19)) +
ggplot2::scale_colour_manual(values=c("black", "red")) +
ggplot2::theme_bw()
# ggplot2::theme.user +
ggplot2::theme(legend.title = ggplot2::element_blank())
# print(p_pc)
# Split MIR data into calibration and validation set using
# the results of Kennard-Stone Calibration Sampling
# Selct by row index of calibration samples
val_set <- spec_chem[sel$model, ]
# Check number of observations (rows) for validation set
nrow(val_set)
cal_set <- spec_chem[- sel$model, ]
list_out <- list(
calibration = cal_set,
validation = val_set,
p_pc = p_pc
)
list_out
# Check number of observations (rows) for calibration set
# nrow(cal_set)
} else {
cal_set <- spec_chem
list(calibration = cal_set)
}
}
#' @title Perform model tuning
#' @description Uses function from caret to to model tuning
#' for PLS regression.
#' @param x list from calibration sampling
#' @param variable response variable for PLS regression, supplied
#' as character expression
#' @param validation Logical expression weather an independent
#' validation is performed.
#' @param env Environment where function is evaluated
#' @export
tune_model_q <- function(x, variable,
env = parent.frame(), validation = TRUE) {
calibration <- NULL
# List of calibration and validation samples
# set up a cross-validation scheme
# create 10 folds that we will keep for the different
# modeling approaches to allow comparison
# randomly break the data into 10 partitions
# note that k is the total number of samples for leave-one-out
# use substitute function to make non-standard evaluation
# of variable argument (looks at a function as argument,
# sees code used to compute value;
# see chapter 13.1 Capturing expressions
# in Advanced R (Hadley Wickham)
# !! p. 270
r <- eval(variable, x$calibration, env)
idx <- caret::createFolds(y = r, k = 10, returnTrain = T) # update ***
idx
# inject the index in the trainControl object
tr_control <- caret::trainControl(method = "cv", index = idx,
savePredictions = T)
if (validation == TRUE) {
tr_control
} else {
tr_control
}
}
#' @title Perform model tuning
#' @description Uses function from caret to to model tuning
#' for PLS regression.
#' @param x list from calibration sampling
#' @param variable response variable for PLS regression, supplied
#' as character expression
#' @param validation Logical expression weather an independent
#' validation is performed.
#' @param env Environment where function is evaluated
#' @export
tune_model <- function(x, variable,
env = parent.frame(), validation = TRUE) {
tune_model_q(x, substitute(variable), env)
}
# Fit a PLS regression model using the caret package ------------
#' @title Fit a PLS regression model
#' (quoted version of the function)
#' @description Uses the caret package to perform PLS modeling.
#' Spectra are centered and scaled prior to modeling.
#' @param x List that contains calibration
#' set, validation set, and model tuning options
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable to be modeled
#' @param tr_control Object that defines controlling parameters
#' of the desired internal validation framework
#' @param env Environment where function is evaluated
#' @export
fit_pls_q <- function(x, validation = TRUE,
variable, tr_control, env = parent.frame()) {
# Fit a partial least square regression (pls) model
# center and scale MIR (you can try without)
calibration <- MIR <- NULL
v <- eval(variable, x$calibration, env)
if (validation == TRUE) {
pls_model <- caret::train(x = x$calibration$MIR, y = v,
method = "pls",
tuneLength = 20,
trControl = tr_control,
preProcess = c("center", "scale")
)
} else {
pls_model <- caret::train(x = x$calibration$MIR, y = v,
method = "pls",
tuneLength = 20,
trControl = tr_control,
preProcess = c("center", "scale")
)
}
# Collect fitted object into a list
# fitList_cal <- list(pls = fit_pls)
# fitList_cal
pls_model
}
#' @title Fit a PLS regression model
#' @description Uses the caret package to perform PLS modeling.
#' Spectra are centered and scaled prior to modeling.
#' @param x List that contains calibration
#' set, validation set, and model tuning options
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable to be modeled
#' @param env Environment where function is evaluated
#' @export
fit_pls <- function(x, validation = TRUE,
variable, env = parent.frame()) {
fit_pls_q(x = x, validation = TRUE,
variable = substitute(variable), env
)
}
# Evaluate PLS performance (validation and cross-validation) ----
#' @title Evaluate PLS performance
#' @description Calculate model performance indices based
#' on observed and predicted values of validation and calibration
#' set, and internal cross-validation
#' @param x List that contains calibration and validation data
#' frame with combined spectral and chemical data
#' @param pls_model List with PLS regression model output from
#' the caret package
#' @param variable Response variable (e.g. chemical property) to be
#' modelled (needs to be non-quoted expression). \code{variable}
#' needs to be a column name in the \code{validation} data.frame
#' (element of \code{x})
#' @param validation Logical expression if independent validation
#' is performed (split data set into calibration set and
#' validation set)
#' @param print Print observed vs. predicted for calibration
#' and validation. Default is \code{TRUE}.
#' @param env Specifiy the environment in which the function is
#' called. Default argument of \code{env} is
#' \code{parent.frame()}
#' @export
evaluate_pls_q <- function(x, pls_model, variable,
validation = TRUE, print = TRUE, env = parent.frame()) {
# Set global variables to NULL to avoid R CMD check notes
MIR <- object <- model <- dataType <- obs <- pred <- NULL
ncomp <- finalModel <- rmsd <- r2 <- r2 <- rpd <- n <- NULL
rmse <- calibration <- NULL
# Collect fitted object into a list
list_models <- list(pls = pls_model)
# Extract best tuning parameters and associated cv predictions
if(validation == TRUE) {
predobs_cal <- plyr::ldply(list_models,
function(x) plyr::match_df(x$pred, x$bestTune),
.id = "model"
)
# Calculate training (calibration) and test (validation) data
# predictions based on pls model with calibration data
v <- eval(variable, x$validation, env)
predobs_val <- caret::extractPrediction(list_models,
testX = x$validation$MIR, testY = v) # update ***
# Create new data frame column <object>
predobs_val$object <- predobs_val$model
# Replace levels "Training" and "Test" in dataType column
# by "Calibration" and "Validation" (rename levels of factor)
predobs_val$dataType <- plyr::revalue(predobs_val$dataType,
c("Test" = "Validation", "Training" = "Calibration")
)
# Change the order of rows in the data frame
# Calibration as first level (show Calibration in ggplot graph
# on left panel)
predobs_val$dataType <- factor(predobs_val$dataType,
levels = c("Calibration", "Validation"))
# Calculate model performance indexes by model and dataType
# uses package plyr and function summary.df of SPECmisc.R
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
)
} else {
# Extract best tuning parameters and associated cv predictions
predobs_cv <- plyr::ldply(list_models,
function(x) plyr::match_df(x$pred, x$bestTune),
.id = "model"
)
# Extract auto-prediction
predobs <- caret::extractPrediction(list_models)
predobs_cv$object <- predobs_cv$model
predobs_cv$dataType <- "Cross-validation"
predobs_cv <- dplyr::select(
predobs_cv, obs, pred, model, dataType, object
)
predobs_val <- rbind(predobs, predobs_cv)
stats <- plyr::ddply(predobs_val, c("model", "dataType"),
function(x) summary_df(x, "obs", "pred")
)
}
# Add number of components to stats; from finalModel list item
# from train() function output (function from caret package)
stats$ncomp <- rep(pls_model$finalModel$ncomp, nrow(stats))
# Add range of observed values for validation and calibraton
# get range from predicted vs. observed data frame
# stored in object predobs
obs_cal <- subset(predobs_val, dataType == "Calibration")$obs
obs_val <- subset(predobs_val, dataType == "Validation")$obs
# Get name of predicted variable; see p. 261 of book
# "Advanced R" (Hadley Wickham)
variable_name <- deparse(variable)
# before: deparse(substitute(variable))
df_range <- data.frame(
variable = rep(variable_name, 2),
dataType = c("Calibration", "Validation"),
min_obs = c(range(obs_cal)[1], range(obs_val)[1]),
median_obs = c(median(obs_cal), median(obs_val)),
max_obs = c(range(obs_cal)[2], range(obs_val)[2]),
mean_obs = c(mean(obs_cal), mean(obs_val)),
CV = c(sd(obs_cal) / mean(obs_cal) * 100,
sd(obs_val) / mean(obs_val) * 100)
)
# Join stats with range data frame (df_range)
stats <- plyr::join(stats, df_range, type = "inner")
annotation <- plyr::mutate(stats,
rmse = as.character(as.expression(paste0("RMSE == ",
round(rmsd, 2)))),
r2 = as.character(as.expression(paste0("italic(R)^2 == ",
round(r2, 2)))),
rpd = as.character(as.expression(paste("RPD == ",
round(rpd, 2)))),
n = as.character(as.expression(paste0("italic(n) == ", n))),
ncomp = as.character(as.expression(paste0("ncomp = ",
ncomp)))
)
# Plot predicted vs. observed values and model indexes
# update label, xlim, and ylim ***
# Add label number of samples to facet_grid using a
# labeling function
# ! Update labeller API:
# https://github.com/hadley/ggplot2/commit/ef33dc7
# http://sahirbhatnagar.com/facet_wrap_labels
# Prepare lookup character vector
make_label <- function(x, validation = TRUE) {
dataType <- n <- NULL
if (validation == TRUE) {
c(`Calibration` = paste0("Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Validation` = paste0("Validation", "~(",
x[x$dataType == "Validation", ]$n, ")"
)
)
} else{
c(`Calibration` = paste0("Calibration", "~(",
x[x$dataType == "Calibration", ]$n, ")"
),
`Cross-Validation` = paste0("Cross-Validation", "~(",
x[x$dataType == "Cross-Validation", ]$n, ")"
)
)
}
}
if (validation == TRUE) {
label_validation <- make_label(x = annotation,
validation = TRUE
)
} else {
label_validation <- make_label(x = annotation,
validation = FALSE
)
}
# Rename labels on the fly with a lookup character vector
to_string <- ggplot2::as_labeller(
x = label_validation, ggplot2::label_parsed
)
# -------------------------------------------------------------
# http://docs.ggplot2.org/0.9.3.1/label_parsed.html
# some other info: https://coderclub.b.uib.no/tag/plotmath/
# !!! now depreciated in ggplot2 >= 2.0.0
# dataType_labeller <- function(variable, value){
# new <- paste0(dataType_names[value], "~(", annotation$n, ")")
# plyr::llply(as.character(new), function(x) parse(text = x))
# }
p_pred_obs <- ggplot2::ggplot(data = predobs_val) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 4) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = r2), size = 7,
hjust = -0.1, vjust = 1.5, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = rmse), size = 7,
hjust = -0.075, vjust = 4.25, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = -Inf, y = Inf, label = rpd), size = 7,
hjust = -0.1, vjust = 6.5, parse = TRUE) +
ggplot2::facet_grid(~ dataType,
labeller =ggplot2::as_labeller(to_string)) +
# ggplot2::facet_grid(~ dataType,
# labeller = dataType_labeller) +
ggplot2::theme_bw() +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = "Observed", y = "Predicted") +
ggplot2::xlim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::ylim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) # +
# theme.user
## ggplot graph for model comparison
## (arranged later in panels)
x_label <- paste0("Observed ",
as.character(variable_name))
y_label <- paste0("Predicted ",
as.character(variable_name))
p_model <- ggplot2::ggplot(data = predobs_val) +
ggplot2::geom_point(ggplot2::aes(x = obs, y = pred),
shape = 1, size = 2, alpha = 1/2) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = r2), size = 3,
hjust = 1.15, vjust = -3, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rmse), size = 3,
hjust = 1.12, vjust = -2.5, parse = TRUE) +
ggplot2::geom_text(data = annotation,
ggplot2::aes(x = Inf, y = -Inf, label = rpd), size = 3,
hjust = 1.15, vjust = -1.25, parse = TRUE) +
ggplot2::facet_grid(~ dataType,
labeller = ggplot2::as_labeller(to_string)) +
# ggplot2::facet_grid(~ dataType,
# labeller = dataType_labeller) +
ggplot2::theme_bw() +
ggplot2::geom_abline(col = "red") +
ggplot2::labs(x = x_label, y = y_label) +
ggplot2::xlim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::ylim(c(min(predobs_val$obs) -
0.05 * diff(range(predobs_val$obs)),
max(predobs_val$obs) +
0.05 * diff(range(predobs_val$obs)))) +
ggplot2::coord_fixed()
if(print == TRUE) {
print(p_model)
}
list(stats = stats, p_model = p_model)
}
## PLS regression modeling in one function ======================
#' @title Calibration sampling, model tuning, and PLS regression
#' @description Perform calibration sampling and use selected
#' calibration set for model tuning
#' @param spec_chem data.frame that contains IR spectroscopy
#' and chemical data
#' @param k Number of validation samples
#' @param pc Number of Principal Components used for Calibration
#' sampling (Kennard-Stones algorithm)
#' @param ratio_val Ratio of number of validation and all samples.
#' @param print Logical expression weather graphs shall be printed
#' @param validation Logical expression weather independent
#' validation is performed
#' @param variable Response variable (without quotes)
#' @param env Environment where function is evaluated
#' @export
# Note: check non standard evaluation, argument passing...
pls_ken_stone <- function(spec_chem, ratio_val, pc,
print = TRUE, validation = TRUE, variable,
env = parent.frame()) {
calibration <- 0
# Calibration sampling
list_sampled <- ken_stone(
spec_chem, ratio_val = ratio_val, pc = 2, validation = TRUE
)
tr_control <- tune_model_q(list_sampled,
substitute(variable), env
)
pls <- fit_pls_q(x = list_sampled, validation = TRUE,
variable = substitute(variable), tr_control = tr_control, env
)
stats <- evaluate_pls_q(x = list_sampled, pls_model = pls,
variable = substitute(variable), env = parent.frame()
)
list(data = list_sampled, p_pc = list_sampled$p_pc,
pls_model = pls, stats = stats$stats, p_model = stats$p_model)
}

+ 42
- 0
R/predict-spectra.R View File

@ -0,0 +1,42 @@
#' @title Predict soil properties of new spectra based on calibration models
#' @description
#' Function that uses pre-processed spectra, additional metadata of new
#' samples, and caret model output for the different soil property models
#' to create predicted values.
#' @param model_list List that contains caret output objects
#' of the different calibration models to predict (one model per soil property)
#' @param spectra_list List that contains spectra and additional data
#' after pre-processing (\code{do_pretreatment()}including metadata
#' (\code{sample_ID})
#' @usage predict_from_spectra(model_list, spectra_list)
#' @export
predict_from_spectra <- function(model_list, spectra_list) {
# Use extractPrediction function (caret) and supply model_list that contains
# caret calibration outputs; use pre-processed spectra dataset (list
# resulting from do_pretreatment())
predictions_caret <- caret::extractPrediction(
models_prediction,
unkX = soilspec_test$MIR0
)
# Convert data.frame into long form; one sample should be represented by
# one single row and the predicted values of soil properties should be
# in the different columns
# Use the tidyr::spread() function (from tidyr packge)
# to gather columns into rows
# Add sample_ID column to uniquely identify observations
# Number of caret model objects used to predict
n <- length(unique(predictions_caret$object))
# Add sample_ID from metadata of spectra to predicted values
sample_ID <- spectra_list$data_meta$ID
# Repeat meta_data for each of the additional model rows and add
# ID column to long form data frame
id <- rep(sample_ID, n)
predictions_metadata <- cbind(predictions_caret, sample_ID = id)
# Get data into wide form
predictions_wide <- tidyr::spread(
data = predictions_metadata, key = "object", value = "pred"
)
}

+ 51
- 0
R/pretreat-spectra.R View File

@ -0,0 +1,51 @@
#' @title Preprocess spectra
#' @description Use commonly used preprocessing algorithms on
#' the spectra.
#' @param list_spectra List that contains averaged spectra
#' in the list element called \code{MIR_mean}
#' @param select Character string that specifies the predefined
#' pretreatment options. Possible arguments are:
#' \code{select = "MIR0"} for Savitzky Golay smoothing filter
#' without derivative, \code{select = "MIR1"} for Savitky Golay
#' with first derivative, \code{select = "MIR2"} for Savitzky
#' Golay with second derivative, \code{select = "MIR0_snv"}
#' for Standard Normal Variate after Savitzky Golay without
#' derivative, and \code{select = "MIRb"} for
#' baseline correction.
#' @usage do_pretreatment(list_spectra, select)
#' @return list_spectra: List that contains preprocessed
#' spectra in element \code{MIR0}
#' @import hyperSpec
#' @export
do_pretreatment <- function(list_spectra, select) {
MIR_mean <- NULL
MIR_raw <- list_spectra$MIR_mean
# Filter the data using the Savitzky and Golay smoothing filter
# with a window size of 5 spectral variables and
# a polynomial order of 3 (no differentiation)
# p = polynomial order; plot variance vs polynomial order?
# w = window size (must be odd)
# m = m-th derivative of the polynomial coefficients
# (0 = smoothing)
MIR0 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean,
m = 0, p = 3, w = 9) # smoothing and averaging
MIR1 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean,
m = 1, p = 3, w = 5) # first derivative ***
MIR2 <- prospectr::savitzkyGolay(X = list_spectra$MIR_mean,
m = 2, p = 3, w = 5) # second derivative ***
# Calculate standard normal variate (SNV) after smoothing
MIR0_snv <- prospectr::standardNormalVariate(MIR0)
MIR1_snv <- prospectr::standardNormalVariate(MIR1) # added 2016-08-05
# Baseline correction
# Compute baseline but first, create hyperSpec obj
spc <- new("hyperSpec", spc = as.matrix(list_spectra$MIR_mean),
wavelength = as.numeric(colnames(list_spectra$MIR_mean)))
below <- hyperSpec::spc.fit.poly.below(
fit.to = spc[, , 4000 ~ 900],
apply.to = spc, npts.min = 20, poly.order = 2)
spc_corr <- spc - below
MIRb <- spc_corr[[]]
pre <- select
list_spectra$MIR0 <- get(pre)
return(list_spectra)
}

+ 162
- 0
R/remove-outl-spectra.R View File

@ -0,0 +1,162 @@
#' @title Remove outlier spectra
#' @description Remove outlier spectra based on the
#' \code{pcout()} function of the \code{mvoutlier} package.
#' @usage remove_outliers(list_spectra, remove = TRUE)
#' @param list_spectra List that contains averaged
#' spectral information
#' in list element \code{MIR_mean} (data.frame) and metadata in
#' \code{data_meta} (data.frame).
#' @param remove logical expression (\code{TRUE} or \code{FALSE})
#' that specifies weather spectra shall be removed.
#' If \code{rm = FALSE}, there will be no outlier removal
#' @return Returns list \code{spectra_out} that contains:
#' \itemize{
#' \item \code{MIR_mean}: Outlier removed MIR spectra as
#' data.frame object. If \code{remove = FALSE},
#' the function will
#' return almost identical list identical to \code{list_spectra},
#' except that the first \code{indices} column of the spectral
#' data frame \code{MIR_mean} is removed
#' (This is done for both options
#' \code{remove = TRUE} and \code{remove = FALSE}).
#' \item \code{data_meta}: metadata data.frame, identical
#' as in the \code{list_spectra} input list.
#' \item \code{plot_out}: (optional) ggplot2 graph
#' that shows all spectra (absorbance on x-axis and wavenumber
#' on y-axis) with outlier marked, if
#' \code{remove = TRUE}.
#' }
#' @details This is an optional function if one wants to remove
#' outliers.
#' @export
remove_outliers <- function(list_spectra, remove = TRUE) {
# Outlier detection
# Use the mvoutlier package and pcout function to identify
# multivariate outliers
wfinal01 <- ID <- NULL
if (remove == TRUE) {
# Remove the 'indices' column
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1]
out <- mvoutlier::pcout(list_spectra$MIR_mean, makeplot = T,
outbound = 0.05) # parameters should be adapted
# Plot outlying spectra
plot_out <- plotMIR(
list_spectra$MIR_mean[
order(out$wfinal01, decreasing = T), ],
col = as.factor(out$wfinal01[order(out$wfinal01,
decreasing = T)])) +
ggplot2::scale_colour_brewer("outlier", palette = "Set1")
out_id <- as.character(
list_spectra$data_meta$ID[!as.logical(out$wfinal01)]
)
# Remove outliers
MIR_mean <- list_spectra$MIR_mean[
! list_spectra$data_meta$ID %in% out_id, ]
# rep ID and country name
data_meta <- list_spectra$data_meta[
! list_spectra$data_meta$ID %in% out_id, ]
spectra_out <- list(MIR_mean = MIR_mean,
data_meta = data_meta,
plot_out = plot_out)
} else {
# Remove the 'indices' column
list_spectra$MIR_mean <- list_spectra$MIR_mean[, -1]
spectra_out <- list(MIR_mean = list_spectra$MIR_mean,
data_meta = list_spectra$data_meta)
}
spectra_out
}
## plotMIR function of Antoine Stevens; don't export this
## function to the NAMESPACE
plotMIR <- function(spc, group = NULL, col = NULL,
linetype = NULL, wr = NULL, brk = NULL,
ylab = "Absorbance", xlab = "Wavenumber /cm-1",
by = NULL, by.wrap = T, ...){
# Function to plot spectra, based on the ggplot2 package
# spc = spectral matrix, with colnames = wavelengths
# group = grouping variable, usually the id's of the sample
# wr = wavelength range to plot
# brk = breaks of the x-axis
# by = factor variable for which the mean and sd of
# each level will be computed and plotted (optional)
# Requires packages ggplot2; data.table; reshape2
# Workaround to pass R CMD check:
# http://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
# Setting the variables to NULL first
variable <- value <- colour <- NULL
spc <- as.data.frame(spc)
if (!is.null(wr))
spc <- spc[, as.numeric(colnames(spc)) >= min(wr) &
as.numeric(colnames(spc)) <= max(wr)]
if (is.null(brk))
brk <- pretty(as.numeric(colnames(spc)), n = 10)
if (!is.null(by)) {
spc$by <- by
spc <- data.table::data.table(spc, check.names = F)
mean.spc <- reshape2::melt(
spc[, lapply(data.table::.SD, mean), by = by],
id.vars = "by"
)
sd.spc <- reshape2::melt(
spc[, lapply(data.table::.SD, sd), by = by],
id.vars = "by"
)
mean.spc$min <- mean.spc$value - sd.spc$value
mean.spc$max <- mean.spc$value + sd.spc$value
mean.spc$variable <- as.numeric(
as.character(mean.spc$variable)
)
if (by.wrap) {
p <- ggplot2::ggplot(data = mean.spc) +
ggplot2::geom_ribbon(
ggplot2::aes(x = variable, ymin = min, ymax = max),
fill = "grey", col = "black", size = 0.15) +
ggplot2::theme_bw()
p <- p + ggplot2::geom_line(
ggplot2::aes(x = variable, y = value),
size = 0.25) +
ggplot2::facet_wrap(~ by) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::scale_x_reverse(breaks = brk)
} else {
p <- ggplot2::ggplot(data = mean.spc,
ggplot2::aes(x = variable, y = value, group = by, col = by)) +
ggplot2::geom_line(size = 0.25) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::scale_x_reverse(breaks = brk) +
ggplot2::theme_bw()
}
return(p)
} else {
if (is.null(group))
group <- as.character(1:nrow(spc))
spc$group <- group
spc$colour <- col
spc$linetype <- linetype
id.var <- colnames(spc)[
grep("group|colour|linetype",colnames(spc))]
tmp <- reshape2::melt(spc, id.var = id.var)
tmp$variable <- as.numeric(as.character(tmp$variable))
p <- ggplot2::ggplot(tmp,
ggplot2::aes(variable, value, group = group)) +
ggplot2::labs(x = xlab, y = ylab) +
ggplot2::theme_bw() +
ggplot2::scale_x_reverse(breaks = brk)
if (is.null(col) & is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = group))
else if (!is.null(col) & is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = colour))
else if (is.null(col) & !is.null(linetype))
p <- p + ggplot2::geom_line(
ggplot2::aes(colour = group,
linetype = linetype))
else p <- p + ggplot2::geom_line(
ggplot2::aes(colour = colour,
linetype = linetype))
return(p)
}
}

+ 19
- 0
R/resample-spectra.R View File

@ -0,0 +1,19 @@
#' @title Resample spectra stored to new
#' @description Calculates model statistics for predicted (y)
#' vs. observed (y) values
#' @param list_spectra List of spectra and metadata
#' @param wn_lower Numerical value for lowest wavenumber in sampling interval
#' @param wn_upper Numerical value for highest wavenumber in sampling interval
#' @export
resample_spectra <- function(
list_spectra, wn_lower = 510, wn_upper = 3988, wn_interval = 2)
{
# Create sequence of new wavenumbers
wn_seq <- rev(seq(from = wn_lower, wn_upper, by = wn_interval))
list_spectra$MIR0 <- prospectr::resample(
X = list_spectra$MIR_mean, # spectral matrix to resample
wav = as.numeric(colnames(list_spectra$MIR_mean)), # old wavenumbers
new.wav = wn_seq # new wavenumbers
)
return(list_spectra)
}

+ 25
- 0
R/spectra-utils.R View File

@ -0,0 +1,25 @@
#' @title Calculate model statistics
#' @description Calculates model statistics for predicted (y)
#' vs. observed (y) values
#' @param df data.frame with predicted and observed data
#' @param x column with observed values
#' @param y column with predicted values
#' @export
summary_df <- function(df, x, y){
x <- df[, x]
y <- df[, y]
data.frame(rmse = sqrt(sum((x - y)^2, na.rm = T) / (length(x)-1)),
rmsd = mean((x - y)^2)^.5,
sdev = sd(x, na.rm = T),
rpd = sd(x,na.rm = T) /
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)),
rpiq = (quantile(x, .75, na.rm = T) - quantile(x, .25, na.rm = T)) /
sqrt(sum((x - y)^2, na.rm = T) / (length(x) - 1)),
r2 = cor(x, y, use = "pairwise.complete.obs")^2,
bias = mean(x, na.rm = T) - mean(y, na.rm = T),
SB = (mean(x, na.rm = T) - mean(y, na.rm = T))^2,
NU = var(x, na.rm = T) * (1 - lm(y ~ x)$coefficients[2])^2,
LC = var(y, na.rm = T) *
(1 - cor(x, y, use = "pairwise.complete.obs")^2),
n = length(x))
}

+ 41
- 0
man/average_spectra.Rd View File

@ -0,0 +1,41 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/average-spectra.R
\name{average_spectra}
\alias{average_spectra}
\title{Calculate mean of spectra}
\usage{
average_spectra(in_spectra)
}
\arguments{
\item{in_spectra}{List that contains spectral data in the
element \code{MIR} (data.frame) and sample metadata in the
list element \code{data_rep} (data.frame).
The data.frame \code{data_meta}
contains the sample ID stored in the \code{ID}
vector (originally from spectral file names),
country abbreviation stored in \code{contry} (2 letters),
and the vector \code{site} (2 letters) that is the country
abbreviation.}
}
\value{
\code{out_spectra}: List that contains:
\itemize{
\item \code{data_meta}: metadata of sample (data.frame)
that is
taken from the element \code{rep} of the input list argument
\code{in_spectra}
\item \code{MIR_mean}: average spectra from replicates of
sample ID
(data.frame)
\item \code{MIR_sd}: standard deviation of spectra calculated
from replicates of sample ID (data.frame)
\item \code{cvar} coefficient of variance over all
wavenumbers of spectra
calculated from replicates of sample ID (vector)
}
}
\description{
Calculate the mean of each spectral repetitions
(absorbance average per wavenumber)
}

+ 31
- 0
man/do_pretreatment.Rd View File

@ -0,0 +1,31 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pretreat-spectra.R
\name{do_pretreatment}
\alias{do_pretreatment}
\title{Preprocess spectra}
\usage{
do_pretreatment(list_spectra, select)
}
\arguments{
\item{list_spectra}{List that contains averaged spectra
in the list element called \code{MIR_mean}}
\item{select}{Character string that specifies the predefined
pretreatment options. Possible arguments are:
\code{select = "MIR0"} for Savitzky Golay smoothing filter
without derivative, \code{select = "MIR1"} for Savitky Golay
with first derivative, \code{select = "MIR2"} for Savitzky
Golay with second derivative, \code{select = "MIR0_snv"}
for Standard Normal Variate after Savitzky Golay without
derivative, and \code{select = "MIRb"} for
baseline correction.}
}
\value{
list_spectra: List that contains preprocessed
spectra in element \code{MIR0}
}
\description{
Use commonly used preprocessing algorithms on
the spectra.
}

+ 38
- 0
man/evaluate_pls_q.Rd View File

@ -0,0 +1,38 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pls-modeling.R
\name{evaluate_pls_q}
\alias{evaluate_pls_q}
\title{Evaluate PLS performance}
\usage{
evaluate_pls_q(x, pls_model, variable, validation = TRUE, print = TRUE,
env = parent.frame())
}
\arguments{
\item{x}{List that contains calibration and validation data
frame with combined spectral and chemical data}
\item{pls_model}{List with PLS regression model output from
the caret package}
\item{variable}{Response variable (e.g. chemical property) to be
modelled (needs to be non-quoted expression). \code{variable}
needs to be a column name in the \code{validation} data.frame
(element of \code{x})}
\item{validation}{Logical expression if independent validation
is performed (split data set into calibration set and
validation set)}
\item{print}{Print observed vs. predicted for calibration
and validation. Default is \code{TRUE}.}
\item{env}{Specifiy the environment in which the function is
called. Default argument of \code{env} is
\code{parent.frame()}}
}
\description{
Calculate model performance indices based
on observed and predicted values of validation and calibration
set, and internal cross-validation
}

+ 24
- 0
man/fit_pls.Rd View File

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pls-modeling.R
\name{fit_pls}
\alias{fit_pls}
\title{Fit a PLS regression model}
\usage{
fit_pls(x, validation = TRUE, variable, env = parent.frame())
}
\arguments{
\item{x}{List that contains calibration
set, validation set, and model tuning options}
\item{validation}{Logical expression weather independent
validation is performed}
\item{variable}{Response variable to be modeled}
\item{env}{Environment where function is evaluated}
}
\description{
Uses the caret package to perform PLS modeling.
Spectra are centered and scaled prior to modeling.
}

+ 28
- 0
man/fit_pls_q.Rd View File

@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pls-modeling.R
\name{fit_pls_q}
\alias{fit_pls_q}
\title{Fit a PLS regression model
(quoted version of the function)}
\usage{
fit_pls_q(x, validation = TRUE, variable, tr_control, env = parent.frame())
}
\arguments{
\item{x}{List that contains calibration
set, validation set, and model tuning options}
\item{validation}{Logical expression weather independent
validation is performed}
\item{variable}{Response variable to be modeled}
\item{tr_control}{Object that defines controlling parameters
of the desired internal validation framework}
\item{env}{Environment where function is evaluated}
}
\description{
Uses the caret package to perform PLS modeling.
Spectra are centered and scaled prior to modeling.
}

+ 24
- 0
man/join_chem_spec.Rd View File

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/join-chem-spectra.R
\name{join_chem_spec}
\alias{join_chem_spec}
\title{Join chemical and spectral data frames}
\usage{
join_chem_spec(dat_chem, dat_spec, by = "sample_ID")
}
\arguments{
\item{dat_chem}{data.frame that contains chemical values of
the sample}
\item{dat_spec}{List that contains spectral data}
\item{by}{character of column name that defines sample_ID}
}
\value{
List: xxx
}
\description{
Combines spectral data (data.frame) and chemical
data (data.frame).
}

+ 28
- 0
man/ken_stone.Rd View File

@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pls-modeling.R
\name{ken_stone}
\alias{ken_stone}
\title{Split}
\usage{
ken_stone(spec_chem, ratio_val, pc, print = TRUE,
validation = TRUE)
}
\arguments{
\item{spec_chem}{data.frame that contains chemical
and IR spectroscopy data}
\item{ratio_val}{Ratio of number of validation and all samples.}
\item{pc}{Number of principal components (numeric)}
\item{print}{logical expression weather calibration}
\item{validation}{Logical expression weather
calibration sampling is performed
(\code{TRUE} or \code{FALSE}).}
}
\description{
Perform calibration sampling based on
the Kennard-Stones algorithm.
}

+ 34
- 0
man/pls_ken_stone.Rd View File

@ -0,0 +1,34 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pls-modeling.R
\name{pls_ken_stone}
\alias{pls_ken_stone}
\title{Calibration sampling, model tuning, and PLS regression}
\usage{
pls_ken_stone(spec_chem, ratio_val, pc, print = TRUE, validation = TRUE,
variable, env = parent.frame())
}
\arguments{
\item{spec_chem}{data.frame that contains IR spectroscopy
and chemical data}
\item{ratio_val}{Ratio of number of validation and all samples.}
\item{pc}{Number of Principal Components used for Calibration
sampling (Kennard-Stones algorithm)}
\item{print}{Logical expression weather graphs shall be printed}
\item{validation}{Logical expression weather independent
validation is performed}
\item{variable}{Response variable (without quotes)}
\item{env}{Environment where function is evaluated}
\item{k}{Number of validation samples}
}
\description{
Perform calibration sampling and use selected
calibration set for model tuning
}

+ 22
- 0
man/predict_from_spectra.Rd View File

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict-spectra.R
\name{predict_from_spectra}
\alias{predict_from_spectra}
\title{Predict soil properties of new spectra based on calibration models}
\usage{
predict_from_spectra(model_list, spectra_list)
}
\arguments{