Completely redesign gather_spc() to support all spectral types. Also implement thorough `data` element testing

6 years ago · c68a2f4142
3 changed files with 258 additions and 54 deletions
--- a/1
+++ b/1
@ -43,5 +43,6 @@ importFrom(foreach,"%dopar%")
 importFrom(magrittr,"%>%")
 importFrom(rlang,ensym)
 importFrom(rlang,quo_name)
+importFrom(rlang,set_names)
 importFrom(stringr,str_replace)
 importFrom(tidyselect,one_of)
--- a/R/gather-spc.R
+++ b/R/gather-spc.R
@ -1,53 +1,237 @@
-#' @title Gather spectra from list of spectral data into a tibble object
-#' @description Gather specta and spectrometer metadata from list into a tibble.
-#' Spectra and wavenumbers are stored in list-columns. A tibble is an extended
-#' data frame and each spectrum can contain complex data and metadata that
-#' are in a rectangular data structure. List-columns is tidy
+#' @title Gather spectra of different types from nested list of spectral data
+#' into a tibble object
+#' @description Gather specta, spectrometer metadata, measurment id's,
+#' x unit values from list into a tibble, one row per spectrum replicate
+#' measurement. Spectra, x values and metadata are stored in list-columns.
+#' A tibble is an extende data frame and each spectrum can contain complex data
+#' and metadata that are in a rectangular data structure. List-columns is tidy
 #' data structure concept that can be combined with functional programming
 #' frameworks provided by e.g. the purrr package.
-#' @param data list with file name elements that contain spectra and metadata
-#' after reading binary OPUS files with \code{simplerspec::read_opus_univ()}
-#' @usage gather_spc(data)
-#' @return Spectral data and metadata in object of class tible
+#' @param data list with file name (`file_id`) elements that contain spectum
+#' types, corresponding x unit type values and metadata; The `data` list
+#' is commonly the output after reading binary OPUS files with
+#' \code{simplerspec::read_opus_univ()}.
+#' @param spc_types Character vector with the spectum types to be extracted
+#' and gathered into list columns. The spectrum type names need to match exactly
+#' the standardised spectrum type names, which are also names of the second list
+#' hierarchy elements of `data`. These values are allowed: `"spc"`: final
+#' spectrum after atmospheric compensation if performed, otherwise spectrum
+#' after ratioing sample and reference single channel reflectance -- referred to
+#' as `AB` in Bruker OPUS software; `"spc_nocomp"`: Spectrum prior to
+#' atmospheric compensation if performed; `"sc_sm"`: Single channel reflectance
+#' spectrum of the sample; `"sc_rf"`: Single channel reflectance spectrum of
+#' the reference (background spectrum); `"ig_sm"`: Interferogram of the sample;
+#' `"ig_rf"`: Interferogram of the reference. Default is only extracting final
+#' spectra, `spc_types = c("spc")`
+#' @usage gather_spc(data = data, spc_types = c("spc", "spc_nocomp", "sc_sm",
+#' "sc_rf", "ig_sm", "ig_rf"))
+#' @return Spectra related spectral data transformed into (list-)columns
+#' of object of class tibble.
+#' @importFrom rlang set_names
 #' @export
-gather_spc <- function(data) {
-
-  ## Extract data from list by dplyr::map variants -----------------------------
-
-  # Extract original spectral matrix for all scans
-  # First, try to map all spectra; if some of the list elements are NULL,
-  # remove all spectra and metadata for Bruker files that were
-  # not successfully read (NULL in final spectra in sublist "spc")
-  map_spc <- purrr::map(data, "spc")
-  which_NULL <- which(sapply(map_spc, is.null))
-  if (length(which_NULL > 0)) {
-    message(paste0("Sample spectra originating from the following",
-      " OPUS files could not gathered because extracted spectra are NULL: <",
-      paste(names(which_NULL), collapse = ";"), ">. ",
-      "The following list positions have therefore been omitted from
-      <data> when gathering spectra from list into tibble: ",
-      paste(which_NULL, collapse = "; ")), ".")
-    data <- data[- which_NULL]
-    map_spc <- map_spc[- which_NULL]
+gather_spc <- function(data,
+                       spc_types = c("spc")) {
+  spc_types <- map(spc_types, rlang::sym)
+  spc_types_chr <- map_chr(spc_types, rlang::quo_name)
+
+  # Duplicate original data to refer to full `data` list in messages
+  data_origin <- data
+
+  ## Make sure first level `data` elements are named
+
+  if (any(names(data) %in% "")) {
+    which_missing <- which(names(data) %in% "")
+    idx_nm_missing <- rlang::set_names(names(data)[which_missing],
+      which_missing)
+    message(paste0(length(which_missing), " `data` ",
+      ifelse(length(which_missing) > 1, "elements", "element"),
+      ifelse(length(which_missing) > 1, " have", " has"),
+      " missing `file_id` names. ",
+      ifelse(length(which_missing) > 1, "These", "This"), "`data` list index",
+      ifelse(length(which_missing) > 1, " positions\n", " position\n"),
+      ifelse(length(which_missing) > 1, "were", "was"),
+      " therefore removed:\n\n",
+      idx_nm_missing %>%
+        purrr::imap_chr(~ paste0(.y, " : ", .x)) %>% paste(collapse = "  "),
+      "\n\n"))
+
+    data <- data[-which_missing]
+  }
+
+  ## Check whether any file names (`file_id`'s) are duplicated;
+  ## remove and raise warning in case duplicated
+
+  is_duplicated <- duplicated(names(data))
+  idx_nm_duplicated <- rlang::set_names(names(data)[is_duplicated],
+    which(is_duplicated))
+  if (any(is_duplicated)) {
+    message(paste0(sum(is_duplicated), " `file_id` named ",
+      ifelse(sum(is_duplicated) > 1, "elements", "element"), " of `data` ",
+      ifelse(sum(is_duplicated) > 1, "are", "is"),
+      " non-unique/duplicated\nand ",
+      ifelse(sum(is_duplicated) > 1, "were", "was"), " therefore removed:\n\n",
+      idx_nm_duplicated %>%
+        purrr::imap_chr(~ paste0(.y, " : ", .x)) %>% paste(collapse = "  "),
+      "\n\n"))
+
+    data <- data[-is_duplicated]
  }
-  # Extract metadata list elements and combine into data.frame
-  map_metadata_df <- purrr::map_df(data, "metadata")
-  map_metadata <- purrr::map(data, "metadata")
-  # Extract rownames of spectra; remove names of rownames vector
-  rownames_spc <- unname(unlist(lapply(map_spc, rownames)))
-  # Extract wavenumbers
-  map_wavenumbers <- purrr::map(data, "wavenumbers")
-
-  ## Create list-column tibble
-  data_tibble <- tibble::as_tibble(
-    map_metadata_df[c("unique_id", "file_id", "sample_id")]
+
+  ## Check whether `spc_types` are allowed and present at the second level of
+  ## the nested list `data`
+
+  spc_types_allowed <- c("spc", "spc_nocomp", "sc_sm", "sc_rf", "ig_sm",
+    "ig_rf")
+  spc_types_present <- spc_types_chr %in% spc_types_allowed
+
+  if (any(!spc_types_present)) {
+    options(useFancyQuotes = FALSE)
+    stop("The following spectrum types specified in `spc_types` are",
+      " not supported:\n\n",
+      paste(dQuote(spc_types_chr[!spc_types_present]), collapse = ", ")) }
+
+  ## Collect names of spectrum types nested at second `file_id` `data` list
+  ## level to check whether all specified `spc_types` are present in `data`;
+  ## access sub-element names at that list level
+
+  data_types_byfile <- purrr::modify_depth(data, .depth = 1,
+    ~ purrr::imap_chr(.x, ~ .y))
+  spc_types_byfile_in <- map(data_types_byfile, ~ spc_types_chr %in% .x)
+
+  if (any(map_lgl(spc_types_byfile_in, ~ any(!.x)))) {
+    which_rm <- map_lgl(spc_types_byfile_in, ~ any(!.x)) %>% which()
+    rm_origin <- names(data_origin[names(which_rm)])
+    rm_origin_lgl <- names(data_origin) %in% rm_origin
+    which_rm_origin <- rlang::set_names(rm_origin, which(rm_origin_lgl))
+
+    message(paste0("Spectrum types (second list level names) specified in",
+      " `spc_types` were not found\nwithin all first level elements ",
+      " of list `data` or are NULL (spectra data by `file_id`).\n",
+      "Therefore, all data elements for the corresponding",
+      " list indices and `file_id`'s\nwere removed from `data`:\n\n",
+      which_rm_origin %>%
+        purrr::imap_chr(~ paste0(.y, " : ", .x)) %>% paste(collapse = "  "),
+      "\n\n"))
+
+    data <- data[- which_rm]
+  }
+
+  ## Extract the spectra of different types by plucking all types specified in
+  ## `spc_types_chr` argment into separate first list level elements
+
+  spc_mapped <- map(rlang::set_names(spc_types_chr),
+    ~ pluck_depth(data = data, .depth = 1, .string = .x))
+
+  ## Match and extract the values for the x unit types
+
+  spc_type_x_unit <- list(
+    "spc" = c("wavenumbers", "wavelengths", "x_values"),
+    "spc_nocomp" = c("wavenumbers", "wavelengths", "x_values"),
+    "sc_sm" = c("wavenumbers_sc_sm", "wavelengths_sc_sm", "x_values_sc_sm"),
+    "sc_rf" = c("wavenumbers_sc_rf", "wavelengths_sc_rf", "x_values_sc_rf")
  )

-  ## Add spectra and wavenumbers
-  tibble::add_column(.data = data_tibble,
-    # raw spectra
-    spc = map_spc,
-    wavenumbers = map_wavenumbers,
-    metadata = map_metadata
+  lcols_x_values_matching <- purrr::flatten_chr(spc_type_x_unit[spc_types_chr])
+  x_values_matching <- lcols_x_values_matching[lcols_x_values_matching
+    %in% unique(purrr::flatten_chr(data_types_byfile))]
+
+  # `data` has eventually been updated (removed `data` elements)
+  data_types_byfile <- purrr::modify_depth(data, .depth = 1,
+    ~ purrr::imap_chr(.x, ~ .y))
+  x_value_types_byfile_in <- map(data_types_byfile, ~ x_values_matching %in% .x)
+
+  if (any(map_lgl(x_value_types_byfile_in, ~ any(!.x)))) {
+    which_rm <- map_lgl(x_value_types_byfile_in, ~ any(!.x)) %>% which()
+    rm_origin <- names(data_origin[names(which_rm)])
+    rm_origin_lgl <- names(data_origin) %in% rm_origin
+    which_rm_origin <- rlang::set_names(rm_origin, which(rm_origin_lgl))
+
+    message(paste0("These x unit types (second list level names)",
+      " corresponding to \nspecified `spc_types` spectrum types were not",
+      " found within\nall first level elements or are NULL",
+      " (spectra data by `file_id`) of list `data`: \n\n",
+      paste(dQuote(x_values_matching), collapse = ", ")), "\n\n",
+      "Therefore, all data elements for the corresponding",
+      " list indices\nand `file_id`'s were removed from `data`:\n\n",
+      which_rm_origin %>%
+        purrr::imap_chr(~ paste0(.y, " : ", .x)) %>% paste(collapse = "  "),
+      "\n\n")
+
+    data <- data[- which_rm]
+    spc_mapped <- map(spc_mapped, ~ .x[- which_rm])
+  }
+
+  x_values_mapped <- map(set_names(x_values_matching),
+    ~ pluck_depth(data = data, .depth = 1, .string = .x))
+
+  ## Combine the mapped spectra and correspoding x unit types, and order
+  ## list before returning spectral tibble (`spc_tbl`)
+
+  spc_x_unit_types_order <- c("spc", "wavenumbers", "wavelengths", "x_values",
+    "spc_nocomp",
+    "sc_sm", "wavenumbers_sc_sm", "wavelengths_sc_sm", "x_values_sc_sm",
+    "sc_rf", "wavenumbers_sc_rf", "wavelengths_sc_rf", "x_values_sc_rf",
+    "ig_sm", "ig_rf")
+
+  spc_x_values_mapped <- c(spc_mapped, x_values_mapped)
+  spc_x_unit_types_matched <- spc_x_unit_types_order[spc_x_unit_types_order %in%
+    names(spc_x_values_mapped)]
+  spc_x_values_mapped <- spc_x_values_mapped[spc_x_unit_types_matched]
+
+  ## Check if `metadata` elements are present for all list first level elements
+
+  # `data` has eventually been updated (removed `data` elements)
+  data_types_byfile <- purrr::modify_depth(data, .depth = 1,
+    ~ purrr::imap_chr(.x, ~ .y))
+  metadata_byfile_in <- map(data_types_byfile, ~ "metadata" %in% .x)
+
+  if (any(map_lgl(metadata_byfile_in, ~ any(!.x)))) {
+    which_rm <- map_lgl(metadata_byfile_in, ~ any(!.x)) %>% which()
+    rm_origin <- names(data_origin[names(which_rm)])
+    rm_origin_lgl <- names(data_origin) %in% rm_origin
+    which_rm_origin <- rlang::set_names(rm_origin, which(rm_origin_lgl))
+
+    message(paste0("`metadata` (second list level names) was not",
+      " found within\n all first level elements (`metadata` by `file_id`)",
+      " of list `data` or are NULL.\n\n",
+    "Therefore, all data elements for the corresponding",
+    " list indices\nand `file_id`'s were removed from `data`:\n\n",
+    which_rm_origin %>%
+      purrr::imap_chr(~ paste0(.y, " : ", .x)) %>% paste(collapse = "  "),
+        "\n\n"))
+
+    data <- data[- which_rm]
+    spc_x_values_mapped <- map(spc_x_values_mapped, ~ .x[- which_rm])
+  }
+
+  ## Extract metadata list elements and combine into tibble
+
+  metadata_mapped <- map(set_names(list("metadata")),
+    ~ pluck_depth(data = data, .depth = 1, .string = .x))
+  metadata_df <- map_df(data, "metadata")
+  id_tbl <- tibble::as_tibble(
+    metadata_df[c("unique_id", "file_id", "sample_id")]
  )
+
+  ## Column bind all tibbles of metadata id's, metadata, spectra, and
+  ## x unit type values; return on combined spectral tibble (`spc_tbl`)
+
+  spc_tbl <- tibble::as_tibble(spc_x_values_mapped)
+  metadata_tbl <- tibble::as_tibble(metadata_mapped)
+  dplyr::bind_cols(list(id_tbl, metadata_tbl, spc_tbl))
+}
+
+
+# Helpers ----------------------------------------------------------------------
+
+# Pluck out single elements from a list hierarchy level (`.depth`)
+# specified in `.string` based on element names; All elements with the specified
+# name will be transposed into a single list named by the element name.
+# `pluck_depth` can be combined with `purrr::map` to extract elements of
+# different names at all list nodes at the specified level, using e.g.
+# map(rlang::set_names(c("spc", "spc_nocomp")),
+#   ~ pluck_depth(data = data, .depth = 1, .string = .x))
+
+pluck_depth <- function(data, .depth = 1, .string = NULL) {
+  purrr::modify_depth(data, .depth = .depth, ~ purrr::pluck(.x, .string))
 }
--- a/man/gather_spc.Rd
+++ b/man/gather_spc.Rd
@ -2,22 +2,41 @@
 % Please edit documentation in R/gather-spc.R
 \name{gather_spc}
 \alias{gather_spc}
-\title{Gather spectra from list of spectral data into a tibble object}
+\title{Gather spectra of different types from nested list of spectral data
+into a tibble object}
 \usage{
-gather_spc(data)
+gather_spc(data = data, spc_types = c("spc", "spc_nocomp", "sc_sm",
+"sc_rf", "ig_sm", "ig_rf"))
 }
 \arguments{
-\item{data}{list with file name elements that contain spectra and metadata
-after reading binary OPUS files with \code{simplerspec::read_opus_univ()}}
+\item{data}{list with file name (`file_id`) elements that contain spectum
+types, corresponding x unit type values and metadata; The `data` list
+is commonly the output after reading binary OPUS files with
+\code{simplerspec::read_opus_univ()}.}
+
+\item{spc_types}{Character vector with the spectum types to be extracted
+and gathered into list columns. The spectrum type names need to match exactly
+the standardised spectrum type names, which are also names of the second list
+hierarchy elements of `data`. These values are allowed: `"spc"`: final
+spectrum after atmospheric compensation if performed, otherwise spectrum
+after ratioing sample and reference single channel reflectance -- referred to
+as `AB` in Bruker OPUS software; `"spc_nocomp"`: Spectrum prior to
+atmospheric compensation if performed; `"sc_sm"`: Single channel reflectance
+spectrum of the sample; `"sc_rf"`: Single channel reflectance spectrum of
+the reference (background spectrum); `"ig_sm"`: Interferogram of the sample;
+`"ig_rf"`: Interferogram of the reference. Default is only extracting final
+spectra, `spc_types = c("spc")`}
 }
 \value{
-Spectral data and metadata in object of class tible
+Spectra related spectral data transformed into (list-)columns
+of object of class tibble.
 }
 \description{
-Gather specta and spectrometer metadata from list into a tibble.
-Spectra and wavenumbers are stored in list-columns. A tibble is an extended
-data frame and each spectrum can contain complex data and metadata that
-are in a rectangular data structure. List-columns is tidy
+Gather specta, spectrometer metadata, measurment id's,
+x unit values from list into a tibble, one row per spectrum replicate
+measurement. Spectra, x values and metadata are stored in list-columns.
+A tibble is an extende data frame and each spectrum can contain complex data
+and metadata that are in a rectangular data structure. List-columns is tidy
 data structure concept that can be combined with functional programming
 frameworks provided by e.g. the purrr package.
 }