working on #14

6 years ago · e2101c435d
7 changed files with 70 additions and 39 deletions
--- a/13
+++ b/13
@ -1,10 +1,12 @@
 Package: docxtractr
 Title: Extract Data Tables and Comments from 'Microsoft' 'Word' Documents
-Version: 0.4.0
+Version: 0.5.0
 Authors@R: c(
-      person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
-      person("Mark", "Dulhunty", role = c("ctb"))
-    )
+    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 
+           comment = c(ORCID = "0000-0001-5670-2640")),
+    person("Mark", "Dulhunty", role = c("ctb")),
+    person("Karlo", "Martins", role = c("ctb"), email = "guidonimartins@gmail.com")
+  )
 Maintainer: Bob Rudis <bob@rud.is>
 Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fairly
    straightforward to navigate, especially when it applies to 'Word' tables and
@ -12,6 +14,7 @@ Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fa
    and also to extract/clean tables and comments from 'Microsoft Word' 'docx' documents.
 URL: http://github.com/hrbrmstr/docxtractr
 BugReports: https://github.com/hrbrmstr/docxtractr/issues
+Encoding: UTF-8
 Depends:
    R (>= 3.2.0)
 License: MIT + file LICENSE
@ -27,4 +30,4 @@ Imports:
    dplyr,
    utils,
    httr
-RoxygenNote: 6.0.1
+RoxygenNote: 6.0.1.9000
--- a/R/comments.R
+++ b/R/comments.R
@ -0,0 +1,54 @@
+#' Extract all comments from a Word document
+#'
+#' @md
+#' @param docx \code{docx} object read with \code{read_docx}
+#' @param include_text if `TRUE` then the text associated with the comment will
+#'        also be included
+#' @return \code{data_frame} of comment id, author & text
+#' @export
+#' @examples
+#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
+#' docx_cmnt_count(cmnts)
+#' docx_describe_cmnts(cmnts)
+#' docx_extract_all_cmnts(cmnts)
+docx_extract_all_cmnts <- function(docx, include_text=FALSE) {
+
+  ensure_docx(docx)
+  if (docx_cmnt_count(docx) < 1) return(tibble::data_frame())
+
+  ns <- docx$ns
+
+  comments <- docx$cmnts
+
+  purrr::map_df(xml2::xml_attrs(comments), function(x) {
+    tibble::as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
+  }) -> meta
+
+  dplyr::bind_cols(
+    meta,
+    cbind.data.frame(comment_text=xml2::xml_text(comments), stringsAsFactors=FALSE)
+  ) -> out
+
+  if (include_text) {
+
+    doc <- docx$docx
+
+    out$word_src <- purrr::map_chr(out$id, ~{
+      xml_find_all(
+        doc,
+        sprintf("//w:commentRangeStart[@w:id='%s']/following-sibling::*[
+             count(. | //w:commentRangeEnd[@w:id='%s']/preceding-sibling::*) =
+             count(//w:commentRangeEnd[@w:id='%s']/preceding-sibling::*)]",
+                .x, .x, .x)
+      ) %>%
+        xml_text() %>%
+        paste0(collapse=" ")
+
+    })
+
+
+  }
+
+  tibble::as_tibble(out)
+
+}
--- a/R/docxtractr-package.r
+++ b/R/docxtractr-package.r
@ -18,3 +18,4 @@
 #' @importFrom purrr map_df map map_int map_chr map_lgl
 #' @importFrom httr GET stop_for_status write_disk
 NULL
+
--- a/R/extract_all.r
+++ b/R/extract_all.r
@ -59,34 +59,3 @@ docx_extract_all <- function(docx, guess_header=TRUE, preserve=FALSE, trim=TRUE)
  message("docx_extract_all() is deprecated; use docx_extract_all_tbls()")
  docx_extract_all_tbls(docx, guess_header, preserve, trim)
 }
-
-#' Extract all comments from a Word document
-#'
-#' @param docx \code{docx} object read with \code{read_docx}
-#' @return \code{data_frame} of comment id, author & text
-#' @export
-#' @examples
-#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
-#' docx_cmnt_count(cmnts)
-#' docx_describe_cmnts(cmnts)
-#' docx_extract_all_cmnts(cmnts)
-docx_extract_all_cmnts <- function(docx) {
-
-  ensure_docx(docx)
-  if (docx_cmnt_count(docx) < 1) return(data_frame())
-
-  ns <- docx$ns
-
-  comments <- docx$cmnts
-
-  purrr::map_df(xml2::xml_attrs(comments), function(x) {
-    as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
-  }) -> meta
-
-  bind_cols(meta,
-            cbind.data.frame(comment_text=xml2::xml_text(comments),
-                             stringsAsFactors=FALSE)) -> out
-
-  as_tibble(out)
-
-}
--- a/R/utils.r
+++ b/R/utils.r
@ -31,4 +31,4 @@ has_header <- function(tbl, rows, ns) {

 is_url <- function(path) { grepl("^(http|ftp)s?://", path) }

-is_docx <- function(path) { tolower(file_ext(path)) == "docx" }
+is_docx <- function(path) { tolower(tools::file_ext(path)) == "docx" }
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 74d48f22ac94804e5b83d97b18508aafe94eb365
--- a/man/docx_extract_all_cmnts.Rd
+++ b/man/docx_extract_all_cmnts.Rd
@ -1,13 +1,16 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/extract_all.r
+% Please edit documentation in R/comments.R
 \name{docx_extract_all_cmnts}
 \alias{docx_extract_all_cmnts}
 \title{Extract all comments from a Word document}
 \usage{
-docx_extract_all_cmnts(docx)
+docx_extract_all_cmnts(docx, include_text = FALSE)
 }
 \arguments{
 \item{docx}{\code{docx} object read with \code{read_docx}}
+
+\item{include_text}{if \code{TRUE} then the text associated with the comment will
+also be included}
 }
 \value{
 \code{data_frame} of comment id, author & text
			`@ -0,0 +1 @@`
			`Subproject commit 74d48f22ac94804e5b83d97b18508aafe94eb365`