Browse Source

working on #14

pull/15/head
boB Rudis 6 years ago
parent
commit
e2101c435d
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 13
      DESCRIPTION
  2. 54
      R/comments.R
  3. 1
      R/docxtractr-package.r
  4. 31
      R/extract_all.r
  5. 2
      R/utils.r
  6. 1
      burrp
  7. 7
      man/docx_extract_all_cmnts.Rd

13
DESCRIPTION

@ -1,10 +1,12 @@
Package: docxtractr
Title: Extract Data Tables and Comments from 'Microsoft' 'Word' Documents
Version: 0.4.0
Version: 0.5.0
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
person("Mark", "Dulhunty", role = c("ctb"))
)
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
person("Mark", "Dulhunty", role = c("ctb")),
person("Karlo", "Martins", role = c("ctb"), email = "guidonimartins@gmail.com")
)
Maintainer: Bob Rudis <bob@rud.is>
Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fairly
straightforward to navigate, especially when it applies to 'Word' tables and
@ -12,6 +14,7 @@ Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fa
and also to extract/clean tables and comments from 'Microsoft Word' 'docx' documents.
URL: http://github.com/hrbrmstr/docxtractr
BugReports: https://github.com/hrbrmstr/docxtractr/issues
Encoding: UTF-8
Depends:
R (>= 3.2.0)
License: MIT + file LICENSE
@ -27,4 +30,4 @@ Imports:
dplyr,
utils,
httr
RoxygenNote: 6.0.1
RoxygenNote: 6.0.1.9000

54
R/comments.R

@ -0,0 +1,54 @@
#' Extract all comments from a Word document
#'
#' @md
#' @param docx \code{docx} object read with \code{read_docx}
#' @param include_text if `TRUE` then the text associated with the comment will
#' also be included
#' @return \code{data_frame} of comment id, author & text
#' @export
#' @examples
#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
#' docx_cmnt_count(cmnts)
#' docx_describe_cmnts(cmnts)
#' docx_extract_all_cmnts(cmnts)
docx_extract_all_cmnts <- function(docx, include_text=FALSE) {
ensure_docx(docx)
if (docx_cmnt_count(docx) < 1) return(tibble::data_frame())
ns <- docx$ns
comments <- docx$cmnts
purrr::map_df(xml2::xml_attrs(comments), function(x) {
tibble::as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
}) -> meta
dplyr::bind_cols(
meta,
cbind.data.frame(comment_text=xml2::xml_text(comments), stringsAsFactors=FALSE)
) -> out
if (include_text) {
doc <- docx$docx
out$word_src <- purrr::map_chr(out$id, ~{
xml_find_all(
doc,
sprintf("//w:commentRangeStart[@w:id='%s']/following-sibling::*[
count(. | //w:commentRangeEnd[@w:id='%s']/preceding-sibling::*) =
count(//w:commentRangeEnd[@w:id='%s']/preceding-sibling::*)]",
.x, .x, .x)
) %>%
xml_text() %>%
paste0(collapse=" ")
})
}
tibble::as_tibble(out)
}

1
R/docxtractr-package.r

@ -18,3 +18,4 @@
#' @importFrom purrr map_df map map_int map_chr map_lgl
#' @importFrom httr GET stop_for_status write_disk
NULL

31
R/extract_all.r

@ -59,34 +59,3 @@ docx_extract_all <- function(docx, guess_header=TRUE, preserve=FALSE, trim=TRUE)
message("docx_extract_all() is deprecated; use docx_extract_all_tbls()")
docx_extract_all_tbls(docx, guess_header, preserve, trim)
}
#' Extract all comments from a Word document
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @return \code{data_frame} of comment id, author & text
#' @export
#' @examples
#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
#' docx_cmnt_count(cmnts)
#' docx_describe_cmnts(cmnts)
#' docx_extract_all_cmnts(cmnts)
docx_extract_all_cmnts <- function(docx) {
ensure_docx(docx)
if (docx_cmnt_count(docx) < 1) return(data_frame())
ns <- docx$ns
comments <- docx$cmnts
purrr::map_df(xml2::xml_attrs(comments), function(x) {
as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
}) -> meta
bind_cols(meta,
cbind.data.frame(comment_text=xml2::xml_text(comments),
stringsAsFactors=FALSE)) -> out
as_tibble(out)
}

2
R/utils.r

@ -31,4 +31,4 @@ has_header <- function(tbl, rows, ns) {
is_url <- function(path) { grepl("^(http|ftp)s?://", path) }
is_docx <- function(path) { tolower(file_ext(path)) == "docx" }
is_docx <- function(path) { tolower(tools::file_ext(path)) == "docx" }

1
burrp

@ -0,0 +1 @@
Subproject commit 74d48f22ac94804e5b83d97b18508aafe94eb365

7
man/docx_extract_all_cmnts.Rd

@ -1,13 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/extract_all.r
% Please edit documentation in R/comments.R
\name{docx_extract_all_cmnts}
\alias{docx_extract_all_cmnts}
\title{Extract all comments from a Word document}
\usage{
docx_extract_all_cmnts(docx)
docx_extract_all_cmnts(docx, include_text = FALSE)
}
\arguments{
\item{docx}{\code{docx} object read with \code{read_docx}}
\item{include_text}{if \code{TRUE} then the text associated with the comment will
also be included}
}
\value{
\code{data_frame} of comment id, author & text

Loading…
Cancel
Save