Browse Source

track changes support ref: #19

tags/v0.6.1
boB Rudis 6 years ago
parent
commit
624db148eb
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 7
      DESCRIPTION
  2. 2
      NAMESPACE
  3. 5
      NEWS.md
  4. 2
      R/docxtractr-package.r
  5. 50
      R/read_docs.r
  6. 11
      R/utils-pipe.R
  7. 25
      README.Rmd
  8. 50
      README.md
  9. BIN
      inst/examples/trackchanges.docx
  10. 12
      man/pipe.Rd
  11. 19
      man/read_docx.Rd

7
DESCRIPTION

@ -1,6 +1,6 @@
Package: docxtractr
Title: Extract Data Tables and Comments from 'Microsoft' 'Word' Documents
Version: 0.5.0
Version: 0.6.0
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
@ -24,14 +24,15 @@ LazyData: true
Suggests:
testthat,
covr
Imports:
Imports:
tools,
xml2,
tibble,
purrr,
dplyr,
utils,
httr
httr,
magrittr
RoxygenNote: 6.0.1.9000
SystemRequirements: LibreOffice (<https://www.libreoffice.org/>) required to extract
data from .doc files.

2
NAMESPACE

@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand
S3method(print,docx)
export("%>%")
export(assign_colnames)
export(docx_cmnt_count)
export(docx_describe_cmnts)
@ -20,6 +21,7 @@ importFrom(dplyr,select)
importFrom(httr,GET)
importFrom(httr,stop_for_status)
importFrom(httr,write_disk)
importFrom(magrittr,"%>%")
importFrom(purrr,map)
importFrom(purrr,map_chr)
importFrom(purrr,map_df)

5
NEWS.md

@ -1,3 +1,8 @@
# 0.6.0
- Enable support for accepting or rejecting tracked changes when
reading in the document. Ref #19
# 0.5.0
- .doc input supported (via Chris Muir)

2
R/docxtractr-package.r

@ -10,7 +10,7 @@
#' @docType package
#'
#' @author Bob Rudis (bob@@rud.is)
#' @importFrom xml2 xml_find_all xml_text xml_ns xml_find_first xml_attrs
#' @importFrom xml2 xml_find_all xml_text xml_ns xml_find_first xml_attrs read_xml
#' @importFrom tibble data_frame as_data_frame as_tibble
#' @importFrom dplyr bind_cols count arrange select
#' @importFrom tools file_ext

50
R/read_docs.r

@ -4,20 +4,39 @@
#' \code{.doc} file as input if \code{LibreOffice} is installed
#' (see \url{https://www.libreoffice.org/} for more info and to download).
#'
#' @md
#' @param path path to the Word document
#' @importFrom xml2 read_xml
#' @param track_changes if not `NULL` (the default) then must be one of
#' "`accept`" or "`reject`" which will, respectively, accept all or
#' reject all changes. NOTE: this functionality relies on the
#' `pandoc` utility being available on the system `PATH`. Both
#' system `PATH` and the `RSTUDIO_PANDOC` (RStudio ships with
#' a copy of `pandoc`) environment variables will be checked.
#' If no `pandoc` binary is found then a warning will be issued
#' and the document will be read without integrating or ignoring
#' any tracked changes. The original Word document *will not be modified*
#' and this feature *only works* with `docx` files.
#' @export
#' @examples
#' doc <- read_docx(system.file("examples/data.docx", package="docxtractr"))
#' class(doc)
#'
#' doc <- read_docx(
#' system.file("examples/trackchanges.docx", package="docxtractr"),
#' track_changes = "accept"
#' )
#'
#' \dontrun{
#' # from a URL
# budget <- read_docx(
# "http://rud.is/dl/1.DOCX")
#' }
read_docx <- function(path) {
read_docx <- function(path, track_changes=NULL) {
stopifnot(is.character(path))
if (!is.null(track_changes)) {
track_changes <- match.arg(track_changes, c("accept", "reject"))
}
# make temporary things for us to work with
tmpd <- tempdir()
@ -78,6 +97,33 @@ read_docx <- function(path) {
}
}
if (!is.null(track_changes)) {
pandoc_bin <- Sys.which("pandoc")
if (pandoc_bin == "") {
pandoc_bin <- Sys.getenv("RSTUDIO_PANDOC")
if (pandoc_bin == "") {
warning(
"Track changes option was used but no pandoc binary was found. ",
"Please ensure that the directory containing pandoc is available ",
"on the system PATH and restart the R session before trying again. ",
"Reading in document *without* tracking any changes."
)
}
}
if (pandoc_bin != "") {
system2(
command = pandoc_bin,
args = c(
"-f", "docx",
"-t", "docx",
"-o", tmpf,
sprintf("--track-changes=%s", track_changes),
tmpf
)
)
}
}
# unzip it
unzip(tmpf, exdir=sprintf("%s/docdata", tmpd))

11
R/utils-pipe.R

@ -0,0 +1,11 @@
#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL

25
README.Rmd

@ -56,6 +56,7 @@ The following data file are included:
- `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables
- `system.file("examples/comments.docx", package="docxtractr")`: Word docx with comments
- `system.file("examples/realworld.docx", package="docxtractr")`: A "real world" Word docx file with tables of all shapes and sizes
- `system.file("examples/trackchanges.docx", package="docxtractr")`: Word docx with track changes in a table
## Installation
@ -176,6 +177,30 @@ print(cmnts)
glimpse(docx_extract_all_cmnts(cmnts))
```
### Track Changes (depends on `pandoc` being available)
```{r track-changes}
# original
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr")
) %>%
docx_extract_all_tbls(guess_header = FALSE)
# accept
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr"),
track_changes = "accept"
) %>%
docx_extract_all_tbls(guess_header = FALSE)
# reject
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr"),
track_changes = "reject"
) %>%
docx_extract_all_tbls(guess_header = FALSE)
```
## Test Results
```{r test}

50
README.md

@ -70,6 +70,8 @@ The following data file are included:
docx with comments
- `system.file("examples/realworld.docx", package="docxtractr")`: A
“real world” Word docx file with tables of all shapes and sizes
- `system.file("examples/trackchanges.docx", package="docxtractr")`:
Word docx with track changes in a table
## Installation
@ -88,7 +90,7 @@ library(dplyr)
# current version
packageVersion("docxtractr")
#> [1] '0.5.0'
#> [1] '0.6.0'
```
``` r
@ -441,6 +443,46 @@ glimpse(docx_extract_all_cmnts(cmnts))
#> $ comment_text <chr> "This is the first comment", "This is the second comment", "This is a reply to the second comm...
```
### Track Changes (depends on `pandoc` being available)
``` r
# original
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr")
) %>%
docx_extract_all_tbls(guess_header = FALSE)
#> NOTE: header=FALSE but table has a marked header row in the Word document
#> [[1]]
#> # A tibble: 1 x 1
#> V1
#> <chr>
#> 1 21
# accept
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr"),
track_changes = "accept"
) %>%
docx_extract_all_tbls(guess_header = FALSE)
#> [[1]]
#> # A tibble: 1 x 1
#> V1
#> <chr>
#> 1 2
# reject
read_docx(
system.file("examples/trackchanges.docx", package="docxtractr"),
track_changes = "reject"
) %>%
docx_extract_all_tbls(guess_header = FALSE)
#> [[1]]
#> # A tibble: 1 x 1
#> V1
#> <chr>
#> 1 1
```
## Test Results
``` r
@ -453,14 +495,14 @@ library(testthat)
#> matches
date()
#> [1] "Sun Sep 16 13:17:27 2018"
#> [1] "Tue Oct 23 08:10:10 2018"
test_dir("tests/")
#> ✔ | OK F W S | Context
#> ══ testthat results ═════════════════════════════════════════════════════════════════════════════════════════
#> ══ testthat results ═════════════════════════════════════════════════
#> OK: 16 SKIPPED: 0 FAILED: 0
#>
#> ══ Results ═══════════════════════════════════════════════════════════════════════════════════════════════════
#> ══ Results ═══════════════════════════════════════════════════════════
#> Duration: 0.2 s
#>
#> OK: 0

BIN
inst/examples/trackchanges.docx

Binary file not shown.

12
man/pipe.Rd

@ -0,0 +1,12 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils-pipe.R
\name{\%>\%}
\alias{\%>\%}
\title{Pipe operator}
\usage{
lhs \%>\% rhs
}
\description{
See \code{magrittr::\link[magrittr]{\%>\%}} for details.
}
\keyword{internal}

19
man/read_docx.Rd

@ -4,10 +4,21 @@
\alias{read_docx}
\title{Read in a Word document for table extraction}
\usage{
read_docx(path)
read_docx(path, track_changes = NULL)
}
\arguments{
\item{path}{path to the Word document}
\item{track_changes}{if not \code{NULL} (the default) then must be one of
"\code{accept}" or "\code{reject}" which will, respectively, accept all or
reject all changes. NOTE: this functionality relies on the
\code{pandoc} utility being available on the system \code{PATH}. Both
system \code{PATH} and the \code{RSTUDIO_PANDOC} (RStudio ships with
a copy of \code{pandoc}) environment variables will be checked.
If no \code{pandoc} binary is found then a warning will be issued
and the document will be read without integrating or ignoring
any tracked changes. The original Word document \emph{will not be modified}
and this feature \emph{only works} with \code{docx} files.}
}
\description{
Local file path or URL pointing to a \code{.docx} file. Can also take
@ -17,6 +28,12 @@ Local file path or URL pointing to a \code{.docx} file. Can also take
\examples{
doc <- read_docx(system.file("examples/data.docx", package="docxtractr"))
class(doc)
doc <- read_docx(
system.file("examples/trackchanges.docx", package="docxtractr"),
track_changes = "accept"
)
\dontrun{
# from a URL
}

Loading…
Cancel
Save