Bob Rudis
9 years ago
26 changed files with 774 additions and 4 deletions
@ -1,2 +1,8 @@ |
|||
^.*\.Rproj$ |
|||
^\.Rproj\.user$ |
|||
^README\.Rmd$ |
|||
^README-.*\.png$ |
|||
^\.travis\.yml$ |
|||
^CONDUCT\.md$ |
|||
^README\.md$ |
|||
^docxtractr-logo\.png$ |
|||
|
@ -0,0 +1,14 @@ |
|||
# Sample .travis.yml for R projects |
|||
|
|||
language: r |
|||
warnings_are_errors: true |
|||
sudo: required |
|||
|
|||
env: |
|||
global: |
|||
- CRAN: http://cran.rstudio.com |
|||
|
|||
notifications: |
|||
email: |
|||
on_success: change |
|||
on_failure: change |
@ -0,0 +1,25 @@ |
|||
# Contributor Code of Conduct |
|||
|
|||
As contributors and maintainers of this project, we pledge to respect all people who |
|||
contribute through reporting issues, posting feature requests, updating documentation, |
|||
submitting pull requests or patches, and other activities. |
|||
|
|||
We are committed to making participation in this project a harassment-free experience for |
|||
everyone, regardless of level of experience, gender, gender identity and expression, |
|||
sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. |
|||
|
|||
Examples of unacceptable behavior by participants include the use of sexual language or |
|||
imagery, derogatory comments or personal attacks, trolling, public or private harassment, |
|||
insults, or other unprofessional conduct. |
|||
|
|||
Project maintainers have the right and responsibility to remove, edit, or reject comments, |
|||
commits, code, wiki edits, issues, and other contributions that are not aligned to this |
|||
Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed |
|||
from the project team. |
|||
|
|||
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by |
|||
opening an issue or contacting one or more of the project maintainers. |
|||
|
|||
This Code of Conduct is adapted from the Contributor Covenant |
|||
(http:contributor-covenant.org), version 1.0.0, available at |
|||
http://contributor-covenant.org/version/1/0/0/ |
@ -1,8 +1,13 @@ |
|||
Package: docxtractr |
|||
Title: What the Package Does (one line, title case) |
|||
Title: Extract Tables from Microsoft Word Documents |
|||
Version: 0.0.0.9000 |
|||
Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre"))) |
|||
Description: What the package does (one paragraph). |
|||
Depends: R (>= 3.2.2) |
|||
Description: Microsoft Word docx files provide an XML structure that is fairly |
|||
straightforward to navigate, especially when it applies to Word tables. The |
|||
docxtractr package provides tools to determine table count, table structure and |
|||
extract tables from Microsoft Word docx documents. |
|||
Depends: R (>= 3.0.0) |
|||
License: MIT + file LICENSE |
|||
LazyData: true |
|||
Suggests: testthat |
|||
Imports: tools, xml2, dplyr |
|||
|
@ -1 +1,15 @@ |
|||
exportPattern("^[^\\.]") |
|||
# Generated by roxygen2 (4.1.1): do not edit by hand |
|||
|
|||
S3method(print,docx) |
|||
export(docx_describe_tbls) |
|||
export(docx_extract_tbl) |
|||
export(docx_tbl_count) |
|||
export(read_docx) |
|||
importFrom(dplyr,bind_rows) |
|||
importFrom(tools,file_ext) |
|||
importFrom(xml2,read_xml) |
|||
importFrom(xml2,xml_attrs) |
|||
importFrom(xml2,xml_find_all) |
|||
importFrom(xml2,xml_find_one) |
|||
importFrom(xml2,xml_ns) |
|||
importFrom(xml2,xml_text) |
|||
|
@ -0,0 +1,65 @@ |
|||
#' Returns a description of all the tables in the Word document |
|||
#' |
|||
#' This function will attempt to discern the structure of each of the tables |
|||
#' in \code{docx} and print this information |
|||
#' |
|||
#' @param docx \code{docx} object read with \code{read_docx} |
|||
#' @export |
|||
#' @examples |
|||
#' complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
#' docx_tbl_count(complx) |
|||
#' docx_describe_tbls(complx) |
|||
docx_describe_tbls <- function(docx) { |
|||
|
|||
ensure_docx(docx) |
|||
if (!docx_tbl_count(docx) > 0) stop("No tables in document", call.=FALSE) |
|||
|
|||
ns <- docx$ns |
|||
tbls <- docx$tbls |
|||
|
|||
cat(sprintf("Word document [%s]\n\n", docx$path)) |
|||
|
|||
for (i in 1:length(tbls)) { |
|||
|
|||
tbl <- tbls[[i]] |
|||
|
|||
cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns) |
|||
rows <- xml_find_all(tbl, "./w:tr", ns=ns) |
|||
|
|||
cell_count_by_row <- sapply(rows, function(row) { length(xml_find_all(row, "./w:tc", ns)) }) |
|||
row_counts <- paste0(unique(cell_count_by_row), collapse=", ") |
|||
max_cell_count <- max(cell_count_by_row) |
|||
|
|||
cat(sprintf("Table %d\n total cells: %d\n row count : %d\n", i, length(cells), length(rows))) |
|||
|
|||
# simplistic test for whether table is uniform rows x cells == cell count |
|||
if ((max_cell_count * length(rows)) == length(cells)) { |
|||
cat(" uniform : likely!\n") |
|||
} else { |
|||
cat(sprintf(" uniform : unlikely => found differing cell counts (%s) across some rows \n", row_counts)) |
|||
} |
|||
|
|||
# microsoft has a tag for some table structure info. examine it to |
|||
# see if the creator of the header made the first row special which |
|||
# will likely mean it's a header candidate |
|||
hdr <- has_header(tbl, rows, ns) |
|||
if (is.na(hdr)) { |
|||
cat(" has header : unlikely\n") |
|||
} else { |
|||
cat(sprintf(" has header : likely! => possibly [%s]\n", hdr)) |
|||
} |
|||
|
|||
cat("\n") |
|||
|
|||
} |
|||
|
|||
} |
|||
|
|||
#' Display information about the document |
|||
#' |
|||
#' @param x \code{docx} object |
|||
#' @param ... ignored |
|||
#' @export |
|||
print.docx <- function(x, ...) { |
|||
docx_describe_tbls(x) |
|||
} |
@ -0,0 +1,62 @@ |
|||
#' Extract a table from a Word document |
|||
#' |
|||
#' Given a document read with \code{read_docx} and a table to extract (optionally |
|||
#' indicating whether there was a header or not and if cell whitepace trimming is |
|||
#' desired) extract the contents of the table to a \code{data.frame}. |
|||
#' |
|||
#' @param docx \code{docx} object read with \code{read_docx} |
|||
#' @param tbl_number which table to extract (defaults to \code{1}) |
|||
#' @param header assume first row of table is a header row? (default; \code{TRUE}) |
|||
#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE}) |
|||
#' @return \code{data.frame} |
|||
#' @export |
|||
#' @examples |
|||
#' doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr")) |
|||
#' docx_extract_tbl(doc3, 3) |
|||
docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, trim=TRUE) { |
|||
|
|||
ensure_docx(docx) |
|||
if ((tbl_number < 1) | (tbl_number > docx_tbl_count(docx))) { |
|||
stop("'tbl_number' is invalid.", call.=FALSE) |
|||
} |
|||
|
|||
ns <- docx$ns |
|||
tbl <- docx$tbls[[tbl_number]] |
|||
|
|||
cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns) |
|||
rows <- xml_find_all(tbl, "./w:tr", ns=ns) |
|||
|
|||
bind_rows(lapply(rows, function(row) { |
|||
|
|||
vals <- xml_text(xml_find_all(row, "./w:tc", ns=ns), trim=trim) |
|||
names(vals) <- sprintf("V%d", 1:length(vals)) |
|||
data.frame(as.list(vals), stringsAsFactors=FALSE) |
|||
|
|||
})) -> dat |
|||
|
|||
if (header) { |
|||
colnames(dat) <- dat[1,] |
|||
dat <- dat[-1,] |
|||
} else { |
|||
hdr <- has_header(tbl, rows, ns) |
|||
if (!is.na(hdr)) message("NOTE: header=FALSE but table has a marked header row in the Word document") |
|||
} |
|||
|
|||
rownames(dat) <- NULL |
|||
|
|||
dat |
|||
|
|||
} |
|||
|
|||
#' Get number of tables in a Word document |
|||
#' |
|||
#' @param docx \code{docx} object read with \code{read_docx} |
|||
#' @return numeric |
|||
#' @export |
|||
#' @examples |
|||
#' complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
#' docx_tbl_count(complx) |
|||
docx_tbl_count <- function(docx) { |
|||
ensure_docx(docx) |
|||
length(docx$tbls) |
|||
} |
@ -0,0 +1,14 @@ |
|||
#' docxtractr is an R pacakge for extracting tables out of Word documents (docx) |
|||
#' |
|||
#' Microsoft Word docx files provide an XML structure that is fairly |
|||
#' straightforward to navigate, especially when it applies to Word tables. The |
|||
#' docxtractr package provides tools to determine table count, table structure and |
|||
#' extract tables from Microsoft Word docx documents. |
|||
#' |
|||
#' @name docxtractr |
|||
#' @docType package |
|||
#' |
|||
#' @author Bob Rudis (@@hrbrmstr) |
|||
#' @importFrom xml2 xml_find_all xml_text xml_ns xml_find_one xml_attrs |
|||
#' @importFrom dplyr bind_rows |
|||
NULL |
@ -0,0 +1,49 @@ |
|||
#' Read in a Word document for table extraction |
|||
#' |
|||
#' Path must be local (i.e. not a URL) |
|||
#' |
|||
#' @param path path to the Word document |
|||
#' @importFrom xml2 read_xml |
|||
#' @importFrom tools file_ext |
|||
#' @export |
|||
#' @examples |
|||
#' doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) |
|||
#' class(doc) |
|||
read_docx <- function(path) { |
|||
|
|||
path <- path.expand(path) |
|||
|
|||
if (!file_ext(path) == "docx") stop("read_docx only works with '.docx' files", call.=FALSE) |
|||
if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE) |
|||
|
|||
# make temporary things for us to work with |
|||
tmpd <- tempdir() |
|||
tmpf <- tempfile(tmpdir=tmpd, fileext=".zip") |
|||
|
|||
# copy docx to zip (not entirely necessary) |
|||
file.copy(path, tmpf) |
|||
# unzip it |
|||
unzip(tmpf, exdir=sprintf("%s/docdata", tmpd)) |
|||
|
|||
# read the actual XML document |
|||
doc <- read_xml(sprintf("%s/docdata/word/document.xml", tmpd)) |
|||
|
|||
# cleanup |
|||
unlink(tmpf) |
|||
unlink(sprintf("%s/docdata", tmpd), recursive=TRUE) |
|||
|
|||
# extract the namespace |
|||
ns <- xml_ns(doc) |
|||
|
|||
# get the tables |
|||
tbls <- xml_find_all(doc, ".//w:tbl", ns=ns) |
|||
|
|||
# make an object for other functions to work with |
|||
docx <- list(docx=doc, ns=ns, tbls=tbls, path=path) |
|||
|
|||
# special class helps us work with these things |
|||
class(docx) <- "docx" |
|||
|
|||
docx |
|||
|
|||
} |
@ -0,0 +1,30 @@ |
|||
# used by cuntions to make sure they are working with a well-formed docx object |
|||
ensure_docx <- function(docx) { |
|||
if (!inherits(docx, "docx")) stop("Must pass in a 'docx' object", call.=FALSE) |
|||
if (!(all(sapply(c("docx", "ns", "tbls", "path"), exists, where=docx)))) |
|||
stop("'docx' object missing necessary components", call.=FALSE) |
|||
} |
|||
|
|||
# test if a w:tbl has a header row |
|||
has_header <- function(tbl, rows, ns) { |
|||
|
|||
# microsoft has a tag for some table structure info. examine it to |
|||
# see if the creator of the header made the first row special which |
|||
# will likely mean it's a header candidate |
|||
look <- try(xml_find_one(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE) |
|||
if (inherits(look, "try-error")) { |
|||
return(NA) |
|||
} else { |
|||
look_attr <- xml_attrs(look) |
|||
if ("firstRow" %in% names(look_attr)) { |
|||
if (look_attr["firstRow"] == "0") { |
|||
return(NA) |
|||
} else { |
|||
return(paste0(xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", ")) |
|||
} |
|||
} else { |
|||
return(NA) |
|||
} |
|||
} |
|||
|
|||
} |
@ -0,0 +1,122 @@ |
|||
--- |
|||
output: |
|||
md_document: |
|||
variant: markdown_github |
|||
--- |
|||
|
|||
<!-- README.md is generated from README.Rmd. Please edit that file --> |
|||
|
|||
```{r, echo = FALSE} |
|||
knitr::opts_chunk$set( |
|||
collapse = TRUE, |
|||
comment = "#>", |
|||
fig.path = "README-" |
|||
) |
|||
``` |
|||
|
|||
![](docxtractr-logo.png) |
|||
|
|||
docxtractr is an R pacakge for extracting tables out of Word documents (docx) |
|||
|
|||
Microsoft Word docx files provide an XML structure that is fairly |
|||
straightforward to navigate, especially when it applies to Word tables. The |
|||
docxtractr package provides tools to determine table count, table structure and |
|||
extract tables from Microsoft Word docx documents. |
|||
|
|||
The following functions are implemented: |
|||
|
|||
- `read_docx`: Read in a Word document for table extraction |
|||
- `docx_describe_tbls`: Returns a description of all the tables in the Word document |
|||
- `docx_extract_tbl`: Extract a table from a Word document |
|||
- `docx_tbl_count`: Get number of tables in a Word document |
|||
|
|||
The following data file are included: |
|||
|
|||
- `system.file("examples/data.docx", package="docxtractr")`: Word docx with 1 table |
|||
- `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables |
|||
- `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables |
|||
- `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables |
|||
|
|||
### News |
|||
|
|||
- Version 0.0.0.9000 released |
|||
|
|||
### Installation |
|||
|
|||
```{r eval=FALSE} |
|||
devtools::install_github("hrbrmstr/docxtractr") |
|||
``` |
|||
|
|||
```{r echo=FALSE} |
|||
options(width=120) |
|||
``` |
|||
|
|||
### Usage |
|||
|
|||
```{r sample} |
|||
library(docxtractr) |
|||
|
|||
# current verison |
|||
packageVersion("docxtractr") |
|||
|
|||
# one table |
|||
doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(doc) |
|||
|
|||
docx_describe_tbls(doc) |
|||
|
|||
docx_extract_tbl(doc, 1) |
|||
|
|||
docx_extract_tbl(doc) |
|||
|
|||
docx_extract_tbl(doc, header=FALSE) |
|||
|
|||
# three tables |
|||
doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr")) |
|||
|
|||
docx_extract_tbl(doc3, 3) |
|||
|
|||
docx_tbl_count(doc3) |
|||
|
|||
docx_describe_tbls(doc3) |
|||
|
|||
# no tables |
|||
none <- read_docx(system.file("examples/none.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(none) |
|||
|
|||
# wrapping in try since it will return an error |
|||
# use docx_tbl_count before trying to extract in scripts/production |
|||
try(docx_describe_tbls(none)) |
|||
try(docx_extract_tbl(none, 2)) |
|||
|
|||
# 5 tables, with two in sketchy formats |
|||
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(complx) |
|||
|
|||
docx_describe_tbls(complx) |
|||
|
|||
docx_extract_tbl(complx, 3, header=TRUE) |
|||
|
|||
docx_extract_tbl(complx, 4, header=TRUE) |
|||
|
|||
docx_extract_tbl(complx, 5, header=TRUE) |
|||
``` |
|||
|
|||
### Test Results |
|||
|
|||
```{r} |
|||
library(docxtractr) |
|||
library(testthat) |
|||
|
|||
date() |
|||
|
|||
test_dir("tests/") |
|||
``` |
|||
|
|||
### Code of Conduct |
|||
|
|||
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). |
|||
By participating in this project you agree to abide by its terms. |
@ -0,0 +1,219 @@ |
|||
<!-- README.md is generated from README.Rmd. Please edit that file --> |
|||
![](docxtractr-logo.png) |
|||
|
|||
docxtractr is an R pacakge for extracting tables out of Word documents (docx) |
|||
|
|||
Microsoft Word docx files provide an XML structure that is fairly straightforward to navigate, especially when it applies to Word tables. The docxtractr package provides tools to determine table count, table structure and extract tables from Microsoft Word docx documents. |
|||
|
|||
The following functions are implemented: |
|||
|
|||
- `read_docx`: Read in a Word document for table extraction |
|||
- `docx_describe_tbls`: Returns a description of all the tables in the Word document |
|||
- `docx_extract_tbl`: Extract a table from a Word document |
|||
- `docx_tbl_count`: Get number of tables in a Word document |
|||
|
|||
The following data file are included: |
|||
|
|||
- `system.file("examples/data.docx", package="docxtractr")`: Word docx with 1 table |
|||
- `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables |
|||
- `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables |
|||
- `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables |
|||
|
|||
### News |
|||
|
|||
- Version 0.0.0.9000 released |
|||
|
|||
### Installation |
|||
|
|||
``` r |
|||
devtools::install_github("hrbrmstr/docxtractr") |
|||
``` |
|||
|
|||
### Usage |
|||
|
|||
``` r |
|||
library(docxtractr) |
|||
|
|||
# current verison |
|||
packageVersion("docxtractr") |
|||
#> [1] '0.0.0.9000' |
|||
|
|||
# one table |
|||
doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(doc) |
|||
#> [1] 1 |
|||
|
|||
docx_describe_tbls(doc) |
|||
#> Word document [/Library/Frameworks/R.framework/Versions/3.2/Resources/library/docxtractr/examples/data.docx] |
|||
#> |
|||
#> Table 1 |
|||
#> total cells: 16 |
|||
#> row count : 4 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [This, Is, A, Column] |
|||
|
|||
docx_extract_tbl(doc, 1) |
|||
#> Source: local data frame [3 x 4] |
|||
#> |
|||
#> This Is A Column |
|||
#> 1 1 Cat 3.4 Dog |
|||
#> 2 3 Fish 100.3 Bird |
|||
#> 3 5 Pelican -99 Kangaroo |
|||
|
|||
docx_extract_tbl(doc) |
|||
#> Source: local data frame [3 x 4] |
|||
#> |
|||
#> This Is A Column |
|||
#> 1 1 Cat 3.4 Dog |
|||
#> 2 3 Fish 100.3 Bird |
|||
#> 3 5 Pelican -99 Kangaroo |
|||
|
|||
docx_extract_tbl(doc, header=FALSE) |
|||
#> NOTE: header=FALSE but table has a marked header row in the Word document |
|||
#> Source: local data frame [4 x 4] |
|||
#> |
|||
#> V1 V2 V3 V4 |
|||
#> 1 This Is A Column |
|||
#> 2 1 Cat 3.4 Dog |
|||
#> 3 3 Fish 100.3 Bird |
|||
#> 4 5 Pelican -99 Kangaroo |
|||
|
|||
# three tables |
|||
doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr")) |
|||
|
|||
docx_extract_tbl(doc3, 3) |
|||
#> Source: local data frame [6 x 2] |
|||
#> |
|||
#> Foo Bar |
|||
#> 1 Aa Bb |
|||
#> 2 Dd Ee |
|||
#> 3 Gg Hh |
|||
#> 4 1 2 |
|||
#> 5 Zz Jj |
|||
#> 6 Tt ii |
|||
|
|||
docx_tbl_count(doc3) |
|||
#> [1] 3 |
|||
|
|||
docx_describe_tbls(doc3) |
|||
#> Word document [/Library/Frameworks/R.framework/Versions/3.2/Resources/library/docxtractr/examples/data3.docx] |
|||
#> |
|||
#> Table 1 |
|||
#> total cells: 16 |
|||
#> row count : 4 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [This, Is, A, Column] |
|||
#> |
|||
#> Table 2 |
|||
#> total cells: 12 |
|||
#> row count : 4 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [Foo, Bar, Baz] |
|||
#> |
|||
#> Table 3 |
|||
#> total cells: 14 |
|||
#> row count : 7 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [Foo, Bar] |
|||
|
|||
# no tables |
|||
none <- read_docx(system.file("examples/none.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(none) |
|||
#> [1] 0 |
|||
|
|||
# wrapping in try since it will return an error |
|||
# use docx_tbl_count before trying to extract in scripts/production |
|||
try(docx_describe_tbls(none)) |
|||
try(docx_extract_tbl(none, 2)) |
|||
|
|||
# 5 tables, with two in sketchy formats |
|||
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
|
|||
docx_tbl_count(complx) |
|||
#> [1] 5 |
|||
|
|||
docx_describe_tbls(complx) |
|||
#> Word document [/Library/Frameworks/R.framework/Versions/3.2/Resources/library/docxtractr/examples/complex.docx] |
|||
#> |
|||
#> Table 1 |
|||
#> total cells: 16 |
|||
#> row count : 4 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [This, Is, A, Column] |
|||
#> |
|||
#> Table 2 |
|||
#> total cells: 12 |
|||
#> row count : 4 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [Foo, Bar, Baz] |
|||
#> |
|||
#> Table 3 |
|||
#> total cells: 14 |
|||
#> row count : 7 |
|||
#> uniform : likely! |
|||
#> has header : likely! => possibly [Foo, Bar] |
|||
#> |
|||
#> Table 4 |
|||
#> total cells: 11 |
|||
#> row count : 4 |
|||
#> uniform : unlikely => found differing cell counts (3, 2) across some rows |
|||
#> has header : likely! => possibly [Foo, Bar, Baz] |
|||
#> |
|||
#> Table 5 |
|||
#> total cells: 21 |
|||
#> row count : 7 |
|||
#> uniform : likely! |
|||
#> has header : unlikely |
|||
|
|||
docx_extract_tbl(complx, 3, header=TRUE) |
|||
#> Source: local data frame [6 x 2] |
|||
#> |
|||
#> Foo Bar |
|||
#> 1 Aa Bb |
|||
#> 2 Dd Ee |
|||
#> 3 Gg Hh |
|||
#> 4 1 2 |
|||
#> 5 Zz Jj |
|||
#> 6 Tt ii |
|||
|
|||
docx_extract_tbl(complx, 4, header=TRUE) |
|||
#> Source: local data frame [3 x 3] |
|||
#> |
|||
#> Foo Bar Baz |
|||
#> 1 Aa BbCc NA |
|||
#> 2 Dd Ee Ff |
|||
#> 3 Gg Hh ii |
|||
|
|||
docx_extract_tbl(complx, 5, header=TRUE) |
|||
#> Source: local data frame [6 x 3] |
|||
#> |
|||
#> Foo Bar Baz |
|||
#> 1 Aa Bb Cc |
|||
#> 2 Dd Ee Ff |
|||
#> 3 Gg Hh Ii |
|||
#> 4 Jj88 Kk Ll |
|||
#> 5 Uu Ii |
|||
#> 6 Hh Ii h |
|||
``` |
|||
|
|||
### Test Results |
|||
|
|||
``` r |
|||
library(docxtractr) |
|||
library(testthat) |
|||
|
|||
date() |
|||
#> [1] "Mon Aug 24 13:36:23 2015" |
|||
|
|||
test_dir("tests/") |
|||
#> testthat results ======================================================================================================== |
|||
#> OK: 0 SKIPPED: 0 FAILED: 0 |
|||
#> |
|||
#> DONE |
|||
``` |
|||
|
|||
### Code of Conduct |
|||
|
|||
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. |
After Width: | Height: | Size: 19 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,21 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/describe.r |
|||
\name{docx_describe_tbls} |
|||
\alias{docx_describe_tbls} |
|||
\title{Returns a description of all the tables in the Word document} |
|||
\usage{ |
|||
docx_describe_tbls(docx) |
|||
} |
|||
\arguments{ |
|||
\item{docx}{\code{docx} object read with \code{read_docx}} |
|||
} |
|||
\description{ |
|||
This function will attempt to discern the structure of each of the tables |
|||
in \code{docx} and print this information |
|||
} |
|||
\examples{ |
|||
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
docx_tbl_count(complx) |
|||
docx_describe_tbls(complx) |
|||
} |
|||
|
@ -0,0 +1,30 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/docx_find_tbls.r |
|||
\name{docx_extract_tbl} |
|||
\alias{docx_extract_tbl} |
|||
\title{Extract a table from a Word document} |
|||
\usage{ |
|||
docx_extract_tbl(docx, tbl_number = 1, header = TRUE, trim = TRUE) |
|||
} |
|||
\arguments{ |
|||
\item{docx}{\code{docx} object read with \code{read_docx}} |
|||
|
|||
\item{tbl_number}{which table to extract (defaults to \code{1})} |
|||
|
|||
\item{header}{assume first row of table is a header row? (default; \code{TRUE})} |
|||
|
|||
\item{trim}{trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})} |
|||
} |
|||
\value{ |
|||
\code{data.frame} |
|||
} |
|||
\description{ |
|||
Given a document read with \code{read_docx} and a table to extract (optionally |
|||
indicating whether there was a header or not and if cell whitepace trimming is |
|||
desired) extract the contents of the table to a \code{data.frame}. |
|||
} |
|||
\examples{ |
|||
doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr")) |
|||
docx_extract_tbl(doc3, 3) |
|||
} |
|||
|
@ -0,0 +1,22 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/docx_find_tbls.r |
|||
\name{docx_tbl_count} |
|||
\alias{docx_tbl_count} |
|||
\title{Get number of tables in a Word document} |
|||
\usage{ |
|||
docx_tbl_count(docx) |
|||
} |
|||
\arguments{ |
|||
\item{docx}{\code{docx} object read with \code{read_docx}} |
|||
} |
|||
\value{ |
|||
numeric |
|||
} |
|||
\description{ |
|||
Get number of tables in a Word document |
|||
} |
|||
\examples{ |
|||
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) |
|||
docx_tbl_count(complx) |
|||
} |
|||
|
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/docxtractr-package.r |
|||
\docType{package} |
|||
\name{docxtractr} |
|||
\alias{docxtractr} |
|||
\alias{docxtractr-package} |
|||
\title{docxtractr is an R pacakge for extracting tables out of Word documents (docx)} |
|||
\description{ |
|||
Microsoft Word docx files provide an XML structure that is fairly |
|||
straightforward to navigate, especially when it applies to Word tables. The |
|||
docxtractr package provides tools to determine table count, table structure and |
|||
extract tables from Microsoft Word docx documents. |
|||
} |
|||
\author{ |
|||
Bob Rudis (@hrbrmstr) |
|||
} |
|||
|
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/describe.r |
|||
\name{print.docx} |
|||
\alias{print.docx} |
|||
\title{Display information about the document} |
|||
\usage{ |
|||
\method{print}{docx}(x, ...) |
|||
} |
|||
\arguments{ |
|||
\item{x}{\code{docx} object} |
|||
|
|||
\item{...}{ignored} |
|||
} |
|||
\description{ |
|||
Display information about the document |
|||
} |
|||
|
@ -0,0 +1,19 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/read_docs.r |
|||
\name{read_docx} |
|||
\alias{read_docx} |
|||
\title{Read in a Word document for table extraction} |
|||
\usage{ |
|||
read_docx(path) |
|||
} |
|||
\arguments{ |
|||
\item{path}{path to the Word document} |
|||
} |
|||
\description{ |
|||
Path must be local (i.e. not a URL) |
|||
} |
|||
\examples{ |
|||
doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) |
|||
class(doc) |
|||
} |
|||
|
@ -0,0 +1,4 @@ |
|||
library(testthat) |
|||
library(docxtractr) |
|||
|
|||
test_check("docxtractr") |
@ -0,0 +1,6 @@ |
|||
context("basic functionality") |
|||
test_that("we can do something", { |
|||
|
|||
#expect_that(some_function(), is_a("data.frame")) |
|||
|
|||
}) |
Loading…
Reference in new issue