Browse Source

cleanup + fix #7

pull/11/head
boB Rudis 7 years ago
parent
commit
7829003217
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 14
      DESCRIPTION
  2. 8
      NAMESPACE
  3. 9
      NEWS.md
  4. 2
      R/aaa.r
  5. 7
      R/assign_colnames.r
  6. 11
      R/describe.r
  7. 17
      R/docx_find_tbls.r
  8. 8
      R/docxtractr-package.r
  9. 10
      R/extract_all.r
  10. 26
      R/mcga.r
  11. 10
      R/read_docs.r
  12. 8
      R/utils.r
  13. 6
      README.Rmd
  14. 194
      README.md
  15. 14
      docxtractr.Rproj
  16. 3
      man/assign_colnames.Rd
  17. 1
      man/docx_cmnt_count.Rd
  18. 1
      man/docx_describe_cmnts.Rd
  19. 1
      man/docx_describe_tbls.Rd
  20. 1
      man/docx_extract_all.Rd
  21. 1
      man/docx_extract_all_cmnts.Rd
  22. 1
      man/docx_extract_all_tbls.Rd
  23. 1
      man/docx_extract_tbl.Rd
  24. 1
      man/docx_tbl_count.Rd
  25. 1
      man/docxtractr.Rd
  26. 23
      man/mcga.Rd
  27. 1
      man/print.docx.Rd
  28. 1
      man/read_docx.Rd
  29. 10
      tests/testthat/test-docxtractr.R

14
DESCRIPTION

@ -1,16 +1,16 @@
Package: docxtractr
Title: Extract Data Tables and Comments from Microsoft Word Documents
Version: 0.2.1
Title: Extract Data Tables and Comments from 'Microsoft' 'Word' Documents
Version: 0.3.0
Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")))
Maintainer: Bob Rudis <bob@rudis.net>
Description: Microsoft Word docx files provide an XML structure that is fairly
straightforward to navigate, especially when it applies to Word tables and
Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fairly
straightforward to navigate, especially when it applies to 'Word' tables and
comments. Tools are provided to determine table count/structure, comment count
and also to extract/clean tables and comments from Microsoft Word docx documents.
and also to extract/clean tables and comments from 'Microsoft Word' 'docx' documents.
URL: http://github.com/hrbrmstr/docxtractr
BugReports: https://github.com/hrbrmstr/docxtractr/issues
Depends:
R (>= 3.0.0)
R (>= 3.1.2)
License: MIT + file LICENSE
LazyData: true
Suggests:
@ -22,4 +22,4 @@ Imports:
purrr,
dplyr,
utils
RoxygenNote: 5.0.1
RoxygenNote: 6.0.1

8
NAMESPACE

@ -10,17 +10,23 @@ export(docx_extract_all_cmnts)
export(docx_extract_all_tbls)
export(docx_extract_tbl)
export(docx_tbl_count)
export(mcga)
export(read_docx)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_cols)
importFrom(dplyr,bind_rows)
importFrom(dplyr,count)
importFrom(dplyr,select)
importFrom(purrr,map)
importFrom(purrr,map_chr)
importFrom(purrr,map_df)
importFrom(purrr,map_int)
importFrom(purrr,map_lgl)
importFrom(tibble,as_data_frame)
importFrom(tibble,as_tibble)
importFrom(tibble,data_frame)
importFrom(tools,file_ext)
importFrom(utils,download.file)
importFrom(utils,globalVariables)
importFrom(utils,unzip)
importFrom(xml2,read_xml)
importFrom(xml2,xml_attrs)

9
NEWS.md

@ -1,3 +1,12 @@
# 0.3.0 WIP
- return tibbles where possible & not stomping on input type (#7)
- change tests to test for `tbl` vs `data.frame` (related to #7)
- don't stomp on data frame-ish input type in `assign_colnames()`
- prefix `::` (non-user facing tweak)
- switch all `*apply()` to `purrr` calls since we bother to import `purrr` (non-user facing tweak)
-
# 0.2.0 released
- update for new xml2 pkg compatibility

2
R/aaa.r

@ -1 +1 @@
n <- author <- meta <- NULL
utils::globalVariables(c("n", "author", "meta"))

7
R/assign_colnames.r

@ -25,7 +25,7 @@
#' docx_tbl_count(real_world)
#'
#' # get all the tables
#' tbls <- docx_extract_all(real_world)
#' tbls <- docx_extract_all_tbls(real_world)
#'
#' # make table 1 better
#' assign_colnames(tbls[[1]], 2)
@ -36,6 +36,8 @@ assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) {
if ((row > nrow(dat)) | (row < 1)) return(dat)
d_class <- class(dat)
# just in case someone shoots us a data.table or other stranger things
dat <- data.frame(dat, stringsAsFactors=FALSE)
@ -47,6 +49,9 @@ assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) {
dat <- dat[-(start:end),]
rownames(dat) <- NULL
# give them back what they passed in
class(dat) <- d_class
dat
}

11
R/describe.r

@ -26,10 +26,10 @@ docx_describe_tbls <- function(docx) {
tbl <- tbls[[i]]
cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
rows <- xml_find_all(tbl, "./w:tr", ns=ns)
cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
cell_count_by_row <- sapply(rows, function(row) { length(xml_find_all(row, "./w:tc", ns)) })
cell_count_by_row <- purrr::map_int(rows, ~{ length(xml2::xml_find_all(.x, "./w:tc", ns)) })
row_counts <- paste0(unique(cell_count_by_row), collapse=", ")
max_cell_count <- max(cell_count_by_row)
@ -84,16 +84,17 @@ docx_describe_cmnts <- function(docx) {
cat(sprintf("Found %d comments.\n", length(cmnts)))
map_df(xml_attrs(cmnts), function(x) {
purrr::map_df(xml_attrs(cmnts), function(x) {
as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
}) -> meta
cmnt_df <- dplyr::bind_cols(meta,
cbind.data.frame(comment_text=xml_text(cmnts),
cbind.data.frame(comment_text=xml2::xml_text(cmnts),
stringsAsFactors=FALSE))
aut_df <- dplyr::count(cmnt_df, author)
aut_df <- dplyr::arrange(aut_df, -n)
print(select(aut_df, author, `# Comments`=n))
}

17
R/docx_find_tbls.r

@ -25,16 +25,15 @@ docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, trim=TRUE) {
ns <- docx$ns
tbl <- docx$tbls[[tbl_number]]
cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
rows <- xml_find_all(tbl, "./w:tr", ns=ns)
cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
bind_rows(lapply(rows, function(row) {
vals <- xml_text(xml_find_all(row, "./w:tc", ns=ns), trim=trim)
purrr::map_df(rows, ~{
vals <- xml2::xml_text(xml2::xml_find_all(.x, "./w:tc", ns=ns), trim=trim)
names(vals) <- sprintf("V%d", 1:length(vals))
data.frame(as.list(vals), stringsAsFactors=FALSE)
})) -> dat
as.list(vals)
# data.frame(as.list(vals), stringsAsFactors=FALSE)
}) -> dat
if (header) {
colnames(dat) <- dat[1,]
@ -48,7 +47,7 @@ docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, trim=TRUE) {
rownames(dat) <- NULL
dat
tibble::as_tibble(dat)
}

8
R/docxtractr-package.r

@ -11,9 +11,9 @@
#'
#' @author Bob Rudis (@@hrbrmstr)
#' @importFrom xml2 xml_find_all xml_text xml_ns xml_find_first xml_attrs
#' @importFrom tibble data_frame as_data_frame
#' @importFrom dplyr bind_rows bind_cols count arrange select
#' @importFrom tibble data_frame as_data_frame as_tibble
#' @importFrom dplyr bind_cols count arrange select
#' @importFrom tools file_ext
#' @importFrom utils download.file unzip
#' @importFrom purrr map_df
#' @importFrom utils download.file unzip globalVariables
#' @importFrom purrr map_df map map_int map_chr map_lgl
NULL

10
R/extract_all.r

@ -27,7 +27,7 @@ docx_extract_all_tbls <- function(docx, guess_header=TRUE, trim=TRUE) {
hdr <- FALSE
if (guess_header) {
tbl <- docx$tbls[[i]]
rows <- xml_find_all(tbl, "./w:tr", ns=ns)
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
hdr <- !is.na(has_header(tbl, rows, ns))
}
docx_extract_tbl(docx, i, hdr, trim)
@ -77,12 +77,14 @@ docx_extract_all_cmnts <- function(docx) {
comments <- docx$cmnts
map_df(xml_attrs(comments), function(x) {
purrr::map_df(xml2::xml_attrs(comments), function(x) {
as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE)))
}) -> meta
bind_cols(meta,
cbind.data.frame(comment_text=xml_text(comments),
stringsAsFactors=FALSE))
cbind.data.frame(comment_text=xml2::xml_text(comments),
stringsAsFactors=FALSE)) -> out
as_tibble(out)
}

26
R/mcga.r

@ -0,0 +1,26 @@
#' Make Column Names Great Again
#'
#' Remove punctuation and spaces and turn them to underscores plus convert to lower case.
#'
#' @md
#' @param tbl a `data.frame`-like object
#' @return whatver class `x` was but with truly great, really great column names. They're amazing.
#' Trust me. They'll be incredible column names once we're done.
#' @export
#' @examples
#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
#' tbls <- docx_extract_all_tbls(real_world)
#' mcga(assign_colnames(tbls[[1]], 2))
mcga <- function(tbl) {
x <- colnames(tbl)
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
colnames(tbl) <- x
tbl
}

10
R/read_docs.r

@ -36,20 +36,20 @@ read_docx <- function(path) {
unzip(tmpf, exdir=sprintf("%s/docdata", tmpd))
# read the actual XML document
doc <- read_xml(sprintf("%s/docdata/word/document.xml", tmpd))
doc <- xml2::read_xml(sprintf("%s/docdata/word/document.xml", tmpd))
# extract the namespace
ns <- xml_ns(doc)
ns <- xml2::xml_ns(doc)
# get the tables
tbls <- xml_find_all(doc, ".//w:tbl", ns=ns)
tbls <- xml2::xml_find_all(doc, ".//w:tbl", ns=ns)
if (file.exists(sprintf("%s/docdata/word/comments.xml", tmpd))) {
docmnt <- read_xml(sprintf("%s/docdata/word/comments.xml", tmpd))
# get the comments
cmnts <- xml_find_all(docmnt, ".//w:comment", ns=ns)
cmnts <- xml2::xml_find_all(docmnt, ".//w:comment", ns=ns)
} else {
cmnts <- xml_find_all(doc, ".//w:comment", ns=ns)
cmnts <- xml2::xml_find_all(doc, ".//w:comment", ns=ns)
}
# make an object for other functions to work with

8
R/utils.r

@ -1,7 +1,7 @@
# used by cuntions to make sure they are working with a well-formed docx object
ensure_docx <- function(docx) {
if (!inherits(docx, "docx")) stop("Must pass in a 'docx' object", call.=FALSE)
if (!(all(sapply(c("docx", "ns", "tbls", "path"), exists, where=docx))))
if (!(all(purrr::map_lgl(c("docx", "ns", "tbls", "path"), exists, where=docx))))
stop("'docx' object missing necessary components", call.=FALSE)
}
@ -11,16 +11,16 @@ has_header <- function(tbl, rows, ns) {
# microsoft has a tag for some table structure info. examine it to
# see if the creator of the header made the first row special which
# will likely mean it's a header candidate
look <- try(xml_find_first(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE)
look <- try(xml2::xml_find_first(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE)
if (inherits(look, "try-error")) {
return(NA)
} else {
look_attr <- xml_attrs(look)
look_attr <- xml2::xml_attrs(look)
if ("firstRow" %in% names(look_attr)) {
if (look_attr["firstRow"] == "0") {
return(NA)
} else {
return(paste0(xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", "))
return(paste0(xml2::xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", "))
}
} else {
return(NA)

6
README.Rmd

@ -34,6 +34,7 @@ The following functions are implemented:
- `docx_tbl_count`: Get number of tables in a Word document
- `docx_cmnt_count`: Get number of comments in a Word document
- `assign_colnames`: Make a specific row the column names for the specified data.frame
- `mcga` : Make column names great again
The following data file are included:
@ -134,9 +135,12 @@ tbls <- docx_extract_all(real_world)
# see table 1
tbls[[1]]
#' # make table 1 better
# make table 1 better
assign_colnames(tbls[[1]], 2)
# make table 1's column names great again
mcga(assign_colnames(tbls[[1]], 2))
# see table 5
tbls[[5]]

194
README.md

@ -21,6 +21,7 @@ The following functions are implemented:
- `docx_tbl_count`: Get number of tables in a Word document
- `docx_cmnt_count`: Get number of comments in a Word document
- `assign_colnames`: Make a specific row the column names for the specified data.frame
- `mcga` : Make column names great again
The following data file are included:
@ -56,7 +57,7 @@ library(dplyr)
# current verison
packageVersion("docxtractr")
#> [1] '0.2.0'
#> [1] '0.3.0'
# one table
doc <- read_docx(system.file("examples/data.docx", package="docxtractr"))
@ -65,7 +66,7 @@ docx_tbl_count(doc)
#> [1] 1
docx_describe_tbls(doc)
#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/data.docx]
#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/data.docx]
#>
#> Table 1
#> total cells: 16
@ -74,24 +75,30 @@ docx_describe_tbls(doc)
#> has header : likely! => possibly [This, Is, A, Column]
docx_extract_tbl(doc, 1)
#> This Is A Column
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
#> # A tibble: 3 x 4
#> This Is A Column
#> <chr> <chr> <chr> <chr>
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
docx_extract_tbl(doc)
#> This Is A Column
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
#> # A tibble: 3 x 4
#> This Is A Column
#> <chr> <chr> <chr> <chr>
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
docx_extract_tbl(doc, header=FALSE)
#> NOTE: header=FALSE but table has a marked header row in the Word document
#> V1 V2 V3 V4
#> 1 This Is A Column
#> 2 1 Cat 3.4 Dog
#> 3 3 Fish 100.3 Bird
#> 4 5 Pelican -99 Kangaroo
#> # A tibble: 4 x 4
#> V1 V2 V3 V4
#> <chr> <chr> <chr> <chr>
#> 1 This Is A Column
#> 2 1 Cat 3.4 Dog
#> 3 3 Fish 100.3 Bird
#> 4 5 Pelican -99 Kangaroo
# url
@ -116,22 +123,23 @@ docx_describe_tbls(budget)
#> has header : unlikely
docx_extract_tbl(budget, 1)
#> Short-term Portfolio Long-term Portfolio Total Portfolio Values
#> 1 Portfolio Balance (Market Value) * $ 123,651,911 $ 294,704,136 $ 418,356,047
#> 2 Effective Yield 0.16 % 1.42 % 1.05 %
#> 3 Avg. Weighted Maturity 11 Days 2.4 Years 1.7 Years
#> 4 Net Earnings $ 18,470 $ 350,554 $ 369,024
#> 5 Benchmark** 0.02 % 0.41 % 0.27 %
#> # A tibble: 5 x 4
#> `` `Short-term Portfolio` `Long-term Portfolio` `Total Portfolio Values`
#> <chr> <chr> <chr> <chr>
#> 1 Portfolio Balance (Market Value) * $ 123,651,911 $ 294,704,136 $ 418,356,047
#> 2 Effective Yield 0.16 % 1.42 % 1.05 %
#> 3 Avg. Weighted Maturity 11 Days 2.4 Years 1.7 Years
#> 4 Net Earnings $ 18,470 $ 350,554 $ 369,024
#> 5 Benchmark** 0.02 % 0.41 % 0.27 %
docx_extract_tbl(budget, 2)
#> Amount of Funds (Market Value) Maturity Effective Yield Interpolated Yield
#> 1 Short-Term Portfolio $ 123,651,911 11 days 0.16 % 0.01 %
#> 2 Long-Term Portfolio $ 294,704,136 2.4 years 1.42 % 0.41 %
#> 3 Total Portfolio $ 418,356,047 1.7 years 1.05 % 0.27 %
#> Total Return Monthly Total Return Annual
#> 1 0.013 0.160
#> 2 0.437 0.250
#> 3 0.298 0.222
#> # A tibble: 3 x 7
#> `` `Amount of Funds (Market Value)` Maturity `Effective Yield` `Interpolated Yield`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Short-Term Portfolio $ 123,651,911 11 days 0.16 % 0.01 %
#> 2 Long-Term Portfolio $ 294,704,136 2.4 years 1.42 % 0.41 %
#> 3 Total Portfolio $ 418,356,047 1.7 years 1.05 % 0.27 %
#> # ... with 2 more variables: `Total Return Monthly` <chr>, `Total Return Annual` <chr>
# three tables
doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))
@ -140,7 +148,7 @@ docx_tbl_count(doc3)
#> [1] 3
docx_describe_tbls(doc3)
#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/data3.docx]
#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/data3.docx]
#>
#> Table 1
#> total cells: 16
@ -161,13 +169,15 @@ docx_describe_tbls(doc3)
#> has header : likely! => possibly [Foo, Bar]
docx_extract_tbl(doc3, 3)
#> Foo Bar
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
#> # A tibble: 6 x 2
#> Foo Bar
#> <chr> <chr>
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
# no tables
none <- read_docx(system.file("examples/none.docx", package="docxtractr"))
@ -188,7 +198,7 @@ docx_tbl_count(complx)
#> [1] 5
docx_describe_tbls(complx)
#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/complex.docx]
#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/complex.docx]
#>
#> Table 1
#> total cells: 16
@ -221,28 +231,34 @@ docx_describe_tbls(complx)
#> has header : unlikely
docx_extract_tbl(complx, 3, header=TRUE)
#> Foo Bar
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
#> # A tibble: 6 x 2
#> Foo Bar
#> <chr> <chr>
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
docx_extract_tbl(complx, 4, header=TRUE)
#> Foo Bar Baz
#> 1 Aa BbCc <NA>
#> 2 Dd Ee Ff
#> 3 Gg Hh ii
#> # A tibble: 3 x 3
#> Foo Bar Baz
#> <chr> <chr> <chr>
#> 1 Aa BbCc <NA>
#> 2 Dd Ee Ff
#> 3 Gg Hh ii
docx_extract_tbl(complx, 5, header=TRUE)
#> Foo Bar Baz
#> 1 Aa Bb Cc
#> 2 Dd Ee Ff
#> 3 Gg Hh Ii
#> 4 Jj88 Kk Ll
#> 5 Uu Ii
#> 6 Hh Ii h
#> # A tibble: 6 x 3
#> Foo Bar Baz
#> <chr> <chr> <chr>
#> 1 Aa Bb Cc
#> 2 Dd Ee Ff
#> 3 Gg Hh Ii
#> 4 Jj88 Kk Ll
#> 5 Uu Ii
#> 6 Hh Ii h
# a "real" Word doc
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
@ -256,7 +272,9 @@ tbls <- docx_extract_all(real_world)
# see table 1
tbls[[1]]
#> # A tibble: 9 x 9
#> V1 V2 V3 V4 V5
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Lesson 1: Step 1 <NA> <NA> <NA> <NA>
#> 2 Country Birthrate Death Rate Population Growth 2005 Population Growth 2050
#> 3 USA 2.06 0.51% 0.92% -0.06%
@ -266,20 +284,27 @@ tbls[[1]]
#> 7 Italy 1.28 0.72% 0.35% -1.33%
#> 8 Mexico 2.43 0.25% 1.41% 0.96%
#> 9 Nigeria 4.78 0.26% 2.46% 3.58%
#> V6 V7 V8 V9
#> 1 <NA> <NA> <NA> <NA>
#> 2 Relative place in Transition Social Factors 1 Social Factors 2 Social Factors 3
#> 3 Post- Industrial Female Independence Stable Birth Rate Good technology
#> 4 Post- Industrial Government intervention Technology Urbanization
#> 5 Mature Industrial Not yet industrialized More children needed Slightly higher life expectancy
#> 6 Post Industrial Economic growth Poverty Becoming more industrialized
#> 7 Late Post industrial Stable birth rate People marry later Better health care
#> 8 Mature Industrial Better health care Emigration Economic growth
#> 9 End of Mechanization of Agriculture Disease People marry early People have many children
#' # make table 1 better
#> # ... with 4 more variables: V6 <chr>, V7 <chr>, V8 <chr>, V9 <chr>
# make table 1 better
assign_colnames(tbls[[1]], 2)
#> Country Birthrate Death Rate Population Growth 2005 Population Growth 2050 Relative place in Transition
#> # A tibble: 7 x 9
#> Country Birthrate `Death Rate` `Population Growth 2005` `Population Growth 2050` `Relative place in Transition`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 USA 2.06 0.51% 0.92% -0.06% Post- Industrial
#> 2 China 1.62 0.3% 0.6% -0.58% Post- Industrial
#> 3 Egypt 2.83 0.41% 2.0% 1.32% Mature Industrial
#> 4 India 2.35 0.34% 1.56% 0.76% Post Industrial
#> 5 Italy 1.28 0.72% 0.35% -1.33% Late Post industrial
#> 6 Mexico 2.43 0.25% 1.41% 0.96% Mature Industrial
#> 7 Nigeria 4.78 0.26% 2.46% 3.58% End of Mechanization of Agriculture
#> # ... with 3 more variables: `Social Factors 1` <chr>, `Social Factors 2` <chr>, `Social Factors 3` <chr>
# make table 1's column names great again
mcga(assign_colnames(tbls[[1]], 2))
#> # A tibble: 7 x 9
#> country birthrate death_rate population_growth_2005 population_growth_2050 relative_place_in_transition
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 USA 2.06 0.51% 0.92% -0.06% Post- Industrial
#> 2 China 1.62 0.3% 0.6% -0.58% Post- Industrial
#> 3 Egypt 2.83 0.41% 2.0% 1.32% Mature Industrial
@ -287,18 +312,13 @@ assign_colnames(tbls[[1]], 2)
#> 5 Italy 1.28 0.72% 0.35% -1.33% Late Post industrial
#> 6 Mexico 2.43 0.25% 1.41% 0.96% Mature Industrial
#> 7 Nigeria 4.78 0.26% 2.46% 3.58% End of Mechanization of Agriculture
#> Social Factors 1 Social Factors 2 Social Factors 3
#> 1 Female Independence Stable Birth Rate Good technology
#> 2 Government intervention Technology Urbanization
#> 3 Not yet industrialized More children needed Slightly higher life expectancy
#> 4 Economic growth Poverty Becoming more industrialized
#> 5 Stable birth rate People marry later Better health care
#> 6 Better health care Emigration Economic growth
#> 7 Disease People marry early People have many children
#> # ... with 3 more variables: social_factors_1 <chr>, social_factors_2 <chr>, social_factors_3 <chr>
# see table 5
tbls[[5]]
#> # A tibble: 5 x 6
#> V1 V2 V3 V4 V5 V6
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Lesson 2: Step 1 <NA> <NA> <NA> <NA> <NA>
#> 2 Nigeria Default Prediction + 5 years +15 years -5 years
#> 3 Birth rate 4.78 Goes Down 4.76 4.72 4.79
@ -307,23 +327,25 @@ tbls[[5]]
# make table 5 better
assign_colnames(tbls[[5]], 2)
#> Nigeria Default Prediction + 5 years +15 years -5 years
#> 1 Birth rate 4.78 Goes Down 4.76 4.72 4.79
#> 2 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3%
#> 3 Population growth 3.58% Goes Down 3.02% 2.32% 4.38%
#> # A tibble: 3 x 6
#> Nigeria Default Prediction `+ 5 years` `+15 years` `-5 years`
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Birth rate 4.78 Goes Down 4.76 4.72 4.79
#> 2 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3%
#> 3 Population growth 3.58% Goes Down 3.02% 2.32% 4.38%
# comments
cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
print(cmnts)
#> No tables in document
#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/comments.docx]
#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/comments.docx]
#>
#> Found 3 comments.
#> # A tibble: 1 x 2
#> author # Comments
#> <chr> <int>
#> 1 boB Rudis 3
#> author `# Comments`
#> <chr> <int>
#> 1 boB Rudis 3
glimpse(docx_extract_all_cmnts(cmnts))
#> Observations: 3
@ -347,7 +369,7 @@ library(testthat)
#> matches
date()
#> [1] "Tue Jul 19 22:56:37 2016"
#> [1] "Mon Jun 19 05:52:59 2017"
test_dir("tests/")
#> testthat results ========================================================================================================

14
docxtractr.Rproj

@ -5,21 +5,19 @@ SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
UseSpacesForTab: Yes
NumSpacesForTab: 2
RnwWeave: Sweave
LaTeX: pdfLaTeX
PackageBuildArgs: --resave-data
PackageCheckArgs: --as-cran
PackageRoxygenize: rd,collate,namespace

3
man/assign_colnames.Rd

@ -37,7 +37,7 @@ real_world <- read_docx(system.file("examples/realworld.docx", package="docxtrac
docx_tbl_count(real_world)
# get all the tables
tbls <- docx_extract_all(real_world)
tbls <- docx_extract_all_tbls(real_world)
# make table 1 better
assign_colnames(tbls[[1]], 2)
@ -48,4 +48,3 @@ assign_colnames(tbls[[5]], 2)
\seealso{
\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}
}

1
man/docx_cmnt_count.Rd

@ -19,4 +19,3 @@ Get number of comments in a Word document
cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
docx_cmnt_count(cmnts)
}

1
man/docx_describe_cmnts.Rd

@ -17,4 +17,3 @@ cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
docx_cmnt_count(cmnts)
docx_describe_cmnts(cmnts)
}

1
man/docx_describe_tbls.Rd

@ -18,4 +18,3 @@ complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
docx_tbl_count(complx)
docx_describe_tbls(complx)
}

1
man/docx_extract_all.Rd

@ -33,4 +33,3 @@ tbls <- docx_extract_all_tbls(real_world)
\seealso{
\code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
}

1
man/docx_extract_all_cmnts.Rd

@ -21,4 +21,3 @@ docx_cmnt_count(cmnts)
docx_describe_cmnts(cmnts)
docx_extract_all_cmnts(cmnts)
}

1
man/docx_extract_all_tbls.Rd

@ -33,4 +33,3 @@ tbls <- docx_extract_all_tbls(real_world)
\seealso{
\code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
}

1
man/docx_extract_tbl.Rd

@ -31,4 +31,3 @@ docx_extract_tbl(doc3, 3)
\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
\code{\link{assign_colnames}}
}

1
man/docx_tbl_count.Rd

@ -19,4 +19,3 @@ Get number of tables in a Word document
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
docx_tbl_count(complx)
}

1
man/docxtractr.Rd

@ -15,4 +15,3 @@ comment count and extract comments from Word docx documents.
\author{
Bob Rudis (@hrbrmstr)
}

23
man/mcga.Rd

@ -0,0 +1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/mcga.r
\name{mcga}
\alias{mcga}
\title{Make Column Names Great Again}
\usage{
mcga(tbl)
}
\arguments{
\item{tbl}{a \code{data.frame}-like object}
}
\value{
whatver class \code{x} was but with truly great, really great column names. They're amazing.
Trust me. They'll be incredible column names once we're done.
}
\description{
Remove punctuation and spaces and turn them to underscores plus convert to lower case.
}
\examples{
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
tbls <- docx_extract_all_tbls(real_world)
mcga(assign_colnames(tbls[[1]], 2))
}

1
man/print.docx.Rd

@ -14,4 +14,3 @@
\description{
Display information about the document
}

1
man/read_docx.Rd

@ -21,4 +21,3 @@ budget <- read_docx(
"http://rud.is/dl/1.DOCX")
}
}

10
tests/testthat/test-docxtractr.R

@ -1,11 +1,11 @@
context("basic functionality")
context("docx extraction works")
test_that("we can do something", {
doc <- read_docx(system.file("examples/data.docx", package="docxtractr"))
expect_that(doc, is_a("docx"))
expect_that(docx_tbl_count(doc), equals(1))
expect_that(docx_extract_tbl(doc, 1), is_a("data.frame"))
expect_that(docx_extract_tbl(doc, 1), is_a("tbl"))
complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
expect_that(docx_tbl_count(complx), equals(5))
@ -14,9 +14,9 @@ test_that("we can do something", {
tmp_4 <- docx_extract_tbl(complx, 4)
tmp_5 <- docx_extract_tbl(complx, 5)
expect_that(tmp_3, is_a("data.frame"))
expect_that(tmp_4, is_a("data.frame"))
expect_that(tmp_5, is_a("data.frame"))
expect_that(tmp_3, is_a("tbl"))
expect_that(tmp_4, is_a("tbl"))
expect_that(tmp_5, is_a("tbl"))
expect_that(nrow(tmp_3), equals(6))
expect_that(ncol(tmp_4), equals(3))

Loading…
Cancel
Save