From 7829003217382327d11e303292823d0bc18e6529 Mon Sep 17 00:00:00 2001 From: boB Rudis Date: Mon, 19 Jun 2017 05:53:49 -0400 Subject: [PATCH] cleanup + fix #7 --- DESCRIPTION | 14 +-- NAMESPACE | 8 +- NEWS.md | 9 ++ R/aaa.r | 2 +- R/assign_colnames.r | 7 +- R/describe.r | 11 ++- R/docx_find_tbls.r | 17 ++-- R/docxtractr-package.r | 8 +- R/extract_all.r | 10 +- R/mcga.r | 26 ++++++ R/read_docs.r | 10 +- R/utils.r | 8 +- README.Rmd | 6 +- README.md | 194 ++++++++++++++++++++++----------------- docxtractr.Rproj | 14 ++- man/assign_colnames.Rd | 3 +- man/docx_cmnt_count.Rd | 1 - man/docx_describe_cmnts.Rd | 1 - man/docx_describe_tbls.Rd | 1 - man/docx_extract_all.Rd | 1 - man/docx_extract_all_cmnts.Rd | 1 - man/docx_extract_all_tbls.Rd | 1 - man/docx_extract_tbl.Rd | 1 - man/docx_tbl_count.Rd | 1 - man/docxtractr.Rd | 1 - man/mcga.Rd | 23 +++++ man/print.docx.Rd | 1 - man/read_docx.Rd | 1 - tests/testthat/test-docxtractr.R | 10 +- 29 files changed, 237 insertions(+), 154 deletions(-) create mode 100644 R/mcga.r create mode 100644 man/mcga.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 7aa074d..336f769 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,16 @@ Package: docxtractr -Title: Extract Data Tables and Comments from Microsoft Word Documents -Version: 0.2.1 +Title: Extract Data Tables and Comments from 'Microsoft' 'Word' Documents +Version: 0.3.0 Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre"))) Maintainer: Bob Rudis -Description: Microsoft Word docx files provide an XML structure that is fairly - straightforward to navigate, especially when it applies to Word tables and +Description: 'Microsoft Word' 'docx' files provide an 'XML' structure that is fairly + straightforward to navigate, especially when it applies to 'Word' tables and comments. Tools are provided to determine table count/structure, comment count - and also to extract/clean tables and comments from Microsoft Word docx documents. + and also to extract/clean tables and comments from 'Microsoft Word' 'docx' documents. URL: http://github.com/hrbrmstr/docxtractr BugReports: https://github.com/hrbrmstr/docxtractr/issues Depends: - R (>= 3.0.0) + R (>= 3.1.2) License: MIT + file LICENSE LazyData: true Suggests: @@ -22,4 +22,4 @@ Imports: purrr, dplyr, utils -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 39de290..82bd419 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,17 +10,23 @@ export(docx_extract_all_cmnts) export(docx_extract_all_tbls) export(docx_extract_tbl) export(docx_tbl_count) +export(mcga) export(read_docx) importFrom(dplyr,arrange) importFrom(dplyr,bind_cols) -importFrom(dplyr,bind_rows) importFrom(dplyr,count) importFrom(dplyr,select) +importFrom(purrr,map) +importFrom(purrr,map_chr) importFrom(purrr,map_df) +importFrom(purrr,map_int) +importFrom(purrr,map_lgl) importFrom(tibble,as_data_frame) +importFrom(tibble,as_tibble) importFrom(tibble,data_frame) importFrom(tools,file_ext) importFrom(utils,download.file) +importFrom(utils,globalVariables) importFrom(utils,unzip) importFrom(xml2,read_xml) importFrom(xml2,xml_attrs) diff --git a/NEWS.md b/NEWS.md index 8e07745..c2a49ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# 0.3.0 WIP + +- return tibbles where possible & not stomping on input type (#7) +- change tests to test for `tbl` vs `data.frame` (related to #7) +- don't stomp on data frame-ish input type in `assign_colnames()` +- prefix `::` (non-user facing tweak) +- switch all `*apply()` to `purrr` calls since we bother to import `purrr` (non-user facing tweak) +- + # 0.2.0 released - update for new xml2 pkg compatibility diff --git a/R/aaa.r b/R/aaa.r index ee6fa33..e3396d0 100644 --- a/R/aaa.r +++ b/R/aaa.r @@ -1 +1 @@ -n <- author <- meta <- NULL +utils::globalVariables(c("n", "author", "meta")) diff --git a/R/assign_colnames.r b/R/assign_colnames.r index 744281f..ac30abc 100644 --- a/R/assign_colnames.r +++ b/R/assign_colnames.r @@ -25,7 +25,7 @@ #' docx_tbl_count(real_world) #' #' # get all the tables -#' tbls <- docx_extract_all(real_world) +#' tbls <- docx_extract_all_tbls(real_world) #' #' # make table 1 better #' assign_colnames(tbls[[1]], 2) @@ -36,6 +36,8 @@ assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) { if ((row > nrow(dat)) | (row < 1)) return(dat) + d_class <- class(dat) + # just in case someone shoots us a data.table or other stranger things dat <- data.frame(dat, stringsAsFactors=FALSE) @@ -47,6 +49,9 @@ assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) { dat <- dat[-(start:end),] rownames(dat) <- NULL + # give them back what they passed in + class(dat) <- d_class + dat } diff --git a/R/describe.r b/R/describe.r index ca77285..1159ddb 100644 --- a/R/describe.r +++ b/R/describe.r @@ -26,10 +26,10 @@ docx_describe_tbls <- function(docx) { tbl <- tbls[[i]] - cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns) - rows <- xml_find_all(tbl, "./w:tr", ns=ns) + cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns) + rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns) - cell_count_by_row <- sapply(rows, function(row) { length(xml_find_all(row, "./w:tc", ns)) }) + cell_count_by_row <- purrr::map_int(rows, ~{ length(xml2::xml_find_all(.x, "./w:tc", ns)) }) row_counts <- paste0(unique(cell_count_by_row), collapse=", ") max_cell_count <- max(cell_count_by_row) @@ -84,16 +84,17 @@ docx_describe_cmnts <- function(docx) { cat(sprintf("Found %d comments.\n", length(cmnts))) - map_df(xml_attrs(cmnts), function(x) { + purrr::map_df(xml_attrs(cmnts), function(x) { as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE))) }) -> meta cmnt_df <- dplyr::bind_cols(meta, - cbind.data.frame(comment_text=xml_text(cmnts), + cbind.data.frame(comment_text=xml2::xml_text(cmnts), stringsAsFactors=FALSE)) aut_df <- dplyr::count(cmnt_df, author) aut_df <- dplyr::arrange(aut_df, -n) + print(select(aut_df, author, `# Comments`=n)) } diff --git a/R/docx_find_tbls.r b/R/docx_find_tbls.r index f34442f..64e205d 100644 --- a/R/docx_find_tbls.r +++ b/R/docx_find_tbls.r @@ -25,16 +25,15 @@ docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, trim=TRUE) { ns <- docx$ns tbl <- docx$tbls[[tbl_number]] - cells <- xml_find_all(tbl, "./w:tr/w:tc", ns=ns) - rows <- xml_find_all(tbl, "./w:tr", ns=ns) + cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns) + rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns) - bind_rows(lapply(rows, function(row) { - - vals <- xml_text(xml_find_all(row, "./w:tc", ns=ns), trim=trim) + purrr::map_df(rows, ~{ + vals <- xml2::xml_text(xml2::xml_find_all(.x, "./w:tc", ns=ns), trim=trim) names(vals) <- sprintf("V%d", 1:length(vals)) - data.frame(as.list(vals), stringsAsFactors=FALSE) - - })) -> dat + as.list(vals) + # data.frame(as.list(vals), stringsAsFactors=FALSE) + }) -> dat if (header) { colnames(dat) <- dat[1,] @@ -48,7 +47,7 @@ docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, trim=TRUE) { rownames(dat) <- NULL - dat + tibble::as_tibble(dat) } diff --git a/R/docxtractr-package.r b/R/docxtractr-package.r index aeebe1c..84ceaa7 100644 --- a/R/docxtractr-package.r +++ b/R/docxtractr-package.r @@ -11,9 +11,9 @@ #' #' @author Bob Rudis (@@hrbrmstr) #' @importFrom xml2 xml_find_all xml_text xml_ns xml_find_first xml_attrs -#' @importFrom tibble data_frame as_data_frame -#' @importFrom dplyr bind_rows bind_cols count arrange select +#' @importFrom tibble data_frame as_data_frame as_tibble +#' @importFrom dplyr bind_cols count arrange select #' @importFrom tools file_ext -#' @importFrom utils download.file unzip -#' @importFrom purrr map_df +#' @importFrom utils download.file unzip globalVariables +#' @importFrom purrr map_df map map_int map_chr map_lgl NULL diff --git a/R/extract_all.r b/R/extract_all.r index f7ced3d..23a0d65 100644 --- a/R/extract_all.r +++ b/R/extract_all.r @@ -27,7 +27,7 @@ docx_extract_all_tbls <- function(docx, guess_header=TRUE, trim=TRUE) { hdr <- FALSE if (guess_header) { tbl <- docx$tbls[[i]] - rows <- xml_find_all(tbl, "./w:tr", ns=ns) + rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns) hdr <- !is.na(has_header(tbl, rows, ns)) } docx_extract_tbl(docx, i, hdr, trim) @@ -77,12 +77,14 @@ docx_extract_all_cmnts <- function(docx) { comments <- docx$cmnts - map_df(xml_attrs(comments), function(x) { + purrr::map_df(xml2::xml_attrs(comments), function(x) { as_data_frame(t(cbind.data.frame(x, stringsAsFactors=FALSE))) }) -> meta bind_cols(meta, - cbind.data.frame(comment_text=xml_text(comments), - stringsAsFactors=FALSE)) + cbind.data.frame(comment_text=xml2::xml_text(comments), + stringsAsFactors=FALSE)) -> out + + as_tibble(out) } diff --git a/R/mcga.r b/R/mcga.r new file mode 100644 index 0000000..15183f0 --- /dev/null +++ b/R/mcga.r @@ -0,0 +1,26 @@ +#' Make Column Names Great Again +#' +#' Remove punctuation and spaces and turn them to underscores plus convert to lower case. +#' +#' @md +#' @param tbl a `data.frame`-like object +#' @return whatver class `x` was but with truly great, really great column names. They're amazing. +#' Trust me. They'll be incredible column names once we're done. +#' @export +#' @examples +#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) +#' tbls <- docx_extract_all_tbls(real_world) +#' mcga(assign_colnames(tbls[[1]], 2)) +mcga <- function(tbl) { + + x <- colnames(tbl) + x <- tolower(x) + x <- gsub("[[:punct:][:space:]]+", "_", x) + x <- gsub("_+", "_", x) + x <- gsub("(^_|_$)", "", x) + + colnames(tbl) <- x + + tbl + +} diff --git a/R/read_docs.r b/R/read_docs.r index 6aea6f8..4c7bbec 100644 --- a/R/read_docs.r +++ b/R/read_docs.r @@ -36,20 +36,20 @@ read_docx <- function(path) { unzip(tmpf, exdir=sprintf("%s/docdata", tmpd)) # read the actual XML document - doc <- read_xml(sprintf("%s/docdata/word/document.xml", tmpd)) + doc <- xml2::read_xml(sprintf("%s/docdata/word/document.xml", tmpd)) # extract the namespace - ns <- xml_ns(doc) + ns <- xml2::xml_ns(doc) # get the tables - tbls <- xml_find_all(doc, ".//w:tbl", ns=ns) + tbls <- xml2::xml_find_all(doc, ".//w:tbl", ns=ns) if (file.exists(sprintf("%s/docdata/word/comments.xml", tmpd))) { docmnt <- read_xml(sprintf("%s/docdata/word/comments.xml", tmpd)) # get the comments - cmnts <- xml_find_all(docmnt, ".//w:comment", ns=ns) + cmnts <- xml2::xml_find_all(docmnt, ".//w:comment", ns=ns) } else { - cmnts <- xml_find_all(doc, ".//w:comment", ns=ns) + cmnts <- xml2::xml_find_all(doc, ".//w:comment", ns=ns) } # make an object for other functions to work with diff --git a/R/utils.r b/R/utils.r index 85861c6..2fd3210 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,7 +1,7 @@ # used by cuntions to make sure they are working with a well-formed docx object ensure_docx <- function(docx) { if (!inherits(docx, "docx")) stop("Must pass in a 'docx' object", call.=FALSE) - if (!(all(sapply(c("docx", "ns", "tbls", "path"), exists, where=docx)))) + if (!(all(purrr::map_lgl(c("docx", "ns", "tbls", "path"), exists, where=docx)))) stop("'docx' object missing necessary components", call.=FALSE) } @@ -11,16 +11,16 @@ has_header <- function(tbl, rows, ns) { # microsoft has a tag for some table structure info. examine it to # see if the creator of the header made the first row special which # will likely mean it's a header candidate - look <- try(xml_find_first(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE) + look <- try(xml2::xml_find_first(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE) if (inherits(look, "try-error")) { return(NA) } else { - look_attr <- xml_attrs(look) + look_attr <- xml2::xml_attrs(look) if ("firstRow" %in% names(look_attr)) { if (look_attr["firstRow"] == "0") { return(NA) } else { - return(paste0(xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", ")) + return(paste0(xml2::xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", ")) } } else { return(NA) diff --git a/README.Rmd b/README.Rmd index 4b92022..d28f0b6 100644 --- a/README.Rmd +++ b/README.Rmd @@ -34,6 +34,7 @@ The following functions are implemented: - `docx_tbl_count`: Get number of tables in a Word document - `docx_cmnt_count`: Get number of comments in a Word document - `assign_colnames`: Make a specific row the column names for the specified data.frame +- `mcga` : Make column names great again The following data file are included: @@ -134,9 +135,12 @@ tbls <- docx_extract_all(real_world) # see table 1 tbls[[1]] -#' # make table 1 better +# make table 1 better assign_colnames(tbls[[1]], 2) +# make table 1's column names great again +mcga(assign_colnames(tbls[[1]], 2)) + # see table 5 tbls[[5]] diff --git a/README.md b/README.md index 597e4a7..f97db7c 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ The following functions are implemented: - `docx_tbl_count`: Get number of tables in a Word document - `docx_cmnt_count`: Get number of comments in a Word document - `assign_colnames`: Make a specific row the column names for the specified data.frame +- `mcga` : Make column names great again The following data file are included: @@ -56,7 +57,7 @@ library(dplyr) # current verison packageVersion("docxtractr") -#> [1] '0.2.0' +#> [1] '0.3.0' # one table doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) @@ -65,7 +66,7 @@ docx_tbl_count(doc) #> [1] 1 docx_describe_tbls(doc) -#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/data.docx] +#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/data.docx] #> #> Table 1 #> total cells: 16 @@ -74,24 +75,30 @@ docx_describe_tbls(doc) #> has header : likely! => possibly [This, Is, A, Column] docx_extract_tbl(doc, 1) -#> This Is A Column -#> 1 1 Cat 3.4 Dog -#> 2 3 Fish 100.3 Bird -#> 3 5 Pelican -99 Kangaroo +#> # A tibble: 3 x 4 +#> This Is A Column +#> +#> 1 1 Cat 3.4 Dog +#> 2 3 Fish 100.3 Bird +#> 3 5 Pelican -99 Kangaroo docx_extract_tbl(doc) -#> This Is A Column -#> 1 1 Cat 3.4 Dog -#> 2 3 Fish 100.3 Bird -#> 3 5 Pelican -99 Kangaroo +#> # A tibble: 3 x 4 +#> This Is A Column +#> +#> 1 1 Cat 3.4 Dog +#> 2 3 Fish 100.3 Bird +#> 3 5 Pelican -99 Kangaroo docx_extract_tbl(doc, header=FALSE) #> NOTE: header=FALSE but table has a marked header row in the Word document -#> V1 V2 V3 V4 -#> 1 This Is A Column -#> 2 1 Cat 3.4 Dog -#> 3 3 Fish 100.3 Bird -#> 4 5 Pelican -99 Kangaroo +#> # A tibble: 4 x 4 +#> V1 V2 V3 V4 +#> +#> 1 This Is A Column +#> 2 1 Cat 3.4 Dog +#> 3 3 Fish 100.3 Bird +#> 4 5 Pelican -99 Kangaroo # url @@ -116,22 +123,23 @@ docx_describe_tbls(budget) #> has header : unlikely docx_extract_tbl(budget, 1) -#> Short-term Portfolio Long-term Portfolio Total Portfolio Values -#> 1 Portfolio Balance (Market Value) * $ 123,651,911 $ 294,704,136 $ 418,356,047 -#> 2 Effective Yield 0.16 % 1.42 % 1.05 % -#> 3 Avg. Weighted Maturity 11 Days 2.4 Years 1.7 Years -#> 4 Net Earnings $ 18,470 $ 350,554 $ 369,024 -#> 5 Benchmark** 0.02 % 0.41 % 0.27 % +#> # A tibble: 5 x 4 +#> `` `Short-term Portfolio` `Long-term Portfolio` `Total Portfolio Values` +#> +#> 1 Portfolio Balance (Market Value) * $ 123,651,911 $ 294,704,136 $ 418,356,047 +#> 2 Effective Yield 0.16 % 1.42 % 1.05 % +#> 3 Avg. Weighted Maturity 11 Days 2.4 Years 1.7 Years +#> 4 Net Earnings $ 18,470 $ 350,554 $ 369,024 +#> 5 Benchmark** 0.02 % 0.41 % 0.27 % docx_extract_tbl(budget, 2) -#> Amount of Funds (Market Value) Maturity Effective Yield Interpolated Yield -#> 1 Short-Term Portfolio $ 123,651,911 11 days 0.16 % 0.01 % -#> 2 Long-Term Portfolio $ 294,704,136 2.4 years 1.42 % 0.41 % -#> 3 Total Portfolio $ 418,356,047 1.7 years 1.05 % 0.27 % -#> Total Return Monthly Total Return Annual -#> 1 0.013 0.160 -#> 2 0.437 0.250 -#> 3 0.298 0.222 +#> # A tibble: 3 x 7 +#> `` `Amount of Funds (Market Value)` Maturity `Effective Yield` `Interpolated Yield` +#> +#> 1 Short-Term Portfolio $ 123,651,911 11 days 0.16 % 0.01 % +#> 2 Long-Term Portfolio $ 294,704,136 2.4 years 1.42 % 0.41 % +#> 3 Total Portfolio $ 418,356,047 1.7 years 1.05 % 0.27 % +#> # ... with 2 more variables: `Total Return Monthly` , `Total Return Annual` # three tables doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr")) @@ -140,7 +148,7 @@ docx_tbl_count(doc3) #> [1] 3 docx_describe_tbls(doc3) -#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/data3.docx] +#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/data3.docx] #> #> Table 1 #> total cells: 16 @@ -161,13 +169,15 @@ docx_describe_tbls(doc3) #> has header : likely! => possibly [Foo, Bar] docx_extract_tbl(doc3, 3) -#> Foo Bar -#> 1 Aa Bb -#> 2 Dd Ee -#> 3 Gg Hh -#> 4 1 2 -#> 5 Zz Jj -#> 6 Tt ii +#> # A tibble: 6 x 2 +#> Foo Bar +#> +#> 1 Aa Bb +#> 2 Dd Ee +#> 3 Gg Hh +#> 4 1 2 +#> 5 Zz Jj +#> 6 Tt ii # no tables none <- read_docx(system.file("examples/none.docx", package="docxtractr")) @@ -188,7 +198,7 @@ docx_tbl_count(complx) #> [1] 5 docx_describe_tbls(complx) -#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/complex.docx] +#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/complex.docx] #> #> Table 1 #> total cells: 16 @@ -221,28 +231,34 @@ docx_describe_tbls(complx) #> has header : unlikely docx_extract_tbl(complx, 3, header=TRUE) -#> Foo Bar -#> 1 Aa Bb -#> 2 Dd Ee -#> 3 Gg Hh -#> 4 1 2 -#> 5 Zz Jj -#> 6 Tt ii +#> # A tibble: 6 x 2 +#> Foo Bar +#> +#> 1 Aa Bb +#> 2 Dd Ee +#> 3 Gg Hh +#> 4 1 2 +#> 5 Zz Jj +#> 6 Tt ii docx_extract_tbl(complx, 4, header=TRUE) -#> Foo Bar Baz -#> 1 Aa BbCc -#> 2 Dd Ee Ff -#> 3 Gg Hh ii +#> # A tibble: 3 x 3 +#> Foo Bar Baz +#> +#> 1 Aa BbCc +#> 2 Dd Ee Ff +#> 3 Gg Hh ii docx_extract_tbl(complx, 5, header=TRUE) -#> Foo Bar Baz -#> 1 Aa Bb Cc -#> 2 Dd Ee Ff -#> 3 Gg Hh Ii -#> 4 Jj88 Kk Ll -#> 5 Uu Ii -#> 6 Hh Ii h +#> # A tibble: 6 x 3 +#> Foo Bar Baz +#> +#> 1 Aa Bb Cc +#> 2 Dd Ee Ff +#> 3 Gg Hh Ii +#> 4 Jj88 Kk Ll +#> 5 Uu Ii +#> 6 Hh Ii h # a "real" Word doc real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) @@ -256,7 +272,9 @@ tbls <- docx_extract_all(real_world) # see table 1 tbls[[1]] +#> # A tibble: 9 x 9 #> V1 V2 V3 V4 V5 +#> #> 1 Lesson 1: Step 1 #> 2 Country Birthrate Death Rate Population Growth 2005 Population Growth 2050 #> 3 USA 2.06 0.51% 0.92% -0.06% @@ -266,20 +284,27 @@ tbls[[1]] #> 7 Italy 1.28 0.72% 0.35% -1.33% #> 8 Mexico 2.43 0.25% 1.41% 0.96% #> 9 Nigeria 4.78 0.26% 2.46% 3.58% -#> V6 V7 V8 V9 -#> 1 -#> 2 Relative place in Transition Social Factors 1 Social Factors 2 Social Factors 3 -#> 3 Post- Industrial Female Independence Stable Birth Rate Good technology -#> 4 Post- Industrial Government intervention Technology Urbanization -#> 5 Mature Industrial Not yet industrialized More children needed Slightly higher life expectancy -#> 6 Post Industrial Economic growth Poverty Becoming more industrialized -#> 7 Late Post industrial Stable birth rate People marry later Better health care -#> 8 Mature Industrial Better health care Emigration Economic growth -#> 9 End of Mechanization of Agriculture Disease People marry early People have many children - -#' # make table 1 better +#> # ... with 4 more variables: V6 , V7 , V8 , V9 + +# make table 1 better assign_colnames(tbls[[1]], 2) -#> Country Birthrate Death Rate Population Growth 2005 Population Growth 2050 Relative place in Transition +#> # A tibble: 7 x 9 +#> Country Birthrate `Death Rate` `Population Growth 2005` `Population Growth 2050` `Relative place in Transition` +#> +#> 1 USA 2.06 0.51% 0.92% -0.06% Post- Industrial +#> 2 China 1.62 0.3% 0.6% -0.58% Post- Industrial +#> 3 Egypt 2.83 0.41% 2.0% 1.32% Mature Industrial +#> 4 India 2.35 0.34% 1.56% 0.76% Post Industrial +#> 5 Italy 1.28 0.72% 0.35% -1.33% Late Post industrial +#> 6 Mexico 2.43 0.25% 1.41% 0.96% Mature Industrial +#> 7 Nigeria 4.78 0.26% 2.46% 3.58% End of Mechanization of Agriculture +#> # ... with 3 more variables: `Social Factors 1` , `Social Factors 2` , `Social Factors 3` + +# make table 1's column names great again +mcga(assign_colnames(tbls[[1]], 2)) +#> # A tibble: 7 x 9 +#> country birthrate death_rate population_growth_2005 population_growth_2050 relative_place_in_transition +#> #> 1 USA 2.06 0.51% 0.92% -0.06% Post- Industrial #> 2 China 1.62 0.3% 0.6% -0.58% Post- Industrial #> 3 Egypt 2.83 0.41% 2.0% 1.32% Mature Industrial @@ -287,18 +312,13 @@ assign_colnames(tbls[[1]], 2) #> 5 Italy 1.28 0.72% 0.35% -1.33% Late Post industrial #> 6 Mexico 2.43 0.25% 1.41% 0.96% Mature Industrial #> 7 Nigeria 4.78 0.26% 2.46% 3.58% End of Mechanization of Agriculture -#> Social Factors 1 Social Factors 2 Social Factors 3 -#> 1 Female Independence Stable Birth Rate Good technology -#> 2 Government intervention Technology Urbanization -#> 3 Not yet industrialized More children needed Slightly higher life expectancy -#> 4 Economic growth Poverty Becoming more industrialized -#> 5 Stable birth rate People marry later Better health care -#> 6 Better health care Emigration Economic growth -#> 7 Disease People marry early People have many children +#> # ... with 3 more variables: social_factors_1 , social_factors_2 , social_factors_3 # see table 5 tbls[[5]] +#> # A tibble: 5 x 6 #> V1 V2 V3 V4 V5 V6 +#> #> 1 Lesson 2: Step 1 #> 2 Nigeria Default Prediction + 5 years +15 years -5 years #> 3 Birth rate 4.78 Goes Down 4.76 4.72 4.79 @@ -307,23 +327,25 @@ tbls[[5]] # make table 5 better assign_colnames(tbls[[5]], 2) -#> Nigeria Default Prediction + 5 years +15 years -5 years -#> 1 Birth rate 4.78 Goes Down 4.76 4.72 4.79 -#> 2 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3% -#> 3 Population growth 3.58% Goes Down 3.02% 2.32% 4.38% +#> # A tibble: 3 x 6 +#> Nigeria Default Prediction `+ 5 years` `+15 years` `-5 years` +#> +#> 1 Birth rate 4.78 Goes Down 4.76 4.72 4.79 +#> 2 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3% +#> 3 Population growth 3.58% Goes Down 3.02% 2.32% 4.38% # comments cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr")) print(cmnts) #> No tables in document -#> Word document [/Library/Frameworks/R.framework/Versions/3.3/Resources/library/docxtractr/examples/comments.docx] +#> Word document [/Library/Frameworks/R.framework/Versions/3.4/Resources/library/docxtractr/examples/comments.docx] #> #> Found 3 comments. #> # A tibble: 1 x 2 -#> author # Comments -#> -#> 1 boB Rudis 3 +#> author `# Comments` +#> +#> 1 boB Rudis 3 glimpse(docx_extract_all_cmnts(cmnts)) #> Observations: 3 @@ -347,7 +369,7 @@ library(testthat) #> matches date() -#> [1] "Tue Jul 19 22:56:37 2016" +#> [1] "Mon Jun 19 05:52:59 2017" test_dir("tests/") #> testthat results ======================================================================================================== diff --git a/docxtractr.Rproj b/docxtractr.Rproj index 773de7e..9f58d70 100644 --- a/docxtractr.Rproj +++ b/docxtractr.Rproj @@ -5,21 +5,19 @@ SaveWorkspace: No AlwaysSaveHistory: Default EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 Encoding: UTF-8 +RnwWeave: Sweave +LaTeX: pdfLaTeX + AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace - -UseSpacesForTab: Yes -NumSpacesForTab: 2 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - PackageBuildArgs: --resave-data PackageCheckArgs: --as-cran +PackageRoxygenize: rd,collate,namespace diff --git a/man/assign_colnames.Rd b/man/assign_colnames.Rd index 3f24f98..43531a1 100644 --- a/man/assign_colnames.Rd +++ b/man/assign_colnames.Rd @@ -37,7 +37,7 @@ real_world <- read_docx(system.file("examples/realworld.docx", package="docxtrac docx_tbl_count(real_world) # get all the tables -tbls <- docx_extract_all(real_world) +tbls <- docx_extract_all_tbls(real_world) # make table 1 better assign_colnames(tbls[[1]], 2) @@ -48,4 +48,3 @@ assign_colnames(tbls[[5]], 2) \seealso{ \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}} } - diff --git a/man/docx_cmnt_count.Rd b/man/docx_cmnt_count.Rd index e316727..0ca7ccc 100644 --- a/man/docx_cmnt_count.Rd +++ b/man/docx_cmnt_count.Rd @@ -19,4 +19,3 @@ Get number of comments in a Word document cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr")) docx_cmnt_count(cmnts) } - diff --git a/man/docx_describe_cmnts.Rd b/man/docx_describe_cmnts.Rd index 3b6e897..0ad06d2 100644 --- a/man/docx_describe_cmnts.Rd +++ b/man/docx_describe_cmnts.Rd @@ -17,4 +17,3 @@ cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr")) docx_cmnt_count(cmnts) docx_describe_cmnts(cmnts) } - diff --git a/man/docx_describe_tbls.Rd b/man/docx_describe_tbls.Rd index 5324c3f..8a9aca6 100644 --- a/man/docx_describe_tbls.Rd +++ b/man/docx_describe_tbls.Rd @@ -18,4 +18,3 @@ complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) docx_tbl_count(complx) docx_describe_tbls(complx) } - diff --git a/man/docx_extract_all.Rd b/man/docx_extract_all.Rd index d21169e..e4c8c31 100644 --- a/man/docx_extract_all.Rd +++ b/man/docx_extract_all.Rd @@ -33,4 +33,3 @@ tbls <- docx_extract_all_tbls(real_world) \seealso{ \code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}} } - diff --git a/man/docx_extract_all_cmnts.Rd b/man/docx_extract_all_cmnts.Rd index 3116149..612c181 100644 --- a/man/docx_extract_all_cmnts.Rd +++ b/man/docx_extract_all_cmnts.Rd @@ -21,4 +21,3 @@ docx_cmnt_count(cmnts) docx_describe_cmnts(cmnts) docx_extract_all_cmnts(cmnts) } - diff --git a/man/docx_extract_all_tbls.Rd b/man/docx_extract_all_tbls.Rd index 1014e49..6c51aac 100644 --- a/man/docx_extract_all_tbls.Rd +++ b/man/docx_extract_all_tbls.Rd @@ -33,4 +33,3 @@ tbls <- docx_extract_all_tbls(real_world) \seealso{ \code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}} } - diff --git a/man/docx_extract_tbl.Rd b/man/docx_extract_tbl.Rd index 89beb22..04e76e3 100644 --- a/man/docx_extract_tbl.Rd +++ b/man/docx_extract_tbl.Rd @@ -31,4 +31,3 @@ docx_extract_tbl(doc3, 3) \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}, \code{\link{assign_colnames}} } - diff --git a/man/docx_tbl_count.Rd b/man/docx_tbl_count.Rd index b7e91f2..eeb2280 100644 --- a/man/docx_tbl_count.Rd +++ b/man/docx_tbl_count.Rd @@ -19,4 +19,3 @@ Get number of tables in a Word document complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) docx_tbl_count(complx) } - diff --git a/man/docxtractr.Rd b/man/docxtractr.Rd index 820a44d..dc130a5 100644 --- a/man/docxtractr.Rd +++ b/man/docxtractr.Rd @@ -15,4 +15,3 @@ comment count and extract comments from Word docx documents. \author{ Bob Rudis (@hrbrmstr) } - diff --git a/man/mcga.Rd b/man/mcga.Rd new file mode 100644 index 0000000..9150451 --- /dev/null +++ b/man/mcga.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mcga.r +\name{mcga} +\alias{mcga} +\title{Make Column Names Great Again} +\usage{ +mcga(tbl) +} +\arguments{ +\item{tbl}{a \code{data.frame}-like object} +} +\value{ +whatver class \code{x} was but with truly great, really great column names. They're amazing. +Trust me. They'll be incredible column names once we're done. +} +\description{ +Remove punctuation and spaces and turn them to underscores plus convert to lower case. +} +\examples{ +real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) +tbls <- docx_extract_all_tbls(real_world) +mcga(assign_colnames(tbls[[1]], 2)) +} diff --git a/man/print.docx.Rd b/man/print.docx.Rd index a908af0..1771e52 100644 --- a/man/print.docx.Rd +++ b/man/print.docx.Rd @@ -14,4 +14,3 @@ \description{ Display information about the document } - diff --git a/man/read_docx.Rd b/man/read_docx.Rd index 6a358d3..63a6eca 100644 --- a/man/read_docx.Rd +++ b/man/read_docx.Rd @@ -21,4 +21,3 @@ budget <- read_docx( "http://rud.is/dl/1.DOCX") } } - diff --git a/tests/testthat/test-docxtractr.R b/tests/testthat/test-docxtractr.R index 69d8ab6..f02caad 100644 --- a/tests/testthat/test-docxtractr.R +++ b/tests/testthat/test-docxtractr.R @@ -1,11 +1,11 @@ -context("basic functionality") +context("docx extraction works") test_that("we can do something", { doc <- read_docx(system.file("examples/data.docx", package="docxtractr")) expect_that(doc, is_a("docx")) expect_that(docx_tbl_count(doc), equals(1)) - expect_that(docx_extract_tbl(doc, 1), is_a("data.frame")) + expect_that(docx_extract_tbl(doc, 1), is_a("tbl")) complx <- read_docx(system.file("examples/complex.docx", package="docxtractr")) expect_that(docx_tbl_count(complx), equals(5)) @@ -14,9 +14,9 @@ test_that("we can do something", { tmp_4 <- docx_extract_tbl(complx, 4) tmp_5 <- docx_extract_tbl(complx, 5) - expect_that(tmp_3, is_a("data.frame")) - expect_that(tmp_4, is_a("data.frame")) - expect_that(tmp_5, is_a("data.frame")) + expect_that(tmp_3, is_a("tbl")) + expect_that(tmp_4, is_a("tbl")) + expect_that(tmp_5, is_a("tbl")) expect_that(nrow(tmp_3), equals(6)) expect_that(ncol(tmp_4), equals(3))