0.1.0.9000 - two new functions

9 years ago · 761ecad7ff
11 changed files with 328 additions and 42 deletions
--- a/4
+++ b/4
@ -1,11 +1,11 @@
 Package: docxtractr
 Title: Extract Tables from Microsoft Word Documents
-Version: 0.0.1.9001
+Version: 0.1.0.9000
 Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")))
 Description: Microsoft Word docx files provide an XML structure that is fairly
    straightforward to navigate, especially when it applies to Word tables. The
    docxtractr package provides tools to determine table count, table structure and
-    extract tables from Microsoft Word docx documents.
+    extract + clean tables from Microsoft Word docx documents.
 Depends: R (>= 3.0.0)
 License: MIT + file LICENSE
 LazyData: true
--- a/2
+++ b/2
@ -1,7 +1,9 @@
 # Generated by roxygen2 (4.1.1): do not edit by hand

 S3method(print,docx)
+export(assign_colnames)
 export(docx_describe_tbls)
+export(docx_extract_all)
 export(docx_extract_tbl)
 export(docx_tbl_count)
 export(read_docx)
--- a/R/assign_colnames.r
+++ b/R/assign_colnames.r
@ -0,0 +1,52 @@
+#' Make a specific row the column names for the specified data.frame
+#'
+#' Many tables in Word documents are in twisted formats where there may be
+#' labels or other oddities mixed in that make it difficult to work with the
+#' underlying data. This function makes it easy to identify a particular row
+#' in a scraped \code{data.frame} as the one containing column names and
+#' have it become the column names, removing it and (optionally) all of the
+#' rows before it (since that's usually what needs to be done).
+#'
+#' @param dat can be any \code{data.frame} but is intended for use with
+#'        ones retuned by this package
+#' @param row numeric value indicating the row number that is to become
+#'        the column names
+#' @param remove remove row specified by \code{row} after making it
+#'        the column names? (Default: \code{TRUE})
+#' @param remove_previous remove any rows preceeding \code{row}? (Default:
+#'        \code{TRUE} but will be assigned whatever is given for
+#'        \code{remove}).
+#' @return \code{data.frame}
+#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}
+#' @export
+#' @examples
+#' # a "real" Word doc
+#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+#' docx_tbl_count(real_world)
+#'
+#' # get all the tables
+#' tbls <- docx_extract_all(real_world)
+#'
+#' # make table 1 better
+#' assign_colnames(tbls[[1]], 2)
+#'
+#' # make table 5 better
+#' assign_colnames(tbls[[5]], 2)
+assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) {
+
+  if ((row > nrow(dat)) | (row < 1)) return(dat)
+
+  # just in case someone shoots us a data.table or other stranger things
+  dat <- data.frame(dat, stringsAsFactors=FALSE)
+
+  colnames(dat) <- dat[row,]
+  start <- row
+  end <- row
+  if (remove_previous) start <- 1
+
+  dat <- dat[-(start:end),]
+  rownames(dat) <- NULL
+
+  dat
+
+}
--- a/R/docx_find_tbls.r
+++ b/R/docx_find_tbls.r
@ -9,6 +9,8 @@
 #' @param header assume first row of table is a header row? (default; \code{TRUE})
 #' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})
 #' @return \code{data.frame}
+#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
+#'          \code{\link{assign_colnames}}
 #' @export
 #' @examples
 #' doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))
--- a/R/extract_all.r
+++ b/R/extract_all.r
@ -0,0 +1,38 @@
+#' Extract all tables from a Word document
+#'
+#' This function makes no assumptions about an
+#'
+#' @param docx \code{docx} object read with \code{read_docx}
+#' @param guess_header should the function make a guess as to the existense of
+#'        a header in a table? (Default: \code{TRUE})
+#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})
+#' @return \code{list} of \code{data.frame}s or an empty \code{list} if no
+#'         tables exist in \code{docx}
+#' @seealso \code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
+#' @export
+#' @examples
+#' # a "real" Word doc
+#'
+#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+#' docx_tbl_count(real_world)
+#'
+#' # get all the tables
+#' tbls <- docx_extract_all(real_world)
+docx_extract_all <- function(docx, guess_header=TRUE, trim=TRUE) {
+
+  ensure_docx(docx)
+  if (docx_tbl_count(docx) < 1) return(list())
+
+  ns <- docx$ns
+
+  lapply(1:docx_tbl_count(docx), function(i) {
+    hdr <- FALSE
+    if (guess_header) {
+      tbl <- docx$tbls[[i]]
+      rows <- xml_find_all(tbl, "./w:tr", ns=ns)
+      hdr <- !is.na(has_header(tbl, rows, ns))
+    }
+    docx_extract_tbl(docx, i, hdr, trim)
+  })
+
+}
--- a/README.Rmd
+++ b/README.Rmd
@ -28,7 +28,9 @@ The following functions are implemented:
 - `read_docx`:	Read in a Word document for table extraction
 - `docx_describe_tbls`:	Returns a description of all the tables in the Word document
 - `docx_extract_tbl`:	Extract a table from a Word document
+- `docx_extract_all`:	Extract all tables from a Word document
 - `docx_tbl_count`:	Get number of tables in a Word document
+- `assign_colnames`:	Make a specific row the column names for the specified data.frame

 The following data file are included:

@ -36,9 +38,11 @@ The following data file are included:
 - `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables
 - `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables
 - `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables
+- `system.file("examples/realworld.docx", package="docxtractr")`: A "real world" Word docx file with tables of all shapes and sizes

 ### News

+- Version 0.1.0.9000 released - new function to extract all tables and a function to cleanup column names in scraped tables
 - Version 0.0.1.9001 released - pre-CRAN flight check
 - Version 0.0.1.9000 released - read from URL
 - Version 0.0.0.9000 released
@ -117,6 +121,26 @@ docx_extract_tbl(complx, 3, header=TRUE)
 docx_extract_tbl(complx, 4, header=TRUE)

 docx_extract_tbl(complx, 5, header=TRUE)
+
+# a "real" Word doc
+real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+
+docx_tbl_count(real_world)
+
+# get all the tables
+tbls <- docx_extract_all(real_world)
+
+# see table 1
+tbls[[1]]
+
+#' # make table 1 better
+assign_colnames(tbls[[1]], 2)
+
+# see table 5
+tbls[[5]]
+
+# make table 5 better
+assign_colnames(tbls[[5]], 2)
 ```

 ### Test Results
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 ![](docxtractr-logo.png)

-docxtractr is an R pacakge for extracting tables out of Word documents (docx)
+docxtractr is an R package for extracting tables out of Word documents (docx)

 Microsoft Word docx files provide an XML structure that is fairly straightforward to navigate, especially when it applies to Word tables. The docxtractr package provides tools to determine table count, table structure and extract tables from Microsoft Word docx documents.

@ -10,7 +10,9 @@ The following functions are implemented:
 -   `read_docx`: Read in a Word document for table extraction
 -   `docx_describe_tbls`: Returns a description of all the tables in the Word document
 -   `docx_extract_tbl`: Extract a table from a Word document
+-   `docx_extract_all`: Extract all tables from a Word document
 -   `docx_tbl_count`: Get number of tables in a Word document
+-   `assign_colnames`: Make a specific row the column names for the specified data.frame

 The following data file are included:

@ -18,9 +20,11 @@ The following data file are included:
 -   `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables
 -   `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables
 -   `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables
+-   `system.file("examples/realworld.docx", package="docxtractr")`: A "real world" Word docx file with tables of all shapes and sizes

 ### News

+-   Version 0.1.0.9000 released - new function to extract all tables and a function to cleanup column names in scraped tables
 -   Version 0.0.1.9001 released - pre-CRAN flight check
 -   Version 0.0.1.9000 released - read from URL
 -   Version 0.0.0.9000 released
@ -58,28 +62,31 @@ docx_describe_tbls(doc)
 docx_extract_tbl(doc, 1)
 #> Source: local data frame [3 x 4]
 #> 
-#>   This      Is     A   Column
-#> 1    1     Cat   3.4      Dog
-#> 2    3    Fish 100.3     Bird
-#> 3    5 Pelican   -99 Kangaroo
+#>    This      Is     A   Column
+#>   (chr)   (chr) (chr)    (chr)
+#> 1     1     Cat   3.4      Dog
+#> 2     3    Fish 100.3     Bird
+#> 3     5 Pelican   -99 Kangaroo

 docx_extract_tbl(doc)
 #> Source: local data frame [3 x 4]
 #> 
-#>   This      Is     A   Column
-#> 1    1     Cat   3.4      Dog
-#> 2    3    Fish 100.3     Bird
-#> 3    5 Pelican   -99 Kangaroo
+#>    This      Is     A   Column
+#>   (chr)   (chr) (chr)    (chr)
+#> 1     1     Cat   3.4      Dog
+#> 2     3    Fish 100.3     Bird
+#> 3     5 Pelican   -99 Kangaroo

 docx_extract_tbl(doc, header=FALSE)
 #> NOTE: header=FALSE but table has a marked header row in the Word document
 #> Source: local data frame [4 x 4]
 #> 
-#>     V1      V2    V3       V4
-#> 1 This      Is     A   Column
-#> 2    1     Cat   3.4      Dog
-#> 3    3    Fish 100.3     Bird
-#> 4    5 Pelican   -99 Kangaroo
+#>      V1      V2    V3       V4
+#>   (chr)   (chr) (chr)    (chr)
+#> 1  This      Is     A   Column
+#> 2     1     Cat   3.4      Dog
+#> 3     3    Fish 100.3     Bird
+#> 4     5 Pelican   -99 Kangaroo

 # url 

@ -107,6 +114,7 @@ docx_extract_tbl(budget, 1)
 #> Source: local data frame [5 x 4]
 #> 
 #>                                      Short-term Portfolio Long-term Portfolio Total Portfolio Values
+#>                                (chr)                (chr)               (chr)                  (chr)
 #> 1 Portfolio Balance (Market Value) *       $  123,651,911       $ 294,704,136          $ 418,356,047
 #> 2                    Effective Yield               0.16 %              1.42 %                 1.05 %
 #> 3             Avg. Weighted Maturity              11 Days           2.4 Years              1.7 Years
@ -117,6 +125,7 @@ docx_extract_tbl(budget, 2)
 #> Source: local data frame [3 x 7]
 #> 
 #>                        Amount of Funds (Market Value)  Maturity Effective Yield Interpolated Yield
+#>                  (chr)                          (chr)     (chr)           (chr)              (chr)
 #> 1 Short-Term Portfolio                  $ 123,651,911   11 days          0.16 %             0.01 %
 #> 2  Long-Term Portfolio                  $ 294,704,136 2.4 years          1.42 %             0.41 %
 #> 3      Total Portfolio                  $ 418,356,047 1.7 years          1.05 %             0.27 %
@ -152,13 +161,14 @@ docx_describe_tbls(doc3)
 docx_extract_tbl(doc3, 3)
 #> Source: local data frame [6 x 2]
 #> 
-#>   Foo Bar
-#> 1  Aa  Bb
-#> 2  Dd  Ee
-#> 3  Gg  Hh
-#> 4   1   2
-#> 5  Zz  Jj
-#> 6  Tt  ii
+#>     Foo   Bar
+#>   (chr) (chr)
+#> 1    Aa    Bb
+#> 2    Dd    Ee
+#> 3    Gg    Hh
+#> 4     1     2
+#> 5    Zz    Jj
+#> 6    Tt    ii

 # no tables
 none <- read_docx(system.file("examples/none.docx", package="docxtractr"))
@ -214,32 +224,99 @@ docx_describe_tbls(complx)
 docx_extract_tbl(complx, 3, header=TRUE)
 #> Source: local data frame [6 x 2]
 #> 
-#>   Foo Bar
-#> 1  Aa  Bb
-#> 2  Dd  Ee
-#> 3  Gg  Hh
-#> 4   1   2
-#> 5  Zz  Jj
-#> 6  Tt  ii
+#>     Foo   Bar
+#>   (chr) (chr)
+#> 1    Aa    Bb
+#> 2    Dd    Ee
+#> 3    Gg    Hh
+#> 4     1     2
+#> 5    Zz    Jj
+#> 6    Tt    ii

 docx_extract_tbl(complx, 4, header=TRUE)
 #> Source: local data frame [3 x 3]
 #> 
-#>   Foo  Bar Baz
-#> 1  Aa BbCc  NA
-#> 2  Dd   Ee  Ff
-#> 3  Gg   Hh  ii
+#>     Foo   Bar   Baz
+#>   (chr) (chr) (chr)
+#> 1    Aa  BbCc    NA
+#> 2    Dd    Ee    Ff
+#> 3    Gg    Hh    ii

 docx_extract_tbl(complx, 5, header=TRUE)
 #> Source: local data frame [6 x 3]
 #> 
-#>    Foo Bar Baz
-#> 1   Aa  Bb  Cc
-#> 2   Dd  Ee  Ff
-#> 3   Gg  Hh  Ii
-#> 4 Jj88  Kk  Ll
-#> 5       Uu  Ii
-#> 6   Hh  Ii   h
+#>     Foo   Bar   Baz
+#>   (chr) (chr) (chr)
+#> 1    Aa    Bb    Cc
+#> 2    Dd    Ee    Ff
+#> 3    Gg    Hh    Ii
+#> 4  Jj88    Kk    Ll
+#> 5          Uu    Ii
+#> 6    Hh    Ii     h
+
+# a "real" Word doc
+real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+
+docx_tbl_count(real_world)
+#> [1] 8
+
+# get all the tables
+tbls <- docx_extract_all(real_world)
+
+# see table 1
+tbls[[1]]
+#> Source: local data frame [9 x 9]
+#> 
+#>                  V1        V2         V3                     V4                     V5
+#>               (chr)     (chr)      (chr)                  (chr)                  (chr)
+#> 1 Lesson 1:  Step 1        NA         NA                     NA                     NA
+#> 2           Country Birthrate Death Rate Population Growth 2005 Population Growth 2050
+#> 3               USA      2.06      0.51%                  0.92%                 -0.06%
+#> 4             China      1.62       0.3%                   0.6%                 -0.58%
+#> 5             Egypt      2.83      0.41%                   2.0%                  1.32%
+#> 6             India      2.35      0.34%                  1.56%                  0.76%
+#> 7             Italy      1.28      0.72%                  0.35%                 -1.33%
+#> 8            Mexico      2.43      0.25%                  1.41%                  0.96%
+#> 9           Nigeria      4.78      0.26%                  2.46%                  3.58%
+#> Variables not shown: V6 (chr), V7 (chr), V8 (chr), V9 (chr)
+
+#' # make table 1 better
+assign_colnames(tbls[[1]], 2)
+#>   Country Birthrate Death Rate Population Growth 2005 Population Growth 2050        Relative place in Transition
+#> 1     USA      2.06      0.51%                  0.92%                 -0.06%                    Post- Industrial
+#> 2   China      1.62       0.3%                   0.6%                 -0.58%                    Post- Industrial
+#> 3   Egypt      2.83      0.41%                   2.0%                  1.32%                   Mature Industrial
+#> 4   India      2.35      0.34%                  1.56%                  0.76%                     Post Industrial
+#> 5   Italy      1.28      0.72%                  0.35%                 -1.33%                Late Post industrial
+#> 6  Mexico      2.43      0.25%                  1.41%                  0.96%                   Mature Industrial
+#> 7 Nigeria      4.78      0.26%                  2.46%                  3.58% End of Mechanization of Agriculture
+#>          Social Factors 1     Social Factors 2                Social Factors 3
+#> 1     Female Independence    Stable Birth Rate                 Good technology
+#> 2 Government intervention           Technology                    Urbanization
+#> 3  Not yet industrialized More children needed Slightly higher life expectancy
+#> 4         Economic growth              Poverty    Becoming more industrialized
+#> 5       Stable birth rate   People marry later              Better health care
+#> 6      Better health care           Emigration                 Economic growth
+#> 7                 Disease   People marry early       People have many children
+
+# see table 5
+tbls[[5]]
+#> Source: local data frame [5 x 6]
+#> 
+#>                  V1      V2            V3        V4        V5       V6
+#>               (chr)   (chr)         (chr)     (chr)     (chr)    (chr)
+#> 1 Lesson 2:  Step 1      NA            NA        NA        NA       NA
+#> 2           Nigeria Default    Prediction + 5 years +15 years -5 years
+#> 3        Birth rate    4.78     Goes Down      4.76      4.72     4.79
+#> 4        Death rate   0.36% Stay the Same     0.42%     0.52%     0.3%
+#> 5 Population growth   3.58%     Goes Down     3.02%     2.32%    4.38%
+
+# make table 5 better
+assign_colnames(tbls[[5]], 2)
+#>             Nigeria Default    Prediction + 5 years +15 years -5 years
+#> 1        Birth rate    4.78     Goes Down      4.76      4.72     4.79
+#> 2        Death rate   0.36% Stay the Same     0.42%     0.52%     0.3%
+#> 3 Population growth   3.58%     Goes Down     3.02%     2.32%    4.38%
 ```

 ### Test Results
@ -249,7 +326,7 @@ library(docxtractr)
 library(testthat)

 date()
-#> [1] "Mon Aug 24 19:59:01 2015"
+#> [1] "Tue Aug 25 23:25:22 2015"

 test_dir("tests/")
 #> testthat results ========================================================================================================
--- a/inst/examples/realworld.docx
+++ b/inst/examples/realworld.docx
--- a/man/assign_colnames.Rd
+++ b/man/assign_colnames.Rd
@ -0,0 +1,51 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/assign_colnames.r
+\name{assign_colnames}
+\alias{assign_colnames}
+\title{Make a specific row the column names for the specified data.frame}
+\usage{
+assign_colnames(dat, row, remove = TRUE, remove_previous = remove)
+}
+\arguments{
+\item{dat}{can be any \code{data.frame} but is intended for use with
+ones retuned by this package}
+
+\item{row}{numeric value indicating the row number that is to become
+the column names}
+
+\item{remove}{remove row specified by \code{row} after making it
+the column names? (Default: \code{TRUE})}
+
+\item{remove_previous}{remove any rows preceeding \code{row}? (Default:
+\code{TRUE} but will be assigned whatever is given for
+\code{remove}).}
+}
+\value{
+\code{data.frame}
+}
+\description{
+Many tables in Word documents are in twisted formats where there may be
+labels or other oddities mixed in that make it difficult to work with the
+underlying data. This function makes it easy to identify a particular row
+in a scraped \code{data.frame} as the one containing column names and
+have it become the column names, removing it and (optionally) all of the
+rows before it (since that's usually what needs to be done).
+}
+\examples{
+# a "real" Word doc
+real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+docx_tbl_count(real_world)
+
+# get all the tables
+tbls <- docx_extract_all(real_world)
+
+# make table 1 better
+assign_colnames(tbls[[1]], 2)
+
+# make table 5 better
+assign_colnames(tbls[[5]], 2)
+}
+\seealso{
+\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}
+}
+
--- a/man/docx_extract_all.Rd
+++ b/man/docx_extract_all.Rd
@ -0,0 +1,36 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/extract_all.r
+\name{docx_extract_all}
+\alias{docx_extract_all}
+\title{Extract all tables from a Word document}
+\usage{
+docx_extract_all(docx, guess_header = TRUE, trim = TRUE)
+}
+\arguments{
+\item{docx}{\code{docx} object read with \code{read_docx}}
+
+\item{guess_header}{should the function make a guess as to the existense of
+a header in a table? (Default: \code{TRUE})}
+
+\item{trim}{trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})}
+}
+\value{
+\code{list} of \code{data.frame}s or an empty \code{list} if no
+        tables exist in \code{docx}
+}
+\description{
+This function makes no assumptions about an
+}
+\examples{
+# a "real" Word doc
+
+real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
+docx_tbl_count(real_world)
+
+# get all the tables
+tbls <- docx_extract_all(real_world)
+}
+\seealso{
+\code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
+}
+
--- a/man/docx_extract_tbl.Rd
+++ b/man/docx_extract_tbl.Rd
@ -27,4 +27,8 @@ desired) extract the contents of the table to a \code{data.frame}.
 doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))
 docx_extract_tbl(doc3, 3)
 }
+\seealso{
+\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
+         \code{\link{assign_colnames}}
+}