Browse Source

0.1.0.9000 - two new functions

pull/4/head
Bob Rudis 9 years ago
parent
commit
761ecad7ff
  1. 4
      DESCRIPTION
  2. 2
      NAMESPACE
  3. 52
      R/assign_colnames.r
  4. 2
      R/docx_find_tbls.r
  5. 38
      R/extract_all.r
  6. 24
      README.Rmd
  7. 157
      README.md
  8. BIN
      inst/examples/realworld.docx
  9. 51
      man/assign_colnames.Rd
  10. 36
      man/docx_extract_all.Rd
  11. 4
      man/docx_extract_tbl.Rd

4
DESCRIPTION

@ -1,11 +1,11 @@
Package: docxtractr
Title: Extract Tables from Microsoft Word Documents
Version: 0.0.1.9001
Version: 0.1.0.9000
Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")))
Description: Microsoft Word docx files provide an XML structure that is fairly
straightforward to navigate, especially when it applies to Word tables. The
docxtractr package provides tools to determine table count, table structure and
extract tables from Microsoft Word docx documents.
extract + clean tables from Microsoft Word docx documents.
Depends: R (>= 3.0.0)
License: MIT + file LICENSE
LazyData: true

2
NAMESPACE

@ -1,7 +1,9 @@
# Generated by roxygen2 (4.1.1): do not edit by hand
S3method(print,docx)
export(assign_colnames)
export(docx_describe_tbls)
export(docx_extract_all)
export(docx_extract_tbl)
export(docx_tbl_count)
export(read_docx)

52
R/assign_colnames.r

@ -0,0 +1,52 @@
#' Make a specific row the column names for the specified data.frame
#'
#' Many tables in Word documents are in twisted formats where there may be
#' labels or other oddities mixed in that make it difficult to work with the
#' underlying data. This function makes it easy to identify a particular row
#' in a scraped \code{data.frame} as the one containing column names and
#' have it become the column names, removing it and (optionally) all of the
#' rows before it (since that's usually what needs to be done).
#'
#' @param dat can be any \code{data.frame} but is intended for use with
#' ones retuned by this package
#' @param row numeric value indicating the row number that is to become
#' the column names
#' @param remove remove row specified by \code{row} after making it
#' the column names? (Default: \code{TRUE})
#' @param remove_previous remove any rows preceeding \code{row}? (Default:
#' \code{TRUE} but will be assigned whatever is given for
#' \code{remove}).
#' @return \code{data.frame}
#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}
#' @export
#' @examples
#' # a "real" Word doc
#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
#' docx_tbl_count(real_world)
#'
#' # get all the tables
#' tbls <- docx_extract_all(real_world)
#'
#' # make table 1 better
#' assign_colnames(tbls[[1]], 2)
#'
#' # make table 5 better
#' assign_colnames(tbls[[5]], 2)
assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) {
if ((row > nrow(dat)) | (row < 1)) return(dat)
# just in case someone shoots us a data.table or other stranger things
dat <- data.frame(dat, stringsAsFactors=FALSE)
colnames(dat) <- dat[row,]
start <- row
end <- row
if (remove_previous) start <- 1
dat <- dat[-(start:end),]
rownames(dat) <- NULL
dat
}

2
R/docx_find_tbls.r

@ -9,6 +9,8 @@
#' @param header assume first row of table is a header row? (default; \code{TRUE})
#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})
#' @return \code{data.frame}
#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
#' \code{\link{assign_colnames}}
#' @export
#' @examples
#' doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))

38
R/extract_all.r

@ -0,0 +1,38 @@
#' Extract all tables from a Word document
#'
#' This function makes no assumptions about an
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @param guess_header should the function make a guess as to the existense of
#' a header in a table? (Default: \code{TRUE})
#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})
#' @return \code{list} of \code{data.frame}s or an empty \code{list} if no
#' tables exist in \code{docx}
#' @seealso \code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
#' @export
#' @examples
#' # a "real" Word doc
#'
#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
#' docx_tbl_count(real_world)
#'
#' # get all the tables
#' tbls <- docx_extract_all(real_world)
docx_extract_all <- function(docx, guess_header=TRUE, trim=TRUE) {
ensure_docx(docx)
if (docx_tbl_count(docx) < 1) return(list())
ns <- docx$ns
lapply(1:docx_tbl_count(docx), function(i) {
hdr <- FALSE
if (guess_header) {
tbl <- docx$tbls[[i]]
rows <- xml_find_all(tbl, "./w:tr", ns=ns)
hdr <- !is.na(has_header(tbl, rows, ns))
}
docx_extract_tbl(docx, i, hdr, trim)
})
}

24
README.Rmd

@ -28,7 +28,9 @@ The following functions are implemented:
- `read_docx`: Read in a Word document for table extraction
- `docx_describe_tbls`: Returns a description of all the tables in the Word document
- `docx_extract_tbl`: Extract a table from a Word document
- `docx_extract_all`: Extract all tables from a Word document
- `docx_tbl_count`: Get number of tables in a Word document
- `assign_colnames`: Make a specific row the column names for the specified data.frame
The following data file are included:
@ -36,9 +38,11 @@ The following data file are included:
- `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables
- `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables
- `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables
- `system.file("examples/realworld.docx", package="docxtractr")`: A "real world" Word docx file with tables of all shapes and sizes
### News
- Version 0.1.0.9000 released - new function to extract all tables and a function to cleanup column names in scraped tables
- Version 0.0.1.9001 released - pre-CRAN flight check
- Version 0.0.1.9000 released - read from URL
- Version 0.0.0.9000 released
@ -117,6 +121,26 @@ docx_extract_tbl(complx, 3, header=TRUE)
docx_extract_tbl(complx, 4, header=TRUE)
docx_extract_tbl(complx, 5, header=TRUE)
# a "real" Word doc
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
docx_tbl_count(real_world)
# get all the tables
tbls <- docx_extract_all(real_world)
# see table 1
tbls[[1]]
#' # make table 1 better
assign_colnames(tbls[[1]], 2)
# see table 5
tbls[[5]]
# make table 5 better
assign_colnames(tbls[[5]], 2)
```
### Test Results

157
README.md

@ -1,7 +1,7 @@
<!-- README.md is generated from README.Rmd. Please edit that file -->
![](docxtractr-logo.png)
docxtractr is an R pacakge for extracting tables out of Word documents (docx)
docxtractr is an R package for extracting tables out of Word documents (docx)
Microsoft Word docx files provide an XML structure that is fairly straightforward to navigate, especially when it applies to Word tables. The docxtractr package provides tools to determine table count, table structure and extract tables from Microsoft Word docx documents.
@ -10,7 +10,9 @@ The following functions are implemented:
- `read_docx`: Read in a Word document for table extraction
- `docx_describe_tbls`: Returns a description of all the tables in the Word document
- `docx_extract_tbl`: Extract a table from a Word document
- `docx_extract_all`: Extract all tables from a Word document
- `docx_tbl_count`: Get number of tables in a Word document
- `assign_colnames`: Make a specific row the column names for the specified data.frame
The following data file are included:
@ -18,9 +20,11 @@ The following data file are included:
- `system.file("examples/data3.docx", package="docxtractr")`: Word docx with 3 tables
- `system.file("examples/none.docx", package="docxtractr")`: Word docx with 0 tables
- `system.file("examples/complex.docx", package="docxtractr")`: Word docx with non-uniform tables
- `system.file("examples/realworld.docx", package="docxtractr")`: A "real world" Word docx file with tables of all shapes and sizes
### News
- Version 0.1.0.9000 released - new function to extract all tables and a function to cleanup column names in scraped tables
- Version 0.0.1.9001 released - pre-CRAN flight check
- Version 0.0.1.9000 released - read from URL
- Version 0.0.0.9000 released
@ -58,28 +62,31 @@ docx_describe_tbls(doc)
docx_extract_tbl(doc, 1)
#> Source: local data frame [3 x 4]
#>
#> This Is A Column
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
#> This Is A Column
#> (chr) (chr) (chr) (chr)
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
docx_extract_tbl(doc)
#> Source: local data frame [3 x 4]
#>
#> This Is A Column
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
#> This Is A Column
#> (chr) (chr) (chr) (chr)
#> 1 1 Cat 3.4 Dog
#> 2 3 Fish 100.3 Bird
#> 3 5 Pelican -99 Kangaroo
docx_extract_tbl(doc, header=FALSE)
#> NOTE: header=FALSE but table has a marked header row in the Word document
#> Source: local data frame [4 x 4]
#>
#> V1 V2 V3 V4
#> 1 This Is A Column
#> 2 1 Cat 3.4 Dog
#> 3 3 Fish 100.3 Bird
#> 4 5 Pelican -99 Kangaroo
#> V1 V2 V3 V4
#> (chr) (chr) (chr) (chr)
#> 1 This Is A Column
#> 2 1 Cat 3.4 Dog
#> 3 3 Fish 100.3 Bird
#> 4 5 Pelican -99 Kangaroo
# url
@ -107,6 +114,7 @@ docx_extract_tbl(budget, 1)
#> Source: local data frame [5 x 4]
#>
#> Short-term Portfolio Long-term Portfolio Total Portfolio Values
#> (chr) (chr) (chr) (chr)
#> 1 Portfolio Balance (Market Value) * $ 123,651,911 $ 294,704,136 $ 418,356,047
#> 2 Effective Yield 0.16 % 1.42 % 1.05 %
#> 3 Avg. Weighted Maturity 11 Days 2.4 Years 1.7 Years
@ -117,6 +125,7 @@ docx_extract_tbl(budget, 2)
#> Source: local data frame [3 x 7]
#>
#> Amount of Funds (Market Value) Maturity Effective Yield Interpolated Yield
#> (chr) (chr) (chr) (chr) (chr)
#> 1 Short-Term Portfolio $ 123,651,911 11 days 0.16 % 0.01 %
#> 2 Long-Term Portfolio $ 294,704,136 2.4 years 1.42 % 0.41 %
#> 3 Total Portfolio $ 418,356,047 1.7 years 1.05 % 0.27 %
@ -152,13 +161,14 @@ docx_describe_tbls(doc3)
docx_extract_tbl(doc3, 3)
#> Source: local data frame [6 x 2]
#>
#> Foo Bar
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
#> Foo Bar
#> (chr) (chr)
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
# no tables
none <- read_docx(system.file("examples/none.docx", package="docxtractr"))
@ -214,32 +224,99 @@ docx_describe_tbls(complx)
docx_extract_tbl(complx, 3, header=TRUE)
#> Source: local data frame [6 x 2]
#>
#> Foo Bar
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
#> Foo Bar
#> (chr) (chr)
#> 1 Aa Bb
#> 2 Dd Ee
#> 3 Gg Hh
#> 4 1 2
#> 5 Zz Jj
#> 6 Tt ii
docx_extract_tbl(complx, 4, header=TRUE)
#> Source: local data frame [3 x 3]
#>
#> Foo Bar Baz
#> 1 Aa BbCc NA
#> 2 Dd Ee Ff
#> 3 Gg Hh ii
#> Foo Bar Baz
#> (chr) (chr) (chr)
#> 1 Aa BbCc NA
#> 2 Dd Ee Ff
#> 3 Gg Hh ii
docx_extract_tbl(complx, 5, header=TRUE)
#> Source: local data frame [6 x 3]
#>
#> Foo Bar Baz
#> 1 Aa Bb Cc
#> 2 Dd Ee Ff
#> 3 Gg Hh Ii
#> 4 Jj88 Kk Ll
#> 5 Uu Ii
#> 6 Hh Ii h
#> Foo Bar Baz
#> (chr) (chr) (chr)
#> 1 Aa Bb Cc
#> 2 Dd Ee Ff
#> 3 Gg Hh Ii
#> 4 Jj88 Kk Ll
#> 5 Uu Ii
#> 6 Hh Ii h
# a "real" Word doc
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
docx_tbl_count(real_world)
#> [1] 8
# get all the tables
tbls <- docx_extract_all(real_world)
# see table 1
tbls[[1]]
#> Source: local data frame [9 x 9]
#>
#> V1 V2 V3 V4 V5
#> (chr) (chr) (chr) (chr) (chr)
#> 1 Lesson 1: Step 1 NA NA NA NA
#> 2 Country Birthrate Death Rate Population Growth 2005 Population Growth 2050
#> 3 USA 2.06 0.51% 0.92% -0.06%
#> 4 China 1.62 0.3% 0.6% -0.58%
#> 5 Egypt 2.83 0.41% 2.0% 1.32%
#> 6 India 2.35 0.34% 1.56% 0.76%
#> 7 Italy 1.28 0.72% 0.35% -1.33%
#> 8 Mexico 2.43 0.25% 1.41% 0.96%
#> 9 Nigeria 4.78 0.26% 2.46% 3.58%
#> Variables not shown: V6 (chr), V7 (chr), V8 (chr), V9 (chr)
#' # make table 1 better
assign_colnames(tbls[[1]], 2)
#> Country Birthrate Death Rate Population Growth 2005 Population Growth 2050 Relative place in Transition
#> 1 USA 2.06 0.51% 0.92% -0.06% Post- Industrial
#> 2 China 1.62 0.3% 0.6% -0.58% Post- Industrial
#> 3 Egypt 2.83 0.41% 2.0% 1.32% Mature Industrial
#> 4 India 2.35 0.34% 1.56% 0.76% Post Industrial
#> 5 Italy 1.28 0.72% 0.35% -1.33% Late Post industrial
#> 6 Mexico 2.43 0.25% 1.41% 0.96% Mature Industrial
#> 7 Nigeria 4.78 0.26% 2.46% 3.58% End of Mechanization of Agriculture
#> Social Factors 1 Social Factors 2 Social Factors 3
#> 1 Female Independence Stable Birth Rate Good technology
#> 2 Government intervention Technology Urbanization
#> 3 Not yet industrialized More children needed Slightly higher life expectancy
#> 4 Economic growth Poverty Becoming more industrialized
#> 5 Stable birth rate People marry later Better health care
#> 6 Better health care Emigration Economic growth
#> 7 Disease People marry early People have many children
# see table 5
tbls[[5]]
#> Source: local data frame [5 x 6]
#>
#> V1 V2 V3 V4 V5 V6
#> (chr) (chr) (chr) (chr) (chr) (chr)
#> 1 Lesson 2: Step 1 NA NA NA NA NA
#> 2 Nigeria Default Prediction + 5 years +15 years -5 years
#> 3 Birth rate 4.78 Goes Down 4.76 4.72 4.79
#> 4 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3%
#> 5 Population growth 3.58% Goes Down 3.02% 2.32% 4.38%
# make table 5 better
assign_colnames(tbls[[5]], 2)
#> Nigeria Default Prediction + 5 years +15 years -5 years
#> 1 Birth rate 4.78 Goes Down 4.76 4.72 4.79
#> 2 Death rate 0.36% Stay the Same 0.42% 0.52% 0.3%
#> 3 Population growth 3.58% Goes Down 3.02% 2.32% 4.38%
```
### Test Results
@ -249,7 +326,7 @@ library(docxtractr)
library(testthat)
date()
#> [1] "Mon Aug 24 19:59:01 2015"
#> [1] "Tue Aug 25 23:25:22 2015"
test_dir("tests/")
#> testthat results ========================================================================================================

BIN
inst/examples/realworld.docx

Binary file not shown.

51
man/assign_colnames.Rd

@ -0,0 +1,51 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/assign_colnames.r
\name{assign_colnames}
\alias{assign_colnames}
\title{Make a specific row the column names for the specified data.frame}
\usage{
assign_colnames(dat, row, remove = TRUE, remove_previous = remove)
}
\arguments{
\item{dat}{can be any \code{data.frame} but is intended for use with
ones retuned by this package}
\item{row}{numeric value indicating the row number that is to become
the column names}
\item{remove}{remove row specified by \code{row} after making it
the column names? (Default: \code{TRUE})}
\item{remove_previous}{remove any rows preceeding \code{row}? (Default:
\code{TRUE} but will be assigned whatever is given for
\code{remove}).}
}
\value{
\code{data.frame}
}
\description{
Many tables in Word documents are in twisted formats where there may be
labels or other oddities mixed in that make it difficult to work with the
underlying data. This function makes it easy to identify a particular row
in a scraped \code{data.frame} as the one containing column names and
have it become the column names, removing it and (optionally) all of the
rows before it (since that's usually what needs to be done).
}
\examples{
# a "real" Word doc
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
docx_tbl_count(real_world)
# get all the tables
tbls <- docx_extract_all(real_world)
# make table 1 better
assign_colnames(tbls[[1]], 2)
# make table 5 better
assign_colnames(tbls[[5]], 2)
}
\seealso{
\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}}
}

36
man/docx_extract_all.Rd

@ -0,0 +1,36 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/extract_all.r
\name{docx_extract_all}
\alias{docx_extract_all}
\title{Extract all tables from a Word document}
\usage{
docx_extract_all(docx, guess_header = TRUE, trim = TRUE)
}
\arguments{
\item{docx}{\code{docx} object read with \code{read_docx}}
\item{guess_header}{should the function make a guess as to the existense of
a header in a table? (Default: \code{TRUE})}
\item{trim}{trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})}
}
\value{
\code{list} of \code{data.frame}s or an empty \code{list} if no
tables exist in \code{docx}
}
\description{
This function makes no assumptions about an
}
\examples{
# a "real" Word doc
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr"))
docx_tbl_count(real_world)
# get all the tables
tbls <- docx_extract_all(real_world)
}
\seealso{
\code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}}
}

4
man/docx_extract_tbl.Rd

@ -27,4 +27,8 @@ desired) extract the contents of the table to a \code{data.frame}.
doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))
docx_extract_tbl(doc3, 3)
}
\seealso{
\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
\code{\link{assign_colnames}}
}

Loading…
Cancel
Save