Bob Rudis
9 years ago
11 changed files with 328 additions and 42 deletions
@ -0,0 +1,52 @@ |
|||
#' Make a specific row the column names for the specified data.frame |
|||
#' |
|||
#' Many tables in Word documents are in twisted formats where there may be |
|||
#' labels or other oddities mixed in that make it difficult to work with the |
|||
#' underlying data. This function makes it easy to identify a particular row |
|||
#' in a scraped \code{data.frame} as the one containing column names and |
|||
#' have it become the column names, removing it and (optionally) all of the |
|||
#' rows before it (since that's usually what needs to be done). |
|||
#' |
|||
#' @param dat can be any \code{data.frame} but is intended for use with |
|||
#' ones retuned by this package |
|||
#' @param row numeric value indicating the row number that is to become |
|||
#' the column names |
|||
#' @param remove remove row specified by \code{row} after making it |
|||
#' the column names? (Default: \code{TRUE}) |
|||
#' @param remove_previous remove any rows preceeding \code{row}? (Default: |
|||
#' \code{TRUE} but will be assigned whatever is given for |
|||
#' \code{remove}). |
|||
#' @return \code{data.frame} |
|||
#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}} |
|||
#' @export |
|||
#' @examples |
|||
#' # a "real" Word doc |
|||
#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) |
|||
#' docx_tbl_count(real_world) |
|||
#' |
|||
#' # get all the tables |
|||
#' tbls <- docx_extract_all(real_world) |
|||
#' |
|||
#' # make table 1 better |
|||
#' assign_colnames(tbls[[1]], 2) |
|||
#' |
|||
#' # make table 5 better |
|||
#' assign_colnames(tbls[[5]], 2) |
|||
assign_colnames <- function(dat, row, remove=TRUE, remove_previous=remove) { |
|||
|
|||
if ((row > nrow(dat)) | (row < 1)) return(dat) |
|||
|
|||
# just in case someone shoots us a data.table or other stranger things |
|||
dat <- data.frame(dat, stringsAsFactors=FALSE) |
|||
|
|||
colnames(dat) <- dat[row,] |
|||
start <- row |
|||
end <- row |
|||
if (remove_previous) start <- 1 |
|||
|
|||
dat <- dat[-(start:end),] |
|||
rownames(dat) <- NULL |
|||
|
|||
dat |
|||
|
|||
} |
@ -0,0 +1,38 @@ |
|||
#' Extract all tables from a Word document |
|||
#' |
|||
#' This function makes no assumptions about an |
|||
#' |
|||
#' @param docx \code{docx} object read with \code{read_docx} |
|||
#' @param guess_header should the function make a guess as to the existense of |
|||
#' a header in a table? (Default: \code{TRUE}) |
|||
#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE}) |
|||
#' @return \code{list} of \code{data.frame}s or an empty \code{list} if no |
|||
#' tables exist in \code{docx} |
|||
#' @seealso \code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}} |
|||
#' @export |
|||
#' @examples |
|||
#' # a "real" Word doc |
|||
#' |
|||
#' real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) |
|||
#' docx_tbl_count(real_world) |
|||
#' |
|||
#' # get all the tables |
|||
#' tbls <- docx_extract_all(real_world) |
|||
docx_extract_all <- function(docx, guess_header=TRUE, trim=TRUE) { |
|||
|
|||
ensure_docx(docx) |
|||
if (docx_tbl_count(docx) < 1) return(list()) |
|||
|
|||
ns <- docx$ns |
|||
|
|||
lapply(1:docx_tbl_count(docx), function(i) { |
|||
hdr <- FALSE |
|||
if (guess_header) { |
|||
tbl <- docx$tbls[[i]] |
|||
rows <- xml_find_all(tbl, "./w:tr", ns=ns) |
|||
hdr <- !is.na(has_header(tbl, rows, ns)) |
|||
} |
|||
docx_extract_tbl(docx, i, hdr, trim) |
|||
}) |
|||
|
|||
} |
Binary file not shown.
@ -0,0 +1,51 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/assign_colnames.r |
|||
\name{assign_colnames} |
|||
\alias{assign_colnames} |
|||
\title{Make a specific row the column names for the specified data.frame} |
|||
\usage{ |
|||
assign_colnames(dat, row, remove = TRUE, remove_previous = remove) |
|||
} |
|||
\arguments{ |
|||
\item{dat}{can be any \code{data.frame} but is intended for use with |
|||
ones retuned by this package} |
|||
|
|||
\item{row}{numeric value indicating the row number that is to become |
|||
the column names} |
|||
|
|||
\item{remove}{remove row specified by \code{row} after making it |
|||
the column names? (Default: \code{TRUE})} |
|||
|
|||
\item{remove_previous}{remove any rows preceeding \code{row}? (Default: |
|||
\code{TRUE} but will be assigned whatever is given for |
|||
\code{remove}).} |
|||
} |
|||
\value{ |
|||
\code{data.frame} |
|||
} |
|||
\description{ |
|||
Many tables in Word documents are in twisted formats where there may be |
|||
labels or other oddities mixed in that make it difficult to work with the |
|||
underlying data. This function makes it easy to identify a particular row |
|||
in a scraped \code{data.frame} as the one containing column names and |
|||
have it become the column names, removing it and (optionally) all of the |
|||
rows before it (since that's usually what needs to be done). |
|||
} |
|||
\examples{ |
|||
# a "real" Word doc |
|||
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) |
|||
docx_tbl_count(real_world) |
|||
|
|||
# get all the tables |
|||
tbls <- docx_extract_all(real_world) |
|||
|
|||
# make table 1 better |
|||
assign_colnames(tbls[[1]], 2) |
|||
|
|||
# make table 5 better |
|||
assign_colnames(tbls[[5]], 2) |
|||
} |
|||
\seealso{ |
|||
\code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}} |
|||
} |
|||
|
@ -0,0 +1,36 @@ |
|||
% Generated by roxygen2 (4.1.1): do not edit by hand |
|||
% Please edit documentation in R/extract_all.r |
|||
\name{docx_extract_all} |
|||
\alias{docx_extract_all} |
|||
\title{Extract all tables from a Word document} |
|||
\usage{ |
|||
docx_extract_all(docx, guess_header = TRUE, trim = TRUE) |
|||
} |
|||
\arguments{ |
|||
\item{docx}{\code{docx} object read with \code{read_docx}} |
|||
|
|||
\item{guess_header}{should the function make a guess as to the existense of |
|||
a header in a table? (Default: \code{TRUE})} |
|||
|
|||
\item{trim}{trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})} |
|||
} |
|||
\value{ |
|||
\code{list} of \code{data.frame}s or an empty \code{list} if no |
|||
tables exist in \code{docx} |
|||
} |
|||
\description{ |
|||
This function makes no assumptions about an |
|||
} |
|||
\examples{ |
|||
# a "real" Word doc |
|||
|
|||
real_world <- read_docx(system.file("examples/realworld.docx", package="docxtractr")) |
|||
docx_tbl_count(real_world) |
|||
|
|||
# get all the tables |
|||
tbls <- docx_extract_all(real_world) |
|||
} |
|||
\seealso{ |
|||
\code{\link{assign_colnames}}, \code{\link{docx_extract_tbl}} |
|||
} |
|||
|
Loading…
Reference in new issue