diff --git a/.Rbuildignore b/.Rbuildignore index 9a37f74..ae879f1 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^NOTES\.*html$ ^tools$ ^cran-comments\.md$ +^builder$ diff --git a/DESCRIPTION b/DESCRIPTION index 7834441..6823edf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,15 +3,19 @@ Type: Package Title: Retrieve 'Magic' Attributes from Files and Directories Version: 0.2.0 Date: 2016-08-14 -Author: Bob Rudis (@hrbrmstr), Christos Zoulas [libmagic], Mans Rullgard [file] +Author: Bob Rudis (@hrbrmstr), Christos Zoulas [libmagic], Mans Rullgard [file], + Jonathan Ong [mime-db] Maintainer: Bob Rudis Description: The 'libmagic' library provides functions to determine 'MIME' type and other metadata from files through their "magic" attributes. This is useful when you do not wish to rely solely on - the honesty of a user or the extension on a file name. + the honesty of a user or the extension on a file name. It also + incorporates other metadata from the mime-db database + . URL: http://github.com/hrbrmstr/wand BugReports: https://github.com/hrbrmstr/wand/issues NeedsCompilation: yes +LazyData: true SystemRequirements: libmagic (>= 5.14) for Unix/Linux/macOS; Rtools 3.3+ for Windows License: AGPL Suggests: @@ -28,5 +32,6 @@ Imports: tidyr, utils, Rcpp +Encoding: UTF-8 LinkingTo: Rcpp RoxygenNote: 5.0.1 diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..a01a56f --- /dev/null +++ b/INSTALL @@ -0,0 +1,13 @@ +For Linux/UNIX/macOS you need 'libmagic' installed which is a component of the +'file' utility: . You can find out more information +on 'libmagic' and 'file' at this URL: . + +Here are the incantations you must use to get magic for your environment: + +- `apt-get install libmagic-dev` on Ubuntu/Debian-ish systems +- `brew install libmagic` on macOS +- `yum install file-devel` on RHEL/CentOS/Fedora + +For Windows you will need Rtools +version 3.3 or higher (it may work with older ones, but it's only been tested on +Rtools version 3.3 & 3.4). \ No newline at end of file diff --git a/R/aaa.r b/R/aaa.r index 4f4d52d..3ac56ed 100644 --- a/R/aaa.r +++ b/R/aaa.r @@ -1 +1 @@ -response <- encoding <- NULL \ No newline at end of file +extensions <- mime_type <- response <- encoding <- NULL \ No newline at end of file diff --git a/R/datasets.r b/R/datasets.r new file mode 100644 index 0000000..ae41e2d --- /dev/null +++ b/R/datasets.r @@ -0,0 +1,31 @@ +#' @title MIME Types Database +#' @description This is a dataset of all mime types. It aggregates data from the +#' following sources: +#' +#' \itemize{ +#' \item \url{http://www.iana.org/assignments/media-types/media-types.xhtml} +#' \item \url{http://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types} +#' \item \url{http://hg.nginx.org/nginx/raw-file/default/conf/mime.types} +#' } +#' +#' There are a total of four possible fields per element: +#' +#' \itemize{ +#' \item \code{source}: where the mime type is defined. If not set, it's +#' probably a custom media type. One of \code{apache}, \code{iana} or \code{nginx}. +#' \item \code{extensions}: a character vector of known extensions associated with this mime type. +#' \item \code{compressible}: whether a file of this type can be "gzipped" (mostly +#' useful in the context of serving up web content). +#' \item \code{charset}: the default charset associated with this type, if any. +#' } +#' +#' @docType data +#' @keywords datasets +#' @name mime_db +#' +#' @references Ingested from \url{https://github.com/jshttp/mime-db}. +#' @usage data(mime_db) +#' @note Last updated 2016-08-14; the only guaranteed field is \code{source} +#' @format A list with 1,883 elements and four named fields: \code{source}, +#' \code{compressible}, \code{extensions} & \code{charset}. +NULL \ No newline at end of file diff --git a/R/wand-package.R b/R/wand-package.R index fba0632..ce90ab8 100644 --- a/R/wand-package.R +++ b/R/wand-package.R @@ -1,5 +1,13 @@ #' Retrieve 'Magic' Attributes from Files and Directories #' +#' The 'libmagic' library provides functions to determine 'MIME' type and other +#' metadata from files through their "magic" attributes. This is useful when you +#' do not wish to rely solely on the honesty of a user or the extension on a +#' file name. It also incorporates other metadata from the mime-db database +#' +#' +#' Based on \code{file} / \code{libmagic} - \url{https://github.com/file/file} +#' #' @name wand #' @docType package #' @author Bob Rudis (@@hrbrmstr) diff --git a/R/wand.r b/R/wand.r index ff1e6de..197214a 100644 --- a/R/wand.r +++ b/R/wand.r @@ -17,7 +17,7 @@ #' @examples #' library(dplyr) #' -#' system.file("img", package="filemagic") %>% +#' system.file("extdata/img", package="filemagic") %>% #' list.files(full.names=TRUE) %>% #' incant() %>% #' glimpse() @@ -37,8 +37,8 @@ incant <- function(path, magic_db="system") { if (!found_file) { stop(paste0("'file.exe' not found. Please install 'Rtools' and restart R. ", - "See 'https://github.com/stan-dev/rstan/wiki/Install-Rtools-for-Windows' ", - "for more information on how to install 'Rtools'", collapse=""), + "See 'https://github.com/stan-dev/rstan/wiki/Install-Rtools-for-Windows' ", + "for more information on how to install 'Rtools'", collapse=""), call.=FALSE) } @@ -49,17 +49,17 @@ incant <- function(path, magic_db="system") { suppressMessages( suppressWarnings( - system2(file_exe, - c("--mime-type", "--mime-encoding", "--no-buffer", "--preserve-date", - '--separator "||"', - sprintf('--files-from "%s"', tf)), - stdout=TRUE))) -> output_1 + system2(file_exe, + c("--mime-type", "--mime-encoding", "--no-buffer", "--preserve-date", + '--separator "||"', + sprintf('--files-from "%s"', tf)), + stdout=TRUE))) -> output_1 suppressMessages( suppressWarnings(system2(file_exe, - c("--no-buffer", "--preserve-date", '--separator "||"', - sprintf('--files-from "%s"', tf)), - stdout=TRUE))) -> output_2 + c("--no-buffer", "--preserve-date", '--separator "||"', + sprintf('--files-from "%s"', tf)), + stdout=TRUE))) -> output_2 unlink(tf) @@ -74,13 +74,36 @@ incant <- function(path, magic_db="system") { setNames(c("file", "description")) -> df2 left_join(df1, df2, by="file") %>% - mutate_all(stri_trim_both) + mutate_all(stri_trim_both) -> ret } else { - incant_(path, magic_db) + ret <- incant_(path, magic_db) } + + if (!("extensions" %in% colnames(ret))) ret$extensions <- NA + + mutate(ret, extensions=ifelse(extensions=="???", NA, extensions)) %>% + mutate(extensions=map_exts(mime_type, extensions)) + } +map_exts <- function(mime_type, current_extensions) { + + exts <- stri_split_regex(current_extensions, "/") + + map2(mime_type, exts, function(mt, xt) { + + ret <- wand::mime_db[[mt]]$extensions %||% NA + ret <- sort(unique(c(xt, ret))) + ret <- ret[!is.na(ret)] + if (length(ret)==0) ret <- NA + ret + + }) + +} + + #' ripped from rappdirs (ty Hadley!) get_os <- function () { if (.Platform$OS.type == "windows") { diff --git a/R/zzz.r b/R/zzz.r index 86c15f9..31aaecc 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -18,7 +18,7 @@ #' @examples #' library(dplyr) #' -#' system.file("img", package="filemagic") %>% +#' system.file("extdata/img", package="filemagic") %>% #' list.files(full.names=TRUE) %>% #' incant(magic_wand_file()) %>% #' glimpse() @@ -32,7 +32,7 @@ magic_wand_file <- function(refresh=FALSE) { if (lib_version() >= 528) vers <- "new" else vers <- "old" if (refresh | (!file.exists(file.path(rappdirs::user_cache_dir("wandr"), "magic.mgc")))) { - unzip(system.file("db", vers, "magic.mgc.zip", package="wand"), + unzip(system.file("extdata", "db", vers, "magic.mgc.zip", package="wand"), exdir=cache, overwrite=TRUE) } diff --git a/README.Rmd b/README.Rmd index 23df354..2f7494f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -8,8 +8,9 @@ output: rmarkdown::github_document The `libmagic` library must be installed on *nix/macOS and available to use this. -- `apt-get install libmagic-dev` on Debian-ish systems +- `apt-get install libmagic-dev` on Ubuntu/Debian-ish systems - `brew install libmagic` on macOS +- `yum install file-devel` on RHEL/CentOS/Fedora While the package was developed using the 5.28 version of `libmagic` it has been configured to work with older versions. Note that some fields in the resultant data frame might not be available with older library versions. When using the function `magic_wand_file()` it checks for which version of `libmagic` is installed on your system and provides a suitable `magic.mgc` file for it. @@ -20,6 +21,10 @@ The following functions are implemented: - `incant` : returns the "magic" metadata of the files in the input vector (as a data frame) - `magic_wand_file` : provides a full path to the package-provided `magic` file +The following datasets are included: + +- `mime_db` : a database of all mime types from + ### Installation ```{r eval=FALSE} @@ -34,22 +39,34 @@ options(width=120) ```{r message=FALSE} library(wand) -library(magrittr) library(dplyr) -system.file("img", package="wand") %>% +system.file("extdata", "img", package="wand") %>% list.files(full.names=TRUE) %>% incant() %>% glimpse() +``` +```{r message=FALSE} # Use a non-system magic-file -system.file("img", package="wand") %>% +system.file("extdata", "img", package="wand") %>% list.files(full.names=TRUE) %>% incant(magic_wand_file()) %>% select(description) %>% unlist(use.names=FALSE) +``` +```{r message=FALSE} +# what kinds of extensions are associated with these mime types +system.file("extdata", "img", package="wand") %>% + list.files(full.names=TRUE) %>% + incant(magic_wand_file()) %>% + select(extensions) %>% + as.data.frame() +``` + +```{r message=FALSE} # current verison packageVersion("wand") diff --git a/README.md b/README.md index c116cad..e7f84b0 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,10 @@ The following functions are implemented: - `incant` : returns the "magic" metadata of the files in the input vector (as a data frame) - `magic_wand_file` : provides a full path to the package-provided `magic` file +The following datasets are included: + +- `mime_db` : a database of all mime types from + ### Installation ``` r @@ -27,10 +31,9 @@ devtools::install_github("hrbrmstr/wand") ``` r library(wand) -library(magrittr) library(dplyr) -system.file("img", package="wand") %>% +system.file("extdata", "img", package="wand") %>% list.files(full.names=TRUE) %>% incant() %>% glimpse() @@ -38,16 +41,16 @@ system.file("img", package="wand") %>% ## Observations: 10 ## Variables: 5 - ## $ file "/Library/Frameworks/R.framework/Versions/3.3/Resources/library/wand/img/example_dir", "/Librar... + ## $ file "/Library/Frameworks/R.framework/Versions/3.3/Resources/library/wand/extdata/img/example_dir", ... ## $ mime_type "inode/directory", "text/x-c", "text/html", "text/plain", "text/rtf", "image/jpeg", "applicatio... ## $ encoding "binary", "us-ascii", "us-ascii", "us-ascii", "us-ascii", "binary", "binary", "binary", "us-asc... - ## $ extensions NA, "???", "???", "???", "???", "jpeg/jpg/jpe/jfif", "???", "???", "???", "???" + ## $ extensions [NA, <"c", "cc", "cpp", "cxx", "dic", "h", "hh">, <"htm", "html", "shtml">, <"conf", "def", "i... ## $ description "directory", "C source, ASCII text", "HTML document, ASCII text, with CRLF line terminators", "... ``` r # Use a non-system magic-file -system.file("img", package="wand") %>% +system.file("extdata", "img", package="wand") %>% list.files(full.names=TRUE) %>% incant(magic_wand_file()) %>% select(description) %>% @@ -66,6 +69,27 @@ system.file("img", package="wand") %>% ## [10] "TIFF image data, big-endian" ``` r +# what kinds of extensions are associated with these mime types +system.file("extdata", "img", package="wand") %>% + list.files(full.names=TRUE) %>% + incant(magic_wand_file()) %>% + select(extensions) %>% + as.data.frame() +``` + + ## extensions + ## 1 NA + ## 2 c, cc, cpp, cxx, dic, h, hh + ## 3 htm, html, shtml + ## 4 conf, def, in, ini, list, log, text, txt + ## 5 rtf + ## 6 jfif, jpe, jpeg, jpg + ## 7 pdf + ## 8 png + ## 9 conf, def, in, ini, list, log, text, txt + ## 10 tif, tiff + +``` r # current verison packageVersion("wand") ``` @@ -81,7 +105,7 @@ library(testthat) date() ``` - ## [1] "Mon Aug 15 10:19:22 2016" + ## [1] "Mon Aug 15 11:54:15 2016" ``` r test_dir("tests/") diff --git a/builder/make_mime_db.r b/builder/make_mime_db.r new file mode 100644 index 0000000..40b0ba6 --- /dev/null +++ b/builder/make_mime_db.r @@ -0,0 +1,4 @@ +JSON_DB_URL <- "https://raw.githubusercontent.com/jshttp/mime-db/master/db.json" + +mime_db <- jsonlite::fromJSON(JSON_DB_URL, flatten=TRUE) +use_data(mime_db) diff --git a/configure b/configure new file mode 100755 index 0000000..d470950 --- /dev/null +++ b/configure @@ -0,0 +1,45 @@ +echo "Checking to see if libmagic is available..." + +: ${R_HOME=`R RHOME`} +if test -z "${R_HOME}"; then + echo "could not determine R_HOME" + exit 1 +fi + +CC=`"${R_HOME}/bin/R" CMD config CC` +CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS` +CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS` +CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS` +LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS` +DYLIB_LDFLAGS=`"${R_HOME}/bin/R" CMD config DYLIB_LDFLAGS` +SHLIB_LDFLAGS=`"${R_HOME}/bin/R" CMD config SHLIB_LDFLAGS` + +temp_src=$(mktemp) +cat > ${temp_src} < /dev/null + +ccerr=$? + +rm ${temp_src} ${temp_exe} + +if [ "$ccerr" == 1 ] ; then + echo + echo + echo "The libmagic library was not found." + echo + echo "Please install it before installing this package." + echo + echo + exit 1 +fi + +exit 0 diff --git a/data/mime_db.rda b/data/mime_db.rda new file mode 100644 index 0000000..dfb2c54 Binary files /dev/null and b/data/mime_db.rda differ diff --git a/inst/db/new/magic.mgc.zip b/inst/extdata/db/new/magic.mgc.zip similarity index 100% rename from inst/db/new/magic.mgc.zip rename to inst/extdata/db/new/magic.mgc.zip diff --git a/inst/db/old/magic.mgc.zip b/inst/extdata/db/old/magic.mgc.zip similarity index 100% rename from inst/db/old/magic.mgc.zip rename to inst/extdata/db/old/magic.mgc.zip diff --git a/inst/img/Rlogo.jpg b/inst/extdata/img/Rlogo.jpg similarity index 100% rename from inst/img/Rlogo.jpg rename to inst/extdata/img/Rlogo.jpg diff --git a/inst/img/Rlogo.pdf b/inst/extdata/img/Rlogo.pdf similarity index 100% rename from inst/img/Rlogo.pdf rename to inst/extdata/img/Rlogo.pdf diff --git a/inst/img/Rlogo.png b/inst/extdata/img/Rlogo.png similarity index 100% rename from inst/img/Rlogo.png rename to inst/extdata/img/Rlogo.png diff --git a/inst/img/Rlogo.svg b/inst/extdata/img/Rlogo.svg similarity index 100% rename from inst/img/Rlogo.svg rename to inst/extdata/img/Rlogo.svg diff --git a/inst/img/Rlogo.tiff b/inst/extdata/img/Rlogo.tiff similarity index 100% rename from inst/img/Rlogo.tiff rename to inst/extdata/img/Rlogo.tiff diff --git a/inst/img/example.c b/inst/extdata/img/example.c similarity index 100% rename from inst/img/example.c rename to inst/extdata/img/example.c diff --git a/inst/img/example.html b/inst/extdata/img/example.html similarity index 100% rename from inst/img/example.html rename to inst/extdata/img/example.html diff --git a/inst/img/example.r b/inst/extdata/img/example.r similarity index 100% rename from inst/img/example.r rename to inst/extdata/img/example.r diff --git a/inst/img/example.rtf b/inst/extdata/img/example.rtf similarity index 100% rename from inst/img/example.rtf rename to inst/extdata/img/example.rtf diff --git a/inst/img/example_dir/test.txt b/inst/extdata/img/example_dir/test.txt similarity index 100% rename from inst/img/example_dir/test.txt rename to inst/extdata/img/example_dir/test.txt diff --git a/man/incant.Rd b/man/incant.Rd index fd72c65..d52bd9c 100644 --- a/man/incant.Rd +++ b/man/incant.Rd @@ -30,7 +30,7 @@ Various fields might not be available depending on the version \examples{ library(dplyr) -system.file("img", package="filemagic") \%>\% +system.file("extdata/img", package="filemagic") \%>\% list.files(full.names=TRUE) \%>\% incant() \%>\% glimpse() diff --git a/man/magic_wand_file.Rd b/man/magic_wand_file.Rd index e7b9490..1105b18 100644 --- a/man/magic_wand_file.Rd +++ b/man/magic_wand_file.Rd @@ -28,7 +28,7 @@ cache directory has been cleared. \examples{ library(dplyr) -system.file("img", package="filemagic") \%>\% +system.file("extdata/img", package="filemagic") \%>\% list.files(full.names=TRUE) \%>\% incant(magic_wand_file()) \%>\% glimpse() diff --git a/man/mime_db.Rd b/man/mime_db.Rd new file mode 100644 index 0000000..7dc34ed --- /dev/null +++ b/man/mime_db.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets.r +\docType{data} +\name{mime_db} +\alias{mime_db} +\title{MIME Types Database} +\format{A list with 1,883 elements and four named fields: \code{source}, + \code{compressible}, \code{extensions} & \code{charset}.} +\usage{ +data(mime_db) +} +\description{ +This is a dataset of all mime types. It aggregates data from the +following sources: + +\itemize{ + \item \url{http://www.iana.org/assignments/media-types/media-types.xhtml} + \item \url{http://svn.apache.org/repos/asf/httpd/httpd/trunk/docs/conf/mime.types} + \item \url{http://hg.nginx.org/nginx/raw-file/default/conf/mime.types} +} + +There are a total of four possible fields per element: + +\itemize{ + \item \code{source}: where the mime type is defined. If not set, it's + probably a custom media type. One of \code{apache}, \code{iana} or \code{nginx}. + \item \code{extensions}: a character vector of known extensions associated with this mime type. + \item \code{compressible}: whether a file of this type can be "gzipped" (mostly + useful in the context of serving up web content). + \item \code{charset}: the default charset associated with this type, if any. +} +} +\note{ +Last updated 2016-08-14; the only guaranteed field is \code{source} +} +\references{ +Ingested from \url{https://github.com/jshttp/mime-db}. +} +\keyword{datasets} + diff --git a/man/wand.Rd b/man/wand.Rd index 8192476..3ddadfd 100644 --- a/man/wand.Rd +++ b/man/wand.Rd @@ -1,12 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/wand-package.R +% Please edit documentation in R/wand-package.r \docType{package} \name{wand} \alias{wand} \alias{wand-package} \title{Retrieve 'Magic' Attributes from Files and Directories} \description{ -Retrieve 'Magic' Attributes from Files and Directories +The 'libmagic' library provides functions to determine 'MIME' type and other +metadata from files through their "magic" attributes. This is useful when you +do not wish to rely solely on the honesty of a user or the extension on a +file name. It also incorporates other metadata from the mime-db database + +} +\details{ +Based on \code{file} / \code{libmagic} - \url{https://github.com/file/file} } \author{ Bob Rudis (@hrbrmstr) diff --git a/src/wand.cpp b/src/wand.cpp index 742b287..9c1ee0d 100644 --- a/src/wand.cpp +++ b/src/wand.cpp @@ -1,5 +1,4 @@ #include - using namespace Rcpp; #ifdef _WIN32 diff --git a/tests/testthat/test-wand.R b/tests/testthat/test-wand.R index 2749fed..c3dc1fd 100644 --- a/tests/testthat/test-wand.R +++ b/tests/testthat/test-wand.R @@ -1,7 +1,7 @@ context("basic functionality") test_that("we can do something", { - tmp <- incant(list.files(system.file("img", package="wand"), + tmp <- incant(list.files(system.file("extdata", "img", package="wand"), full.names=TRUE), magic_wand_file()) tmp <- tmp$description