You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

148 lines
5.9 KiB

#' Discover MIME type of a file based on contents
#'
#' There are a limited number of header "magic" bytes checked directly by
#' this function but cover quite a bit of ground. After that, [guess_content_type()]
#' is called which uses file extension-to-MIME mappings.
#'
#' File an issue or PR if more magic-byte-level
#' comparisons are required/desired. If no match is found, `???` is returned
#' (see [guess_content_type()] for how to override this behaviour).
#'
#' @md
#' @param path path to a file
5 years ago
#' @param guess if `TRUE` (the default), calls [guess_content_type()] if
#' no internal rules match the magic header
#' @param ... passed on to [guess_content_type()] if `guess` is `TRUE`
#' @return character vector
#' @export
#' @examples
5 years ago
#' get_content_type(system.file("extdat", "pass-through", "test.pdf", package="wand"))
get_content_type <- function(path, guess = TRUE, ...) {
5 years ago
path <- path.expand(path[1])
if (!file.exists(path)) stop("File not found.", call.=FALSE)
hdr <- readBin(path, "raw", n=1024)
if (all(c(0x4F,0x62,0x6A,0x01) == hdr[1:4])) return("application/vnd.apache.avro+binary")
if (all(c(0x50,0x41,0x52,0x31) == hdr[1:4])) return("application/x-parquet")
if (all(c(0xCA,0xFE,0xBA,0xBE) == hdr[1:4])) return("application/java-vm")
if (all(c(0xD0,0xCF,0x11,0xE0,0xA1,0xB1,0x1A,0xE1) == hdr[1:8])) {
5 years ago
if (guess) {
guessed_name <- guess_content_type(path)
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name)
}
return("application/msword")
5 years ago
}
if (all(c(0x25,0x50,0x44,0x46,0x2d,0x31,0x2e) == hdr[1:7])) return("application/pdf")
if (all(c(0x25,0x50,0x44,0x46) == hdr[1:4])) return("application/x-pdf")
if (all(c(0x38,0x42,0x50,0x53,0x00,0x01) == hdr[1:6])) return("image/photoshop")
if (all(c(0x25,0x21,0x50,0x53) == hdr[1:4])) return("application/postscript")
if (all(c(0xff,0xfb,0x30) == hdr[1:3])) return("audio/mp3")
if (all(c(0xff,0xfb,0xd0) == hdr[1:3])) return("audio/mp3")
if (all(c(0xff,0xfb,0x90) == hdr[1:3])) return("audio/mp3")
if (all(c(0x49,0x44,0x33) == hdr[1:3])) return("audio/mp3")
if (all(c(0xAC,0xED) == hdr[1:2])) return("application/x-java-serialized-object")
if (all(c(0x4c,0x5a,0x49,0x50) == hdr[1:4])) return("application/x-lzip")
if (hdr[1] == 0x3c) { # "<"
if (all(c(0x68,0x74,0x6d,0x6c) == hdr[2:5])) return("text/html") # "html"
if (all(c(0x48,0x54,0x4d,0x4c) == hdr[2:5])) return("text/html") # "HTML"
if (all(c(0x48,0x45,0x41,0x44) == hdr[2:5])) return("text/html") # "HEAD"
if (all(c(0x68,0x65,0x61,0x64) == hdr[2:5])) return("text/html") # "head"
if (all(c(0x3f,0x78,0x6d,0x6c,0x20) == hdr[2:6])) return("application/xml")
}
if (all(c(0x0a,0x0d,0x0d,0x0a) == hdr[1:4])) "application/x-pcapng"
if (all(c(0xa1,0xb2,0xc3,0xd4) == hdr[1:4]) ||
all(c(0xd4,0xc3,0xb2,0xa1) == hdr[1:4])) return("application/x-cap")
if (all(c(0xfe,0xff) == hdr[1:2])) {
if (all(c(0x00,0x3c,0x00,0x3f,0x00,0x78) == hdr[3:8])) return("application/xml")
}
if (all(c(0x42,0x4d) == hdr[1:2])) return("image/bmp")
if (all(c(0x49,0x49,0x2a,0x00) == hdr[1:4])) return("image/tiff")
if (all(c(0x4D,0x4D,0x00,0x2a) == hdr[1:4])) return("image/tiff")
if (all(c(0x47,0x49,0x46,0x38) == hdr[1:4])) return("image/gif")
if (all(c(0x23,0x64,0x65,0x66) == hdr[1:4])) return("image/x-bitmap")
if (all(c(0x21,0x20,0x58,0x50,0x4d,0x32) == hdr[1:6])) return("image/x-pixmap")
if (all(c(137,80,78,71,13,10,26,10) == hdr[1:8])) return("image/png")
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:11]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:13]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65) == hdr[1:15]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:17]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65) == hdr[1:19]))
return("application/javascript")
if (all(c(0x23,0x21,0x2f,0x75,0x73,0x72,0x2f,0x62,0x69,0x6e,0x2f,0x65,0x6e,0x76,0x20,0x6e,0x6f,0x64,0x65,0x6a,0x73) == hdr[1:21]))
return("application/javascript")
if (all(c(0xFF,0xD8,0xFF) == hdr[1:3])) {
if (0xDB == hdr[4]) return("image/jpeg")
if (0xE0 == hdr[4]) return("image/jpeg")
if (0xE1 == hdr[4]) {
if (all(c(0x45,0x78,0x69,0x66,0x00) == hdr[7:11])) return("image/jpeg") # Exif
}
if (0xEE == hdr[4]) return("image/jpg")
}
if (all(c(0x41,0x43) == hdr[1:2]) && all(c(0x00,0x00,0x00,0x00,0x00) == hdr[7:11]))
return("application/acad")
if (all(c(0x2E,0x73,0x6E,0x64) == hdr[1:4])) return("audio/basic")
if (all(c(0x64,0x6E,0x73,0x2E) == hdr[1:4])) return("audio/basic")
if (all(c(0x52,0x49,0x46,0x46) == hdr[1:4])) return("audio/x-wav") # "RIFF"
if (all(c(0x50, 0x4b) == hdr[1:2])) { # "PK"
office_type <- check_office(hdr, path)
if (length(office_type) > 0) return(office_type)
5 years ago
if (guess) {
guessed_name <- guess_content_type(path)
if ((length(guessed_name) == 1) && (guessed_name != "???")) return(guessed_name)
}
return("application/zip")
}
if (all(c(0x00,0x61,0x73,0x6d) == hdr[1:4])) return("application/wasm")
if (all(c(0x37,0x7A,0xBC,0xAF,0x27,0x1C) == hdr[1:6])) return("application/x-7z-compressed")
if (all(c(0x5a,0x4d) == hdr[1:2])) return("x-system/exe")
if (all(c(0x75,0x73,0x74,0x61,0x72) == hdr[258:262])) {
if (all(c(0x00,0x30,0x30) == hdr[263:265]) || all(c(0x20,0x20,0x00) == hdr[263:265])) {
return("application/tar")
} else {
return("application/pax")
}
}
if (all(c(0x00,0x00,0x01,0xBA) == hdr[1:4])) return("video/mpeg")
if (all(c(0x00,0x00,0x01,0xB3) == hdr[1:4])) return("video/mpeg")
5 years ago
if (!guess) return("???")
return(guess_content_type(path, ...))
}