#' --- #' title: "CRAN HTTPS Mirrors" #' author: "" #' date: "" #' output: #' html_document: #' df_print: kable #' keep_md: true #' theme: simplex #' highlight: monochrome #' --- #+ init, include=FALSE knitr::opts_chunk$set( message = FALSE, warning = FALSE, dev = "png", fig.retina = 2, fig.width = 10, fig.height = 6 ) #+ libs library(xml2) library(httr) library(curl) library(stringi) library(urltools) library(ipinfo) library(openssl) library(furrr) library(vershist) # install.packages("vershist", repos = "https://cinc.rud.is/") library(ggalt) library(ggbeeswarm) library(hrbrthemes) library(tidyverse) #' ## Collect CRAN mirrors' metadata if (!file.exists(here::here("data/mir-dat.rds"))) { mdoc <- xml2::read_xml(here::here("data/mirrors.html"), as_html = TRUE) xml_find_all(mdoc, ".//td/a[contains(@href, 'https')]") %>% xml_attr("href") %>% unique() -> ssl_mirrors plan(multiprocess) dl_cert <- possibly(openssl::download_ssl_cert, NULL) HEAD_ <- possibly(httr::HEAD, NULL) dig <- possibly(curl::nslookup, NULL) query_ip_ <- possibly(ipinfo::query_ip, NULL) ssl_mirrors %>% future_map(~{ host <- domain(.x) ip <- dig(host, TRUE) ip_info <- if (length(ip)) query_ip_(ip) else NULL list( host = host, cert = dl_cert(host), head = HEAD_(.x), ip = ip, ip_info = ip_info ) }) -> mir_dat saveRDS(mir_dat, here::here("data/mir-dat.rds")) } else { mir_dat <- readRDS(here::here("data/mir-dat.rds")) } str(mir_dat[1], 3) #' Gratuitous map of CRAN mirror locations maps::map("world", ".", exact = FALSE, plot = FALSE, fill = TRUE) %>% fortify() %>% filter(region != "Antarctica") -> world map_chr(mir_dat, ~.x$ip_info$loc) %>% stri_split_fixed(pattern = ",", n = 2, simplify = TRUE) %>% as.data.frame(stringsAsFactors = FALSE) %>% as_tibble() %>% mutate_all(list(as.numeric)) -> wheres_cran #+ cran-map, fig.width=10, fig.height=6 ggplot() + ggalt::geom_cartogram( data = world, map = world, aes(long, lat, map_id=region), color = ft_cols$gray, size = 0.125 ) + geom_point( data = wheres_cran, aes(V2, V1), size = 2, color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21 ) + ggalt::coord_proj("+proj=wintri") + labs( x = NULL, y = NULL, title = "Geolocation of HTTPS-'enabled' CRAN Mirrors" ) + theme_ft_rc(grid="") + theme(axis.text = element_blank()) #' ## Look at certificate info map_df(mir_dat, ~{ tibble( host = .x$host, s_issuer = .x$cert[[1]]$issuer %||% NA_character_, i_issuer = .x$cert[[2]]$issuer %||% NA_character_, algo = .x$cert[[1]]$algorithm %||% NA_character_, names = .x$cert[[1]]$alt_names %||% NA_character_, nm_ct = length(.x$cert[[1]]$alt_names), key_size = .x$cert[[1]]$pubkey$size %||% NA_integer_ ) }) -> certs #' ### How many either blocked the connection or don't exist certs[!complete.cases(certs),] certs <- filter(certs, complete.cases(certs)) #' ### How many domains do these certs serve? #+ alt-names-ct, fig.width=8, fig.height=6 count(certs, host, sort=TRUE) %>% ggplot() + geom_quasirandom( aes("", n), size = 2, color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21 ) + scale_y_comma() + labs( x = NULL, y = "# Servers", title = "Distribution of the number of alt-names in CRAN mirror certificates" ) + theme_ft_rc(grid="Y") #' Take a look at some of them filter(certs, host == "cran.cnr.berkeley.edu") %>% select(names) %>% head(20) filter(certs, host == "cran.rapporter.net") %>% select(names) %>% head(20) filter(certs, host == "cran-r.c3sl.ufpr.br") %>% select(names) %>% head(20) filter(certs, host == "fourdots.com") %>% select(names) %>% head(20) #' ### Certificate algo/key distinct(certs, host, algo, key_size) %>% count(algo, key_size, sort=TRUE) #' ### Certificate issuers distinct(certs, host, i_issuer) %>% count(i_issuer, sort = TRUE) %>% head(28) #' ## Interactive SSL tests #' #' Using [`testssl.sh`](https://github.com/drwetter/testssl.sh). list.files(here::here("data/ssl"), "json$", full.names = TRUE) %>% map_df(jsonlite::fromJSON) %>% as_tibble() -> ssl_tests sev <- c("OK", "LOW", "MEDIUM", "HIGH", "WARN", "CRITICAL") #+ testssl, fig.width=8, fig.height=12 filter(ssl_tests, severity %in% sev) %>% group_by(ip) %>% count(severity) %>% ungroup() %>% complete(ip = unique(ip), severity = sev) %>% mutate(severity = factor(severity, levels = sev)) %>% arrange(ip) %>% mutate(ip = factor(ip, levels = rev(unique(ip)))) %>% ggplot(aes(severity, ip, fill=n)) + geom_tile(color = "#b2b2b2", size = 0.125) + scale_x_discrete(name = NULL, expand = c(0,0.1), position = "top") + scale_y_discrete(name = NULL, expand = c(0,0)) + viridis::scale_fill_viridis( name = "# Tests", option = "cividis", na.value = ft_cols$gray ) + labs( title = "CRAN Mirror SSL Test Summary Findings by Severity" ) + theme_ft_rc(grid="") + theme(axis.text.y = element_text(size = 8, family = "mono")) -> gg # We're going to move the title vs have too wide of a plot gb <- ggplot2::ggplotGrob(gg) gb$layout$l[gb$layout$name %in% "title"] <- 2 grid::grid.newpage() grid::grid.draw(gb) #' ## Web server headers map_df(mir_dat, ~{ if (length(.x$head$headers) == 0) return(NULL) host <- .x$host flatten_df(.x$head$headers) %>% gather(name, value) %>% mutate(host = host) }) -> hdrs count(hdrs, name, sort=TRUE) %>% head(nrow(.)) #' ### 'Security' Headers c( "content-security-policy", "x-frame-options", "x-xss-protection", "x-content-type-options", "strict-transport-security", "referrer-policy" ) -> secure_headers count(hdrs, name, sort=TRUE) %>% filter(name %in% secure_headers) filter(hdrs, name %in% secure_headers) %>% count(host, sort = TRUE) filter(hdrs, host == "cran.csiro.au", name %in% secure_headers) #' ### 'Server' Types filter(hdrs, name == "server") %>% separate( value, c("kind", "version"), sep="/", fill="right", extra="merge" ) -> svr count(svr, kind, sort=TRUE) #' #### apache apache_httpd_version_history() %>% arrange(rls_date) %>% mutate( vers = factor(as.character(vers), levels = as.character(vers)) ) -> apa_all filter(svr, kind == "Apache") %>% filter(!is.na(version)) %>% mutate(version = stri_replace_all_regex(version, " .*$", "")) %>% count(version) %>% separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>% mutate(pat = ifelse(is.na(pat), 1, pat)) %>% mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>% mutate(v = factor(v, levels = apa_all$vers)) %>% arrange(v) -> apa_vers filter(apa_all, vers %in% apa_vers$v) %>% arrange(rls_date) %>% group_by(rls_year) %>% slice(1) %>% ungroup() %>% arrange(rls_date) -> apa_yrs #+ apache-history, fig.width=12.5, fig.height=5 ggplot() + geom_blank( data = apa_vers, aes(v, n) ) + geom_segment( data = apa_yrs, aes(vers, 0, xend=vers, yend=Inf), linetype = "dotted", size = 0.25, color = "white" ) + geom_segment( data = apa_vers, aes(v, n, xend=v, yend=0), color = ft_cols$gray, size = 8 ) + geom_label( data = apa_yrs, aes(vers, Inf, label = rls_year), family = font_rc, color = "white", fill = "#262a31", size = 4, vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0 ) + scale_y_comma(limits = c(0, 15)) + labs( x = "Apache Version #", y = "# Servers", title = "CRAN Mirrors Apache Version History" ) + theme_ft_rc(grid="Y") + theme(axis.text.x = element_text(family = "mono", size = 8, color = "white")) #' #### nginx nginx_version_history() %>% arrange(rls_date) %>% mutate( vers = factor(as.character(vers), levels = as.character(vers)) ) -> ngx_all filter(svr, kind == "nginx") %>% filter(!is.na(version)) %>% mutate(version = stri_replace_all_regex(version, " .*$", "")) %>% count(version) %>% separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>% mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>% mutate(v = factor(v, levels = ngx_all$vers)) %>% arrange(v) -> ngx_vers filter(ngx_all, vers %in% ngx_vers$v) %>% arrange(rls_date) %>% group_by(rls_year) %>% slice(1) %>% ungroup() %>% arrange(rls_date) -> ngx_yrs #+ nginx-history, fig.width=8, fig.height=5 ggplot() + geom_blank( data = ngx_vers, aes(v, n) ) + geom_segment( data = ngx_yrs, aes(vers, 0, xend=vers, yend=Inf), linetype = "dotted", size = 0.25, color = "white" ) + geom_segment( data = ngx_vers, aes(v, n, xend=v, yend=0), color = ft_cols$gray, size = 8 ) + geom_label( data = ngx_yrs, aes(vers, Inf, label = rls_year), family = font_rc, color = "white", fill = "#262a31", size = 4, vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0 ) + scale_y_comma(limits = c(0, 15)) + labs( x = "nginx Version #", y = "# Servers", title = "CRAN Mirrors nginx Version History" ) + theme_ft_rc(grid="Y") + theme(axis.text.x = element_text(family = "mono", color = "white")) #' ### What else do CRAN mirrors run? cran_mirror_other_things <- readRDS(here::here("data/cran-mirror-other-things.rds")) #' #### A look by port distinct(cran_mirror_other_things, ip, port) %>% count(ip, sort = TRUE) %>% head(20) #+ other-stuff, fig.width=6, fig.height=4 distinct(cran_mirror_other_things, ip, port) %>% filter(!(port %in% c(21, 80, 443))) %>% count(ip) %>% count(n) %>% mutate(n = factor(n)) %>% ggplot() + geom_segment( aes(n, nn, xend = n, yend = 0), size = 10, color = ft_cols$gray ) + scale_y_comma() + labs( x = "Total number of running services", y = "# hosts", title = "How many other services do CRAN mirrors run?", subtitle = "NOTE: Not counting 80/443/21" ) + theme_ft_rc(grid="Y") #' Take a look at a few of them distinct(cran_mirror_other_things, ip, port) %>% count(port, sort=TRUE) distinct(cran_mirror_other_things, ip, port) %>% filter(ip == "104.25.94.23") distinct(cran_mirror_other_things, ip, port) %>% filter(ip == "143.107.10.17") distinct(cran_mirror_other_things, ip, port) %>% filter(ip == "137.208.57.37") #' #### What kinds of services were detected? cran_recog <- readRDS(here::here("data/cran-recog.rds")) distinct(cran_recog, ip, cpe) %>% count(cpe, sort = TRUE) %>% head(50) #' ### What other DNS entries use CRAN IPs? cran_dns <- readRDS(here::here("data/cran-dns.rds")) filter(cran_dns, !grepl("CLOUDFLARENET|AMAZON", as_name)) %>% count(value, as_name, sort = TRUE)