Exploring CRAN claims of the "security" of CRAN mirrors
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

402 lines
10 KiB

#' ---
#' title: "CRAN HTTPS Mirrors"
#' author: ""
#' date: ""
#' output:
#' html_document:
#' df_print: kable
#' keep_md: true
#' theme: simplex
#' highlight: monochrome
#' ---
#+ init, include=FALSE
knitr::opts_chunk$set(
message = FALSE, warning = FALSE, dev = "png",
fig.retina = 2, fig.width = 10, fig.height = 6
)
#+ libs
library(xml2)
library(httr)
library(curl)
library(stringi)
library(urltools)
library(ipinfo)
library(openssl)
library(furrr)
library(vershist) # install.packages("vershist", repos = "https://cinc.rud.is/")
library(ggalt)
library(ggbeeswarm)
library(hrbrthemes)
library(tidyverse)
#' ## Collect CRAN mirrors' metadata
if (!file.exists(here::here("data/mir-dat.rds"))) {
mdoc <- xml2::read_xml(here::here("data/mirrors.html"), as_html = TRUE)
xml_find_all(mdoc, ".//td/a[contains(@href, 'https')]") %>%
xml_attr("href") %>%
unique() -> ssl_mirrors
plan(multiprocess)
dl_cert <- possibly(openssl::download_ssl_cert, NULL)
HEAD_ <- possibly(httr::HEAD, NULL)
dig <- possibly(curl::nslookup, NULL)
query_ip_ <- possibly(ipinfo::query_ip, NULL)
ssl_mirrors %>%
future_map(~{
host <- domain(.x)
ip <- dig(host, TRUE)
ip_info <- if (length(ip)) query_ip_(ip) else NULL
list(
host = host,
cert = dl_cert(host),
head = HEAD_(.x),
ip = ip,
ip_info = ip_info
)
}) -> mir_dat
saveRDS(mir_dat, here::here("data/mir-dat.rds"))
} else {
mir_dat <- readRDS(here::here("data/mir-dat.rds"))
}
str(mir_dat[1], 3)
#' Gratuitous map of CRAN mirror locations
maps::map("world", ".", exact = FALSE, plot = FALSE, fill = TRUE) %>%
fortify() %>%
filter(region != "Antarctica") -> world
map_chr(mir_dat, ~.x$ip_info$loc) %>%
stri_split_fixed(pattern = ",", n = 2, simplify = TRUE) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
as_tibble() %>%
mutate_all(list(as.numeric)) -> wheres_cran
#+ cran-map, fig.width=10, fig.height=6
ggplot() +
ggalt::geom_cartogram(
data = world, map = world, aes(long, lat, map_id=region),
color = ft_cols$gray, size = 0.125
) +
geom_point(
data = wheres_cran, aes(V2, V1), size = 2,
color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21
) +
ggalt::coord_proj("+proj=wintri") +
labs(
x = NULL, y = NULL,
title = "Geolocation of HTTPS-'enabled' CRAN Mirrors"
) +
theme_ft_rc(grid="") +
theme(axis.text = element_blank())
#' ## Look at certificate info
map_df(mir_dat, ~{
tibble(
host = .x$host,
s_issuer = .x$cert[[1]]$issuer %||% NA_character_,
i_issuer = .x$cert[[2]]$issuer %||% NA_character_,
algo = .x$cert[[1]]$algorithm %||% NA_character_,
names = .x$cert[[1]]$alt_names %||% NA_character_,
nm_ct = length(.x$cert[[1]]$alt_names),
key_size = .x$cert[[1]]$pubkey$size %||% NA_integer_
)
}) -> certs
#' ### How many either blocked the connection or don't exist
certs[!complete.cases(certs),]
certs <- filter(certs, complete.cases(certs))
#' ### How many domains do these certs serve?
#+ alt-names-ct, fig.width=8, fig.height=6
count(certs, host, sort=TRUE) %>%
ggplot() +
geom_quasirandom(
aes("", n), size = 2,
color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21
) +
scale_y_comma() +
labs(
x = NULL, y = "# Servers",
title = "Distribution of the number of alt-names in CRAN mirror certificates"
) +
theme_ft_rc(grid="Y")
#' Take a look at some of them
filter(certs, host == "cran.cnr.berkeley.edu") %>%
select(names) %>%
head(20)
filter(certs, host == "cran.rapporter.net") %>%
select(names) %>%
head(20)
filter(certs, host == "cran-r.c3sl.ufpr.br") %>%
select(names) %>%
head(20)
filter(certs, host == "fourdots.com") %>%
select(names) %>%
head(20)
#' ### Certificate algo/key
distinct(certs, host, algo, key_size) %>%
count(algo, key_size, sort=TRUE)
#' ### Certificate issuers
distinct(certs, host, i_issuer) %>%
count(i_issuer, sort = TRUE) %>%
head(28)
#' ## Interactive SSL tests
#'
#' Using [`testssl.sh`](https://github.com/drwetter/testssl.sh).
list.files(here::here("data/ssl"), "json$", full.names = TRUE) %>%
map_df(jsonlite::fromJSON) %>%
as_tibble() -> ssl_tests
sev <- c("OK", "LOW", "MEDIUM", "HIGH", "WARN", "CRITICAL")
#+ testssl, fig.width=8, fig.height=12
filter(ssl_tests, severity %in% sev) %>%
group_by(ip) %>%
count(severity) %>%
ungroup() %>%
complete(ip = unique(ip), severity = sev) %>%
mutate(severity = factor(severity, levels = sev)) %>%
arrange(ip) %>%
mutate(ip = factor(ip, levels = rev(unique(ip)))) %>%
ggplot(aes(severity, ip, fill=n)) +
geom_tile(color = "#b2b2b2", size = 0.125) +
scale_x_discrete(name = NULL, expand = c(0,0.1), position = "top") +
scale_y_discrete(name = NULL, expand = c(0,0)) +
viridis::scale_fill_viridis(
name = "# Tests", option = "cividis", na.value = ft_cols$gray
) +
labs(
title = "CRAN Mirror SSL Test Summary Findings by Severity"
) +
theme_ft_rc(grid="") +
theme(axis.text.y = element_text(size = 8, family = "mono")) -> gg
# We're going to move the title vs have too wide of a plot
gb <- ggplot2::ggplotGrob(gg)
gb$layout$l[gb$layout$name %in% "title"] <- 2
grid::grid.newpage()
grid::grid.draw(gb)
#' ## Web server headers
map_df(mir_dat, ~{
if (length(.x$head$headers) == 0) return(NULL)
host <- .x$host
flatten_df(.x$head$headers) %>%
gather(name, value) %>%
mutate(host = host)
}) -> hdrs
count(hdrs, name, sort=TRUE) %>%
head(nrow(.))
#' ### 'Security' Headers
c(
"content-security-policy", "x-frame-options", "x-xss-protection",
"x-content-type-options", "strict-transport-security", "referrer-policy"
) -> secure_headers
count(hdrs, name, sort=TRUE) %>%
filter(name %in% secure_headers)
filter(hdrs, name %in% secure_headers) %>%
count(host, sort = TRUE)
filter(hdrs, host == "cran.csiro.au", name %in% secure_headers)
#' ### 'Server' Types
filter(hdrs, name == "server") %>%
separate(
value, c("kind", "version"), sep="/", fill="right", extra="merge"
) -> svr
count(svr, kind, sort=TRUE)
#' #### apache
apache_httpd_version_history() %>%
arrange(rls_date) %>%
mutate(
vers = factor(as.character(vers), levels = as.character(vers))
) -> apa_all
filter(svr, kind == "Apache") %>%
filter(!is.na(version)) %>%
mutate(version = stri_replace_all_regex(version, " .*$", "")) %>%
count(version) %>%
separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>%
mutate(pat = ifelse(is.na(pat), 1, pat)) %>%
mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>%
mutate(v = factor(v, levels = apa_all$vers)) %>%
arrange(v) -> apa_vers
filter(apa_all, vers %in% apa_vers$v) %>%
arrange(rls_date) %>%
group_by(rls_year) %>%
slice(1) %>%
ungroup() %>%
arrange(rls_date) -> apa_yrs
#+ apache-history, fig.width=12.5, fig.height=5
ggplot() +
geom_blank(
data = apa_vers, aes(v, n)
) +
geom_segment(
data = apa_yrs, aes(vers, 0, xend=vers, yend=Inf),
linetype = "dotted", size = 0.25, color = "white"
) +
geom_segment(
data = apa_vers, aes(v, n, xend=v, yend=0),
color = ft_cols$gray, size = 8
) +
geom_label(
data = apa_yrs, aes(vers, Inf, label = rls_year),
family = font_rc, color = "white", fill = "#262a31", size = 4,
vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0
) +
scale_y_comma(limits = c(0, 15)) +
labs(
x = "Apache Version #", y = "# Servers",
title = "CRAN Mirrors Apache Version History"
) +
theme_ft_rc(grid="Y") +
theme(axis.text.x = element_text(family = "mono", size = 8, color = "white"))
#' #### nginx
nginx_version_history() %>%
arrange(rls_date) %>%
mutate(
vers = factor(as.character(vers), levels = as.character(vers))
) -> ngx_all
filter(svr, kind == "nginx") %>%
filter(!is.na(version)) %>%
mutate(version = stri_replace_all_regex(version, " .*$", "")) %>%
count(version) %>%
separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>%
mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>%
mutate(v = factor(v, levels = ngx_all$vers)) %>%
arrange(v) -> ngx_vers
filter(ngx_all, vers %in% ngx_vers$v) %>%
arrange(rls_date) %>%
group_by(rls_year) %>%
slice(1) %>%
ungroup() %>%
arrange(rls_date) -> ngx_yrs
#+ nginx-history, fig.width=8, fig.height=5
ggplot() +
geom_blank(
data = ngx_vers, aes(v, n)
) +
geom_segment(
data = ngx_yrs, aes(vers, 0, xend=vers, yend=Inf),
linetype = "dotted", size = 0.25, color = "white"
) +
geom_segment(
data = ngx_vers, aes(v, n, xend=v, yend=0),
color = ft_cols$gray, size = 8
) +
geom_label(
data = ngx_yrs, aes(vers, Inf, label = rls_year),
family = font_rc, color = "white", fill = "#262a31", size = 4,
vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0
) +
scale_y_comma(limits = c(0, 15)) +
labs(
x = "nginx Version #", y = "# Servers",
title = "CRAN Mirrors nginx Version History"
) +
theme_ft_rc(grid="Y") +
theme(axis.text.x = element_text(family = "mono", color = "white"))
#' ### What else do CRAN mirrors run?
cran_mirror_other_things <- readRDS(here::here("data/cran-mirror-other-things.rds"))
#' #### A look by port
distinct(cran_mirror_other_things, ip, port) %>%
count(ip, sort = TRUE) %>%
head(20)
#+ other-stuff, fig.width=6, fig.height=4
distinct(cran_mirror_other_things, ip, port) %>%
filter(!(port %in% c(21, 80, 443))) %>%
count(ip) %>%
count(n) %>%
mutate(n = factor(n)) %>%
ggplot() +
geom_segment(
aes(n, nn, xend = n, yend = 0), size = 10, color = ft_cols$gray
) +
scale_y_comma() +
labs(
x = "Total number of running services", y = "# hosts",
title = "How many other services do CRAN mirrors run?",
subtitle = "NOTE: Not counting 80/443/21"
) +
theme_ft_rc(grid="Y")
#' Take a look at a few of them
distinct(cran_mirror_other_things, ip, port) %>%
count(port, sort=TRUE)
distinct(cran_mirror_other_things, ip, port) %>%
filter(ip == "104.25.94.23")
distinct(cran_mirror_other_things, ip, port) %>%
filter(ip == "143.107.10.17")
distinct(cran_mirror_other_things, ip, port) %>%
filter(ip == "137.208.57.37")
#' #### What kinds of services were detected?
cran_recog <- readRDS(here::here("data/cran-recog.rds"))
distinct(cran_recog, ip, cpe) %>%
count(cpe, sort = TRUE) %>%
head(50)
#' ### What other DNS entries use CRAN IPs?
cran_dns <- readRDS(here::here("data/cran-dns.rds"))
filter(cran_dns, !grepl("CLOUDFLARENET|AMAZON", as_name)) %>%
count(value, as_name, sort = TRUE)