You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
402 lines
10 KiB
402 lines
10 KiB
#' ---
|
|
#' title: "CRAN HTTPS Mirrors"
|
|
#' author: ""
|
|
#' date: ""
|
|
#' output:
|
|
#' html_document:
|
|
#' df_print: kable
|
|
#' keep_md: true
|
|
#' theme: simplex
|
|
#' highlight: monochrome
|
|
#' ---
|
|
#+ init, include=FALSE
|
|
knitr::opts_chunk$set(
|
|
message = FALSE, warning = FALSE, dev = "png",
|
|
fig.retina = 2, fig.width = 10, fig.height = 6
|
|
)
|
|
|
|
#+ libs
|
|
library(xml2)
|
|
library(httr)
|
|
library(curl)
|
|
library(stringi)
|
|
library(urltools)
|
|
library(ipinfo)
|
|
library(openssl)
|
|
library(furrr)
|
|
library(vershist) # install.packages("vershist", repos = "https://cinc.rud.is/")
|
|
library(ggalt)
|
|
library(ggbeeswarm)
|
|
library(hrbrthemes)
|
|
library(tidyverse)
|
|
|
|
#' ## Collect CRAN mirrors' metadata
|
|
|
|
if (!file.exists(here::here("data/mir-dat.rds"))) {
|
|
mdoc <- xml2::read_xml(here::here("data/mirrors.html"), as_html = TRUE)
|
|
|
|
xml_find_all(mdoc, ".//td/a[contains(@href, 'https')]") %>%
|
|
xml_attr("href") %>%
|
|
unique() -> ssl_mirrors
|
|
|
|
plan(multiprocess)
|
|
|
|
dl_cert <- possibly(openssl::download_ssl_cert, NULL)
|
|
HEAD_ <- possibly(httr::HEAD, NULL)
|
|
dig <- possibly(curl::nslookup, NULL)
|
|
query_ip_ <- possibly(ipinfo::query_ip, NULL)
|
|
|
|
ssl_mirrors %>%
|
|
future_map(~{
|
|
host <- domain(.x)
|
|
ip <- dig(host, TRUE)
|
|
ip_info <- if (length(ip)) query_ip_(ip) else NULL
|
|
list(
|
|
host = host,
|
|
cert = dl_cert(host),
|
|
head = HEAD_(.x),
|
|
ip = ip,
|
|
ip_info = ip_info
|
|
)
|
|
}) -> mir_dat
|
|
|
|
saveRDS(mir_dat, here::here("data/mir-dat.rds"))
|
|
} else {
|
|
mir_dat <- readRDS(here::here("data/mir-dat.rds"))
|
|
}
|
|
|
|
str(mir_dat[1], 3)
|
|
|
|
#' Gratuitous map of CRAN mirror locations
|
|
|
|
maps::map("world", ".", exact = FALSE, plot = FALSE, fill = TRUE) %>%
|
|
fortify() %>%
|
|
filter(region != "Antarctica") -> world
|
|
|
|
map_chr(mir_dat, ~.x$ip_info$loc) %>%
|
|
stri_split_fixed(pattern = ",", n = 2, simplify = TRUE) %>%
|
|
as.data.frame(stringsAsFactors = FALSE) %>%
|
|
as_tibble() %>%
|
|
mutate_all(list(as.numeric)) -> wheres_cran
|
|
|
|
#+ cran-map, fig.width=10, fig.height=6
|
|
ggplot() +
|
|
ggalt::geom_cartogram(
|
|
data = world, map = world, aes(long, lat, map_id=region),
|
|
color = ft_cols$gray, size = 0.125
|
|
) +
|
|
geom_point(
|
|
data = wheres_cran, aes(V2, V1), size = 2,
|
|
color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21
|
|
) +
|
|
ggalt::coord_proj("+proj=wintri") +
|
|
labs(
|
|
x = NULL, y = NULL,
|
|
title = "Geolocation of HTTPS-'enabled' CRAN Mirrors"
|
|
) +
|
|
theme_ft_rc(grid="") +
|
|
theme(axis.text = element_blank())
|
|
|
|
#' ## Look at certificate info
|
|
|
|
map_df(mir_dat, ~{
|
|
tibble(
|
|
host = .x$host,
|
|
s_issuer = .x$cert[[1]]$issuer %||% NA_character_,
|
|
i_issuer = .x$cert[[2]]$issuer %||% NA_character_,
|
|
algo = .x$cert[[1]]$algorithm %||% NA_character_,
|
|
names = .x$cert[[1]]$alt_names %||% NA_character_,
|
|
nm_ct = length(.x$cert[[1]]$alt_names),
|
|
key_size = .x$cert[[1]]$pubkey$size %||% NA_integer_
|
|
)
|
|
}) -> certs
|
|
|
|
#' ### How many either blocked the connection or don't exist
|
|
|
|
certs[!complete.cases(certs),]
|
|
|
|
certs <- filter(certs, complete.cases(certs))
|
|
|
|
#' ### How many domains do these certs serve?
|
|
|
|
#+ alt-names-ct, fig.width=8, fig.height=6
|
|
count(certs, host, sort=TRUE) %>%
|
|
ggplot() +
|
|
geom_quasirandom(
|
|
aes("", n), size = 2,
|
|
color = ft_cols$slate, fill = alpha(ft_cols$yellow, 3/4), shape = 21
|
|
) +
|
|
scale_y_comma() +
|
|
labs(
|
|
x = NULL, y = "# Servers",
|
|
title = "Distribution of the number of alt-names in CRAN mirror certificates"
|
|
) +
|
|
theme_ft_rc(grid="Y")
|
|
|
|
#' Take a look at some of them
|
|
|
|
filter(certs, host == "cran.cnr.berkeley.edu") %>%
|
|
select(names) %>%
|
|
head(20)
|
|
|
|
filter(certs, host == "cran.rapporter.net") %>%
|
|
select(names) %>%
|
|
head(20)
|
|
|
|
filter(certs, host == "cran-r.c3sl.ufpr.br") %>%
|
|
select(names) %>%
|
|
head(20)
|
|
|
|
filter(certs, host == "fourdots.com") %>%
|
|
select(names) %>%
|
|
head(20)
|
|
|
|
#' ### Certificate algo/key
|
|
|
|
distinct(certs, host, algo, key_size) %>%
|
|
count(algo, key_size, sort=TRUE)
|
|
|
|
#' ### Certificate issuers
|
|
|
|
distinct(certs, host, i_issuer) %>%
|
|
count(i_issuer, sort = TRUE) %>%
|
|
head(28)
|
|
|
|
#' ## Interactive SSL tests
|
|
#'
|
|
#' Using [`testssl.sh`](https://github.com/drwetter/testssl.sh).
|
|
|
|
list.files(here::here("data/ssl"), "json$", full.names = TRUE) %>%
|
|
map_df(jsonlite::fromJSON) %>%
|
|
as_tibble() -> ssl_tests
|
|
|
|
sev <- c("OK", "LOW", "MEDIUM", "HIGH", "WARN", "CRITICAL")
|
|
|
|
#+ testssl, fig.width=8, fig.height=12
|
|
filter(ssl_tests, severity %in% sev) %>%
|
|
group_by(ip) %>%
|
|
count(severity) %>%
|
|
ungroup() %>%
|
|
complete(ip = unique(ip), severity = sev) %>%
|
|
mutate(severity = factor(severity, levels = sev)) %>%
|
|
arrange(ip) %>%
|
|
mutate(ip = factor(ip, levels = rev(unique(ip)))) %>%
|
|
ggplot(aes(severity, ip, fill=n)) +
|
|
geom_tile(color = "#b2b2b2", size = 0.125) +
|
|
scale_x_discrete(name = NULL, expand = c(0,0.1), position = "top") +
|
|
scale_y_discrete(name = NULL, expand = c(0,0)) +
|
|
viridis::scale_fill_viridis(
|
|
name = "# Tests", option = "cividis", na.value = ft_cols$gray
|
|
) +
|
|
labs(
|
|
title = "CRAN Mirror SSL Test Summary Findings by Severity"
|
|
) +
|
|
theme_ft_rc(grid="") +
|
|
theme(axis.text.y = element_text(size = 8, family = "mono")) -> gg
|
|
|
|
# We're going to move the title vs have too wide of a plot
|
|
|
|
gb <- ggplot2::ggplotGrob(gg)
|
|
gb$layout$l[gb$layout$name %in% "title"] <- 2
|
|
|
|
grid::grid.newpage()
|
|
grid::grid.draw(gb)
|
|
|
|
#' ## Web server headers
|
|
|
|
map_df(mir_dat, ~{
|
|
|
|
if (length(.x$head$headers) == 0) return(NULL)
|
|
|
|
host <- .x$host
|
|
|
|
flatten_df(.x$head$headers) %>%
|
|
gather(name, value) %>%
|
|
mutate(host = host)
|
|
|
|
}) -> hdrs
|
|
|
|
count(hdrs, name, sort=TRUE) %>%
|
|
head(nrow(.))
|
|
|
|
#' ### 'Security' Headers
|
|
|
|
c(
|
|
"content-security-policy", "x-frame-options", "x-xss-protection",
|
|
"x-content-type-options", "strict-transport-security", "referrer-policy"
|
|
) -> secure_headers
|
|
|
|
count(hdrs, name, sort=TRUE) %>%
|
|
filter(name %in% secure_headers)
|
|
|
|
filter(hdrs, name %in% secure_headers) %>%
|
|
count(host, sort = TRUE)
|
|
|
|
filter(hdrs, host == "cran.csiro.au", name %in% secure_headers)
|
|
|
|
#' ### 'Server' Types
|
|
|
|
filter(hdrs, name == "server") %>%
|
|
separate(
|
|
value, c("kind", "version"), sep="/", fill="right", extra="merge"
|
|
) -> svr
|
|
|
|
count(svr, kind, sort=TRUE)
|
|
|
|
#' #### apache
|
|
|
|
apache_httpd_version_history() %>%
|
|
arrange(rls_date) %>%
|
|
mutate(
|
|
vers = factor(as.character(vers), levels = as.character(vers))
|
|
) -> apa_all
|
|
|
|
filter(svr, kind == "Apache") %>%
|
|
filter(!is.na(version)) %>%
|
|
mutate(version = stri_replace_all_regex(version, " .*$", "")) %>%
|
|
count(version) %>%
|
|
separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>%
|
|
mutate(pat = ifelse(is.na(pat), 1, pat)) %>%
|
|
mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>%
|
|
mutate(v = factor(v, levels = apa_all$vers)) %>%
|
|
arrange(v) -> apa_vers
|
|
|
|
filter(apa_all, vers %in% apa_vers$v) %>%
|
|
arrange(rls_date) %>%
|
|
group_by(rls_year) %>%
|
|
slice(1) %>%
|
|
ungroup() %>%
|
|
arrange(rls_date) -> apa_yrs
|
|
|
|
#+ apache-history, fig.width=12.5, fig.height=5
|
|
ggplot() +
|
|
geom_blank(
|
|
data = apa_vers, aes(v, n)
|
|
) +
|
|
geom_segment(
|
|
data = apa_yrs, aes(vers, 0, xend=vers, yend=Inf),
|
|
linetype = "dotted", size = 0.25, color = "white"
|
|
) +
|
|
geom_segment(
|
|
data = apa_vers, aes(v, n, xend=v, yend=0),
|
|
color = ft_cols$gray, size = 8
|
|
) +
|
|
geom_label(
|
|
data = apa_yrs, aes(vers, Inf, label = rls_year),
|
|
family = font_rc, color = "white", fill = "#262a31", size = 4,
|
|
vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0
|
|
) +
|
|
scale_y_comma(limits = c(0, 15)) +
|
|
labs(
|
|
x = "Apache Version #", y = "# Servers",
|
|
title = "CRAN Mirrors Apache Version History"
|
|
) +
|
|
theme_ft_rc(grid="Y") +
|
|
theme(axis.text.x = element_text(family = "mono", size = 8, color = "white"))
|
|
|
|
#' #### nginx
|
|
|
|
nginx_version_history() %>%
|
|
arrange(rls_date) %>%
|
|
mutate(
|
|
vers = factor(as.character(vers), levels = as.character(vers))
|
|
) -> ngx_all
|
|
|
|
filter(svr, kind == "nginx") %>%
|
|
filter(!is.na(version)) %>%
|
|
mutate(version = stri_replace_all_regex(version, " .*$", "")) %>%
|
|
count(version) %>%
|
|
separate(version, c("maj", "min", "pat"), sep="\\.", convert = TRUE, fill = "right") %>%
|
|
mutate(v = sprintf("%s.%s.%s", maj, min, pat)) %>%
|
|
mutate(v = factor(v, levels = ngx_all$vers)) %>%
|
|
arrange(v) -> ngx_vers
|
|
|
|
filter(ngx_all, vers %in% ngx_vers$v) %>%
|
|
arrange(rls_date) %>%
|
|
group_by(rls_year) %>%
|
|
slice(1) %>%
|
|
ungroup() %>%
|
|
arrange(rls_date) -> ngx_yrs
|
|
|
|
#+ nginx-history, fig.width=8, fig.height=5
|
|
ggplot() +
|
|
geom_blank(
|
|
data = ngx_vers, aes(v, n)
|
|
) +
|
|
geom_segment(
|
|
data = ngx_yrs, aes(vers, 0, xend=vers, yend=Inf),
|
|
linetype = "dotted", size = 0.25, color = "white"
|
|
) +
|
|
geom_segment(
|
|
data = ngx_vers, aes(v, n, xend=v, yend=0),
|
|
color = ft_cols$gray, size = 8
|
|
) +
|
|
geom_label(
|
|
data = ngx_yrs, aes(vers, Inf, label = rls_year),
|
|
family = font_rc, color = "white", fill = "#262a31", size = 4,
|
|
vjust = 1, hjust = 0, nudge_x = 0.01, label.size = 0
|
|
) +
|
|
scale_y_comma(limits = c(0, 15)) +
|
|
labs(
|
|
x = "nginx Version #", y = "# Servers",
|
|
title = "CRAN Mirrors nginx Version History"
|
|
) +
|
|
theme_ft_rc(grid="Y") +
|
|
theme(axis.text.x = element_text(family = "mono", color = "white"))
|
|
|
|
#' ### What else do CRAN mirrors run?
|
|
|
|
cran_mirror_other_things <- readRDS(here::here("data/cran-mirror-other-things.rds"))
|
|
|
|
#' #### A look by port
|
|
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
count(ip, sort = TRUE) %>%
|
|
head(20)
|
|
|
|
#+ other-stuff, fig.width=6, fig.height=4
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
filter(!(port %in% c(21, 80, 443))) %>%
|
|
count(ip) %>%
|
|
count(n) %>%
|
|
mutate(n = factor(n)) %>%
|
|
ggplot() +
|
|
geom_segment(
|
|
aes(n, nn, xend = n, yend = 0), size = 10, color = ft_cols$gray
|
|
) +
|
|
scale_y_comma() +
|
|
labs(
|
|
x = "Total number of running services", y = "# hosts",
|
|
title = "How many other services do CRAN mirrors run?",
|
|
subtitle = "NOTE: Not counting 80/443/21"
|
|
) +
|
|
theme_ft_rc(grid="Y")
|
|
|
|
#' Take a look at a few of them
|
|
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
count(port, sort=TRUE)
|
|
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
filter(ip == "104.25.94.23")
|
|
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
filter(ip == "143.107.10.17")
|
|
|
|
distinct(cran_mirror_other_things, ip, port) %>%
|
|
filter(ip == "137.208.57.37")
|
|
|
|
#' #### What kinds of services were detected?
|
|
|
|
cran_recog <- readRDS(here::here("data/cran-recog.rds"))
|
|
|
|
distinct(cran_recog, ip, cpe) %>%
|
|
count(cpe, sort = TRUE) %>%
|
|
head(50)
|
|
|
|
#' ### What other DNS entries use CRAN IPs?
|
|
|
|
cran_dns <- readRDS(here::here("data/cran-dns.rds"))
|
|
|
|
filter(cran_dns, !grepl("CLOUDFLARENET|AMAZON", as_name)) %>%
|
|
count(value, as_name, sort = TRUE)
|
|
|