updated debates data; regen'd vignette

5 years ago · 7494388716
14 changed files with 10255 additions and 29 deletions
--- a/4
+++ b/4
@ -1,8 +1,8 @@
 Package: ggchicklet
 Type: Package
 Title: Create 'Chicklet' (Rounded Segmented Column) Charts
-Version: 0.1.0
+Version: 0.2.0
-Date: 2019-06-28
+Date: 2019-09-15
 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role =
        c("aut", "cre"), comment = c(ORCID = "0000-0001-5670-2640")),
        person("Antoine", "Bichat", role = "ctb") )
--- a/R/datasets.R
+++ b/R/datasets.R
@ -1,15 +1,19 @@
 #' @md
-#' @title June 2019 U.S. Democratic Debate Candidate/Topic Times
+#' @title 2019-2020 U.S. Democratic Debate Candidate/Topic Times
 #' @description The New York Times and other media outlets kept track of the time each
 #'              candidate spent talking including the timestamp of the start of the blathering
 #'              and the topic up for debate. This dataset only includes candidates and
 #'              topic times. The complete datasets (See References) also include moderator
 #'              metadata and opening/closing statement records.
-#' @format data frame with columns: `elapsed` (dbl), `timestamp` (drtn), `speaker` (chr), `topic` (chr)
+#' @format data frame with columns: `elapsed` (dbl), `timestamp` (time), `speaker` (chr), `topic` (chr)
 #'         `debate_date` (date), `debate_group` (dbl), `night` (dbl)
 #' @docType data
 #' @keywords datasets
 #' @name debates2019
-#' @references <https://www.nytimes.com/interactive/2019/admin/100000006581096.embedded.html>
+#' @references <https://www.nytimes.com/interactive/2019/06/26/us/elections/debate-speaking-time.html>
-#' @references <https://www.nytimes.com/interactive/2019/admin/100000006584572.embedded.html>
+#' @references <https://www.nytimes.com/interactive/2019/06/27/us/elections/debate-speaking-time.html>
 #' @references <https://www.nytimes.com/interactive/2019/07/30/us/elections/debate-speaking-time.html>
 #' @references <https://www.nytimes.com/interactive/2019/07/31/us/elections/debate-speaking-time.html>
 #' @references <https://www.nytimes.com/interactive/2019/09/12/us/elections/debate-speaking-time.html>
 #' @usage data("debates2019")
 NULL
--- a/README.Rmd
+++ b/README.Rmd
@ -53,6 +53,7 @@ library(tidyverse)
 data("debates2019")
 debates2019 %>%
  filter(debate_group == 1) %>% 
  mutate(speaker = fct_reorder(speaker, elapsed, sum, .desc=FALSE)) %>%
  mutate(topic = fct_other(
    topic,
@ -92,8 +93,6 @@ debates2019 %>%
  theme_ipsum_rc(grid="X") +
  theme(axis.text.x = element_text(color = "gray60", size = 10)) +
  theme(legend.position = "top")
 ```
 ## ggchicklet Metrics
--- a/README.md
+++ b/README.md
@ -26,7 +26,7 @@ segmented column charts (i.e. “chicklets”).
 ## What’s Inside The Tin
-  - `debates2019`: June 2019 U.S. Democratic Debate Candidate/Topic
+  - `debates2019`: 2019-2020 U.S. Democratic Debate Candidate/Topic
    Times
  - `geom_chicklet`: Chicklet (rounded segmented column) charts
@ -58,7 +58,7 @@ library(ggchicklet)
 # current version
 packageVersion("ggchicklet")
-## [1] '0.1.0'
+## [1] '0.2.0'
 ```
 ### From the NYTimes
@ -70,6 +70,7 @@ library(tidyverse)
 data("debates2019")
 debates2019 %>%
  filter(debate_group == 1) %>% 
  mutate(speaker = fct_reorder(speaker, elapsed, sum, .desc=FALSE)) %>%
  mutate(topic = fct_other(
    topic,
@ -115,10 +116,11 @@ debates2019 %>%
 ## ggchicklet Metrics
-| Lang | \# Files | (%) | LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
+| Lang | \# Files |  (%) |  LoC |  (%) | Blank lines |  (%) | \# Lines |  (%) |
-| :--- | -------: | --: | --: | ---: | ----------: | ---: | -------: | ---: |
+| :--- | -------: | ---: | ---: | ---: | ----------: | ---: | -------: | ---: |
-| R    |        8 | 0.8 | 119 | 0.52 |          27 | 0.36 |      105 | 0.55 |
+| HTML |        5 | 0.33 | 7860 | 0.94 |        1858 | 0.96 |      239 | 0.54 |
-| Rmd  |        2 | 0.2 | 109 | 0.48 |          47 | 0.64 |       85 | 0.45 |
+| R    |        8 | 0.53 |  350 | 0.04 |          34 | 0.02 |      121 | 0.27 |
 | Rmd  |        2 | 0.13 |  117 | 0.01 |          45 | 0.02 |       85 | 0.19 |
 ## Code of Conduct
--- a/data-raw/2019-06-26-us-elections-debate-speaking-time.html
+++ b/data-raw/2019-06-26-us-elections-debate-speaking-time.html
--- a/data-raw/2019-06-27-us-elections-debate-speaking-time.html
+++ b/data-raw/2019-06-27-us-elections-debate-speaking-time.html
--- a/data-raw/2019-07-30-us-elections-debate-speaking-time.html
+++ b/data-raw/2019-07-30-us-elections-debate-speaking-time.html
--- a/data-raw/2019-07-31-us-elections-debate-speaking-time.html
+++ b/data-raw/2019-07-31-us-elections-debate-speaking-time.html
--- a/data-raw/2019-09-12-us-elections-debate-speaking-time.html
+++ b/data-raw/2019-09-12-us-elections-debate-speaking-time.html
--- a/data-raw/debates2019.R
+++ b/data-raw/debates2019.R
@ -1,14 +1,264 @@
 ## code to prepare `debates2019` dataset goes here
-read_csv(
+# read_csv(
-  file = "https://rud.is/data/2019-dem-debates.csv.gz",
+#   file = "https://rud.is/data/2019-dem-debates.csv.gz",
-  col_types = cols(
+#   col_types = cols(
-    elapsed = col_double(),
+#     elapsed = col_double(),
-    timestamp = col_time(format = ""),
+#     timestamp = col_time(format = ""),
-    speaker = col_character(),
+#     speaker = col_character(),
-    topic = col_character()
+#     topic = col_character()
-  )
+#   )
-) -> debates2019
+# ) -> debates2019
 #
 #
 # usethis::use_data(debates2019, overwrite = TRUE)
 library(rvest)
 library(stringi)
 library(tidyverse)
 if (!file.exists(here::here("data-raw/2019-06-26-us-elections-debate-speaking-time.html"))) download.file("https://www.nytimes.com/interactive/2019/06/26/us/elections/debate-speaking-time.html", here::here("data-raw/2019-06-26-us-elections-debate-speaking-time.html"))
 if (!file.exists(here::here("data-raw/2019-06-27-us-elections-debate-speaking-time.html"))) download.file("https://www.nytimes.com/interactive/2019/06/27/us/elections/debate-speaking-time.html", here::here("data-raw/2019-06-27-us-elections-debate-speaking-time.html"))
 if (!file.exists(here::here("data-raw/2019-07-30-us-elections-debate-speaking-time.html"))) download.file("https://www.nytimes.com/interactive/2019/07/30/us/elections/debate-speaking-time.html", here::here("data-raw/2019-07-30-us-elections-debate-speaking-time.html"))
 if (!file.exists(here::here("data-raw/2019-07-31-us-elections-debate-speaking-time.html"))) download.file("https://www.nytimes.com/interactive/2019/07/31/us/elections/debate-speaking-time.html", here::here("data-raw/2019-07-31-us-elections-debate-speaking-time.html"))
 if (!file.exists(here::here("data-raw/2019-09-12-us-elections-debate-speaking-time.html"))) download.file("https://www.nytimes.com/interactive/2019/09/12/us/elections/debate-speaking-time.html", here::here("data-raw/2019-09-12-us-elections-debate-speaking-time.html"))
 read_html(here::here("data-raw/2019-06-26-us-elections-debate-speaking-time.html")) %>%
  html_nodes(xpath = ".//script[contains(., 'NYTG_DEMDEBATES')]") %>%
  html_text() %>%
  stri_split_lines() %>%
  unlist() %>%
  .[3] %>%
  stri_replace_first_regex("^.*NYTG_DEMDEBATES = ", "") %>%
  jsonlite::fromJSON() %>%
  mutate(
    elapsed = as.numeric(elapsed)/60,
    debate_date = as.Date("2019-09-13"),
    speaker = stri_trans_totitle(speaker),
    timestamp = parse_time(timestamp),
    topic = stri_trans_totitle(topic),
    debate_group = 1,
    night = 1
  ) %>%
  mutate(
    speaker = case_when(
      speaker == "Orourke" ~ "O'Rourke",
      speaker == "Deblasio" ~ "de Blasio",
      TRUE ~ speaker
    )
  ) %>%
  mutate(
    topic = case_when(
      topic == "" ~ "Other",
      grepl("Campaign", topic) ~ "Campaign Finance Reform",
      grepl("Civil", topic) ~ "Civil Rights",
      grepl("Climate", topic) ~ "Climate",
      grepl("Foreign", topic) ~ "Foreign Policy",
      grepl("Gun", topic) ~ "Gun Control",
      grepl("Election", topic) ~ "Elections Reform",
      grepl("Health", topic) ~ "Healthcare",
      grepl("Party", topic) ~ "Party Strategy",
      grepl("Women", topic) ~ "Womens Rights",
      TRUE ~ topic
    )
  ) %>%
  filter(
    !is.na(timestamp),
    speaker != "",
    speaker != "Moderator"
  ) %>%
  as_tibble() -> jun_day_1
 read_html(here::here("data-raw/2019-06-27-us-elections-debate-speaking-time.html")) %>%
  html_nodes(xpath = ".//script[contains(., 'NYTG_DEMDEBATES')]") %>%
  html_text() %>%
  stri_split_lines() %>%
  unlist() %>%
  .[3] %>%
  stri_replace_first_regex("^.*NYTG_DEMDEBATES = ", "") %>%
  jsonlite::fromJSON() %>%
  mutate(
    elapsed = as.numeric(elapsed)/60,
    debate_date = as.Date("2019-09-13"),
    speaker = stri_trans_totitle(speaker),
    timestamp = parse_time(timestamp),
    topic = stri_trans_totitle(topic),
    debate_group = 1,
    night = 2
  ) %>%
  mutate(
    speaker = case_when(
      speaker == "Orourke" ~ "O'Rourke",
      speaker == "Deblasio" ~ "de Blasio",
      TRUE ~ speaker
    )
  ) %>%
  mutate(
    topic = case_when(
      topic == "" ~ "Other",
      grepl("Campaign", topic) ~ "Campaign Finance Reform",
      grepl("Civil", topic) ~ "Civil Rights",
      grepl("Climate", topic) ~ "Climate",
      grepl("Foreign", topic) ~ "Foreign Policy",
      grepl("Gun", topic) ~ "Gun Control",
      grepl("Election", topic) ~ "Elections Reform",
      grepl("Health", topic) ~ "Healthcare",
      grepl("Party", topic) ~ "Party Strategy",
      grepl("Women", topic) ~ "Womens Rights",
      TRUE ~ topic
    )
  ) %>%
  filter(
    !is.na(timestamp),
    speaker != "",
    speaker != "Moderator"
  ) %>%
  as_tibble() -> jun_day_2
 read_html(here::here("data-raw/2019-07-30-us-elections-debate-speaking-time.html")) %>%
  html_nodes(xpath = ".//script[contains(., 'NYTG_DEMDEBATES')]") %>%
  html_text() %>%
  stri_split_lines() %>%
  unlist() %>%
  .[2] %>%
  stri_replace_first_regex("^.*NYTG_DEMDEBATES = ", "") %>%
  jsonlite::fromJSON() %>%
  mutate(
    elapsed = as.numeric(elapsed)/60,
    debate_date = as.Date("2019-09-13"),
    speaker = stri_trans_totitle(speaker),
    timestamp = parse_time(timestamp),
    topic = stri_trans_totitle(topic),
    debate_group = 2,
    night = 1
  ) %>%
  mutate(
    speaker = case_when(
      speaker == "Orourke" ~ "O'Rourke",
      speaker == "Deblasio" ~ "de Blasio",
      TRUE ~ speaker
    )
  ) %>%
  mutate(
    topic = case_when(
      topic == "" ~ "Other",
      grepl("Campaign", topic) ~ "Campaign Finance Reform",
      grepl("Civil", topic) ~ "Civil Rights",
      grepl("Climate", topic) ~ "Climate",
      grepl("Foreign", topic) ~ "Foreign Policy",
      grepl("Gun", topic) ~ "Gun Control",
      grepl("Election", topic) ~ "Elections Reform",
      grepl("Health", topic) ~ "Healthcare",
      grepl("Party", topic) ~ "Party Strategy",
      grepl("Women", topic) ~ "Womens Rights",
      TRUE ~ topic
    )
  ) %>%
  filter(
    !is.na(timestamp),
    speaker != "",
    speaker != "Moderator"
  ) %>%
  as_tibble() -> jul_day_1
 read_html(here::here("data-raw/2019-07-31-us-elections-debate-speaking-time.html")) %>%
  html_nodes(xpath = ".//script[contains(., 'NYTG_DEMDEBATES')]") %>%
  html_text() %>%
  stri_split_lines() %>%
  unlist() %>%
  .[2] %>%
  stri_replace_first_regex("^.*NYTG_DEMDEBATES = ", "") %>%
  jsonlite::fromJSON() %>%
  mutate(
    elapsed = as.numeric(elapsed)/60,
    debate_date = as.Date("2019-09-13"),
    speaker = stri_trans_totitle(speaker),
    timestamp = parse_time(timestamp),
    topic = stri_trans_totitle(topic),
    debate_group = 2,
    night = 2
  ) %>%
  mutate(
    speaker = case_when(
      speaker == "Orourke" ~ "O'Rourke",
      speaker == "Deblasio" ~ "de Blasio",
      TRUE ~ speaker
    )
  ) %>%
  mutate(
    topic = case_when(
      topic == "" ~ "Other",
      grepl("Campaign", topic) ~ "Campaign Finance Reform",
      grepl("Civil", topic) ~ "Civil Rights",
      grepl("Climate", topic) ~ "Climate",
      grepl("Foreign", topic) ~ "Foreign Policy",
      grepl("Gun", topic) ~ "Gun Control",
      grepl("Election", topic) ~ "Elections Reform",
      grepl("Health", topic) ~ "Healthcare",
      grepl("Party", topic) ~ "Party Strategy",
      grepl("Women", topic) ~ "Womens Rights",
      TRUE ~ topic
    )
  ) %>%
  filter(
    !is.na(timestamp),
    speaker != "",
    speaker != "Moderator"
  ) %>%
  as_tibble() -> jul_day_2
 read_html(here::here("data-raw/2019-09-12-us-elections-debate-speaking-time.html")) %>%
  html_nodes(xpath = ".//script[contains(., 'NYTG_DEMDEBATES')]") %>%
  html_text() %>%
  stri_split_lines() %>%
  unlist() %>%
  .[3] %>%
  stri_replace_first_regex("^.*NYTG_DEMDEBATES = ", "") %>%
  jsonlite::fromJSON() %>%
  mutate(
    elapsed = as.numeric(elapsed)/60,
    debate_date = as.Date("2019-09-13"),
    speaker = stri_trans_totitle(speaker),
    timestamp = parse_time(timestamp),
    topic = stri_trans_totitle(topic),
    debate_group = 3,
    night = 1
  ) %>%
  mutate(
    speaker = case_when(
      speaker == "Orourke" ~ "O'Rourke",
      speaker == "Deblasio" ~ "de Blasio",
      TRUE ~ speaker
    )
  ) %>%
  mutate(
    topic = case_when(
      topic == "" ~ "Other",
      grepl("Campaign", topic) ~ "Campaign Finance Reform",
      grepl("Civil", topic) ~ "Civil Rights",
      grepl("Climate", topic) ~ "Climate",
      grepl("Foreign", topic) ~ "Foreign Policy",
      grepl("Gun", topic) ~ "Gun Control",
      grepl("Election", topic) ~ "Elections Reform",
      grepl("Health", topic) ~ "Healthcare",
      grepl("Party", topic) ~ "Party Strategy",
      grepl("Women", topic) ~ "Womens Rights",
      TRUE ~ topic
    )
  ) %>%
  filter(
    !is.na(timestamp),
    speaker != "",
    speaker != "Moderator"
  ) %>%
  as_tibble() -> sep_day_1
 bind_rows(
  jun_day_1,
  jun_day_2,
  jul_day_1,
  jul_day_2,
  sep_day_1
 ) -> debates2019
 usethis::use_data(debates2019, overwrite = TRUE)
--- a/data/debates2019.rda
+++ b/data/debates2019.rda
--- a/man/debates2019.Rd
+++ b/man/debates2019.Rd
@ -3,8 +3,9 @@
 \docType{data}
 \name{debates2019}
 \alias{debates2019}
-\title{June 2019 U.S. Democratic Debate Candidate/Topic Times}
+\title{2019-2020 U.S. Democratic Debate Candidate/Topic Times}
-\format{data frame with columns: \code{elapsed} (dbl), \code{timestamp} (drtn), \code{speaker} (chr), \code{topic} (chr)}
+\format{data frame with columns: \code{elapsed} (dbl), \code{timestamp} (time), \code{speaker} (chr), \code{topic} (chr)
 \code{debate_date} (date), \code{debate_group} (dbl), \code{night} (dbl)}
 \usage{
 data("debates2019")
 }
@ -16,8 +17,14 @@ topic times. The complete datasets (See References) also include moderator
 metadata and opening/closing statement records.
 }
 \references{
-\url{https://www.nytimes.com/interactive/2019/admin/100000006581096.embedded.html}
+\url{https://www.nytimes.com/interactive/2019/06/26/us/elections/debate-speaking-time.html}
-\url{https://www.nytimes.com/interactive/2019/admin/100000006584572.embedded.html}
+\url{https://www.nytimes.com/interactive/2019/06/27/us/elections/debate-speaking-time.html}
 \url{https://www.nytimes.com/interactive/2019/07/30/us/elections/debate-speaking-time.html}
 \url{https://www.nytimes.com/interactive/2019/07/31/us/elections/debate-speaking-time.html}
 \url{https://www.nytimes.com/interactive/2019/09/12/us/elections/debate-speaking-time.html}
 }
 \keyword{datasets}
--- a/man/figures/README-nyt-1.png
+++ b/man/figures/README-nyt-1.png
--- a/vignettes/using-ggchicklet.Rmd
+++ b/vignettes/using-ggchicklet.Rmd
@ -51,7 +51,10 @@ The `elapsed` column contains how long the candidate spoke and `timestamp` is th
 There are also candidates:
 ```{r data-ex-01}
-distinct(debates2019, speaker) %>% 
+debates2019 %>% 
  filter(debate_group == 1) %>% 
  filter(night == 1) %>% 
  distinct(speaker) %>% 
  arrange(speaker) %>% 
  print(n=nrow(.))
 ```
@ -59,7 +62,10 @@ distinct(debates2019, speaker) %>%
 and the topics debates:
 ```{r data-ex-02}
-distinct(debates2019, topic) %>% 
+debates2019 %>% 
  filter(debate_group == 1) %>% 
  filter(night == 1) %>% 
  distinct(topic) %>% 
  arrange(topic) %>% 
  print(n=nrow(.))
 ```
@ -76,6 +82,7 @@ the segments by topic.
 ```{r chicklet, fig.width=600/72, fig.height=600/72}
 debates2019 %>%
  filter(debate_group == 1) %>% 
  mutate(speaker = fct_reorder(speaker, elapsed, sum, .desc=FALSE)) %>%
  mutate(topic = fct_other(
    topic,