## code to prepare `debates2019` dataset goes here
# read_csv(
# file = "https://rud.is/data/2019-dem-debates.csv.gz",
# col_types = cols(
# elapsed = col_double(),
# timestamp = col_time(format = ""),
# speaker = col_character(),
# topic = col_character()
# )
# ) -> debates2019
#
#
# usethis::use_data(debates2019, overwrite = TRUE)
library ( rvest )
library ( stringi )
library ( tidyverse )
if ( ! file.exists ( here :: here ( " data-raw/2019-06-26-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/06/26/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-06-26-us-elections-debate-speaking-time.html" ) )
if ( ! file.exists ( here :: here ( " data-raw/2019-06-27-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/06/27/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-06-27-us-elections-debate-speaking-time.html" ) )
if ( ! file.exists ( here :: here ( " data-raw/2019-07-30-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/07/30/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-07-30-us-elections-debate-speaking-time.html" ) )
if ( ! file.exists ( here :: here ( " data-raw/2019-07-31-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/07/31/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-07-31-us-elections-debate-speaking-time.html" ) )
if ( ! file.exists ( here :: here ( " data-raw/2019-09-12-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/09/12/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-09-12-us-elections-debate-speaking-time.html" ) )
if ( ! file.exists ( here :: here ( " data-raw/2019-10-15-us-elections-debate-speaking-time.html" ) ) ) download.file ( " https://www.nytimes.com/interactive/2019/10/15/us/elections/debate-speaking-time.html" , here :: here ( " data-raw/2019-10-15-us-elections-debate-speaking-time.html" ) )
read_html ( here :: here ( " data-raw/2019-06-26-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[3 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-09-13" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
topic = stri_trans_totitle ( topic ) ,
debate_group = 1 ,
night = 1
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
speaker == " Deblasio" ~ " de Blasio" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " Campaign" , topic ) ~ " Campaign Finance Reform" ,
grepl ( " Civil" , topic ) ~ " Civil Rights" ,
grepl ( " Climate" , topic ) ~ " Climate" ,
grepl ( " Foreign" , topic ) ~ " Foreign Policy" ,
grepl ( " Gun" , topic ) ~ " Gun Control" ,
grepl ( " Election" , topic ) ~ " Elections Reform" ,
grepl ( " Health" , topic ) ~ " Healthcare" ,
grepl ( " Party" , topic ) ~ " Party Strategy" ,
grepl ( " Women" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> jun_day_1
read_html ( here :: here ( " data-raw/2019-06-27-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[3 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-09-13" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
topic = stri_trans_totitle ( topic ) ,
debate_group = 1 ,
night = 2
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
speaker == " Deblasio" ~ " de Blasio" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " Campaign" , topic ) ~ " Campaign Finance Reform" ,
grepl ( " Civil" , topic ) ~ " Civil Rights" ,
grepl ( " Climate" , topic ) ~ " Climate" ,
grepl ( " Foreign" , topic ) ~ " Foreign Policy" ,
grepl ( " Gun" , topic ) ~ " Gun Control" ,
grepl ( " Election" , topic ) ~ " Elections Reform" ,
grepl ( " Health" , topic ) ~ " Healthcare" ,
grepl ( " Party" , topic ) ~ " Party Strategy" ,
grepl ( " Women" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> jun_day_2
read_html ( here :: here ( " data-raw/2019-07-30-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[2 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-09-13" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
topic = stri_trans_totitle ( topic ) ,
debate_group = 2 ,
night = 1
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
speaker == " Deblasio" ~ " de Blasio" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " Campaign" , topic ) ~ " Campaign Finance Reform" ,
grepl ( " Civil" , topic ) ~ " Civil Rights" ,
grepl ( " Climate" , topic ) ~ " Climate" ,
grepl ( " Foreign" , topic ) ~ " Foreign Policy" ,
grepl ( " Gun" , topic ) ~ " Gun Control" ,
grepl ( " Election" , topic ) ~ " Elections Reform" ,
grepl ( " Health" , topic ) ~ " Healthcare" ,
grepl ( " Party" , topic ) ~ " Party Strategy" ,
grepl ( " Women" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> jul_day_1
read_html ( here :: here ( " data-raw/2019-07-31-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[2 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-09-13" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
topic = stri_trans_totitle ( topic ) ,
debate_group = 2 ,
night = 2
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
speaker == " Deblasio" ~ " de Blasio" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " Campaign" , topic ) ~ " Campaign Finance Reform" ,
grepl ( " Civil" , topic ) ~ " Civil Rights" ,
grepl ( " Climate" , topic ) ~ " Climate" ,
grepl ( " Foreign" , topic ) ~ " Foreign Policy" ,
grepl ( " Gun" , topic ) ~ " Gun Control" ,
grepl ( " Election" , topic ) ~ " Elections Reform" ,
grepl ( " Health" , topic ) ~ " Healthcare" ,
grepl ( " Party" , topic ) ~ " Party Strategy" ,
grepl ( " Women" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> jul_day_2
read_html ( here :: here ( " data-raw/2019-09-12-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[3 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-09-13" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
topic = stri_trans_totitle ( topic ) ,
debate_group = 3 ,
night = 1
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
speaker == " Deblasio" ~ " de Blasio" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " Campaign" , topic ) ~ " Campaign Finance Reform" ,
grepl ( " Civil" , topic ) ~ " Civil Rights" ,
grepl ( " Climate" , topic ) ~ " Climate" ,
grepl ( " Foreign" , topic ) ~ " Foreign Policy" ,
grepl ( " Gun" , topic ) ~ " Gun Control" ,
grepl ( " Election" , topic ) ~ " Elections Reform" ,
grepl ( " Health" , topic ) ~ " Healthcare" ,
grepl ( " Party" , topic ) ~ " Party Strategy" ,
grepl ( " Women" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> sep_day_1
read_html ( here :: here ( " data-raw/2019-10-15-us-elections-debate-speaking-time.html" ) ) %>%
html_nodes ( xpath = " .//script[contains(., 'NYTG_DEMDEBATES')]" ) %>%
html_text ( ) %>%
stri_split_lines ( ) %>%
unlist ( ) %>%
.[3 ] %>%
stri_replace_first_regex ( " ^.*NYTG_DEMDEBATES = " , " " ) %>%
jsonlite :: fromJSON ( ) %>%
mutate (
elapsed = as.numeric ( elapsed ) / 60 ,
debate_date = as.Date ( " 2019-10-15" ) ,
speaker = stri_trans_totitle ( speaker ) ,
timestamp = parse_time ( timestamp ) ,
debate_group = 4 ,
night = 1
) %>%
mutate (
speaker = case_when (
speaker == " Orourke" ~ " O'Rourke" ,
TRUE ~ speaker
)
) %>%
mutate (
topic = case_when (
topic == " " ~ " Other" ,
grepl ( " impeachment" , topic ) ~ " Impeachment" ,
grepl ( " economy" , topic ) ~ " Economy" ,
grepl ( " opioids" , topic ) ~ " Opioids" ,
grepl ( " candidate-age" , topic ) ~ " Age" ,
grepl ( " tech-companies" , topic ) ~ " Tech Companies" ,
grepl ( " middle-east policy" , topic ) ~ " Foreign Policy" ,
grepl ( " gun-control" , topic ) ~ " Gun Control" ,
grepl ( " income-inequality" , topic ) ~ " Income Inequality" ,
grepl ( " health-care" , topic ) ~ " Healthcare" ,
grepl ( " party-strategy" , topic ) ~ " Party Strategy" ,
grepl ( " womens-rights" , topic ) ~ " Women's Rights" ,
TRUE ~ topic
)
) %>%
filter (
! is.na ( timestamp ) ,
speaker != " " ,
speaker != " Moderator"
) %>%
as_tibble ( ) -> oct_day_1
bind_rows (
jun_day_1 ,
jun_day_2 ,
jul_day_1 ,
jul_day_2 ,
sep_day_1 ,
oct_day_1
) -> debates2019
usethis :: use_data ( debates2019 , overwrite = TRUE )