-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_html.R
152 lines (125 loc) · 5.09 KB
/
scraper_html.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
library(tidyverse)
library(magrittr)
library(lubridate)
library(rvest)
library(urltools)
library(arrow)
#import rds if available
if (file.exists("corona_update_transcripts.rds")) {
corona_update_transcripts <- readRDS(file = "corona_update_transcripts.rds")
}
extract_last_change <- function(.episode_html) {
.episode_html %>%
html_node(css = ".lastchanged") %>%
html_text() %>%
str_remove(pattern = "[:alpha:]+[:punct:]") %>%
str_remove(pattern = "Uhr") %>%
str_squish() %>%
lubridate::dmy_hm()
}
#TODO get episode length from iframe player
extract_player_iframe_url <- function(.episode_html){
.episode_html %>%
html_node("iframe") %>%
html_attr("src")
}
extract_episode_length <- function(.episode_html) {
.episode_html %>%
html_node(css = ".textcontent h2") %>%
html_text() %>%
str_extract(pattern = "(?<=\\().{2,20}(?=\\)$)")
}
extract_transcript_nodes <- function(.episode_html) {
.episode_html %>%
#get all siblings of node p after the last node a that starts with # for the href attribute
html_nodes(xpath = '//p[a[starts-with(@href, "#")]][last()]/following-sibling::p')
}
extract_speaker_name <- function(.transcript_nodes) {
html_node(x = .transcript_nodes, xpath = "strong") %>%
html_text(trim = TRUE) %>%
str_squish() %>%
str_extract(pattern = "^[:upper:][:alpha:]+.+\\:$") %>%
str_remove(pattern = ":") %>%
#manual fix for episode 38
str_replace(pattern = "Eine Bitte an unsere Hörer", replacement = "Korinna Hennig") %>%
str_squish() %>%
na_if(y = "")
}
extract_transcript <- function(.episode_url) {
#sleep to be polite
Sys.sleep(runif(1, min = 0.5, max = 2))
#get html for episode_url
episode_html <- read_html(.episode_url)
#extract all information via functions
episode_last_change <- extract_last_change(episode_html)
episode_length <- extract_episode_length(episode_html)
transcript_nodes <- extract_transcript_nodes(episode_html)
speaker_names <- extract_speaker_name(transcript_nodes)
speaker_text <- html_text(transcript_nodes, trim = TRUE)
#put it all together and some clean up on the speaker column
tibble(speaker = speaker_names,
text = speaker_text) %>%
tidyr::fill(speaker, .direction = "down") %>%
drop_na() %>%
mutate(text = str_remove(text, pattern = speaker) %>%
str_remove(pattern = "^\\:") %>%
str_squish(),
paragraph_no = row_number()) %>%
nest(speaker = speaker, text = text, paragraph_no = paragraph_no) %>%
mutate(last_change = episode_last_change,
duration_episode = episode_length)
}
all_episodes_url <- "https://www.ndr.de/nachrichten/info/Coronavirus-Update-Alle-Folgen,podcastcoronavirus134.html"
#read html of podcast homepage
corona_update_html <- read_html(all_episodes_url)
#get list of episodes including urls to transcript
corona_update_transcripts <- corona_update_html %>%
html_nodes(css = ".std h2") %>%
map_df(~{
title <- .x %>% html_nodes('a') %>% html_text() %>% str_trim()
link <- .x %>% html_nodes('a') %>% html_attr("href")
tibble(title, link)
}) %>%
mutate(link = paste0("https://", domain(all_episodes_url), link),
episode_no = str_extract(string = title, pattern = "(?<=\\()[:digit:]+(?=\\))"),
episode_no = as.integer(episode_no),
#clean title
title = trimws(str_remove(string = title, pattern = "\\(.+\\)[:blank:]") %>%
str_remove("Coronavirus-Update\\: "))) %>%
#get transcript data and unnest the results to get a big data frame
mutate(result_text = map(.x = link, .f = extract_transcript)) %>%
unnest(result_text) %>%
unnest(c(paragraph_no, speaker, text))
#sanity check of speaker names
speaker_names <- count(corona_update_transcripts, speaker, sort = TRUE)
speaker_names
#manually clean the speaker names from typos and different variants
corona_update_transcripts <- corona_update_transcripts %>%
mutate(speaker = case_when(
speaker %in% c("Hennig",
"Hennig",
"Henning",
"Korinna Hennig") ~ "Korinna Hennig",
speaker %in% c("Drosten") ~ "Christian Drosten",
speaker %in% c("Ciesek",
"Cisek",
"Sandra Cisek") ~ "Sandra Ciesek",
speaker %in% c("Martini") ~ "Anja Martini",
speaker %in% c("Rohde") ~ "Gernot Rohde",
speaker %in% c("Kluge") ~ "Stefan Kluge",
speaker %in% c("Kriegel") ~ "Martin Kriegel",
speaker %in% c("Wieler") ~ "Lothar Wieler",
speaker %in% c("Addo") ~ "Marylyn Addo",
speaker %in% c("Muntau") ~ "Ania Muntau",
speaker %in% c("Buyx") ~ "Alena Buyx",
speaker %in% c("Greiner",
"Wolfang Greiner") ~ "Wolfgang Greiner",
speaker %in% c("Prof. Dr. Hans-Georg Eichler",
"Eichler") ~ "Hans-Georg Eichler",
TRUE ~ speaker
)
)
#export transcript tibble
saveRDS(corona_update_transcripts, file = "corona_update_transcripts.rds")
write_parquet(corona_update_transcripts, sink = "corona_update_transcripts.parquet")
write_csv(corona_update_transcripts, file = "corona_update_transcripts.csv")