-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunction_scraper.R
111 lines (101 loc) · 3.86 KB
/
function_scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#--------------------------------------------------
## Get all links from each index ###
find_all_links <- function(pages_pbs) {
res_pbs = c()
for (link in pages_pbs){
tmp <- link %>%
read_html %>%
html_nodes('#medicine-item > tbody > tr:nth-child(n) > td > a') %>%
html_attr('href') %>%
paste('https://www.pbs.gov.au',.,sep='')
res_pbs = c(res_pbs, tmp)
}
return(res_pbs)
}
#--------------------------------------------------
## Get all sub links within one drug link ###
find_all_sublinks <- function(all_pbs) {
all_subpbs = c()
for (link in all_pbs){
tryCatch(
tmp_sub <- link %>%
read_html %>%
html_nodes('#content > div > div > div:nth-child(4) > div > table > tbody > tr:nth-child(1) > td > ul > li:nth-child(n) > a') %>%
html_attr('href'), error = function(e){NA}
) #%>%
#paste('https://www.pbs.gov.au',.,sep='')
all_subpbs = c(all_subpbs, tmp_sub)
}
return(all_subpbs)
}
#--------------------------------------------------
## Build data table extracting for each link the following information:
# drug_item, drug_name, source, bodysystem_level1, bodysystem_level2, bodysystem_level3,
# product_form, max_qty_packs, max_units, n_repeat, DPMQ, max_safnet, patient_charge
# Parse nodes/columns within main table
get_node <- function(page, x, n=1, n_el = 7){ # expected number of nodes is 7
if (x != "numerical_summary"){
return (t(matrix(page %>%
html_nodes(x) %>%
html_text(), ncol = n)))
}
l = page %>% html_nodes("#medicine-item")
tmp = lapply(l, function(x) get_first_complete_row(x, n_el))
return (do.call(rbind, tmp))
}
# Scrape first complete row
get_first_complete_row <- function(x, n){
tmp = x %>% html_nodes("tr:nth-child(2) > td.align-top") %>% html_text()
if (length(tmp) == n){
return (tmp)
}
else {
tmp = lapply(x %>% html_nodes("tr"), function(y) y %>% html_nodes("td") %>% html_text())
for (item in tmp){
if (length(item) == n){
return(item)
}
}
}
}
# Build data frame
build_df <- function(links, res) {
#res = vector(mode="list", length = length(links))
for (i in 1:length(links)) {
if (!is.null(res[[i]])){
print(paste0("Already scraped: ", links[i]))
next
}
link = links[i]
download.file(link, destfile="tmp.html", quiet=TRUE)
page = read_html("tmp.html")
N = length(page %>% html_nodes("#medicine-item"))
Nbl = length(page %>% html_nodes("#medicine-summary > tbody > tr:nth-child(2) > td.summary-body > a")) %/% N
## for 7 nodes
cols = c("DrugItem", "DrugName", "Source", paste0("BodyLevel", 1:Nbl),
"ProductForm", "to_drop", "MaxQtyPacks", "MaxUnits", "NRepeat", "DPMQ", "MaxSafeNet", "PatientCharge")
## for 6 nodes
# cols = c("DrugItem", "DrugName", "Source", "to_drop", paste0("BodyLevel", 1:Nbl),
# "ProductForm", "drug", "MaxAmount", "NRepeat", "DPMA", "MaxSafeNet", "PatientCharge")
nodes = c("span.item-code",
"#content > div > div:nth-child(n) > div > h1",
"#medicine-summary > tbody > tr:nth-child(n) > td.summary-body > a",
"span.form-strength",
"numerical_summary")
tmp = tryCatch(as.data.frame(do.call(cbind, lapply(nodes, function(x) get_node(page, x, N, 7)))), # change to 6 when dealing with 6 nodes
error=function(cond) cond)
if(inherits(tmp, "error")) {
message(paste0(i, ": The following link broke: ", link))
next
}
if (ncol(tmp) != length(cols)){
print(paste0("problematic link: ", link))
next
}
colnames(tmp) = cols
res[[i]] <<- tmp#tmp[, names(tmp) != "to_drop"] ## Assigns in the parent environment, not ideal but connection to PBS is flaky
#print()
print(paste0(i,": Successfully scraped ", link))
}
#return(res)
}