|
library(dplyr) |
|
library(httr) |
|
library(rvest) |
|
library(stringr) |
|
|
|
dir.create("data" , showWarnings = FALSE) |
|
dir.create("raw" , showWarnings = FALSE) |
|
dir.create("docs" , showWarnings = FALSE) |
|
|
|
for (i in c(2015:1990)) { |
|
|
|
docs = str_c("data/docs-", i, ".csv") |
|
if (!file.exists(docs)) { |
|
|
|
d = data_frame() |
|
|
|
# (1) get lists of nominative measures |
|
dir.create(str_c("raw/", i), showWarnings = FALSE) |
|
|
|
# iinitialized by absence of first page (skips existing records) |
|
f = str_c("raw/", i, "/001.html") |
|
if (!file.exists(f)) { |
|
|
|
cat("Year", i) |
|
GET("http://legifrance.gouv.fr/rechExpMesuresNominatives.do", |
|
query = list(champNom = "", champPrenom = "", champFonction = "", |
|
champMinistere = "", champDecoration = "", |
|
checkboxPeriode = "on", |
|
champDatePublication1J = "01", |
|
champDatePublication1M = "01", |
|
champDatePublication1A = i, |
|
champDatePublication2J = "31", |
|
champDatePublication2M = "12", |
|
champDatePublication2A = i)) %>% |
|
content("text", encoding = "UTF-8") %>% |
|
writeLines(f) |
|
|
|
h = read_html(f, encoding = "UTF-8") |
|
|
|
n = html_nodes(h, "h3") %>% |
|
html_text %>% |
|
str_extract("\\d+") %>% |
|
na.omit %>% |
|
as.integer |
|
|
|
if (!length(n)) { |
|
cat(": empty\n") # skips 1993 |
|
next |
|
} |
|
|
|
p = 2:(n %/% 20 + (n %% 20 > 0)) # number of pages to get |
|
|
|
cat(":", str_pad(n, width = 5), "document(s)", |
|
str_pad(1 + length(p), 5), "pages to download\n") |
|
|
|
r = html_nodes(h, "a") %>% |
|
html_attr("href") %>% |
|
str_extract("rechExpMesuresNominatives.do(.*)fastReqId=\\d+") %>% |
|
na.omit %>% |
|
unique # reason why every search is unique and must start on page 1 |
|
|
|
pb = txtProgressBar(max = length(p), style = 3) |
|
for (j in p) { |
|
|
|
f = str_c("raw/", i, "/", str_pad(j, width = 3, pad = "0"), ".html") |
|
GET(str_c("http://legifrance.gouv.fr/", r, "&page=", j)) %>% |
|
content("text", encoding = "UTF-8") %>% |
|
writeLines(f) |
|
|
|
setTxtProgressBar(pb, which(p == j)) |
|
|
|
} |
|
|
|
cat("\n") |
|
|
|
} |
|
|
|
cat("Year", i) |
|
f = str_c("raw/", i) %>% list.files(full.names = TRUE) |
|
|
|
if (length(f) == 1) { |
|
cat(": empty\n") # skips 1993 |
|
next |
|
} |
|
|
|
# parse the lists |
|
for (j in f) { |
|
|
|
h = read_html(j, encoding = "UTF-8") %>% |
|
html_nodes("li.resultat1 a") |
|
|
|
l = html_attr(h, "href") # links |
|
h = html_text(h) %>% |
|
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>% |
|
str_trim # text |
|
w = (h != "Article") |
|
|
|
d = rbind(d, data_frame(year = i, doc = h[ w ], url = l[ w ], |
|
title = NA, jorf = NA, nor = NA)) |
|
|
|
} |
|
|
|
w = d$year == i |
|
cat(":", sum(w) %>% str_pad(width = 5), "document(s)") |
|
|
|
# data sample |
|
w = w & grepl("ambassad(eur|rice)|consul(e)? général(e)", tolower(d$doc)) |
|
w = d$url[ w ] |
|
cat(length(w) %>% str_pad(width = 5), "documents sampled\n") |
|
|
|
# (2) get the actual documents |
|
if (length(w)) { |
|
|
|
a = data_frame() |
|
pb = txtProgressBar(max = length(w), style = 3) |
|
|
|
for (j in w) { |
|
|
|
f = str_extract(j, "JORFTEXT\\d+") |
|
f = str_c("docs/", f, ".html") |
|
if (!file.exists(f)) |
|
download.file(str_c("http://legifrance.gouv.fr/", j), |
|
f, mode = "wb", quiet = TRUE) |
|
|
|
h = read_html(f) |
|
setTxtProgressBar(pb, which(w == j)) |
|
|
|
# fill in dataset columns |
|
d$title[ d$url == j] = html_nodes(h, "h2") %>% |
|
html_text %>% |
|
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>% |
|
str_trim |
|
|
|
h = html_nodes(h, ".enteteTexte") %>% html_text |
|
|
|
d$jorf[ d$url == j] = str_extract(h, "JORF(.*)\\d{4}") |
|
d$nor[ d$url == j] = str_extract(h, "NOR: (.*)") %>% |
|
str_replace("NOR:\\s", "") |
|
|
|
} |
|
|
|
cat("\n") |
|
|
|
} |
|
|
|
write.csv(d, docs, row.names = FALSE) |
|
|
|
} |
|
|
|
} |