ALLSTATisticians in decline? A polite look at ALLSTAT email Archives

Take it slowly

home_url <- "https://www.jiscmail.ac.uk/cgi-bin/webadmin"

session <- polite::bow(home_url,
                       user_agent = "Maëlle Salmon https://masalmon.eu/")
library("magrittr")

polite::scrape(session, params = "?A0=ALLSTAT") %>%
  rvest::xml_nodes("li") %>%
  rvest::xml_nodes("a") %>%
  rvest::html_attr("href") %>%
  purrr::keep(function(x) stringr::str_detect(x, "\\/cgi-bin\\/webadmin\\?A1\\=")) %>%
  stringr::str_remove("\\/cgi\\-bin\\/webadmin\\?A1\\=ind") %>%
  stringr::str_remove("\\&L\\=ALLSTAT") -> date_strings
date_strings <- date_strings[stringr::str_length(date_strings) != 2]
# or
date_strings <- purrr::discard(date_strings, function(x) stringr::str_length(x) == 2)
get_emails_meta_by_date <- function(date_string, session){
  message(date_string)
  params <- glue::glue("?A1=ind{date_string}&L=ALLSTAT&F=&S=&O=T&H=0&D=0&T=0")
  
  everything <- try(polite::scrape(session, params = params),
                    silent = TRUE)
  
  # at the time of writing one couldn't pass encoding to scrape
  # but now one can https://github.com/dmi3kno/polite/issues/6#issuecomment-409268730
  if(is(everything, "try-error")){
    everything <- httr::GET(paste0(home_url,
                                   params)) %>%
      httr::content(encoding = "latin1")
  }
  
  everything <- everything %>%
    # there are two classes that correspond
    # to the table having two colours of rows!
    rvest::xml_nodes(XPath = '//tr[@class="normalgroup"]|//tr[@class="emphasizedgroup"]') %>%
    rvest::xml_nodes("span")
  
  everything %>%
    rvest::xml_nodes(XPath = "//td") %>%
    rvest::xml_nodes("span") %>%
    rvest::xml_nodes("a") %>%
    rvest::html_text() -> subjects
  
  everything %>%
    rvest::xml_nodes(XPath = "//td[@nowrap]") %>%
    rvest::xml_nodes(XPath = "p[@class='archive']") %>%
    rvest::html_text() -> big_mess
  
  senders <- big_mess[seq_along(big_mess) %% 3 == 1]
  senders <- stringr::str_remove(senders, " \\<\\[log in to unmask\\]\\>")
  
  dates <- big_mess[seq_along(big_mess) %% 3 == 2]
  dates <- lubridate::dmy_hms(dates, tz = "UTC")
  
  sizes <- big_mess[seq_along(big_mess) %% 3 == 0]
  sizes <- stringr::str_remove(sizes, " lines")
  sizes <- as.numeric(sizes)
  
  tibble::tibble(subject = subjects,
                 sender = senders,
                 date = dates,
                 size = sizes) %>%
    readr::write_csv(glue::glue("data/emails_meta{date_string}.csv"))
  
}
fs::dir_create("data")
purrr::walk(date_strings,
              get_emails_meta_by_date,
              session = session)
library("magrittr")
library("magrittr")

fs::dir_ls("../../static/data/allstat") %>%
  purrr::map_df(readr::read_csv) -> emails

jobs <- dplyr::filter(emails, 
                      stringr::str_detect(subject,
                                          "[Jj][Oo][Bb]"))
jobs <- dplyr::mutate(jobs,
                      data_scientist = stringr::str_detect(subject, 
                                                           "[Dd]ata [Ss]cientist"),
                      statistician = stringr::str_detect(subject, 
                                                          "[Ss]tatistician"))
dplyr::filter(jobs, data_scientist, statistician) %>%
  dplyr::select(subject, sender, date) %>%
  knitr::kable()
jobs <- dplyr::filter(jobs,
                      !(data_scientist&statistician))
jobs <- dplyr::mutate(jobs,
                      category = dplyr::case_when(data_scientist ~ "data scientist",
                                                  statistician ~ "statistician",
                                                  TRUE ~ "other"),
                      category = factor(category,
                                        levels = c("statistician",
                                                   "data scientist",
                                                   "other"),
                                        ordered = TRUE))

jobs <- dplyr::mutate(jobs,
                      year = lubridate::year(date))
head(jobs$subject[jobs$category=="statistician"])
## [1] "FW: JOB: Senior/Lead Statistician - GlaxoSmithKline"                   
## [2] "JOB - Clinical Trial Statisticians and SAS Programmers"                
## [3] "JOB - Medical statistician in Salvador, Brazil (2-3 years)"            
## [4] "JOB - Medical Statistician, Oxford"                                    
## [5] "JOB OPPORTUNITY - 39408 - Entry-Level Statisticians - GSK"             
## [6] "JOB: Biostatistician - Contract/Consultancy Opportunity - Switzer land"
head(jobs$subject[jobs$category=="data scientist"])
## [1] "JOB: Data Scientist, Unilever"                                 
## [2] "JOB: Data Scientist, Unilever"                                 
## [3] "JOB: Data Scientist / Machine Learning (BELFAST)"              
## [4] "JOB: Data Scientists in Belfast"                               
## [5] "JOBS: Data Scientists"                                         
## [6] "[Job] Hacker or Botnet Developer, Data Scientist (Telecommute)"
head(jobs$subject[jobs$category=="other"])
## [1] "Job - Data manager Dept of Haematology, Imperial College, London"                  
## [2] "Re: JOB - Health Economic Modelling and Value Demonstrations"                      
## [3] "JOB - Lectureship / readership"                                                    
## [4] "JOB - Mathematical Modeller at the Health Protection Agency, Centre for Infections"
## [5] "JOB - PhD position"                                                                
## [6] "JOB - SAS Senior Statistical Programmer, Oxford"
library("ggplot2")

ggplot(jobs) +
  geom_bar(aes(year, fill = category)) +
  viridis::scale_fill_viridis(discrete = TRUE) +
  theme(legend.position = "bottom") +
  hrbrthemes::theme_ipsum(base_size = 14) +
  xlab("Year (2018 not complete yet)") +
  ylab("Number of job openings") +
  ggtitle("ALLSTAT mailing list 2007-2018")
ggplot(jobs) +
  geom_bar(aes(year, fill = category),
           position = "fill") +
  viridis::scale_fill_viridis(discrete = TRUE) +
  theme(legend.position = "bottom") +
  hrbrthemes::theme_ipsum(base_size = 14) +
  xlab("Year (2018 not complete yet)") +
  ylab("Number of job openings") +
  ggtitle("ALLSTAT mailing list 2007-2018")
dplyr::count(jobs, category,
                sender) %>%
  dplyr::group_by(category) %>%
  dplyr::arrange(category, - n) %>%
  dplyr::filter(sender %in% sender[1:5])
## # A tibble: 15 x 3
## # Groups:   category [3]
##    category       sender                     n
##    <ord>          <chr>                  <int>
##  1 statistician   James Phillips           106
##  2 statistician   Sabrina Andresen          82
##  3 statistician   James Miller              45
##  4 statistician   Angela Smythe             40
##  5 statistician   Helena Newman-Mitchell    37
##  6 data scientist James Phillips            86
##  7 data scientist Sportradar HR              5
##  8 data scientist Jason Howlin               4
##  9 data scientist Deborah Gee                3
## 10 data scientist Christos Mitas             2
## 11 other          James Phillips           671
## 12 other          Angela Smythe            223
## 13 other          Helena Newman-Mitchell   103
## 14 other          Jason Howlin              91
## 15 other          James Miller              85
library("tidytext")
data("stop_words")
words <- dplyr::filter(jobs, category != "other") %>%
  unnest_tokens(word, subject, token = "words") %>%
  dplyr::filter(!word %in% stop_words$word,
                !word %in% c("job", "statistician", 
                             "jobs", "statisticians",
                             "data", "scientist",
                             "scientists",
                             "datascientistjobs"))

word_ratios <- words %>%
  dplyr::count(word, category) %>%
  dplyr::group_by(word) %>%
  dplyr::filter(sum(n) >= 10) %>%
  dplyr::ungroup() %>%
  tidyr::spread(category, n, fill = 0) %>%
  dplyr::mutate_if(is.numeric, dplyr::funs((. + 1) / sum(. + 1))) %>%
  dplyr::mutate(logratio = log(`data scientist` / statistician)) %>%
  dplyr::arrange(desc(logratio))

word_ratios %>%
  dplyr::group_by(logratio < 0) %>%
  dplyr::top_n(15, abs(logratio)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("log odds ratio (data scientist / statistician)") +
  scale_fill_manual(name = "", 
                    values = c("#21908CFF", "#440154FF"),
                    labels = c("data scientist", "statistician")) +
  hrbrthemes::theme_ipsum(base_size = 11)+
  ggtitle("ALLSTAT mailing list 2007-2018")

subject	sender	date
JOB: Statisticians/Data Scientists with Unilever	Murray, Peter	2008-02-12 08:47:56
Jobs: Senior Data Scientists/Statisticians	Cox, Trevor	2008-03-12 16:24:48
20 new job ads for data scientists, statisticians	Vincent Granville	2013-06-16 00:18:27
23 new jobs for statisticians, data scientists	Vincent Granville	2013-07-27 20:05:48
19 new job ads for statisticians and data scientists	Vincent Granville	2013-10-12 23:21:23
Job Data scientist / Statistician	Andrea Schirru	2014-07-22 16:38:59
JOB: Computational Statistician / Data Scientist	David Hastie	2014-07-16 11:36:34
Job Openings: Statisticians and Data Scientists at Open Analytics (Belgium)	Tobias Verbeke	2015-06-30 09:53:21
JOBS x 2: Data Scientist/Medical Statistician at University of Manchester	Matthew Sperrin	2016-07-28 10:42:52
JOB: Data scientist – Statistician (KTP associate) @ University of Essex	Aris Perperoglou	2016-09-12 10:33:32
JOB: Principal Statistician/ Data Scientist- Pharma – Perm – Centralised Monitoring- Data Safety – Global Search-UK	Sabrina Andresen	2016-10-17 16:05:12
JOB: Research Associate – Statistician / Data Scientist, University of Manchester (Centre for Musculoskeletal Research)	Jamie Sergeant	2016-11-01 10:21:07
JOB – Statistician / Data Scientist at Cefas, UK	David Maxwell	2017-08-04 16:15:25
Job: Data Scientist/Statistician at Essex University	Aris Perperoglou	2017-08-21 13:43:43
Job: Statistician/Data Scientist	Roisin McCarthy	2017-11-06 16:49:18
Job Openings: Statisticians and Data Scientists at Open Analytics (Belgium)	Tobias Verbeke	2017-12-05 20:48:39
JOB \| StatsJobs – Data Scientist/Statistician, Zurich Insurance	James Phillips	2018-01-30 08:59:49
JOB \| Junior Medical statistician / Real-world data scientist – Centre of Excellence for Retrospective Studies, IQVIA (London)	Venerus, Alessandra	2018-03-05 10:33:09
JOB \| Senior Medical statistician / Real-world data scientist – Centre of Excellence for Retrospective Studies, IQVIA (London)	Venerus, Alessandra	2018-03-05 10:32:51
Job: Catalyst Project Research Data Scientist / Statistician	Aris Perperoglou	2018-06-25 14:17:18

ALLSTATisticians in decline? A polite look at ALLSTAT email Archives

Webscraping ALLSTAT

Life on the edge of easy responsible webscraping

Actual webscraping a.k.a solving XPath puzzles

Analyzing ALLSTAT jobs

Filtering jobs

Are data scientists on the rise?

Who offers data scientists’ jobs?

What are the openings about?

Conclusion