Webscraping Art Auction Data
[This article was first published on NYC Data Science Academy » R, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Check out my:
Introduction
Goal
Scrape all of the data off of the Blouin Art Sales Index: http://artsalesindex.artinfo.com
Method
Step 1 — Gather URLs for all artists from search directory
Step 2 — Gather URLS for all pieces from Artist Bio Pages
Step 3 — Scrape data for each individual piece
Step 1
#-----------Packages-------------------------------------------------------- library(dplyr) library(rvest) library(httr) #************************************ STEP 1 ***************************************** #------------Definitions----------------------------------------------------- url <- "http://artsalesindex.artinfo.com/asi/search/artistLanding.ai" path1 <- "?lastName=" path2 <- "&startRowNum=" #----------Number of increments for each letter------------------------------ alpha_length <- as.data.frame(LETTERS) alpha_length[,2] <- c(9450, 25950, 18600, 12600, 4650, 10200, 15450, 16350, 1500, 5400, 10800, 15600, 24000,5550, 3450, 14700, 750, 13200, 25200, 9300, 900, 6600, 11550, 750, 1950, 3750) colnames(alpha_length) <- c("letters", "number") #-----------Generate all of the A-Z webpages that will be scraped--------------- pages <- c() for (x in 1:nrow(alpha_length)) { for (i in seq(from=0, to=alpha_length$number[x], by=150)) { pages <- append(pages, paste(url, path1, alpha_length$letter[x], path2, as.character(i), sep="")) }} #------------------Scrape the Artists' Names-------------------------------- #Prep our definition parse.function <- function(x) { x %>% html_nodes(".artist-list li a") %>% html_attr(name = "href") } names <- c() #Scrape through each page for (i in 1:length(pages)) { #---- length(pages) <><><><><><><><><><> Change for test print(c(i, "out of", length(pages))) Sys.sleep(.5) art_site <- rvest::html(pages[i]) names <- append(names, parse.function(art_site)) } #Text Manipulation names <- gsub(" ", "%20", names) names <- gsub("", "%20", names, fixed=TRUE) names <- gsub("_Asgar/Gabriel", "_Asgar%20Gabriel", names) names <- gsub("?", "", names) %>% as.data.frame(names) row.names(names) <- seq(length=nrow(names)) url <- "http://artsalesindex.artinfo.com" #Art Profile pages to be scraped pages <- c() for (i in 1:nrow(names)) { print(c(i, "out of", nrow(names))) pages <- append(pages, paste(url, names$.[i], sep='')) }
Step 2
###******************************** STEP 2 ************************************* #------------------Scrape the Artists' Profile Pages----------------------- parsefunction <- function(x) { x %>% rvest::html_nodes(".results-title a") %>% rvest::html_attr(name = "href") } art.pieces <- character() system.time( for (i in 1:length(pages)) { #----- length(pages) <><><><><>Change for test print(c(i, "out of", length(pages))) Sys.sleep(1) art_site_bio <- html(pages[i]) art.pieces <- append(art.pieces, parsefunction(art_site_bio)) }) #--------------------------------------------------------------- url <- "http://artsalesindex.artinfo.com" #String final URL for piece pages url <- "http://artsalesindex.artinfo.com" art.pieces <- as.data.frame(art.pieces) pages <- c() for (i in 1:nrow(art.pieces)) { print(i) pages <- append(pages, paste(url, art.pieces$art.pieces[i], sep='')) }
Step 3
##************************************* STEP 3 **************************************** artist <- vector(mode="character") artist.nationality <- vector(mode="character") title <- vector(mode="character") year <- vector(mode = "character") lot.number <- vector(mode = "character") auction.data <- vector(mode = "character") price <- vector(mode = "character") lot.details <- vector(mode = "character") materials <- vector(mode = "character") measurements <- vector(mode = "character") description <- vector(mode = "character") markings <- vector(mode = "character") image.link <- vector(mode = "character") for (i in 1:length(pages)) { #----- length(pages) <><><><><><><><><><><><><><><><><>Change for test print(c(i, "out of", length(pages))) Sys.sleep(1) piece_html <- html(pages[i]) #-----------------Artist x <- piece_html %>% html_nodes("#artistName") %>% html_attr(name="value") if (length(x)==0) {artist[i] <- "NA"} else {artist[i] <- x} #-----------------Artist Nationality x <- piece_html %>% html_nodes(".artist-nationality") %>% html_text() if (length(x)==0) {artist.nationality[i] <- "NA"} else {artist.nationality[i] <- x} #-----------------Title x <- piece_html %>% html_nodes(".title") %>% html_text() if (length(x)==0) {title[i] <- "NA"} else {title[i] <- x} #-----------------Year x <- piece_html %>% html_nodes("#artworkIndex_0 p:nth-child(3)") %>% html_text() if (length(x)==0) {year[i] <- "NA"} else {year[i] <- x} #-----------------Lot Number x <- piece_html %>% html_nodes(".lotnumber") %>% html_text() if (length(x)==0) {lot.number[i] <- "NA"} else {lot.number[i] <- x} #-----------------Auction Data x <- piece_html %>% html_nodes(".auctiondata") %>% html_text() if (length(x)==0) {auction.data[i] <- "NA"} else {auction.data[i] <- x} #-----------------Price x <- piece_html %>% html_nodes(".price") %>% html_text() if (length(x)==0) {price[i] <- "NA"} else {price[i] <- x} #-----------------Lot Details foo <- piece_html %>% html_nodes(".lot-details1") %>% html_text() x <- lot.details[2] if (length(x)==0) {lot.details[i] <- "NA"} else {lot.details[i] <- x} #-----------------Details -------------------------------### details <- piece_html %>% html_nodes(".artworkdetails") %>% html_text() #-----------------Materials x <- details[1] if (length(x)==0) {materials[i] <- "NA"} else {materials[i] <- x} #-----------------Measurements x <- details[2] if (length(x)==0) {measurements[i] <- "NA"} else {measurements[i] <- x} #-----------------Description x <- details[3] if (length(x)==0) {description[i] <- "NA"} else {description[i] <- x} #-----------------Markings x <- details[4] if (length(x)==0) {markings[i] <- "NA"} else {markings[i] <- x} #-----------------Image Link x <- piece_html %>% html_nodes("#imageIndex_0 img") %>% html_attr(name="src") if (length(x)==0) {image.link[i] <- "NA"} else {image.link[i] <- x} } final <- data.frame(artist=artist, artist.nationality=artist.nationality, title=title, year=year, lot.number=lot.number, auction.data=auction.data, price=price, lot.details=lot.details, materials=materials, measurements=measurements, description=description, markings=markings, image.link=image.link)
To leave a comment for the author, please follow the link and comment on their blog: NYC Data Science Academy » R.
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.