Webscraping Art Auction Data
[This article was first published on NYC Data Science Academy » R, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Check out my:
Introduction
Goal
Scrape all of the data off of the Blouin Art Sales Index: http://artsalesindex.artinfo.com
Method
Step 1 — Gather URLs for all artists from search directory
Step 2 — Gather URLS for all pieces from Artist Bio Pages
Step 3 — Scrape data for each individual piece
Step 1

#-----------Packages--------------------------------------------------------
library(dplyr)
library(rvest)
library(httr)
#************************************ STEP 1 *****************************************
#------------Definitions-----------------------------------------------------
url <- "http://artsalesindex.artinfo.com/asi/search/artistLanding.ai"
path1 <- "?lastName="
path2 <- "&startRowNum="
#----------Number of increments for each letter------------------------------
alpha_length <- as.data.frame(LETTERS)
alpha_length[,2] <- c(9450, 25950, 18600, 12600, 4650, 10200, 15450, 16350, 1500, 5400, 10800, 15600,
24000,5550, 3450, 14700, 750, 13200, 25200, 9300, 900, 6600, 11550, 750, 1950,
3750)
colnames(alpha_length) <- c("letters", "number")
#-----------Generate all of the A-Z webpages that will be scraped---------------
pages <- c()
for (x in 1:nrow(alpha_length)) {
for (i in seq(from=0, to=alpha_length$number[x], by=150)) {
pages <- append(pages,
paste(url, path1, alpha_length$letter[x], path2, as.character(i), sep=""))
}}
#------------------Scrape the Artists' Names--------------------------------
#Prep our definition
parse.function <- function(x) {
x %>% html_nodes(".artist-list li a") %>% html_attr(name = "href")
}
names <- c()
#Scrape through each page
for (i in 1:length(pages)) { #---- length(pages) <><><><><><><><><><> Change for test
print(c(i, "out of", length(pages)))
Sys.sleep(.5)
art_site <- rvest::html(pages[i])
names <- append(names, parse.function(art_site))
}
#Text Manipulation
names <- gsub(" ", "%20", names)
names <- gsub("", "%20", names, fixed=TRUE)
names <- gsub("_Asgar/Gabriel", "_Asgar%20Gabriel", names)
names <- gsub("?", "", names) %>%
as.data.frame(names)
row.names(names) <- seq(length=nrow(names))
url <- "http://artsalesindex.artinfo.com"
#Art Profile pages to be scraped
pages <- c()
for (i in 1:nrow(names)) {
print(c(i, "out of", nrow(names)))
pages <- append(pages, paste(url, names$.[i], sep=''))
}
Step 2
###******************************** STEP 2 *************************************
#------------------Scrape the Artists' Profile Pages-----------------------
parsefunction <- function(x) {
x %>% rvest::html_nodes(".results-title a") %>% rvest::html_attr(name = "href")
}
art.pieces <- character()
system.time(
for (i in 1:length(pages)) { #----- length(pages) <><><><><>Change for test
print(c(i, "out of", length(pages)))
Sys.sleep(1)
art_site_bio <- html(pages[i])
art.pieces <- append(art.pieces, parsefunction(art_site_bio))
})
#---------------------------------------------------------------
url <- "http://artsalesindex.artinfo.com"
#String final URL for piece pages
url <- "http://artsalesindex.artinfo.com"
art.pieces <- as.data.frame(art.pieces)
pages <- c()
for (i in 1:nrow(art.pieces)) {
print(i)
pages <- append(pages, paste(url, art.pieces$art.pieces[i], sep=''))
}
Step 3
##************************************* STEP 3 ****************************************
artist <- vector(mode="character")
artist.nationality <- vector(mode="character")
title <- vector(mode="character")
year <- vector(mode = "character")
lot.number <- vector(mode = "character")
auction.data <- vector(mode = "character")
price <- vector(mode = "character")
lot.details <- vector(mode = "character")
materials <- vector(mode = "character")
measurements <- vector(mode = "character")
description <- vector(mode = "character")
markings <- vector(mode = "character")
image.link <- vector(mode = "character")
for (i in 1:length(pages)) { #----- length(pages) <><><><><><><><><><><><><><><><><>Change for test
print(c(i, "out of", length(pages)))
Sys.sleep(1)
piece_html <- html(pages[i])
#-----------------Artist
x <- piece_html %>% html_nodes("#artistName") %>% html_attr(name="value")
if (length(x)==0) {artist[i] <- "NA"}
else {artist[i] <- x}
#-----------------Artist Nationality
x <- piece_html %>% html_nodes(".artist-nationality") %>% html_text()
if (length(x)==0) {artist.nationality[i] <- "NA"}
else {artist.nationality[i] <- x}
#-----------------Title
x <- piece_html %>% html_nodes(".title") %>% html_text()
if (length(x)==0) {title[i] <- "NA"}
else {title[i] <- x}
#-----------------Year
x <- piece_html %>% html_nodes("#artworkIndex_0 p:nth-child(3)") %>% html_text()
if (length(x)==0) {year[i] <- "NA"}
else {year[i] <- x}
#-----------------Lot Number
x <- piece_html %>% html_nodes(".lotnumber") %>% html_text()
if (length(x)==0) {lot.number[i] <- "NA"}
else {lot.number[i] <- x}
#-----------------Auction Data
x <- piece_html %>% html_nodes(".auctiondata") %>% html_text()
if (length(x)==0) {auction.data[i] <- "NA"}
else {auction.data[i] <- x}
#-----------------Price
x <- piece_html %>% html_nodes(".price") %>% html_text()
if (length(x)==0) {price[i] <- "NA"}
else {price[i] <- x}
#-----------------Lot Details
foo <- piece_html %>% html_nodes(".lot-details1") %>% html_text()
x <- lot.details[2]
if (length(x)==0) {lot.details[i] <- "NA"}
else {lot.details[i] <- x}
#-----------------Details -------------------------------###
details <- piece_html %>% html_nodes(".artworkdetails") %>% html_text()
#-----------------Materials
x <- details[1]
if (length(x)==0) {materials[i] <- "NA"}
else {materials[i] <- x}
#-----------------Measurements
x <- details[2]
if (length(x)==0) {measurements[i] <- "NA"}
else {measurements[i] <- x}
#-----------------Description
x <- details[3]
if (length(x)==0) {description[i] <- "NA"}
else {description[i] <- x}
#-----------------Markings
x <- details[4]
if (length(x)==0) {markings[i] <- "NA"}
else {markings[i] <- x}
#-----------------Image Link
x <- piece_html %>% html_nodes("#imageIndex_0 img") %>% html_attr(name="src")
if (length(x)==0) {image.link[i] <- "NA"}
else {image.link[i] <- x}
}
final <- data.frame(artist=artist,
artist.nationality=artist.nationality,
title=title,
year=year,
lot.number=lot.number,
auction.data=auction.data,
price=price,
lot.details=lot.details,
materials=materials,
measurements=measurements,
description=description,
markings=markings,
image.link=image.link)
To leave a comment for the author, please follow the link and comment on their blog: NYC Data Science Academy » R.
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.