Mining Twitter for consumer attitudes towards hotels
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
Couple of months back I read Jeffrey Breen’s presentation on mining Twitter for consumer attitudes towards airlines, so I was just curious how it would look if I estimate the sentiment toward major hotels.
So here it is:
# load twitter library
> library(twitteR)
# search for all the hilton tweets
> hilton.tweets=searchTwitter('@hilton',n=1500)
> length(hilton.tweets)
[1] 162
> class(hilton.tweets)
[1] "list"
> tweet=hilton.tweets[[1]]
> class(tweet)
[1] "status"
attr(,"package")
[1] "twitteR"
> tweet$getScreenName()
[1] "i_RAHOFA"
> tweet$getText()
> library("plyr")
> hilton.text=laply(hilton.tweets,function(t)t$getText())
> length(hilton.text)
> head(hilton.text,5)
# load list of positive and negative words for SIMPLE sentiment analysis
# you would have to download the files from a website I included below - make sure you put in the directory that you will be
# referencing
> hu.liu.pos=scan('/Users/marcinkulakowski/Downloads/r/positive-words.txt',what='character',comment.char=';')
> hu.liu.neg=scan('/Users/marcinkulakowski/Downloads/r/negative-words.txt',what='character',comment.char=';')
> pos.words=c(hu.liu.pos,'upgrade')
> neg.words=c(hu.liu.neg,'wtf','wait','waiting','epicfail','mechanical')
# sampling
> sample=c("You'reawesomeandIloveyou","Ihateandhateandhate.Soangry.Die!","Impressedandamazed:youarepeerlessinyourachievementofunparalleledmediocrity.")
> score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
> result=score.sentiment(sample,pos.words,neg.words)
> class(result)
[1] "data.frame"
> result$score
[1] 0 0 0
> hilton.scores=score.sentiment(hilton.text,pos.words,neg.words,.progress='text')
> hilton.scores$hotel='Hilton'
> hilton.scores$code='HL'
> hist(hilton.scores$score)
# hilton histogram
> library("ggplot2")
> qplot(hilton.scores$score)
# qplot hilton
# lets search for all other major hotels
# Intercontinental
intercontinental.tweets=searchTwitter('@intercontinental',n=1500)
class(tweet)
intercontinental.text=laply(intercontinental.tweets,function(t)t$getText())
intercontinental.scores=score.sentiment(intercontinental.text,pos.words,neg.words,.progress='text')
intercontinental.scores$hotel='Intercontinental'
intercontinental.scores$code='IC'
# Wyndham
wyndham.tweets=searchTwitter('@wyndham',n=1500)
class(tweet)
wyndham.text=laply(wyndham.tweets,function(t)t$getText())
wyndham.scores=score.sentiment(wyndham.text,pos.words,neg.words,.progress='text')
wyndham.scores$hotel='Wyndham'
wyndham.scores$code='WY'
# Marriott
marriott.tweets=searchTwitter('@marriott',n=1500)
class(tweet)
marriott.text=laply(marriott.tweets,function(t)t$getText())
marriott.scores=score.sentiment(marriott.text,pos.words,neg.words,.progress='text')
marriott.scores$hotel='Marriott'
marriott.scores$code='MI'
# BestWestern
bestwestern.tweets=searchTwitter('@bestwestern',n=1500)
class(tweet)
bestwestern.text=laply(bestwestern.tweets,function(t)t$getText())
bestwestern.scores=score.sentiment(bestwestern.text,pos.words,neg.words,.progress='text')
bestwestern.scores$hotel='Bestwestern'
bestwestern.scores$code='BW'
# Starwood
starwood.tweets=searchTwitter('@starwood',n=1500)
class(tweet)
starwood.text=laply(starwood.tweets,function(t)t$getText())
starwood.scores=score.sentiment(starwood.text,pos.words,neg.words,.progress='text')
starwood.scores$hotel='Starwood'
starwood.scores$code='SW'
# Hyatt
hyatt.tweets=searchTwitter('@hyatt',n=1500)
class(tweet)
hyatt.text=laply(hyatt.tweets,function(t)t$getText())
hyatt.scores=score.sentiment(hyatt.text,pos.words,neg.words,.progress='text')
hyatt.scores$hotel='Hyatt'
hyatt.scores$code='HY'
> all.scores=rbind(intercontinental.scores,wyndham.scores,hilton.scores,marriott.scores,bestwestern.scores,starwood.scores,hyatt.scores)
# Make separate plot for each hotel
> ggplot(data=all.scores)+#ggplotworksondata.frames,always
geom_bar(mapping=aes(x=score,fill=hotel),binwidth=1)+
facet_grid(hotel~.)+#makeaseparateplotforeachhotel
theme_bw()+scale_fill_brewer()#plaindisplay,nicercolors
# Plot
> all.scores$very.pos=as.numeric(all.scores$score>=2)
> all.scores$very.neg=as.numeric(all.scores$score twitter.df=ddply(all.scores,c('hotel','code'),summarise,pos.count=sum(very.pos),neg.count=sum(very.neg))
> twitter.df$all.count=twitter.df$pos.count+twitter.df$neg.count
> twitter.df$score=round(100*twitter.df$pos.count/twitter.df$all.count)
> install.packages("doBy")
> library("doBy")
> orderBy(~-score,twitter.df)
hotel code pos.count neg.count all.count score
1 Bestwestern BW 6 0 6 100
5 Starwood SW 7 0 7 100
6 Wyndham WY 2 0 2 100
3 Hyatt HY 7 1 8 88
2 Hilton HL 15 3 18 83
4 Marriott MI 13 4 17 76
> install.packages("XML")
> library(XML)
> acsi.url='http://www.theacsi.org/index.php?option=com_content&view=article&id=147&catid=&Itemid=212&i=Hotels'
# scrape acsi website for scores
> acsi.df=readHTMLTable(acsi.url,header=T,which=1,stringsAsFactors=F)
> acsi.df=acsi.df[,c(1,18)]
> head(acsi.df,1)
> colnames(acsi.df)=c('hotel','score')
> acsi.df$score=as.numeric(acsi.df$score)
> View(acsi.df)
# ACSI Dataframe
> acsi.df$code=c('HL','SW','MI','NA','HY','NA','IC','BW','NA','WY','NA','NA','NA')
> acsi.df$score=as.numeric(acsi.df$score)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> compare.df=subset(compare.df,all.count>100)
> compare.df=merge(twitter.df,acsi.df,by='code',suffixes=c('.twitter','.acsi'))
> View(compare.df)
# scores compared
> ggplot(compare.df)+geom_point(aes(x=score.twitter,y=score.acsi,color=hotel.twitter),size=6)+ geom_smooth(aes(x=score.twitter,y=score.acsi,group=1),se=F,method="lm")+theme_bw()+opts(legend.position=c(0.85,0.85))
# final plot
Materials used:
——————————————
Jeffrey Breen’s presentation
The American Customer Satisfaction Index (Hotels)
Opinion Mining, Sentiment Analysis, Opinion Extraction
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.