% # Generate the country name for each country using the # `countrycode()` command dplyr::mutate(countryname = countrycode(ccode, "iso3c", "country.name")) %>% # Filter and only select specific countries that we want to compare dplyr::filter(countryname %in% c( "Germany", "France", "United Kingdom", "Norway", "Spain", "Sweden" )) %>% # Now comes the plotting part :-) ggplot() + # We do a bar plot that has the years on the x-axis and the level of the # net-sentiment on the y-axis # We also color it so that all the net-sentiments greater 0 get a # different color geom_col(aes( x = year, y = net_perc, fill = (net_perc > 0) )) + # Here we define the colors as well as the labels and title of the legend scale_fill_manual( name = "Sentiment", labels = c("Negative", "Positive"), values = c("#C93312", "#446455") ) + # Now we add the axes labels xlab("Time") + ylab("Net sentiment") + # And do a facet_wrap by country to get a more meaningful visualization facet_wrap(~ countryname) Code # Inspired here: https://bit.ly/37MCEHg # Get the 30 top features from the DFM freq_feature % # Generate the continent for each country using the `countrycode()` command dplyr::mutate(continent = countrycode(ccode, "iso3c", "continent", custom_match = c("YUG" = "Europe"))) %>% # We group by continent and year to generate the average sentiment by # continent and and year group_by(continent, year) %>% dplyr::mutate(avg = mean(net_perc)) %>% # We now plot it ggplot() + # Using a line chart with year on the x-axis, the average sentiment # by continent on the y-axis and colored by continent geom_line(aes(x = year, y = avg, col = continent)) + # Define the colors scale_color_manual(name = "", values = wes_palette("Darjeeling1")) + # Label the axes xlab("Time") + ylab("Average net sentiment") These figures above show the output of more basic supervised and unsupervised models in NLP that you can use and that we covered during the talk. And as you work more and more with textual data, you will see that there is so much more in the field of NLP including document similarity, text generation or even chat bots that you can create using your knowledge and starting with the same simple steps that I presented in the talk 👩🏼‍💻 If you want more resources, you can access them here: Quanteda Kohei Watanabe and Stefan Müller: Quanteda Tutorials Quanteda Cheat Sheet More on text mining and NLP Cosima Meyer and Cornelius Puschmann: Advancing Text Mining with R and quanteda Justin Grimmer and Brandon Stewart: Text as Data: The Promise and Pitfalls of Automatic Content Analysis Methods for Political Texts Dan Jurafsky and James H. Martin: Speech and Language Processing Sentiment analysis sentimentr Hammerschmidt/Meyer 2020: Money Makes the World Go Frowned - Analyzing the Impact of Chinese Foreign Aid on States' Sentiment Using Natural Language Processing (for an applied example of sentiment analysis) Model validation oolong: Validation of dictionary approaches and topic models stminsights More general resource Data Science & Society RegEx Cheat Sheet Stringr Cheat Sheet " />

Taking text data to the next level – Unsupervised and supervised approaches in NLP @R-Ladies Bergen

[This article was first published on R-post on Cosima Meyer, and kindly contributed to R-bloggers]. (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

I had the great pleasure to talk about NLP at R-Ladies Bergen yesterday. Thanks to everyone for making this event so much fun! The talk covers both unsupervised and supervised approaches and introduces you to quanteda, an R package that allows you to perform NLP tasks.

All material can be accessed here (including slides, raw and deployed code as well as the recording). The talk itself is heavily based on this blogpost.

Here are some further insights into the talk:

small_image

Code
# Plot a word cloud
quanteda::textplot_wordcloud(
# Load the DFM object
mydfm,
# Define the minimum number the words have to occur
min_count = 3,
# Define the maximum number the words can occur
max_words = 500,
# Define a color
color = wes_palette("Darjeeling1")
)

small_image

Code
# This code is heavily inspired by Julia Silge's blog post
# (https://juliasilge.com/blog/sherlock-holmes-stm/)

small_image

Code
# This code is heavily inspired by this blog post:
# (https://www.mzes.uni-mannheim.de/socialsciencedatalab/article/advancing-text-mining/)

small_image

Code
data %>%
# Generate the country name for each country using the 
# `countrycode()` command
dplyr::mutate(countryname = countrycode(ccode, "iso3c", "country.name")) %>%
# Filter and only select specific countries that we want to compare
dplyr::filter(countryname %in% c(
"Germany",
"France",
"United Kingdom",
"Norway",
"Spain",
"Sweden"
)) %>%
# Now comes the plotting part :-)
ggplot() +
# We do a bar plot that has the years on the x-axis and the level of the 
# net-sentiment on the y-axis
# We also color it so that all the net-sentiments greater 0 get a 
# different color
geom_col(aes(
x = year,
y = net_perc,
fill = (net_perc > 0)
)) +
# Here we define the colors as well as the labels and title of the legend
scale_fill_manual(
name = "Sentiment",
labels = c("Negative", "Positive"),
values = c("#C93312", "#446455")
) +
# Now we add the axes labels
xlab("Time") +
ylab("Net sentiment") +
# And do a facet_wrap by country to get a more meaningful visualization
facet_wrap(~ countryname)

small_image

Code
# Inspired here: https://bit.ly/37MCEHg
# Get the 30 top features from the DFM
freq_feature <- topfeatures(mydfm, 30)
# Create a data.frame for ggplot
data <- data.frame(list(
term = names(freq_feature),
frequency = unname(freq_feature)
))
# Plot the plot
data %>%
# Call ggplot
ggplot() +
# Add geom_segment (this will give us the lines of the lollipops)
geom_segment(aes(
x = reorder(term, frequency),
xend = reorder(term, frequency),
y = 0,
yend = frequency
), color = "grey") +
# Call a point plot with the terms on the x-axis 
# and the frequency on the y-axis
geom_point(aes(x = reorder(term, frequency), y = frequency)) +
# Flip the plot
coord_flip() +
# Add labels for the axes
xlab("") +
ylab("Absolute frequency of the features")

small_image

Code
data %>%
# Generate the continent for each country using the `countrycode()` command
dplyr::mutate(continent = countrycode(ccode, "iso3c", "continent",
custom_match = c("YUG" = "Europe"))) %>%
# We group by continent and year to generate the average sentiment by 
# continent and and year 
group_by(continent, year) %>%
dplyr::mutate(avg = mean(net_perc)) %>%
# We now plot it
ggplot() +
# Using a line chart with year on the x-axis, the average sentiment 
# by continent on the y-axis and colored by continent
geom_line(aes(x = year, y = avg, col = continent)) +
# Define the colors
scale_color_manual(name = "", values = wes_palette("Darjeeling1")) +
# Label the axes
xlab("Time") +
ylab("Average net sentiment")

small_image

These figures above show the output of more basic supervised and unsupervised models in NLP that you can use and that we covered during the talk. And as you work more and more with textual data, you will see that there is so much more in the field of NLP including document similarity, text generation or even chat bots that you can create using your knowledge and starting with the same simple steps that I presented in the talk 👩🏼‍💻

If you want more resources, you can access them here:

To leave a comment for the author, please follow the link and comment on their blog: R-post on Cosima Meyer.

R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.

Never miss an update!
Subscribe to R-bloggers to receive
e-mails with the latest R posts.
(You will not see this message again.)

Click here to close (This popup will not appear again)