Chapter 13: Dictionary-based Analysis
Sentiment analyses are fairly easy when you have your data in tidy text format. As they basically consist of matching the particular words’ sentiment values to the corpus, this can be done with an inner_join(). tidytext comes with four dictionaries: bing, loughran, afinn, and nrc:
needs(tidyverse, tidytext, sotu, SnowballC, textdata)
walk(c("bing", "loughran", "afinn", "nrc"), \(x) get_sentiments(lexicon = x) |>
head() |>
print())# A tibble: 6 × 2
word sentiment
<chr> <chr>
1 2-faces negative
2 abnormal negative
3 abolish negative
4 abominable negative
5 abominably negative
6 abominate negative
# A tibble: 6 × 2
word sentiment
<chr> <chr>
1 abandon negative
2 abandoned negative
3 abandoning negative
4 abandonment negative
5 abandonments negative
6 abandons negative
# A tibble: 6 × 2
word value
<chr> <dbl>
1 abandon -2
2 abandoned -2
3 abandons -2
4 abducted -2
5 abduction -2
6 abductions -2
# A tibble: 6 × 2
word sentiment
<chr> <chr>
1 abacus trust
2 abandon fear
3 abandon negative
4 abandon sadness
5 abandoned anger
6 abandoned fear
As you can see here, the dictionaries are mere tibbles with two columns: “word” and “sentiment”. For easier joining, I should rename my output column to “word” in the preprocessing.
needs(magrittr)
sotu_20cent_clean <- sotu_meta |>
mutate(text = sotu_text) |>
distinct(text, .keep_all = TRUE) |>
filter(between(year, 1900, 2000)) |>
unnest_tokens(output = word, input = text) |>
anti_join(get_stopwords()) |>
filter(!str_detect(word, "[0-9]")) |>
mutate(word = wordStem(word, language = "en"))The AFINN dictionary is the only one with numeric values. You might have noticed that its words are not stemmed. Hence, I need to do this before I can join it with my tibble. Then, to get the sentiment value per document, I can simply average it.
sotu_20cent_afinn <- get_sentiments("afinn") |>
mutate(word = wordStem(word, language = "en")) |>
inner_join(sotu_20cent_clean) |>
group_by(year) |>
summarize(sentiment = mean(value))Thereafter, I can just plot it:
sotu_20cent_afinn |>
ggplot() +
geom_line(aes(x = year, y = sentiment))
That’s a bit hard to interpret. geom_smooth() might help:
sotu_20cent_afinn |>
ggplot() +
geom_smooth(aes(x = year, y = sentiment), span = 0.2)
Interesting. When you think of the tone in the SOTU addresses as a proxy measure for the circumstances, the worst phase appears to be during the 1920s and 1930s – might make sense given the then economic circumstances, etc. The maximum was in around the 1960s and since then it has, apparently, remained fairly stable.
Assessing the results
However, we have no idea whether we are capturing some valid signal or not. Let’s look at the terms that drive those classifications the most, i.e., the words that contribute the most to the overall sentiment score:
sotu_20cent_contribution <- get_sentiments("afinn") |>
mutate(word = wordStem(word, language = "en")) |>
inner_join(sotu_20cent_clean) |>
group_by(word) |>
summarize(occurences = n(),
contribution = sum(value))sotu_20cent_contribution |>
slice_max(contribution, n = 10) |>
bind_rows(sotu_20cent_contribution |> slice_min(contribution, n = 10)) |>
mutate(word = reorder(word, contribution)) |>
ggplot(aes(contribution, word, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
labs(y = NULL)
Let’s split this up per decade:
get_sentiments("afinn") |>
mutate(word = wordStem(word, language = "en")) |>
inner_join(sotu_20cent_clean) |>
mutate(decade = ((year - 1900)/10) |> floor()) |>
group_by(decade, word) |>
summarize(occurrences = n(),
contribution = sum(value)) |>
slice_max(contribution, n = 5) |>
bind_rows(get_sentiments("afinn") |>
mutate(word = wordStem(word, language = "en")) |>
inner_join(sotu_20cent_clean) |>
mutate(decade = ((year - 1900)/10) |> floor()) |>
group_by(decade, word) |>
summarize(occurrences = n(),
contribution = sum(value)) |>
slice_min(contribution, n = 5)) |>
mutate(word = reorder_within(word, contribution, decade)) |>
ggplot(aes(contribution, word, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~decade, ncol = 4, scales = "free") +
scale_y_reordered()
Assessing the quality of the rating
We need to assess the reliability of our classification (would different raters come to the same conclusion; and, if we compare it to a gold standard, how does the classification live up to its standards). One measure we can use here is Krippendorf’s Alpha which is defined as
\[\alpha = \frac{D_o}{D_e}\]
where \(D_{o}\) is the observed disagreement and \(D_{e}\) is the expected disagreement (by chance). The calculation of the measure is far more complicated, but R can easily take care of that – we just need to feed it with proper data. For this example I use a commonly used benchmark data set containing IMDb reviews of movies and whether they’re positive or negative.
imdb_reviews <- read_csv("https://www.dropbox.com/scl/fi/psgj6ze6at3zovildm728/imdb_reviews.csv?rlkey=ve2s02ydosbweemalvskyiu4s&dl=1")
glimpse(imdb_reviews)Rows: 25,000
Columns: 2
$ text <chr> "Once again Mr. Costner has dragged out a movie for far long…
$ sentiment <chr> "negative", "negative", "negative", "negative", "negative", …
imdb_reviews_afinn <- imdb_reviews |>
rowid_to_column("doc") |>
unnest_tokens(token, text) |>
anti_join(get_stopwords(), by = c("token" = "word")) |>
mutate(stemmed = wordStem(token)) |>
inner_join(get_sentiments("afinn") |> mutate(stemmed = wordStem(word))) |>
group_by(doc) |>
summarize(sentiment = mean(value)) |>
mutate(sentiment_afinn = case_when(sentiment > 0 ~ "positive",
TRUE ~ "negative") |>
factor(levels = c("positive", "negative")))Now we have two classifications, one “gold standard” from the data and the one obtained through AFINN.
review_coding <- imdb_reviews |>
mutate(true_sentiment = sentiment |>
factor(levels = c("positive", "negative"))) |>
select(-sentiment) |>
rowid_to_column("doc") |>
left_join(imdb_reviews_afinn |> select(doc, sentiment_afinn)) First, we can check how often AFINN got it right, the accuracy:
sum(review_coding$true_sentiment == review_coding$sentiment_afinn, na.rm = TRUE)/nrow(review_coding)[1] 0.64712
However, accuracy is not a perfect metric because it doesn’t tell you anything about the details. For instance, your classifier might just predict “positive” all of the time. If your gold standard has 50 percent “positive” cases, the accuracy would lie at 0.5. We can address this using the following measures.
For the calculation of Krippendorff’s Alpha, the data must be in a different format: a matrix containing with documents as columns and the respective ratings as rows.
needs(irr)
mat <- review_coding |>
select(-text) |>
as.matrix() |>
t()
mat[1:3, 1:5] [,1] [,2] [,3] [,4] [,5]
doc " 1" " 2" " 3" " 4" " 5"
true_sentiment "negative" "negative" "negative" "negative" "negative"
sentiment_afinn "positive" "negative" "negative" "positive" "positive"
colnames(mat) <- mat[1,]
mat <- mat[2:3,]
mat[1:2, 1:5] 1 2 3 4 5
true_sentiment "negative" "negative" "negative" "negative" "negative"
sentiment_afinn "positive" "negative" "negative" "positive" "positive"
irr::kripp.alpha(mat, method = "nominal") Krippendorff's alpha
Subjects = 25000
Raters = 2
alpha = 0.266
Good are alpha values of around 0.8 – AFINN missed that one.
Another way to evaluate the quality of classification is through a confusion matrix.

Now we can calculate precision (when it predicts “positive”, how often is it correct), recall/sensitivity (when it is “positive”, how often is this predicted), specificity (when it’s predicted “negative”, how often is it actually negative). The F1-score is the harmonic mean of precision and recall and defined as \(F_1 = \frac{2}{\frac{1}{recall}\times \frac{1}{precision}} = 2\times \frac{precision\times recall}{precision + recall}\) and the most commonly used measure to assess the accuracy of the classification. The closer to 1 it is, the better. You can find a more thorough description of the confusion matrix and the different measures in this blog post.
We can do this in R using the caret package.
needs(caret)
confusion_matrix <- confusionMatrix(data = review_coding$sentiment_afinn,
reference = review_coding$true_sentiment,
positive = "positive")
confusion_matrix$byClass Sensitivity Specificity Pos Pred Value
0.8439750 0.4504721 0.6056500
Neg Pred Value Precision Recall
0.7427441 0.6056500 0.8439750
F1 Prevalence Detection Rate
0.7052216 0.5000000 0.4219875
Detection Prevalence Balanced Accuracy
0.6967515 0.6472236
We will revisit this in the chapter on supervised machine learning when we discuss classification algorithms. Spoiler: the results will get better.
Exercises
- Take the abortion-related Tweets from last chapter. Check for sentiment differences in parties. Use the AFINN dictionary. Plot your results.
Solution. Click to expand!
needs(tidyverse, tidytext, stopwords, SnowballC)
timelines <- read_csv("https://www.dropbox.com/s/dpu5m3xqz4u4nv7/tweets_house_rep_party.csv?dl=1") |>
filter(!is.na(party))
keywords <- c("abortion", "prolife", " roe ", " wade ", "roevswade", "baby", "fetus", "womb", "prochoice", "leak")
preprocessed <- timelines |>
rowid_to_column("doc_id") |>
filter(str_detect(text, str_c(keywords, collapse = "|"))) |>
unnest_tokens(word, text) |>
anti_join(get_stopwords()) |>
mutate(stemmed = wordStem(word))
sent_per_party <- preprocessed |>
inner_join(get_sentiments("afinn")) |>
mutate(word_sent = case_when(value > 0 ~ "positive",
value < 0 ~ "negative"))
sent_per_party |>
ggplot() +
geom_bar(aes(word_sent)) +
facet_wrap(vars(party))
sent_per_doc <- preprocessed |>
inner_join(get_sentiments("afinn")) |>
group_by(doc_id, party) |>
summarize(mean_sent = mean(value)) |>
mutate(tweet_sent = case_when(mean_sent > 0 ~ "positive",
mean_sent < 0 ~ "negative",
TRUE ~ "neutral"))
sent_per_doc |>
ggplot() +
geom_bar(aes(tweet_sent)) +
facet_wrap(vars(party))- Have a look at different dictionaries (e.g., Bing or Loughran). Check the words that contributed the most. Do you see any immediate ambiguities or flaws?
Solution. Click to expand!
# afinn
tweets_abortion_tidy_contribution_afinn <- preprocessed |>
inner_join(get_sentiments("afinn")) |>
group_by(party, word) |>
summarize(contribution = sum(value))
bind_rows(
tweets_abortion_tidy_contribution_afinn |>
filter(party == "D") |>
slice_max(contribution, n = 10, with_ties = FALSE) |>
mutate(type = "pos"),
tweets_abortion_tidy_contribution_afinn |>
filter(party == "D") |>
slice_min(contribution, n = 10, with_ties = FALSE) |>
mutate(type = "neg"),
tweets_abortion_tidy_contribution_afinn |>
filter(party == "R") |>
slice_max(contribution, n = 10, with_ties = FALSE) |>
mutate(type = "pos"),
tweets_abortion_tidy_contribution_afinn |>
filter(party == "R") |>
slice_min(contribution, n = 10, with_ties = FALSE) |>
mutate(type = "neg")
) |>
mutate(word = reorder_within(word, contribution, party)) |>
ggplot() +
geom_col(aes(contribution, word), show.legend = FALSE) +
scale_y_reordered() +
facet_wrap(vars(party, type), scales = "free")
# loughran
needs(ggpubr)
tweets_abortion_tidy_contribution_loughran <- preprocessed |>
inner_join(get_sentiments("loughran")) |>
count(party, sentiment, word) |>
group_by(party, sentiment) |>
slice_max(n, n = 10, with_ties = FALSE) |>
group_split()
tweets_abortion_tidy_contribution_loughran |>
map(\(x) x |>
mutate(word = reorder_within(word, n, party)) |>
slice_max(n, n = 10) |>
ggplot() +
geom_col(aes(n, word), show.legend = FALSE) +
scale_y_reordered() +
labs(title = x[["sentiment"]][1])
) |>
ggarrange(plotlist = _)
# bing
tweets_abortion_tidy_contribution_bing <- preprocessed |>
inner_join(get_sentiments("bing")) |>
count(party, sentiment, word) |>
group_by(party, word, sentiment) |>
summarize(contribution = sum(n)) |>
group_by(party, sentiment) |>
slice_max(contribution, n = 10, with_ties = FALSE) |>
mutate(contribution = case_when(
sentiment == "negative" ~ contribution * (-1),
TRUE ~ contribution
))
tweets_abortion_tidy_contribution_bing |>
mutate(word = reorder_within(word, contribution, party)) |>
ggplot() +
geom_col(aes(contribution, word), show.legend = FALSE) +
scale_y_reordered() +
facet_wrap(vars(party, sentiment), scales = "free")Further links
- Tidy text mining with R.
- A more general introduction by Christopher Bail.
- A guide to using spacyr.