Chapter 13: Dictionary-based Analysis

Sentiment analyses are fairly easy when you have your data in tidy text format. As they basically consist of matching the particular words’ sentiment values to the corpus, this can be done with an inner_join(). tidytext comes with four dictionaries: bing, loughran, afinn, and nrc:

needs(tidyverse, tidytext, sotu, SnowballC, textdata)

walk(c("bing", "loughran", "afinn", "nrc"), \(x) get_sentiments(lexicon = x) |> 
       head() |> 
       print())

# A tibble: 6 × 2
  word       sentiment
  <chr>      <chr>    
1 2-faces    negative 
2 abnormal   negative 
3 abolish    negative 
4 abominable negative 
5 abominably negative 
6 abominate  negative 
# A tibble: 6 × 2
  word         sentiment
  <chr>        <chr>    
1 abandon      negative 
2 abandoned    negative 
3 abandoning   negative 
4 abandonment  negative 
5 abandonments negative 
6 abandons     negative 
# A tibble: 6 × 2
  word       value
  <chr>      <dbl>
1 abandon       -2
2 abandoned     -2
3 abandons      -2
4 abducted      -2
5 abduction     -2
6 abductions    -2
# A tibble: 6 × 2
  word      sentiment
  <chr>     <chr>    
1 abacus    trust    
2 abandon   fear     
3 abandon   negative 
4 abandon   sadness  
5 abandoned anger    
6 abandoned fear

As you can see here, the dictionaries are mere tibbles with two columns: “word” and “sentiment”. For easier joining, I should rename my output column to “word” in the preprocessing.

needs(magrittr)

sotu_20cent_clean <- sotu_meta |> 
  mutate(text = sotu_text) |> 
  distinct(text, .keep_all = TRUE) |> 
  filter(between(year, 1900, 2000)) |> 
  unnest_tokens(output = word, input = text) |> 
  anti_join(get_stopwords()) |> 
  filter(!str_detect(word, "[0-9]")) |> 
  mutate(word = wordStem(word, language = "en"))

The AFINN dictionary is the only one with numeric values. You might have noticed that its words are not stemmed. Hence, I need to do this before I can join it with my tibble. Then, to get the sentiment value per document, I can simply average it.

sotu_20cent_afinn <- get_sentiments("afinn") |> 
  mutate(word = wordStem(word, language = "en")) |> 
  inner_join(sotu_20cent_clean) |> 
  group_by(year) |> 
  summarize(sentiment = mean(value))

Thereafter, I can just plot it:

sotu_20cent_afinn |> 
  ggplot() +
  geom_line(aes(x = year, y = sentiment))

That’s a bit hard to interpret. geom_smooth() might help:

sotu_20cent_afinn |> 
  ggplot() +
  geom_smooth(aes(x = year, y = sentiment), span = 0.2)

Interesting. When you think of the tone in the SOTU addresses as a proxy measure for the circumstances, the worst phase appears to be during the 1920s and 1930s – might make sense given the then economic circumstances, etc. The maximum was in around the 1960s and since then it has, apparently, remained fairly stable.

Assessing the results

However, we have no idea whether we are capturing some valid signal or not. Let’s look at the terms that drive those classifications the most, i.e., the words that contribute the most to the overall sentiment score:

sotu_20cent_contribution <- get_sentiments("afinn") |> 
  mutate(word = wordStem(word, language = "en")) |> 
  inner_join(sotu_20cent_clean)  |>
  group_by(word) |>
  summarize(occurences = n(),
            contribution = sum(value))

sotu_20cent_contribution |>
  slice_max(contribution, n = 10) |>
  bind_rows(sotu_20cent_contribution |> slice_min(contribution, n = 10)) |> 
  mutate(word = reorder(word, contribution)) |>
  ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    labs(y = NULL)

Let’s split this up per decade:

get_sentiments("afinn") |> 
  mutate(word = wordStem(word, language = "en")) |> 
  inner_join(sotu_20cent_clean) |>
  mutate(decade = ((year - 1900)/10) |> floor()) |>
  group_by(decade, word) |>
  summarize(occurrences = n(),
            contribution = sum(value)) |> 
  slice_max(contribution, n = 5) |>
  bind_rows(get_sentiments("afinn") |> 
              mutate(word = wordStem(word, language = "en")) |> 
              inner_join(sotu_20cent_clean) |>
              mutate(decade = ((year - 1900)/10) |> floor()) |>
              group_by(decade, word) |>
              summarize(occurrences = n(),
                        contribution = sum(value)) |> 
              slice_min(contribution, n = 5)) |> 
  mutate(word = reorder_within(word, contribution, decade)) |>
  ggplot(aes(contribution, word, fill = contribution > 0)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~decade, ncol = 4, scales = "free") +
    scale_y_reordered()

Assessing the quality of the rating

We need to assess the reliability of our classification (would different raters come to the same conclusion; and, if we compare it to a gold standard, how does the classification live up to its standards). One measure we can use here is Krippendorf’s Alpha which is defined as

\[\alpha = \frac{D_o}{D_e}\]

where \(D_{o}\) is the observed disagreement and \(D_{e}\) is the expected disagreement (by chance). The calculation of the measure is far more complicated, but R can easily take care of that – we just need to feed it with proper data. For this example I use a commonly used benchmark data set containing IMDb reviews of movies and whether they’re positive or negative.

imdb_reviews <- read_csv("https://www.dropbox.com/scl/fi/psgj6ze6at3zovildm728/imdb_reviews.csv?rlkey=ve2s02ydosbweemalvskyiu4s&dl=1")

glimpse(imdb_reviews)

Rows: 25,000
Columns: 2
$ text      <chr> "Once again Mr. Costner has dragged out a movie for far long…
$ sentiment <chr> "negative", "negative", "negative", "negative", "negative", …

imdb_reviews_afinn <- imdb_reviews |> 
  rowid_to_column("doc") |> 
  unnest_tokens(token, text) |> 
  anti_join(get_stopwords(), by = c("token" = "word")) |> 
  mutate(stemmed = wordStem(token)) |> 
  inner_join(get_sentiments("afinn") |> mutate(stemmed = wordStem(word))) |> 
  group_by(doc) |> 
  summarize(sentiment = mean(value)) |> 
  mutate(sentiment_afinn = case_when(sentiment > 0 ~ "positive",
                                     TRUE ~ "negative") |> 
           factor(levels = c("positive", "negative")))

Now we have two classifications, one “gold standard” from the data and the one obtained through AFINN.

review_coding <- imdb_reviews |> 
  mutate(true_sentiment = sentiment |> 
           factor(levels = c("positive", "negative"))) |> 
  select(-sentiment) |> 
  rowid_to_column("doc") |> 
  left_join(imdb_reviews_afinn |> select(doc, sentiment_afinn))

First, we can check how often AFINN got it right, the accuracy:

sum(review_coding$true_sentiment == review_coding$sentiment_afinn, na.rm = TRUE)/nrow(review_coding)

[1] 0.64712

However, accuracy is not a perfect metric because it doesn’t tell you anything about the details. For instance, your classifier might just predict “positive” all of the time. If your gold standard has 50 percent “positive” cases, the accuracy would lie at 0.5. We can address this using the following measures.

For the calculation of Krippendorff’s Alpha, the data must be in a different format: a matrix containing with documents as columns and the respective ratings as rows.

needs(irr)
mat <- review_coding |> 
  select(-text) |> 
  as.matrix() |> 
  t()

mat[1:3, 1:5]

                [,1]       [,2]       [,3]       [,4]       [,5]      
doc             "    1"    "    2"    "    3"    "    4"    "    5"   
true_sentiment  "negative" "negative" "negative" "negative" "negative"
sentiment_afinn "positive" "negative" "negative" "positive" "positive"

colnames(mat) <- mat[1,]

mat <- mat[2:3,]
mat[1:2, 1:5]

                    1          2          3          4          5     
true_sentiment  "negative" "negative" "negative" "negative" "negative"
sentiment_afinn "positive" "negative" "negative" "positive" "positive"

irr::kripp.alpha(mat, method = "nominal")

 Krippendorff's alpha

 Subjects = 25000 
   Raters = 2 
    alpha = 0.266

Good are alpha values of around 0.8 – AFINN missed that one.

Another way to evaluate the quality of classification is through a confusion matrix.

Now we can calculate precision (when it predicts “positive”, how often is it correct), recall/sensitivity (when it is “positive”, how often is this predicted), specificity (when it’s predicted “negative”, how often is it actually negative). The F1-score is the harmonic mean of precision and recall and defined as \(F_1 = \frac{2}{\frac{1}{recall}\times \frac{1}{precision}} = 2\times \frac{precision\times recall}{precision + recall}\) and the most commonly used measure to assess the accuracy of the classification. The closer to 1 it is, the better. You can find a more thorough description of the confusion matrix and the different measures in this blog post.

We can do this in R using the caret package.

needs(caret)
confusion_matrix <- confusionMatrix(data = review_coding$sentiment_afinn, 
                                    reference = review_coding$true_sentiment,
                                    positive = "positive")
confusion_matrix$byClass

         Sensitivity          Specificity       Pos Pred Value 
           0.8439750            0.4504721            0.6056500 
      Neg Pred Value            Precision               Recall 
           0.7427441            0.6056500            0.8439750 
                  F1           Prevalence       Detection Rate 
           0.7052216            0.5000000            0.4219875 
Detection Prevalence    Balanced Accuracy 
           0.6967515            0.6472236

We will revisit this in the chapter on supervised machine learning when we discuss classification algorithms. Spoiler: the results will get better.

Exercises

Take the abortion-related Tweets from last chapter. Check for sentiment differences in parties. Use the AFINN dictionary. Plot your results.

Solution. Click to expand!

needs(tidyverse, tidytext, stopwords, SnowballC)

timelines <- read_csv("https://www.dropbox.com/s/dpu5m3xqz4u4nv7/tweets_house_rep_party.csv?dl=1") |> 
  filter(!is.na(party))

keywords <- c("abortion", "prolife", " roe ", " wade ", "roevswade", "baby", "fetus", "womb", "prochoice", "leak")

preprocessed <- timelines |> 
  rowid_to_column("doc_id") |> 
  filter(str_detect(text, str_c(keywords, collapse = "|"))) |> 
  unnest_tokens(word, text) |> 
  anti_join(get_stopwords()) |> 
  mutate(stemmed = wordStem(word))

sent_per_party <- preprocessed |> 
  inner_join(get_sentiments("afinn")) |> 
  mutate(word_sent = case_when(value > 0 ~ "positive",
                                value < 0 ~ "negative"))
sent_per_party |> 
  ggplot() +
  geom_bar(aes(word_sent)) +
  facet_wrap(vars(party))         
         
sent_per_doc <- preprocessed |> 
  inner_join(get_sentiments("afinn")) |> 
  group_by(doc_id, party) |> 
  summarize(mean_sent = mean(value)) |> 
  mutate(tweet_sent = case_when(mean_sent > 0 ~ "positive",
                                mean_sent < 0 ~ "negative",
                                TRUE ~ "neutral"))

sent_per_doc |> 
  ggplot() +
  geom_bar(aes(tweet_sent)) +
  facet_wrap(vars(party))

Have a look at different dictionaries (e.g., Bing or Loughran). Check the words that contributed the most. Do you see any immediate ambiguities or flaws?

Solution. Click to expand!

# afinn
tweets_abortion_tidy_contribution_afinn <- preprocessed |> 
  inner_join(get_sentiments("afinn")) |> 
  group_by(party, word) |>
  summarize(contribution = sum(value))

bind_rows(
  tweets_abortion_tidy_contribution_afinn |> 
    filter(party == "D") |> 
    slice_max(contribution, n = 10, with_ties = FALSE) |> 
    mutate(type = "pos"),
  tweets_abortion_tidy_contribution_afinn |> 
    filter(party == "D") |> 
    slice_min(contribution, n = 10, with_ties = FALSE) |> 
    mutate(type = "neg"),
  tweets_abortion_tidy_contribution_afinn |> 
    filter(party == "R") |> 
    slice_max(contribution, n = 10, with_ties = FALSE) |> 
    mutate(type = "pos"),
  tweets_abortion_tidy_contribution_afinn |> 
    filter(party == "R") |> 
    slice_min(contribution, n = 10, with_ties = FALSE) |> 
    mutate(type = "neg")
) |> 
  mutate(word = reorder_within(word, contribution, party)) |>
  ggplot() +
  geom_col(aes(contribution, word), show.legend = FALSE) +
    scale_y_reordered() +
    facet_wrap(vars(party, type), scales = "free")

# loughran
needs(ggpubr)

tweets_abortion_tidy_contribution_loughran <- preprocessed |> 
  inner_join(get_sentiments("loughran")) |> 
  count(party, sentiment, word) |> 
  group_by(party, sentiment) |> 
  slice_max(n, n = 10, with_ties = FALSE) |> 
  group_split()

tweets_abortion_tidy_contribution_loughran |> 
  map(\(x) x |> 
        mutate(word = reorder_within(word, n, party)) |> 
        slice_max(n, n = 10) |> 
        ggplot() +
        geom_col(aes(n, word), show.legend = FALSE) +
        scale_y_reordered() +
        labs(title = x[["sentiment"]][1])
  ) |> 
  ggarrange(plotlist = _)
  
# bing
tweets_abortion_tidy_contribution_bing <- preprocessed |> 
  inner_join(get_sentiments("bing")) |> 
  count(party, sentiment, word) |> 
  group_by(party, word, sentiment) |> 
  summarize(contribution = sum(n)) |> 
  group_by(party, sentiment) |> 
  slice_max(contribution, n = 10, with_ties = FALSE) |> 
  mutate(contribution = case_when(
    sentiment == "negative" ~ contribution * (-1),
    TRUE ~ contribution
  ))

tweets_abortion_tidy_contribution_bing |> 
  mutate(word = reorder_within(word, contribution, party)) |>
  ggplot() +
  geom_col(aes(contribution, word), show.legend = FALSE) +
    scale_y_reordered() +
    facet_wrap(vars(party, sentiment), scales = "free")

Further links

Tidy text mining with R.
A more general introduction by Christopher Bail.
A guide to using spacyr.