The Simpsons Dataset

TidyTuesday
R Programming
Data Visualization
Simpsons
This visualization showcases IMDb ratings for The Simpsons episodes across multiple seasons, where color represents the rating (from 4.5 to 8.4) and dot size reflects the number of votes. Notably, Barthood (S27E09) holds the highest rating (8.4), while Lisa Goes Gaga (S23E22) is the lowest-rated episode (4.5) with the most votes.
Author
Published

February 12, 2025

Figure 1: Created using ggplot2 in R, the chart utilizes color encoding to represent IMDb ratings, ranging from 4.5 (lowest) to 8.4 (highest), as shown in the scale on the right. Size encoding indicates the number of IMDb votes per episode, with reference values (104, 600, and 1,215 votes) displayed in the legend. Notable highlights include “Barthood” (S27E09) as the highest-rated episode (8.4 IMDb, 698 votes) and “Lisa Goes Gaga” (S23E22) as the lowest-rated episode (4.5 IMDb, 1,215 votes), marked by the largest and most saturated pink dot.

How This Graphic Was Made

1. 📦 Load Packages & Setup

# Load necessary packages using pacman for easier dependency management
pacman::p_load(
  tidyverse,   # Collection of R packages for data science (ggplot2, dplyr, etc.)
  showtext,    # Enables custom fonts for ggplot2
  ggtext,      # Adds rich text formatting to ggplot2
  skimr,       # Provides summary statistics in a readable format
  janitor,     # Cleans and formats column names for consistency
  shadowtext,  # Adds text with shadows in ggplot2 for better readability
  glue         # Provides a more readable way to concatenate and format strings
)

# Add Google fonts
font_add_google("Roboto Condensed", family = "Roboto")
font_add("Simpsons", regular = here::here("fonts/Simpsons.otf"))

# Add local font
font_add("Font Awesome 6 Brands", here::here("fonts/otfs/Font Awesome 6 Brands-Regular-400.otf"))

# Automatically enable the use of showtext for all plots
showtext_auto()

# Set DPI for high-resolution text rendering
showtext_opts(dpi = 300)

2. 📖 Read in the Data

# Load the TidyTuesday data
tuesdata <- tidytuesdayR::tt_load(2025, week = 5)

characters <- tuesdata$simpsons_characters %>% clean_names()
episodes <- tuesdata$simpsons_episodes %>% clean_names()
locations <- tuesdata$simpsons_locations %>% clean_names()
script_lines <- tuesdata$simpsons_script_lines %>% clean_names()

tidytuesdayR::readme(tuesdata)  # Display dataset documentation
rm(tuesdata)  # Remove raw data object to free up memory

3. 🕵️ Examine the Data

# Preview the structure of each dataset using glimpse()
glimpse(characters)
glimpse(episodes)
glimpse(locations)
glimpse(script_lines)

4. 🤼 Wrangle Data

# Select the episode with the lowest IMDb rating
episodes %>%
  slice_min(imdb_rating) %>%
  select(imdb_rating, imdb_votes, number_in_season, season, title)

# Select the episode with the highest IMDb rating
episodes %>%
  slice_max(imdb_rating) %>%
  select(imdb_rating, imdb_votes, number_in_season, season, title)

# Select the episode with the fewest IMDb votes
episodes %>%
  slice_min(imdb_votes) %>%
  select(imdb_votes)

# Select the episode with the most IMDb votes
episodes %>%
  slice_max(imdb_votes) %>%
  select(imdb_votes) 

# Remove episodes with missing IMDb ratings
episodes_clean <- episodes %>%
  drop_na(imdb_rating)

# Count the number of episodes per season
episodes_clean %>%
  group_by(season) %>%
  count(season)

5. 🔤 Text

title <- "the Simpsons Ratings"
st <- "Highest and Lowest Rated Episodes: Barthood (S27E09, 8.4 IMDb, 698 votes) vs. Lisa Goes Gaga (S23E22, 4.5 IMDb, 1,215 votes)"

# Create a social media caption with customized colors and font for consistency in visualization
social <- andresutils::social_caption(font_family = "Roboto", icon_color = "black", bg_color = "#C5DCEE") 

# Construct the final plot caption by combining TidyTuesday details, data source, and the social caption
cap <- paste0(
  "#TidyTuesday: Week 05, 2025 | **Source**: The Simpsons Dataset | **Graphic**: ", social
)

6. 📊 Plot

# Create a scatter plot visualizing IMDb ratings and votes for each episode
p <- episodes_clean %>%
  ggplot(aes(x = factor(number_in_season), y = fct_rev(factor(season)))) +
  geom_point(aes(color = imdb_rating, size = imdb_votes)) +
  scale_color_gradientn(colors = LaCroixColoR::lacroix_palette("PassionFruit", type = "continuous"), breaks = c(4.5, 5.5, 6.5, 7.5, 8.4)) +
  scale_size_area(breaks = c(104, 600, 1215), label = scales::comma) +
  guides(color = guide_colorbar(barheight = unit(7, "lines"),
                                barwidth = unit(0.5, "lines"),
                                order = 1,
                                theme = theme(
                                  legend.title = element_text(margin = margin(b = 7))
                                )),
         size = guide_legend(override.aes = list(shape = 21))) +
  labs(x = "Episode",
       y = "Season",
       title = title,
       subtitle = st,
       caption = cap,
       color = "IMDb rating", size = "IMDb\nrating count") +
  theme_minimal(base_family = "Roboto") +
  theme(
    plot.title = element_shadowtext(family = "Simpsons", face = "bold", bg.r = 0.05, size = 14, color = "#FFDF00", margin = margin(b = 20), hjust = 1.15),
    plot.subtitle = element_textbox_simple(size = 7, margin = margin(b = 10), lineheight = 1.2),
    plot.background = element_rect(fill = "#C5DCEE", color = NA),
    axis.text = element_text(color = "black", size = 8),
    axis.title.x=element_text(color = "black", size = 9, face = "bold", margin = margin(t = 3)),
    axis.title.y=element_text(color = "black", size = 9, face = "bold"),
    plot.margin = margin(0.5, 0.5, 0.5, 0.5, unit = "cm"),
    panel.grid = element_line(linetype = "dotted", color = "grey65", size = 0.2),
    legend.title = element_text(size = 8, margin = margin(b = 2)),
    legend.text = element_text(size = 7.5),
    plot.caption.position = "plot",
    plot.caption = element_markdown(size = 6, hjust = 0, margin = margin(t = 10))
  )

7. 💾 Save

# Save plot with dimensions
andresutils::save_plot(p, type = "tidytuesday", year = 2025, week = 5, width = 6, height = 4.5)

8. 🚀 GitHub Repository

The complete code for this analysis is available in tt_05_2025.qmd.

For the full repository, click here.

Back to top

Citation

For attribution, please cite this work as:
Gonzalez, Andres. 2025. “The Simpsons Dataset.” February 12, 2025. https://andresgonzalezstats.com/visualization/TidyTuesday/2025/Week_05/tt_05_2025.html.