The Simpsons Dataset – Andres Gonzalez

Figure 1: Created using ggplot2 in R, the chart utilizes color encoding to represent IMDb ratings, ranging from 4.5 (lowest) to 8.4 (highest), as shown in the scale on the right. Size encoding indicates the number of IMDb votes per episode, with reference values (104, 600, and 1,215 votes) displayed in the legend. Notable highlights include “Barthood” (S27E09) as the highest-rated episode (8.4 IMDb, 698 votes) and “Lisa Goes Gaga” (S23E22) as the lowest-rated episode (4.5 IMDb, 1,215 votes), marked by the largest and most saturated pink dot.

How This Graphic Was Made

1. 📦 Load Packages & Setup

Show code

# Load necessary packages using pacman for easier dependency management
pacman::p_load(
  tidyverse,   # Collection of R packages for data science (ggplot2, dplyr, etc.)
  showtext,    # Enables custom fonts for ggplot2
  ggtext,      # Adds rich text formatting to ggplot2
  skimr,       # Provides summary statistics in a readable format
  janitor,     # Cleans and formats column names for consistency
  shadowtext,  # Adds text with shadows in ggplot2 for better readability
  glue         # Provides a more readable way to concatenate and format strings
)

# Add Google fonts
font_add_google("Roboto Condensed", family = "Roboto")
font_add("Simpsons", regular = here::here("fonts/Simpsons.otf"))

# Add local font
font_add("Font Awesome 6 Brands", here::here("fonts/otfs/Font Awesome 6 Brands-Regular-400.otf"))

# Automatically enable the use of showtext for all plots
showtext_auto()

# Set DPI for high-resolution text rendering
showtext_opts(dpi = 300)

2. 📖 Read in the Data

Show code

# Load the TidyTuesday data
tuesdata <- tidytuesdayR::tt_load(2025, week = 5)

characters <- tuesdata$simpsons_characters %>% clean_names()
episodes <- tuesdata$simpsons_episodes %>% clean_names()
locations <- tuesdata$simpsons_locations %>% clean_names()
script_lines <- tuesdata$simpsons_script_lines %>% clean_names()

tidytuesdayR::readme(tuesdata)  # Display dataset documentation
rm(tuesdata)  # Remove raw data object to free up memory

3. 🕵️ Examine the Data

Show code

# Preview the structure of each dataset using glimpse()
glimpse(characters)
glimpse(episodes)
glimpse(locations)
glimpse(script_lines)

4. 🤼 Wrangle Data

Show code

# Select the episode with the lowest IMDb rating
episodes %>%
  slice_min(imdb_rating) %>%
  select(imdb_rating, imdb_votes, number_in_season, season, title)

# Select the episode with the highest IMDb rating
episodes %>%
  slice_max(imdb_rating) %>%
  select(imdb_rating, imdb_votes, number_in_season, season, title)

# Select the episode with the fewest IMDb votes
episodes %>%
  slice_min(imdb_votes) %>%
  select(imdb_votes)

# Select the episode with the most IMDb votes
episodes %>%
  slice_max(imdb_votes) %>%
  select(imdb_votes) 

# Remove episodes with missing IMDb ratings
episodes_clean <- episodes %>%
  drop_na(imdb_rating)

# Count the number of episodes per season
episodes_clean %>%
  group_by(season) %>%
  count(season)

5. 🔤 Text

Show code

title <- "the Simpsons Ratings"
st <- "Highest and Lowest Rated Episodes: Barthood (S27E09, 8.4 IMDb, 698 votes) vs. Lisa Goes Gaga (S23E22, 4.5 IMDb, 1,215 votes)"

# Create a social media caption with customized colors and font for consistency in visualization
social <- andresutils::social_caption(font_family = "Roboto", icon_color = "black", bg_color = "#C5DCEE") 

# Construct the final plot caption by combining TidyTuesday details, data source, and the social caption
cap <- paste0(
  "#TidyTuesday: Week 05, 2025 | **Source**: The Simpsons Dataset | **Graphic**: ", social
)

6. 📊 Plot

Show code

# Create a scatter plot visualizing IMDb ratings and votes for each episode
p <- episodes_clean %>%
  ggplot(aes(x = factor(number_in_season), y = fct_rev(factor(season)))) +
  geom_point(aes(color = imdb_rating, size = imdb_votes)) +
  scale_color_gradientn(colors = LaCroixColoR::lacroix_palette("PassionFruit", type = "continuous"), breaks = c(4.5, 5.5, 6.5, 7.5, 8.4)) +
  scale_size_area(breaks = c(104, 600, 1215), label = scales::comma) +
  guides(color = guide_colorbar(barheight = unit(7, "lines"),
                                barwidth = unit(0.5, "lines"),
                                order = 1,
                                theme = theme(
                                  legend.title = element_text(margin = margin(b = 7))
                                )),
         size = guide_legend(override.aes = list(shape = 21))) +
  labs(x = "Episode",
       y = "Season",
       title = title,
       subtitle = st,
       caption = cap,
       color = "IMDb rating", size = "IMDb\nrating count") +
  theme_minimal(base_family = "Roboto") +
  theme(
    plot.title = element_shadowtext(family = "Simpsons", face = "bold", bg.r = 0.05, size = 14, color = "#FFDF00", margin = margin(b = 20), hjust = 1.15),
    plot.subtitle = element_textbox_simple(size = 7, margin = margin(b = 10), lineheight = 1.2),
    plot.background = element_rect(fill = "#C5DCEE", color = NA),
    axis.text = element_text(color = "black", size = 8),
    axis.title.x=element_text(color = "black", size = 9, face = "bold", margin = margin(t = 3)),
    axis.title.y=element_text(color = "black", size = 9, face = "bold"),
    plot.margin = margin(0.5, 0.5, 0.5, 0.5, unit = "cm"),
    panel.grid = element_line(linetype = "dotted", color = "grey65", size = 0.2),
    legend.title = element_text(size = 8, margin = margin(b = 2)),
    legend.text = element_text(size = 7.5),
    plot.caption.position = "plot",
    plot.caption = element_markdown(size = 6, hjust = 0, margin = margin(t = 10))
  )

7. 💾 Save

Show code

# Save plot with dimensions
andresutils::save_plot(p, type = "tidytuesday", year = 2025, week = 5, width = 6, height = 4.5)

8. 🚀 GitHub Repository

Expand for GitHub Repo

The complete code for this analysis is available in tt_05_2025.qmd.

For the full repository, click here.

Do you enjoy my blog? Subscribe here to get notifications and updates (it's free!):

--- title: "The Simpsons Dataset" description: "This visualization showcases IMDb ratings for The Simpsons episodes across multiple seasons, where color represents the rating (from 4.5 to 8.4) and dot size reflects the number of votes. Notably, Barthood (S27E09) holds the highest rating (8.4), while Lisa Goes Gaga (S23E22) is the lowest-rated episode (4.5) with the most votes." lightbox: true date: February 12,2025 categories: [TidyTuesday, R Programming, Data Visualization, Simpsons] tags: [ggplot2, data-visualization, tidyverse] image: "tt_05_2025.png" editor_options: chunk_output_type: inline --- ![Created using ggplot2 in R, the chart utilizes color encoding to represent IMDb ratings, ranging from 4.5 (lowest) to 8.4 (highest), as shown in the scale on the right. Size encoding indicates the number of IMDb votes per episode, with reference values (104, 600, and 1,215 votes) displayed in the legend. Notable highlights include "Barthood" (S27E09) as the highest-rated episode (8.4 IMDb, 698 votes) and "Lisa Goes Gaga" (S23E22) as the lowest-rated episode (4.5 IMDb, 1,215 votes), marked by the largest and most saturated pink dot.](tt_05_2025.png){#fig-1} ### <mark> **How This Graphic Was Made** </mark> #### 1. 📦 Load Packages & Setup ```{r} #| label: load #| warning: false #| message: false #| results: hide # Load necessary packages using pacman for easier dependency management pacman::p_load( tidyverse, # Collection of R packages for data science (ggplot2, dplyr, etc.) showtext, # Enables custom fonts for ggplot2 ggtext, # Adds rich text formatting to ggplot2 skimr, # Provides summary statistics in a readable format janitor, # Cleans and formats column names for consistency shadowtext, # Adds text with shadows in ggplot2 for better readability glue # Provides a more readable way to concatenate and format strings ) # Add Google fonts font_add_google("Roboto Condensed", family = "Roboto") font_add("Simpsons", regular = here::here("fonts/Simpsons.otf")) # Add local font font_add("Font Awesome 6 Brands", here::here("fonts/otfs/Font Awesome 6 Brands-Regular-400.otf")) # Automatically enable the use of showtext for all plots showtext_auto() # Set DPI for high-resolution text rendering showtext_opts(dpi = 300) ``` #### 2. 📖 Read in the Data ```{r} #| label: read #| include: true #| eval: true #| warning: false # Load the TidyTuesday data tuesdata <- tidytuesdayR::tt_load(2025, week = 5) characters <- tuesdata$simpsons_characters %>% clean_names() episodes <- tuesdata$simpsons_episodes %>% clean_names() locations <- tuesdata$simpsons_locations %>% clean_names() script_lines <- tuesdata$simpsons_script_lines %>% clean_names() tidytuesdayR::readme(tuesdata) # Display dataset documentation rm(tuesdata) # Remove raw data object to free up memory ``` #### 3. 🕵️ Examine the Data ```{r} #| label: examine #| include: true #| eval: true #| results: hide #| warning: false # Preview the structure of each dataset using glimpse() glimpse(characters) glimpse(episodes) glimpse(locations) glimpse(script_lines) ``` #### 4. 🤼 Wrangle Data ```{r} #| label: wrangle #| warning: false #| results: hide # Select the episode with the lowest IMDb rating episodes %>% slice_min(imdb_rating) %>% select(imdb_rating, imdb_votes, number_in_season, season, title) # Select the episode with the highest IMDb rating episodes %>% slice_max(imdb_rating) %>% select(imdb_rating, imdb_votes, number_in_season, season, title) # Select the episode with the fewest IMDb votes episodes %>% slice_min(imdb_votes) %>% select(imdb_votes) # Select the episode with the most IMDb votes episodes %>% slice_max(imdb_votes) %>% select(imdb_votes) # Remove episodes with missing IMDb ratings episodes_clean <- episodes %>% drop_na(imdb_rating) # Count the number of episodes per season episodes_clean %>% group_by(season) %>% count(season) ``` #### 5. 🔤 Text ```{r} #| label: text #| include: true #| warning: false title <- "the Simpsons Ratings" st <- "Highest and Lowest Rated Episodes: Barthood (S27E09, 8.4 IMDb, 698 votes) vs. Lisa Goes Gaga (S23E22, 4.5 IMDb, 1,215 votes)" # Create a social media caption with customized colors and font for consistency in visualization social <- andresutils::social_caption(font_family = "Roboto", icon_color = "black", bg_color = "#C5DCEE") # Construct the final plot caption by combining TidyTuesday details, data source, and the social caption cap <- paste0( "#TidyTuesday: Week 05, 2025 | **Source**: The Simpsons Dataset | **Graphic**: ", social ) ``` #### 6. 📊 Plot ```{r} #| label: plot #| warning: false # Create a scatter plot visualizing IMDb ratings and votes for each episode p <- episodes_clean %>% ggplot(aes(x = factor(number_in_season), y = fct_rev(factor(season)))) + geom_point(aes(color = imdb_rating, size = imdb_votes)) + scale_color_gradientn(colors = LaCroixColoR::lacroix_palette("PassionFruit", type = "continuous"), breaks = c(4.5, 5.5, 6.5, 7.5, 8.4)) + scale_size_area(breaks = c(104, 600, 1215), label = scales::comma) + guides(color = guide_colorbar(barheight = unit(7, "lines"), barwidth = unit(0.5, "lines"), order = 1, theme = theme( legend.title = element_text(margin = margin(b = 7)) )), size = guide_legend(override.aes = list(shape = 21))) + labs(x = "Episode", y = "Season", title = title, subtitle = st, caption = cap, color = "IMDb rating", size = "IMDb\nrating count") + theme_minimal(base_family = "Roboto") + theme( plot.title = element_shadowtext(family = "Simpsons", face = "bold", bg.r = 0.05, size = 14, color = "#FFDF00", margin = margin(b = 20), hjust = 1.15), plot.subtitle = element_textbox_simple(size = 7, margin = margin(b = 10), lineheight = 1.2), plot.background = element_rect(fill = "#C5DCEE", color = NA), axis.text = element_text(color = "black", size = 8), axis.title.x=element_text(color = "black", size = 9, face = "bold", margin = margin(t = 3)), axis.title.y=element_text(color = "black", size = 9, face = "bold"), plot.margin = margin(0.5, 0.5, 0.5, 0.5, unit = "cm"), panel.grid = element_line(linetype = "dotted", color = "grey65", size = 0.2), legend.title = element_text(size = 8, margin = margin(b = 2)), legend.text = element_text(size = 7.5), plot.caption.position = "plot", plot.caption = element_markdown(size = 6, hjust = 0, margin = margin(t = 10)) ) ``` #### 7. 💾 Save ```{r} #| label: save #| warning: false # Save plot with dimensions andresutils::save_plot(p, type = "tidytuesday", year = 2025, week = 5, width = 6, height = 4.5) ``` #### 8. 🚀 GitHub Repository ::: {.callout-tip collapse="true"} ##### Expand for GitHub Repo The complete code for this analysis is available in [`tt_05_2025.qmd`](https://github.com/OKcomputer626/Andres-Portfolio/blob/master/visualization/TidyTuesday/2025/tt_05_2025.qmd). For the full repository, [click here](https://github.com/OKcomputer626/Andres-Portfolio). :::