Visualization Portfolio

Sean A. Fischer

Computational Social Scientist

About Research Portfolio Blog Contact

This page presents a selection of data visualizations that I believe represent my skills and successes so far in my career. These are pulled from coursework, original research projects, and side projects. Each entry includes a brief comment on the history behind the figure and the R code used to generate each figure.

Gun Violence in the United States

Plot

Background

This figure was originally created as part of a graduate course on data visualization. We were tasked to recreate and improve upon a plot created by the Pew Research Center visualizing the trend in the percentage of Americans favoring protections of gun rights over making firearm regulations stricter. My project partner and I decided to plot the trend line over time as seen on Pew Research’s website, but with markers behind the line denoting instances of a mass shooting. By marking these instances with thin gray lines, we were able to show how the volume of such shootings varied over time and potentially impacted public opinion. We also chose to highlight particularly large shootings by visualizing shootings that had at least 50% of the total victims of the largest shooting up to that point in time with red lines and labels. The effect is striking, as these particularly salient events, including the Columbine, Virginia Tech, Fort Hood, and Aurora-movie-theater shootings are all highlighted. Finally, we chose to visualize the trend on a y-axis that ranges from 0% to 100% in order to demonstrate the relative stability of mass opinion in this regard. From this view, we can see that while the overall trend has followed an increase in the percentage of Americans favoring the protection of gun rights, the change has been small, with few events leading to any noticeable interruption of this stability.

Code

library(dplyr)
library(ggplot2)

# data from stanford on mass shootings
mass_shooting <- read.csv("Data/Stanford_MSA_Database_for_release_01082016.csv")

# percent favoring protection of gun rights
gun_att <- data.frame(
  Date =
    lubridate::mdy(
      c(
        "12/5/1993", "5/16/1999", "6/14/1999", "3/19/2000", "4/16/2000", 
        "5/6/2000", "6/8/2003", "2/16/2004", "4/22/2007", "11/26/2007", 
        "4/27/2008", "4/21/2009", "3/14/2010", "9/6/2010", "1/16/2011", 
        "3/1/2011", "10/4/2011", "4/15/2012", "7/29/2012", "12/19/2012", 
        "1/13/2013", "2/18/2013", "5/5/2013", "2/9/2014", "12/7/2014", 
        "7/20/2015"
      )
    ),
  Pct_Gun_Rights = c(
    35, 30, 33, 29, 37, 38, 42, 37, 32, 42, 38, 45, 46, 46, 49, 48, 47, 49, 46, 
    42, 45, 46, 48, 49, 52, 47
  )
)

# clean up data and create relevant metrics
mass_shooting <- 
  mass_shooting %>% 
  select(CaseID, Title, Location, City, State, Country,
         Number.of.Victim.Fatalities, Total.Number.of.Fatalities, 
         Number.of.Victims.Injured, Total.Number.of.Victims,
         Date) %>%
  mutate(Date = lubridate::mdy(Date)) %>%
  arrange(Date) %>%
  mutate(Severity_V_Fatalities = Number.of.Victim.Fatalities / 
           cummax(Number.of.Victim.Fatalities),
         Severity_T_Fatalities = Total.Number.of.Fatalities / 
           cummax(Total.Number.of.Fatalities),
         Severity_V_Injuries = Number.of.Victims.Injured / 
           cummax(Number.of.Victims.Injured),
         Severity_T_Victims = Total.Number.of.Victims / 
           cummax(Total.Number.of.Victims),
         Worst_Ever_V_Fatalities = ifelse(Severity_V_Fatalities == 1, 1, 0),
         Worst_Ever_T_Fatalities = ifelse(Severity_T_Fatalities == 1, 1, 0),
         Worst_Ever_V_Injuries = ifelse(Severity_V_Injuries == 1, 1, 0),
         Worst_Ever_T_Victims = ifelse(Severity_T_Victims == 1, 1, 0))

# plot the data
ggplot() +
  geom_vline(data = mass_shooting %>%
               filter(Date >= min(Date),
                      Date <= lubridate::mdy("07-20-2015")), 
             aes(xintercept = Date), color = "white") +
  geom_vline(data = mass_shooting %>% 
               filter(Date >= min(Date),
                      Date <= lubridate::mdy("07-20-2015"),
                      Severity_T_Victims >= 0.5),
             aes(xintercept = Date), color = "red") +
  ggrepel::geom_label_repel(data = mass_shooting %>% 
                              filter(Date >= min(Date),
                                     Date <= lubridate::mdy("07-20-2015"),
                                     Severity_T_Victims >= 0.5) %>%
                              mutate(label_2 = 
                                       case_when(Title == 
                                                   "Fairchild Air Force Base Hospital" ~
                                                   "Fairchild\nAir Force Base Hospital",
                                                 Title ==
                                                   "Long Island Rail Road Commuter Train" ~
                                                   "Long Island Rail Road\nCommuter Train",
                                                 TRUE ~ Title)),
                            aes(x = Date, label = label_2),
                            y = 60,
                            hjust = .5,
                            colour = "#ff0000",
                            segment.size = 0.5,
                            segment.color = "black",
                            size = 3,
                            force = 20) +
  geom_line(data = gun_att,
            aes(x = Date, y = Pct_Gun_Rights),
            color = "#46a9c7",
            size = 2) +
  geom_point(data = gun_att,
             aes(x = Date, y = Pct_Gun_Rights),
             fill = "#46a9c7",
             shape = 21,
             color = "white",
             stroke = 0.6,
             size = 2) +
  scale_x_date(limits = c(lubridate::mdy("1-1-1993"),
                          lubridate::mdy("8-31-2015")),
               date_breaks = "1 year",
               date_labels = "%Y") +
  ylim(c(0, 100)) +
  ylab("% Favoring Protection of Gun Rights") +
  labs(caption = "\nMass shooting data from Stanford and attitude data from Pew Research.
Vertical lines mark instances of a mass shooting on a given date. 
Red lines indicate shootings that had at least 50% of the maximum victims at the time of the shooting.") +
  fischeR::theme_saf_website() +
  theme(panel.background = element_rect(fill = "grey35", color = NA))

Assessing Network Statistics

Plot

Background

This figure is featured in the second empirical chapter of my dissertation studying the musical preferences of American partisans. In this study, I build networks linking partisans together if they share most-listened-to artists on Spotify and calculate two important statistics, the network assortativity and the network modularity, in order to assess whether any individual-level features are associated with an increased probability of connections being made in the network. To evaluate whether the observed statistics are statistically significant, I compared them to the values drawn from 1000 random networks with the same degree distribution. The figure plots the observed value as a dashed red line over the distribution of values observed from the random networks. The middle 95% of the distributions are shaded in gray because if the observed value falls in this region, it is not statistically significant.

Code

library(ggplot2)
library(patchwork)
library(dplyr)
library(igraph)
library(progress)

# load data
load("Data/spotify_one_mode_graph.RData")

# Create function to create random network and collect measures of interest
random_net <- function(g) {
  require(dplyr)
  g_prime = 
    g %>%
    rewire(keeping_degseq())
  
  out_df <-
    data.frame(
      Metric = c(rep("Assortativity", 4),
                 rep("Modularity", 6)),
      Variable = c("Party ID (7 Points)",
                   "Ideology (7 Points)",
                   "Age",
                   "Occupational Prestige",
                   "Party ID",
                   "Ideology",
                   "Gender",
                   "Race",
                   "Urban/Rural",
                   "Social Class"),
      Value = c(
        # PID - 7 Point Scale
        assortativity(g_prime, V(g_prime)$Party_ID),

        # Ideology - 7 Point Scale
        assortativity(g_prime, V(g_prime)$Ideology),
        
        # Age
        assortativity(g_prime, V(g_prime)$Age),
        
        # Occupational Prestige
        assortativity(g_prime, V(g_prime)$Occ_Prestige),
        
        # Party ID - 3 Point Scale
        modularity(g_prime, 
                   as.factor(V(g_prime)$Party_ID_3)),
        
        # Ideology - 3 Point Scale
        modularity(g_prime, 
                   as.factor(V(g_prime)$Ideology_3)),
        
        # Gender
        modularity(g_prime, 
                   as.factor(V(g_prime)$Gender)),
        
        # Race
        modularity(g_prime, 
                   as.factor(V(g_prime)$Race)),
        
        # Urban-Rural
        modularity(g_prime, 
                   as.factor(V(g_prime)$Urban_Rural)),
        
        # Social Class
        modularity(g_prime, 
                   as.factor(V(g_prime)$Class))
      )
    )
  return(out_df)
}

# Generate 1000 random networks
# Apply function for first time
random_net_results <- random_net(spotify_one_mode_graph)
random_net_results$Trial <- 1

# Apply function 999 more times
pb <- progress_bar$new(
  format = "  [:bar] :percent eta: :eta",
  clear = FALSE, total = 1000, width = 60)
for (i in 1:999) {
  x <- random_net(spotify_one_mode_graph)
  x$Trial <- i + 1
  random_net_results <-
    random_net_results %>%
    bind_rows(x)
  pb$tick()
  Sys.sleep(2 / 100)
}

# Log the actual values for each measure
random_net_results$Actual <-
  c(
    # PID - 7 Point Scale
    assortativity(spotify_one_mode_graph, V(spotify_one_mode_graph)$Party_ID),
    
    # Ideology - 7 Point Scale
    assortativity(spotify_one_mode_graph, V(spotify_one_mode_graph)$Ideology),
    
    # Age
    assortativity(spotify_one_mode_graph, V(spotify_one_mode_graph)$Age),
    
    # Occupational Prestige
    assortativity(spotify_one_mode_graph, 
                  V(spotify_one_mode_graph)$Occ_Prestige),
    
    # Party ID
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Party_ID_3)),
    
    # Ideology
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Ideology_3)),
    
    # Gender
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Gender)),
    
    # Race
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Race)),
    
    # Urban-Rural
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Urban_Rural)),
    
    # Social Class
    modularity(spotify_one_mode_graph, 
               as.factor(V(spotify_one_mode_graph)$Class))
  )

# Create a dataframe with the cutoffs
cutoffs_df <-
  random_net_results %>%
  group_by(Metric, Variable, Actual) %>%
  summarise(Lower_Bound = quantile(Value, probs = 0.025),
            Upper_Bound = quantile(Value, probs = 0.975))

# Generate initial Assortativity density plot
ggplot(random_net_results %>%
         filter(Metric == "Assortativity"),
       aes(x = Value)) +
  geom_density(color = "white", fill = "white") +
  ylab("Density") +
  xlab("Assortativity") +
  facet_wrap(~ Variable, scales = "free_x") -> assort_distributions

# Collect data from the density plot
assort_diss <- ggplot_build(assort_distributions)$data[[1]]

# Combine density plot data with cutoffs data
assort_diss <-
  assort_diss %>%
  mutate(Variable = case_when(PANEL == 1 ~ "Age",
                              PANEL == 2 ~ "Ideology (7 Points)",
                              PANEL == 3 ~ "Occupational Prestige",
                              PANEL == 4 ~ "Party ID (7 Points)")) %>%
  inner_join(cutoffs_df %>% filter(Metric == "Assortativity"),
             by = c("Variable" = "Variable"))

# Replot using cutoffs data to shade middle 95% region
assort_distributions + geom_area(data = assort_diss %>% filter(x > Lower_Bound,
                                                               x < Upper_Bound), 
                                 aes(x = x, y = y), 
                                 fill = "grey50",
                                 alpha = 0.5) +
  geom_vline(aes(xintercept = Actual), lty = 2, color = "red") +
  facet_wrap(~ Variable, scales = "free") -> assort_final_rand

# Generate initial Modularity density plot
ggplot(random_net_results %>%
         filter(Metric == "Modularity"),
       aes(x = Value)) +
  geom_density(color = "white", fill = "white") +
  ylab("Density") +
  xlab("Modularity") +
  facet_wrap(~ Variable, scales = "free_x") -> mod_distributions

# Collect data from the density plot
mod_diss <- ggplot_build(mod_distributions)$data[[1]]

# Combine density plot data with cutoffs data
mod_diss <-
  mod_diss %>%
  mutate(Variable = case_when(PANEL == 1 ~ "Gender",
                              PANEL == 2 ~ "Ideology",
                              PANEL == 3 ~ "Party ID",
                              PANEL == 4 ~ "Race",
                              PANEL == 5 ~ "Social Class",
                              PANEL == 6 ~ "Urban/Rural")) %>%
  inner_join(cutoffs_df %>% filter(Metric == "Modularity"),
             by = c("Variable" = "Variable"))

# Replot using cutoffs data to shade middle 95% region
mod_distributions + geom_area(data = mod_diss %>% filter(x > Lower_Bound,
                                                         x < Upper_Bound), 
                              aes(x = x, y = y), 
                              fill = "grey50",
                              alpha = 0.5) +
  geom_vline(aes(xintercept = Actual), lty = 2, color = "red") +
  facet_wrap(~ Variable, scales = "free", ncol = 2) -> mod_final_rand

((assort_final_rand + ggtitle("Assortativity") + 
    fischeR::theme_saf_website() +
    theme(panel.background = element_rect(fill = "grey35", color = NA))) /
    (mod_final_rand + 
       ggtitle("Modularity") + 
       fischeR::theme_saf_website() +
       theme(panel.background = element_rect(fill = "grey35", color = NA)))) & theme(plot.background = element_rect(fill = "#fbd2e4",
                                                                             color = NA))

Prevalence of Local News

Plot

Background

This visualization was designed for my journal article with Kokil Jaidka and Yph Lelkes in Nature Human Behaviour. Our research showed that the rate of local and national news outlets in Google News search results varied depending on how deep into the results you scrolled. To better communicate this point, we needed a visualization that showed how the rates changed based on how many results we considered for each of 32 queries we examined. The final figure uses a 32-facet design that allowed us to color code the facet titles to more easily indicate which of the terms were locally oriented and which were generally oriented. The x-axis, as it runs left to right, reflects looking deeper into the results, a natural extension of English-speakers intuitive notion of reading.

Code

# load libraries
library(dplyr)
library(ggplot2)
library(fischeR)

# load local and national results
load("Data/all_local_results_with_source_timestamps.RData")
load("Data/all_national_results_with_source_timestamp.RData")

# get the count of outlets in each class for each search term in locally-oriented set
outlet_counts_local <- all_local_results %>%
  filter(classification %in% c("local", "regional", "national", 
                               "international")) %>%
  group_by(term, classification) %>%
  summarise(N = n()) %>%
  ungroup() %>%
  group_by(term) %>%
  mutate(rel_freq = N / sum(N),
         classification = factor(classification, 
                                 levels = c("local", "regional", "national",
                                            "international")))

# reverse term factor levels for alphabetical appearance in plot
outlet_counts_local$term <- factor(outlet_counts_local$term, 
                                   levels = 
                                     rev(unique(outlet_counts_local$term)))

# set maximum N allowed as max N in data
outlet_counts_local$filt_level <- 104

# for each N-1 step recalculate shares
for (i in 103:1) {
  outlet_counts_local_2 <- all_local_results %>%
    filter(classification %in% c("local", "regional", "national", 
                                 "international"),
           rank_2 <= i) %>%
    group_by(term, classification) %>%
    summarise(N = n()) %>%
    ungroup() %>%
    group_by(term) %>%
    mutate(rel_freq = N / sum(N),
           classification = factor(classification, 
                                   levels = c("local", "regional", "national",
                                              "international")))
  
  outlet_counts_local_2$term <- factor(outlet_counts_local_2$term, 
                                       levels = 
                                         rev(unique(outlet_counts_local_2$term)))
  
  outlet_counts_local_2$filt_level <- i
  
  # bind to the original data
  outlet_counts_local <-
    outlet_counts_local %>%
    bind_rows(outlet_counts_local_2)
}

# get the count of outlets in each class for each search term in locally-oriented set 
outlet_counts_national <- all_national_results %>%
  filter(term != "nothing") %>%
  filter(classification %in% c("local", "regional", "national", 
                               "international")) %>%
  group_by(term, classification) %>%
  summarise(N = n()) %>%
  ungroup() %>%
  group_by(term) %>%
  mutate(rel_freq = N / sum(N),
         classification = factor(classification, 
                                 levels = c("local", 
                                            "regional", 
                                            "national", 
                                            "international")))

# reverse term factor levels for proper order in figure
outlet_counts_national$term <- 
  factor(outlet_counts_national$term, 
         levels = 
           rev(unique(outlet_counts_national$term)))

# set maximum N based on data
outlet_counts_national$filt_level <- 105

# repeat for each N-1 step
for (i in 104:1) {
  outlet_counts_national_2 <- all_national_results %>%
    filter(term != "nothing") %>%
    filter(classification %in% c("local", "regional", "national", 
                                 "international"),
           rank_2 <= i) %>%
    group_by(term, classification) %>%
    summarise(N = n()) %>%
    ungroup() %>%
    group_by(term) %>%
    mutate(rel_freq = N / sum(N),
           classification = factor(classification, 
                                   levels = c("local", "regional", "national",
                                              "international")))
  
  outlet_counts_national_2$term <- factor(outlet_counts_national_2$term, 
                                       levels = 
                                         rev(unique(outlet_counts_national_2$term)))
  
  outlet_counts_national_2$filt_level <- i
  
  # merge with original
  outlet_counts_national <-
    outlet_counts_national %>%
    bind_rows(outlet_counts_national_2)
}

# assign group labels
outlet_counts_local$term_type <- "local"
outlet_counts_national$term_type <- "national"

# bind datasets
outlet_counts <-
  outlet_counts_local %>%
  bind_rows(outlet_counts_national)

# convert term labels for figure
outlet_counts <-
  outlet_counts %>%
  ungroup() %>%
  mutate(term = as.character(term),
         term = case_when(term == "accident" ~ "Accident",
                          term == "college" ~ "College",
                          term == "crime" ~ "Crime",
                          term == "death" ~ "Death",
                          term == "emergency%20services" ~ "Emergency Services",
                          term == "governor" ~ "Governor",
                          term == "high%20school" ~ "High School",
                          term == "hospital" ~ "Hospital",
                          term == "mayor" ~ "Mayor",
                          term == "obituary" ~ "Obituary",
                          term == "police" ~ "Police",
                          term == "school%20board" ~ "School Board",
                          term == "traffic" ~ "Traffic",
                          term == "transit" ~ "Transit",
                          term == "university" ~ "University",
                          term == "weather" ~ "Weather",
                          term == "abortion" ~ "Abortion",
                          term == "caravan" ~ "Caravan",
                          term == "climate" ~ "Climate",
                          term == "conservative" ~ "Conservative",
                          term == "corruption" ~ "Corruption",
                          term == "election" ~ "Election",
                          term == "fbi" ~ "FBI",
                          term == "gun" ~ "Gun",
                          term == "immigration" ~ "Immigration",
                          term == "liberal" ~ "Liberal",
                          term == "politics" ~ "Politics",
                          term == "president" ~ "President",
                          term == "scandal" ~ "Scandal",
                          term == "shutdown" ~ "Shutdown",
                          term == "syria" ~ "Syria",
                          term == "taxes" ~ "Taxes"))

# reorder levels for proper order in figure
outlet_counts$term <- factor(outlet_counts$term, levels = c("Accident", "College", "Crime", "Death", "Emergency Services",
                                                            "Governor", "High School", "Hospital", "Mayor",
                                                            "Obituary", "Police", "School Board", "Traffic", "Transit",
                                                            "University", "Weather", "Abortion", "Caravan", "Climate",
                                                            "Conservative", "Corruption", "Election", "FBI", "Gun",
                                                            "Immigration", "Liberal", "Politics", "President", "Scandal",
                                                            "Shutdown", "Syria", "Taxes"))
# create figure in ggplot2
p <-
  ggplot(outlet_counts %>% filter(classification %in% c("local", "national"), filt_level >= 5), 
       aes(x = filt_level, y = rel_freq, lty = classification)) +
  geom_vline(xintercept = 5, color = "grey85") +
  geom_vline(xintercept = 10, color = "grey85") +
  geom_vline(xintercept = 15, color = "grey85") +
  geom_vline(xintercept = 25, color = "grey85") +
  geom_vline(xintercept = 50, color = "grey85") +
  geom_vline(xintercept = 75, color = "grey85") +
  geom_vline(xintercept = 100, color = "grey85") +
  geom_line(size = 1) +
  # scale_color_manual("Term Type", values = c("#670de1", "#d38ddd"), labels = c("Localized", "General")) +
  scale_linetype_discrete("Outlet Type", labels = c("Local", "National")) +
  xlab("Top N Results") +
  ylab("Relative Frequency") +
  labs(caption = "The first 16 purple facets represent locally oriented queries.
The second 16 orange facets represent generally oriented queries.") +
  facet_wrap(~term, ncol = 4) +
  fischeR::theme_saf_website() +
  theme(legend.position = "bottom",
        strip.background = element_rect(color = "black", fill = "white", size = 1, linetype = "solid"),
        strip.text = element_text(color = "black", size = 10),
        panel.border = element_blank(),
        axis.line.x = element_line())

# create label colors using grid and grobs
g <- ggplot_gtable(ggplot_build(p))
strip_both <- which(grepl('strip-', g$layout$name))
fills <- c(rep("#f2ab70", 16), rep("#d38ddd", 16))
k <- 1
for (i in strip_both) {
  j <- which(grepl('rect', g$grobs[[i]]$grobs[[1]]$childrenOrder))
  g$grobs[[i]]$grobs[[1]]$children[[j]]$gp$fill <- fills[k]
  k <- k+1
}

# plot final figure
grid::grid.draw(g)

Network of Musical Preferences

Plot

Background

The network visualized here is based on data I collected by linking a survey to Spotify records collected as part of my dissertation. Every node in the network represents a respondent in my study, colored to indicate their political affiliation (blue for Democrats, red for Republicans, and purple for Independents) and sized based on their number of connections. Nodes are connected in the network if they share any artists in the lists of their fifty most-listened-to artists on Spotify. However, the network seen here is also the product of removing weak ties in the network via the application of a process called the disparity filter. The disparity filter removes ties that are likely to occur via random chance and not represent a meaningful connection. With weak ties removed, we can see that the strong ties remaining reflect a weak community structure around respondents’ partisan affiliation.

Code

library(igraph)
library(ggplot2)
library(ggraph)

load("Data/spotify_one_mode_graph_bb.RData")

ggraph(spotify_one_mode_graph_bb, layout = "graphopt") +
  geom_edge_fan(color = "grey85", alpha = 0.5) +
  geom_node_point(aes(fill = as.factor(Party_ID_3),
                      size = degree), shape = 21, color = "white", stroke = 0.6) +
  scale_fill_manual(values = c("#34AAE0", "#c08ee6", "#FF5600")) +
  scale_size_continuous(range = c(1, 4)) +
  fischeR::theme_saf_website() +
  theme(legend.position = "none",
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_rect(fill = "grey35", color = NA))

Changes in Mobility by State

Plot 1

Plot 2

Background

I have included two plots for this entry because, while similar, they serve slightly different purposes. The first plot, with the smoothed trend line generated by a generalized additive model (gam), shows us how the increase in time spent at home happened rapidly before leveling off. The smoothed curve makes this asymptotic feature quickly recognizable. The second plot, with three straight lines added to (almost all of) the plots shows us the linear trend before March 11th, between March 11th and the implementation of each state’s stay-at-home orders (if any were enacted), and after the implementation of the state’s stay-at-home orders. This visual represents a more concrete analytic strategy of comparing the slopes of the lines in each time period. The linear trend lines visually capture what we might learn from a standard regression model with interaction between continuous time and the given window. You can learn more about this project here.

Code 1

# load libraries
library(dplyr)
library(ggplot2)
library(patchwork)

# load data
load("Data/mobility_reports_4_30.RData")

# create plot
ggplot(google_mobility_2,
       aes(x = date,
           y = residential_percent_change_from_baseline)) +
  geom_point(pch = 1, alpha = 0.2) +
  geom_vline(xintercept = lubridate::mdy("3-11-2020"), 
             lty = 2, 
             col = "#a184b0") +
  geom_vline(aes(xintercept = order_date), 
             lty = 2, 
             col = "#C70039") +
  geom_smooth(col = "#fbd2e4") +
  ylab("Percentage Change from Baseline in Staying Home") +
  labs(captions = "Purple dashed lines mark March 11, 2020 the day that the WHO classified COVID-19 as a pandemic, the NBA stopped operations,
and Tom Hanks announced he had COVID-19. Red dashed lines mark the dates that states enacted stay-at-home orders. 
(Not all states enacted orders during this window.)") +
  facet_wrap(~ state_abrv, nrow = 10) +
  fischeR::theme_saf_website() +
  theme(axis.title.x = element_blank())

Code 2

# load libraries
library(dplyr)
library(ggplot2)
library(patchwork)

# load data
load("Data/mobility_reports_4_30.RData")

# create plot
ggplot(google_mobility_2,
       aes(x = date,
           y = residential_percent_change_from_baseline)) +
  geom_point(pch = 1, alpha = 0.2) +
  geom_vline(xintercept = lubridate::mdy("3-11-2020"), 
             lty = 2, 
             col = "#a184b0") +
  geom_vline(aes(xintercept = order_date), 
             lty = 2, 
             col = "#C70039") +
  geom_smooth(data = google_mobility_2 %>%
                filter(date < lubridate::mdy("3-11-2020")),
              method = "lm", col = "#fbd2e4") +
  geom_smooth(data = google_mobility_2 %>%
                filter(date >= lubridate::mdy("3-11-2020")) %>%
                group_by(state_abrv) %>%
                filter(date < order_date),method = "lm", col = "#fbd2e4") +
  geom_smooth(data = google_mobility_2 %>%
                filter(date >= lubridate::mdy("3-11-2020")) %>%
                group_by(state_abrv) %>%
                filter(date > order_date),
              method = "lm", col = "#fbd2e4") +
  ylab("Percentage Change from Baseline in Staying Home") +
  labs(captions = "Purple dashed lines mark March 11, 2020 the day that the WHO classified COVID-19 as a pandemic, the NBA stopped operations,
and Tom Hanks announced he had COVID-19. Red dashed lines mark the dates that states enacted stay-at-home orders. 
(Not all states enacted orders during this window.)") +
  facet_wrap(~ state_abrv, nrow = 10) +
  fischeR::theme_saf_website() +
  theme(axis.title.x = element_blank())

NBA Awards Voting

Plot

Background

This plot was generated for a side project examining the recent history of NBA-Awards voting. Using the publicly released results, I calculated how much each vote for a player diverged from the average vote for the given player. This plot visualizes the distribution of the within-year standardized squared differences broken out by year. Each individual vote is represented by gray point. Violin plots are included over each set of points to visualize the shape of each distribution and mark the 25th, 50th, and 75th quantiles. Two particularly notable divergent votes are labeled.

Code

# load libraries
library(ggplot2)
library(ggannotate)

# load data
load("Data/raw_votes.RData")

# set seed for random layout
set.seed(101)

# create base figure
ggplot(raw_votes, aes(x = year, y = std_diff)) +
  geom_jitter(alpha = 0.3, 
              shape = 21, 
              fill = "grey95", 
              color = "white",
              stroke = 0.6) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              alpha = 0.5, color = "red", fill = "#ffa7a1") +
  ylab("Standardized Difference") +
  xlab("Year") +
  fischeR::theme_saf_no_font() -> p1

# use ggannotate to add annotations in R
# ggannotate(p1)

# Add annotation snippets to p1 and adjust manually based on output
p2 <- p1 +
  geom_label(data = data.frame(x = 1.5064150341961, 
                               y = 11.1182082489709, 
                               label = "Mike Crispino votes\nSteph Curry\n5th place MVP"),
             mapping = aes(x = x,
                           y = y, 
                           label = label),
             label.padding = unit(0.25, "lines"),
             label.r = unit(0.15, "lines"),
             inherit.aes = FALSE) +
  geom_curve(data = data.frame(x = 1.65, 
                               y = 11.9, 
                               xend = 1.2, 
                               yend = 12.1),
             mapping = aes(x = x, y = y, xend = xend, yend = yend),
             arrow = arrow(30L, unit(0.1, "inches"),
                           "last", "closed"),
             inherit.aes = FALSE) +
  geom_label(data = data.frame(x = 3.5064150341961, 
                               y = 11.1182082489709, 
                               label = "Mark Berman votes\nBen Simmons\n3rd place ROY"),
             mapping = aes(x = x,
                           y = y, 
                           label = label),
             label.padding = unit(0.25, "lines"),
             label.r = unit(0.15, "lines"),
             inherit.aes = FALSE) +
  geom_curve(data = data.frame(x = 3.65, 
                               y = 11.9, 
                               xend = 4.29, 
                               yend = 11.625),
             mapping = aes(x = x, y = y, xend = xend, yend = yend),
             curvature = -0.35,
             arrow = arrow(30L, unit(0.1, "inches"),
                           "last", "closed"),
             inherit.aes = FALSE)

# plot final figure with theme
p2 +
  fischeR::theme_saf_website() +
  theme(legend.position = "none",
        axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_rect(fill = "grey35", color = NA))