data visualization III

advanced plots

3D scatter plots

phylochemistry contains a function to help you make somewhat decent 3D scatter plots. Let’s look at an example (see below). For this, we use the function points3D. Se give it a data argument that gives it vectors of data that should be on the x, y, and z axes, along with a vector that uniquely identifies each observation. We also tell it the angle of the z axis that we want, the integer to which ticks should be rounded, and the tick intervals. The function returns data that we can pass to ggplot to make a 3D plot.

pivot_wider(hawaii_aquifers, names_from = "analyte", values_from = "abundance") %>%
  mutate(sample_unique_ID = paste0(aquifer_code, "_", well_name)) -> aquifers

output <- points3D(
  data = data.frame(
    x = aquifers$SiO2,
    y = aquifers$Cl,
    z = aquifers$Mg,
    sample_unique_ID = aquifers$sample_unique_ID
  angle = pi/2.4,
  tick_round = 10,
  x_tick_interval = 10,
  y_tick_interval = 20,
  z_tick_interval = 20

The output from points3D contains a grid, axes, and ticks, which should all be plotted using geom_segment. It also contains points that should be plotted with geom_point, and point segments that should be plotted with geom_segement. We can take the output from points3D and join it with the original data, which will occurr according to our sample_unique_ID column. Then, we can also plot point metadata:

output$points <- left_join(output$points, aquifers)
## Joining with `by = join_by(sample_unique_ID)`
ggplot() +
    data = output$grid, aes(x = x, xend = xend, y = y, yend = yend),
    color = "grey80"
  ) +
  geom_segment(data = output$axes, aes(x = x, xend = xend, y = y, yend = yend)) +
  geom_segment(data = output$ticks, aes(x = x, xend = xend, y = y, yend = yend)) +
    data = output$labels, aes(x = x, y = y, label = label),
    hjust = 0.5
  ) +
    data = output$point_segments,
    aes(x = x, xend = xend, y = y, yend = yend),
    linetype = "dotted", color = "black"
  ) +
    data = output$points, aes(x = x, y = y, fill = aquifer_code),
    size = 3, shape = 21
  ) +
  theme_void() +
  scale_fill_manual(values = discrete_palette)

marginal summaries

i2 <- iris %>%
  mutate(Species2 = rep(c("A","B"), 75))
p <- ggplot(i2, aes(Sepal.Width, Sepal.Length, color = Species)) +

p + geom_xsidedensity(aes(y=stat(density), xfill = Species), position = "stack")+
  geom_ysidedensity(aes(x=stat(density), yfill = Species2), position = "stack") +
  theme_bw() + 
  facet_grid(Species~Species2, space = "free", scales = "free") +
  labs(title = "FacetGrid", subtitle = "Collapsing All Side Panels") +
  ggside(collapse = "all") +
  scale_xfill_manual(values = c("darkred","darkgreen","darkblue")) +
  scale_yfill_manual(values = c("black","gold"))

representing distributions

You can also combine geoms to create more detailed representations of distributions:

mpg %>% filter(cyl %in% c(4,6,8)) %>%
  ggplot(aes(x = factor(cyl), y = hwy, fill = factor(cyl))) +
    adjust = 0.5, justification = -0.2, .width = 0, point_colour = NA
  ) +
  geom_boxplot(width = 0.12, outlier.color = NA, alpha = 0.5) +
  ggdist::stat_dots(side = "left", justification = 1.1, binwidth = .25)

venn digrams

df <- data.frame(
  plant1 = sample(c(TRUE, FALSE), 24, replace = TRUE),
  plant2 = sample(c(TRUE, FALSE), 24, replace = TRUE),
  plant3 = sample(c(TRUE, FALSE), 24, replace = TRUE),
  attribute_name = sample(letters, 24, replace = FALSE)

vennAnalysis(df[,1:3]) %>%
  ggplot() +
      aes(x0 = x, y0 = y, r = r, fill = category),
      alpha = 0.4
    ) +
  scale_fill_brewer(palette = "Set1") +

ternary plots

alaska_lake_data %>%
  pivot_wider(names_from = "element", values_from = "mg_per_L") %>%
    x = Ca,
    y = S,
    z = Na,
    color = park,
    size = pH
    )) +

map data

plotting boundaries

There is a simple way to plot maps with ggplot. The map data comes with ggplot2! Let’s have a look. See below some of the data sets included. Options included with ggplot are: world, world2, usa, state (US), county (US), nz, italy, and france. geom_polygon() is useful for plotting these, at (at least to me) seems more intuitive than geom_map().

Cool! We can see that lat, lon, group, order, region, and subregion are included. That makes plotting easy. Note that coord_map() can help preserve aspect ratios:

ggplot(map_data("world")) +
  geom_point(aes(x = long, y = lat, color = group), size = 0.5) +
  theme_void() +

Note that we can use coord_map() to do some pretty cool things!

ggplot(map_data("world")) +
  geom_point(aes(x = long, y = lat, color = group), size = 0.5) +
  theme_void() +
  coord_map(projection = "albers", lat0 = 39, lat1 = 45)

We can use filtering to produce maps of specific regions.

ggplot() +
    data = filter(map_data("county"), region == "minnesota"),
    aes(x = long, y = lat, group = subregion, fill = subregion),
    color = "black"
  ) +
  theme_void() +

maps with plots

Please note that the Great Lakes are in map_data()!

filter(map_data("lakes"), region == "Great Lakes", subregion == "Superior") %>%
    ggplot() +
      geom_path(aes(x = long, y = lat)) +
      coord_map() +

We can clean up the map by making different groups for geom_path() whenever two consecutive points are far apart:

# Step 1: Filter and prepare your data for Lake Superior (though yes, that includes Michigan and Huron)
lake_superior <- map_data("lakes") %>%
  filter(region == "Great Lakes", subregion == "Superior") %>%

# Step 2: Calculate distances between consecutive points
lake_superior <- lake_superior %>%
  mutate(lag_long = lag(long),
         lag_lat = lag(lat),
         dist_to_prev = geosphere::distHaversine(cbind(long, lat), cbind(lag_long, lag_lat))) # distHaversine calculates distances

# Step 3: Define a threshold (e.g., 50 km) and create the "distance_group"
threshold <- 50000  # 50 km
lake_superior <- lake_superior %>%
  mutate(distance_group = cumsum(ifelse(dist_to_prev > threshold |, 1, 0)))

# Step 4: Plot the map with `distance_group`
ggplot(lake_superior, aes(x = long, y = lat, group = distance_group)) +
  geom_path() +
  coord_map() +

Now we could add some data. We could do something simple like plot total abundances as the size of a point:

lake_superior_PFAS <- readMonolist("/Users/bust0037/Documents/Science/Websites/pfas_data_private.csv")
lake_superior_PFAS %>%
  group_by(site, lon, lat) %>%
  summarize(total = sum(abundance)) -> lake_superior_PFAS_summarized

ggplot() +
    data = filter(lake_superior, lat > 46, long < -84),
    aes(x = long, y = lat, group = distance_group)
  ) +
    data =  lake_superior_PFAS_summarized,
    aes(x = lon, y = lat, size = total),
    color = "black"
  ) +
  coord_map() +

Or we could do something more sophisticated like add pie charts at each point:

lake_superior_PFAS <- readMonolist("/Users/bust0037/Documents/Science/Websites/pfas_data_private.csv")

grouped_by_site <- filter(lake_superior_PFAS, component == "PFBA")
site_less_than_90lon <- filter(grouped_by_site, lon <= -90)
site_more_than_90lon <- filter(grouped_by_site, lon >= -90)

unique_sites <- unique(lake_superior_PFAS$site)
dataframe_of_pies <- list()
for (i in 1:length(unique_sites)) { #i=1
  this_site <- filter(lake_superior_PFAS, site == unique_sites[i])
  this_site %>%
    geom_col(aes(x = 1, y = abundance, fill = class_name), color = "black") +
    coord_polar(theta = "y") +
    theme_void() +
    scale_fill_brewer(palette = "Set1", guide = "none") -> this_sites_pie
  dataframe_of_pies[[i]] <- tibble(x = this_site$lon[1], y = this_site$lat[1], plot = list(this_sites_pie))
dataframe_of_pies <-, dataframe_of_pies)

ggplot() +
    data = filter(lake_superior, lat > 46, long < -84),
    aes(x = long, y = lat, group = distance_group)
  ) +
    data =  lake_superior_PFAS,
    aes(x = lon, y = lat),
    color = "black"
  ) +
    data = dataframe_of_pies, aes(x = x, y = y, label = plot),
    vp.width = 1/20, hjust = 0.5, vjust = 0.5, alpha = 0.5
  ) +
  geom_label_repel(data = site_less_than_90lon, aes(x = lon, y = lat, label = site), size = 2.5,min.segment.length = 0.01) +
  geom_label(data = site_more_than_90lon, aes(x = lon, y = lat, label = site), size = 2.5) +
  coord_map() +

You can also access a high resolution shoreline dataset for Lake Superior directly from the source() command as lake_superior_shoreline:

shore <- readMonolist("/Users/bust0037/Documents/Science/Websites/")

wide_view <- ggplot(shore) +
    geom_point(aes(y = lat, x = lon), size = 0.01) +
    coord_map() +

zoom_view <- ggplot(filter(shore, lat < 47.2, lat > 46.6, lon < -90)) +
    geom_point(aes(y = lat, x = lon), size = 0.01) +
    coord_map() +
plot_grid(wide_view, zoom_view, nrow = 1, rel_widths = c(1,2))

further reading

For more on plotting maps in R: datavizplyr

For more advanced map plotting: R Spatial

For more on ternary plots: ggtern