data visualization III
advanced plots
3D scatter plots
phylochemistry
contains a function to help you make somewhat decent 3D scatter plots. Let’s look at an example (see below). For this, we use the function points3D
. Se give it a data
argument that gives it vectors of data that should be on the x, y, and z axes, along with a vector that uniquely identifies each observation. We also tell it the angle of the z axis that we want, the integer to which ticks should be rounded, and the tick intervals. The function returns data that we can pass to ggplot to make a 3D plot.
pivot_wider(hawaii_aquifers, names_from = "analyte", values_from = "abundance") %>%
mutate(sample_unique_ID = paste0(aquifer_code, "_", well_name)) -> aquifers
output <- points3D(
data = data.frame(
x = aquifers$SiO2,
y = aquifers$Cl,
z = aquifers$Mg,
sample_unique_ID = aquifers$sample_unique_ID
),
angle = pi/2.4,
tick_round = 10,
x_tick_interval = 10,
y_tick_interval = 20,
z_tick_interval = 20
)
str(output)
## List of 6
## $ grid :'data.frame': 14 obs. of 4 variables:
## ..$ y : num [1:14] 0 0 0 0 0 ...
## ..$ yend: num [1:14] 96.6 96.6 96.6 96.6 96.6 ...
## ..$ x : num [1:14] 10 20 30 40 50 ...
## ..$ xend: num [1:14] 35.9 45.9 55.9 65.9 75.9 ...
## $ ticks :'data.frame': 37 obs. of 4 variables:
## ..$ y : num [1:37] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ yend: num [1:37] 1.93 1.93 1.93 1.93 1.93 ...
## ..$ x : num [1:37] 10 20 30 40 50 60 70 80 10 20 ...
## ..$ xend: num [1:37] 10.5 20.5 30.5 40.5 50.5 ...
## $ labels :'data.frame': 29 obs. of 3 variables:
## ..$ y : num [1:29] -11.2 -11.2 -11.2 -11.2 -11.2 -11.2 -11.2 -11.2 0 20 ...
## ..$ x : num [1:29] 7.2 17.2 27.2 37.2 47.2 57.2 67.2 77.2 4.4 4.4 ...
## ..$ label: num [1:29] 10 20 30 40 50 60 70 80 0 20 ...
## $ axes :'data.frame': 3 obs. of 4 variables:
## ..$ x : num [1:3] 10 10 80
## ..$ xend: num [1:3] 80 10 106
## ..$ y : num [1:3] 0 0 0
## ..$ yend: num [1:3] 0 280 96.6
## $ point_segments:'data.frame': 106 obs. of 4 variables:
## ..$ x : num [1:106] 13 33.1 39.4 53.1 22.5 ...
## ..$ xend: num [1:106] 13 33.1 39.4 53.1 22.5 ...
## ..$ y : num [1:106] 27.3 81.6 82.6 109.5 19.5 ...
## ..$ yend: num [1:106] 7.34 11.59 12.56 15.45 5.51 ...
## $ points :'data.frame': 106 obs. of 3 variables:
## ..$ x : num [1:106] 13 33.1 39.4 53.1 22.5 ...
## ..$ y : num [1:106] 27.3 81.6 82.6 109.5 19.5 ...
## ..$ sample_unique_ID: chr [1:106] "aquifer_1_Alewa_Heights_Spring" "aquifer_1_Beretania_High_Service" "aquifer_1_Beretania_Low_Service" "aquifer_1_Kuliouou_Well" ...
The output from points3D contains a grid, axes, and ticks, which should all be plotted using geom_segment. It also contains points that should be plotted with geom_point, and point segments that should be plotted with geom_segement. We can take the output from points3D and join it with the original data, which will occurr according to our sample_unique_ID column. Then, we can also plot point metadata:
output$points <- left_join(output$points, aquifers)
## Joining with `by = join_by(sample_unique_ID)`
ggplot() +
geom_segment(
data = output$grid, aes(x = x, xend = xend, y = y, yend = yend),
color = "grey80"
) +
geom_segment(data = output$axes, aes(x = x, xend = xend, y = y, yend = yend)) +
geom_segment(data = output$ticks, aes(x = x, xend = xend, y = y, yend = yend)) +
geom_text(
data = output$labels, aes(x = x, y = y, label = label),
hjust = 0.5
) +
geom_segment(
data = output$point_segments,
aes(x = x, xend = xend, y = y, yend = yend),
linetype = "dotted", color = "black"
) +
geom_point(
data = output$points, aes(x = x, y = y, fill = aquifer_code),
size = 3, shape = 21
) +
theme_void() +
scale_fill_manual(values = discrete_palette)
marginal summaries
i2 <- iris %>%
mutate(Species2 = rep(c("A","B"), 75))
p <- ggplot(i2, aes(Sepal.Width, Sepal.Length, color = Species)) +
geom_point()
p + geom_xsidedensity(aes(y=stat(density), xfill = Species), position = "stack")+
geom_ysidedensity(aes(x=stat(density), yfill = Species2), position = "stack") +
theme_bw() +
facet_grid(Species~Species2, space = "free", scales = "free") +
labs(title = "FacetGrid", subtitle = "Collapsing All Side Panels") +
ggside(collapse = "all") +
scale_xfill_manual(values = c("darkred","darkgreen","darkblue")) +
scale_yfill_manual(values = c("black","gold"))
representing distributions
You can also combine geoms to create more detailed representations of distributions:
mpg %>% filter(cyl %in% c(4,6,8)) %>%
ggplot(aes(x = factor(cyl), y = hwy, fill = factor(cyl))) +
ggdist::stat_halfeye(
adjust = 0.5, justification = -0.2, .width = 0, point_colour = NA
) +
geom_boxplot(width = 0.12, outlier.color = NA, alpha = 0.5) +
ggdist::stat_dots(side = "left", justification = 1.1, binwidth = .25)
venn digrams
df <- data.frame(
plant1 = sample(c(TRUE, FALSE), 24, replace = TRUE),
plant2 = sample(c(TRUE, FALSE), 24, replace = TRUE),
plant3 = sample(c(TRUE, FALSE), 24, replace = TRUE),
attribute_name = sample(letters, 24, replace = FALSE)
)
vennAnalysis(df[,1:3]) %>%
ggplot() +
geom_circle(
aes(x0 = x, y0 = y, r = r, fill = category),
alpha = 0.4
) +
scale_fill_brewer(palette = "Set1") +
theme_void()
ternary plots
library(ggplot2)
library(ggtern)
## Registered S3 methods overwritten by 'ggtern':
## method from
## grid.draw.ggplot ggplot2
## plot.ggplot ggplot2
## print.ggplot ggplot2
## --
## Remember to cite, run citation(package = 'ggtern') for further info.
## --
##
## Attaching package: 'ggtern'
## The following objects are masked from 'package:ggtree':
##
## aes, ggplot, ggsave
## The following object is masked from 'package:ggpp':
##
## annotate
## The following object is masked from 'package:rstatix':
##
## mahalanobis_distance
## The following objects are masked from 'package:ggplot2':
##
## aes, annotate, ggplot, ggplot_build,
## ggplot_gtable, ggplotGrob, ggsave, layer_data,
## theme_bw, theme_classic, theme_dark, theme_gray,
## theme_light, theme_linedraw, theme_minimal,
## theme_void
## The following objects are masked from 'package:gridExtra':
##
## arrangeGrob, grid.arrange
alaska_lake_data %>%
pivot_wider(names_from = "element", values_from = "mg_per_L") %>%
ggtern(aes(
x = Ca,
y = S,
z = Na,
color = park,
size = pH
)) +
geom_point()
map data
plotting boundaries
There is a simple way to plot maps with ggplot. The map data comes with ggplot2
! Let’s have a look. See below some of the data sets included. Options included with ggplot are: world
, world2
, usa
, state
(US), county
(US), nz
, italy
, and france
. geom_polygon()
is useful for plotting these, at (at least to me) seems more intuitive than geom_map()
.
head(map_data("world"))
## long lat group order region subregion
## 1 -69.89912 12.45200 1 1 Aruba <NA>
## 2 -69.89571 12.42300 1 2 Aruba <NA>
## 3 -69.94219 12.43853 1 3 Aruba <NA>
## 4 -70.00415 12.50049 1 4 Aruba <NA>
## 5 -70.06612 12.54697 1 5 Aruba <NA>
## 6 -70.05088 12.59707 1 6 Aruba <NA>
head(map_data("state"))
## long lat group order region subregion
## 1 -87.46201 30.38968 1 1 alabama <NA>
## 2 -87.48493 30.37249 1 2 alabama <NA>
## 3 -87.52503 30.37249 1 3 alabama <NA>
## 4 -87.53076 30.33239 1 4 alabama <NA>
## 5 -87.57087 30.32665 1 5 alabama <NA>
## 6 -87.58806 30.32665 1 6 alabama <NA>
head(map_data("county"))
## long lat group order region subregion
## 1 -86.50517 32.34920 1 1 alabama autauga
## 2 -86.53382 32.35493 1 2 alabama autauga
## 3 -86.54527 32.36639 1 3 alabama autauga
## 4 -86.55673 32.37785 1 4 alabama autauga
## 5 -86.57966 32.38357 1 5 alabama autauga
## 6 -86.59111 32.37785 1 6 alabama autauga
head(map_data("france"))
## long lat group order region subregion
## 1 2.557093 51.09752 1 1 Nord <NA>
## 2 2.579995 51.00298 1 2 Nord <NA>
## 3 2.609101 50.98545 1 3 Nord <NA>
## 4 2.630782 50.95073 1 4 Nord <NA>
## 5 2.625894 50.94116 1 5 Nord <NA>
## 6 2.597699 50.91967 1 6 Nord <NA>
Cool! We can see that lat, lon, group, order, region, and subregion are included. That makes plotting easy. Note that coord_map()
can help preserve aspect ratios:
ggplot(map_data("world")) +
geom_point(aes(x = long, y = lat, color = group), size = 0.5) +
theme_void() +
coord_map()
Note that we can use coord_map()
to do some pretty cool things!
ggplot(map_data("world")) +
geom_point(aes(x = long, y = lat, color = group), size = 0.5) +
theme_void() +
coord_map(projection = "albers", lat0 = 39, lat1 = 45)
We can use filtering to produce maps of specific regions.
ggplot() +
geom_polygon(
data = filter(map_data("county"), region == "minnesota"),
aes(x = long, y = lat, group = subregion, fill = subregion),
color = "black"
) +
theme_void() +
coord_map()
maps with plots
Please note that the Great Lakes are in map_data()!
filter(map_data("lakes"), region == "Great Lakes", subregion == "Superior") %>%
ggplot() +
geom_path(aes(x = long, y = lat)) +
coord_map() +
theme_minimal()
We can clean up the map by making different groups for geom_path() whenever two consecutive points are far apart:
# Step 1: Filter and prepare your data for Lake Superior (though yes, that includes Michigan and Huron)
lake_superior <- map_data("lakes") %>%
filter(region == "Great Lakes", subregion == "Superior") %>%
arrange(order)
# Step 2: Calculate distances between consecutive points
lake_superior <- lake_superior %>%
mutate(lag_long = lag(long),
lag_lat = lag(lat),
dist_to_prev = geosphere::distHaversine(cbind(long, lat), cbind(lag_long, lag_lat))) # distHaversine calculates distances
# Step 3: Define a threshold (e.g., 50 km) and create the "distance_group"
threshold <- 50000 # 50 km
lake_superior <- lake_superior %>%
mutate(distance_group = cumsum(ifelse(dist_to_prev > threshold | is.na(dist_to_prev), 1, 0)))
# Step 4: Plot the map with `distance_group`
ggplot(lake_superior, aes(x = long, y = lat, group = distance_group)) +
geom_path() +
coord_map() +
theme_minimal()
Now we could add some data. We could do something simple like plot total abundances as the size of a point:
lake_superior_PFAS <- readMonolist("/Users/bust0037/Documents/Science/Websites/pfas_data_private.csv")
lake_superior_PFAS %>%
group_by(site, lon, lat) %>%
summarize(total = sum(abundance)) -> lake_superior_PFAS_summarized
ggplot() +
geom_path(
data = filter(lake_superior, lat > 46, long < -84),
aes(x = long, y = lat, group = distance_group)
) +
geom_point(
data = lake_superior_PFAS_summarized,
aes(x = lon, y = lat, size = total),
color = "black"
) +
coord_map() +
theme_cowplot()
Or we could do something more sophisticated like add pie charts at each point:
lake_superior_PFAS <- readMonolist("/Users/bust0037/Documents/Science/Websites/pfas_data_private.csv")
grouped_by_site <- filter(lake_superior_PFAS, component == "PFBA")
site_less_than_90lon <- filter(grouped_by_site, lon <= -90)
site_more_than_90lon <- filter(grouped_by_site, lon >= -90)
unique_sites <- unique(lake_superior_PFAS$site)
dataframe_of_pies <- list()
for (i in 1:length(unique_sites)) { #i=1
this_site <- filter(lake_superior_PFAS, site == unique_sites[i])
this_site %>%
ggplot()+
geom_col(aes(x = 1, y = abundance, fill = class_name), color = "black") +
coord_polar(theta = "y") +
theme_void() +
scale_fill_brewer(palette = "Set1", guide = "none") -> this_sites_pie
dataframe_of_pies[[i]] <- tibble(x = this_site$lon[1], y = this_site$lat[1], plot = list(this_sites_pie))
}
dataframe_of_pies <- do.call(rbind, dataframe_of_pies)
ggplot() +
geom_path(
data = filter(lake_superior, lat > 46, long < -84),
aes(x = long, y = lat, group = distance_group)
) +
geom_point(
data = lake_superior_PFAS,
aes(x = lon, y = lat),
color = "black"
) +
geom_plot(
data = dataframe_of_pies, aes(x = x, y = y, label = plot),
vp.width = 1/20, hjust = 0.5, vjust = 0.5, alpha = 0.5
) +
geom_label_repel(data = site_less_than_90lon, aes(x = lon, y = lat, label = site), size = 2.5,min.segment.length = 0.01) +
geom_label(data = site_more_than_90lon, aes(x = lon, y = lat, label = site), size = 2.5) +
coord_map() +
theme_cowplot()
You can also access a high resolution shoreline dataset for Lake Superior directly from the source() command as lake_superior_shoreline
:
shore <- readMonolist("/Users/bust0037/Documents/Science/Websites/thebustalab.github.io/phylochemistry/sample_data/lake_superior_shoreline.csv")
wide_view <- ggplot(shore) +
geom_point(aes(y = lat, x = lon), size = 0.01) +
coord_map() +
theme_minimal()
zoom_view <- ggplot(filter(shore, lat < 47.2, lat > 46.6, lon < -90)) +
geom_point(aes(y = lat, x = lon), size = 0.01) +
coord_map() +
theme_minimal()
plot_grid(wide_view, zoom_view, nrow = 1, rel_widths = c(1,2))
further reading
For more on plotting maps in R: datavizplyr
For more advanced map plotting: R Spatial
For more on ternary plots: ggtern