Let’s create a bar chart and then improve it based on the Data Visualisation Checklist (http://stephanieevergreen.com/wp-content/uploads/2016/10/DataVizChecklist_May2016.pdf)
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#youtube <- read.csv('data/Youtube.csv')
youtube <- read.csv('https://raw.githubusercontent.com/KaneSec/ggplot2/main/data/Youtube.csv')
head(youtube)
## id duration bitrate bitrate.video. height width frame.rate
## 1 uDNj-_5ty48 267 373 274 568 320 29.97
## 2 WCgt-AactyY 31 1261 1183 640 480 24.00
## 3 h9Kt-GhvVlg 333 727 638 384 288 25.00
## 4 Wscj-i5jTjc 67 633 522 854 480 29.92
## 5 dtMj-hglvaI 1000 2862 2703 1280 720 29.97
## 6 6GVW-EHpDb0 59 480 436 480 270 25.00
## frame.rate.est.. codec category
## 1 0.00 h264 Music
## 2 0.00 h264 People & Blogs
## 3 0.00 h264 People & Blogs
## 4 0.00 h264 Nonprofits & Activis
## 5 29.97 h264 Sports
## 6 0.00 h264 News & Politics
# A basic bar chart
ggplot(youtube, aes(x =category)) +
geom_bar()
# Readable?
ggplot(youtube, aes(x =category)) +
geom_bar() +
theme(axis.text.x = element_text(angle = -90, hjust=0))
# 6-12 word descriptive title is left-justified in upper left corner
ggplot(youtube, aes(x =category)) +
geom_bar() +
theme(axis.text.x = element_text(angle = -90, hjust=0)) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count")
# Text is horizontal
ggplot(youtube, aes(x =category)) +
geom_bar() +
theme(axis.text.x = element_text(angle = -90, hjust=0)) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count") +
coord_flip()
# Text is horizontal (x axis)
ggplot(youtube, aes(x =category)) +
geom_bar() +
labs(title = "Number of Top Youtube authors in different categories",
x = "Category",
y = "count") +
coord_flip()
# Data are labeled directly
youtube_cat <- youtube %>% group_by(category) %>% summarize(count=n(), .groups = 'drop')
ggplot(youtube_cat, aes(x =category, y = count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), hjust = -0.2, size = 3.5) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count") +
coord_flip()
# Fix the axis limit
ggplot(youtube_cat, aes(x =category, y = count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), hjust = -0.2, size = 3.5) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count") +
ylim(0, 6000) +
coord_flip()
# Data are intentionally ordered
ggplot(youtube_cat, aes(x = reorder(category, count), y = count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), hjust = -0.2, size = 3.5) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count") +
ylim(0, 6000) +
coord_flip()
# gridline
ggplot(youtube_cat, aes(x = reorder(category, count), y = count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), hjust = -0.2, size = 3.5) +
labs(title = "Count of the Most Popular Youtube Videos by Category",
x = "Category",
y = "count") +
ylim(0, 6000) +
coord_flip() +
theme_classic()
covid <- read.csv("http://raw.githubusercontent.com/owid/covid-19-data/master/public/data/ecdc/full_data.csv")
location <- read.csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/ecdc/locations.csv')
covid_yesterday <- filter(covid, date == '2020-11-08' & location != "World")
covid_yesterday <- merge(covid_yesterday, location, all.x = TRUE)
head(covid_yesterday)
## location date new_cases new_deaths total_cases total_deaths
## 1 Afghanistan 2020-11-08 126 6 42159 1562
## 2 Albania 2020-11-08 495 8 23705 557
## 3 Algeria 2020-11-08 581 12 61381 2036
## 4 Andorra 2020-11-08 184 0 5319 75
## 5 Angola 2020-11-08 121 1 12223 300
## 6 Anguilla 2020-11-08 0 0 3 NA
## weekly_cases weekly_deaths biweekly_cases biweekly_deaths
## 1 658 26 1391 51
## 2 2830 48 4847 84
## 3 3439 72 5501 129
## 4 563 0 1281 6
## 5 1665 21 3394 35
## 6 0 0 0 0
## countriesAndTerritories continent population_year population
## 1 Afghanistan Asia 2020 38928341
## 2 Albania Europe 2020 2877800
## 3 Algeria Africa 2020 43851043
## 4 Andorra Europe 2020 77265
## 5 Angola Africa 2020 32866268
## 6 Anguilla North America 2020 15002
ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths)) +
geom_point() +
labs(title = "Covid-19 Total Cases & Deaths by Country",
x = "Total Cases",
y = "Total Deaths") +
scale_x_continuous(trans='log10', labels = scales::comma) +
scale_y_continuous(trans='log10', labels = scales::comma)
## Warning: Removed 22 rows containing missing values (geom_point).
# Colour by continent
ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths, color = continent)) +
geom_point() +
labs(title = "Covid-19 Total Cases & Deaths by Country",
x = "Total Cases",
y = "Total Deaths") +
scale_x_continuous(trans='log10', labels = scales::comma) +
scale_y_continuous(trans='log10', labels = scales::comma)
## Warning: Removed 22 rows containing missing values (geom_point).
cols <- c("North America" = "#e41a1c",
"South America" = "#984ea3",
"Europe" = "#ff7f00",
"Africa" = "#a6cee3",
"Asia" = "#377eb8",
"Oceania" = "#a65628")
ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths, color = continent, na.rm=TRUE)) +
geom_point() +
labs(title = "Covid-19 Total Cases & Deaths by Country",
x = "Total Cases",
y = "Total Deaths",
caption = "Data source: Our World in Data (https://github.com/owidbot)") +
scale_x_continuous(trans='log10', labels = scales::comma) +
scale_y_continuous(trans='log10', labels = scales::comma) +
scale_colour_manual(
values = cols
) +
annotate(geom = "text", x = 100000, y = 30, label = "Singapore", hjust = "left") +
theme(
plot.caption = element_text(hjust = 0)
)
## Warning: Removed 22 rows containing missing values (geom_point).