Week 3 (Data Visualisation Course)

Let’s create a bar chart and then improve it based on the Data Visualisation Checklist (http://stephanieevergreen.com/wp-content/uploads/2016/10/DataVizChecklist_May2016.pdf)

library(ggplot2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.0.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#youtube <- read.csv('data/Youtube.csv')
youtube <- read.csv('https://raw.githubusercontent.com/KaneSec/ggplot2/main/data/Youtube.csv')
head(youtube)

##            id duration bitrate bitrate.video. height width frame.rate
## 1 uDNj-_5ty48      267     373            274    568   320      29.97
## 2 WCgt-AactyY       31    1261           1183    640   480      24.00
## 3 h9Kt-GhvVlg      333     727            638    384   288      25.00
## 4 Wscj-i5jTjc       67     633            522    854   480      29.92
## 5 dtMj-hglvaI     1000    2862           2703   1280   720      29.97
## 6 6GVW-EHpDb0       59     480            436    480   270      25.00
##   frame.rate.est.. codec             category
## 1             0.00  h264                Music
## 2             0.00  h264       People & Blogs
## 3             0.00  h264       People & Blogs
## 4             0.00  h264 Nonprofits & Activis
## 5            29.97  h264               Sports
## 6             0.00  h264      News & Politics

# A basic bar chart
ggplot(youtube, aes(x =category)) + 
  geom_bar()

# Readable?
ggplot(youtube, aes(x =category)) + 
  geom_bar() +
  theme(axis.text.x = element_text(angle = -90, hjust=0))

# 6-12 word descriptive title is left-justified in upper left corner
ggplot(youtube, aes(x =category)) + 
  geom_bar() +
  theme(axis.text.x = element_text(angle = -90, hjust=0)) +
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count")

# Text is horizontal
ggplot(youtube, aes(x =category)) + 
  geom_bar() +
  theme(axis.text.x = element_text(angle = -90, hjust=0)) +
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count") +
  coord_flip()

# Text is horizontal (x axis)
ggplot(youtube, aes(x =category)) + 
  geom_bar() +
  labs(title = "Number of Top Youtube authors in different categories", 
       x = "Category",
       y = "count") +
  coord_flip()

# Data are labeled directly
youtube_cat <- youtube %>% group_by(category) %>% summarize(count=n(), .groups = 'drop') 
ggplot(youtube_cat, aes(x =category, y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), hjust = -0.2, size = 3.5) + 
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count") +
  coord_flip()

# Fix the axis limit
ggplot(youtube_cat, aes(x =category, y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), hjust = -0.2, size = 3.5) + 
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count") +
  ylim(0, 6000) +
  coord_flip()

# Data are intentionally ordered
ggplot(youtube_cat, aes(x = reorder(category, count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), hjust = -0.2, size = 3.5) + 
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count") +
  ylim(0, 6000) +
  coord_flip()

# gridline
ggplot(youtube_cat, aes(x = reorder(category, count), y = count)) + 
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), hjust = -0.2, size = 3.5) + 
  labs(title = "Count of the Most Popular Youtube Videos by Category", 
       x = "Category",
       y = "count") +
  ylim(0, 6000) +
  coord_flip() +
  theme_classic()

covid <- read.csv("http://raw.githubusercontent.com/owid/covid-19-data/master/public/data/ecdc/full_data.csv")

location <- read.csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/ecdc/locations.csv')

covid_yesterday <- filter(covid, date == '2020-11-08' & location != "World")

covid_yesterday <- merge(covid_yesterday, location, all.x = TRUE)
head(covid_yesterday)

##      location       date new_cases new_deaths total_cases total_deaths
## 1 Afghanistan 2020-11-08       126          6       42159         1562
## 2     Albania 2020-11-08       495          8       23705          557
## 3     Algeria 2020-11-08       581         12       61381         2036
## 4     Andorra 2020-11-08       184          0        5319           75
## 5      Angola 2020-11-08       121          1       12223          300
## 6    Anguilla 2020-11-08         0          0           3           NA
##   weekly_cases weekly_deaths biweekly_cases biweekly_deaths
## 1          658            26           1391              51
## 2         2830            48           4847              84
## 3         3439            72           5501             129
## 4          563             0           1281               6
## 5         1665            21           3394              35
## 6            0             0              0               0
##   countriesAndTerritories     continent population_year population
## 1             Afghanistan          Asia            2020   38928341
## 2                 Albania        Europe            2020    2877800
## 3                 Algeria        Africa            2020   43851043
## 4                 Andorra        Europe            2020      77265
## 5                  Angola        Africa            2020   32866268
## 6                Anguilla North America            2020      15002

ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths)) + 
  geom_point() +
  labs(title = "Covid-19 Total Cases & Deaths by Country", 
       x = "Total Cases",
       y = "Total Deaths") +
  scale_x_continuous(trans='log10', labels = scales::comma) +
  scale_y_continuous(trans='log10', labels = scales::comma)

## Warning: Removed 22 rows containing missing values (geom_point).

# Colour by continent
ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths, color = continent)) + 
  geom_point() +
  labs(title = "Covid-19 Total Cases & Deaths by Country", 
       x = "Total Cases",
       y = "Total Deaths") +
  scale_x_continuous(trans='log10', labels = scales::comma) +
  scale_y_continuous(trans='log10', labels = scales::comma)

## Warning: Removed 22 rows containing missing values (geom_point).

cols <- c("North America" = "#e41a1c",
          "South America" = "#984ea3",
          "Europe" = "#ff7f00",
          "Africa" = "#a6cee3",
          "Asia" = "#377eb8",
          "Oceania" = "#a65628")

ggplot(covid_yesterday, aes(x = total_cases, y = total_deaths, color = continent, na.rm=TRUE)) + 
  geom_point() +
  labs(title = "Covid-19 Total Cases & Deaths by Country", 
       x = "Total Cases",
       y = "Total Deaths",
       caption = "Data source: Our World in Data (https://github.com/owidbot)") +
  scale_x_continuous(trans='log10', labels = scales::comma) +
  scale_y_continuous(trans='log10', labels = scales::comma) +
  scale_colour_manual(
    values = cols
  ) +
  annotate(geom = "text", x = 100000, y = 30, label = "Singapore", hjust = "left") +
  theme(
    plot.caption = element_text(hjust = 0)
  )

## Warning: Removed 22 rows containing missing values (geom_point).

Week 3 (Data Visualisation Course)

Kane

11/11/2020