In this notebook, we show how to produce more sophisticated graphics via the tidyverse
and the ggplot2
library. A few examples are adapted from R. Irizarry’s dslabs
documentation and from other sources.
ggplot2
and the Tidyverse (US Murders, Gapminder, 2016 US Election - Polling, Diseases, Artificial Data, New York Choral Society Singers, University Professors Salaries, MPG, World Phones)library(ggplot2)
ggplot2
and the Tidyverse library("dslabs")
data(package="dslabs")
data("murders")
?murders
head(murders)
## state abb region population total
## 1 Alabama AL South 4779736 135
## 2 Alaska AK West 710231 19
## 3 Arizona AZ West 6392017 232
## 4 Arkansas AR South 2915918 93
## 5 California CA West 37253956 1257
## 6 Colorado CO West 5029196 65
str(murders)
## 'data.frame': 51 obs. of 5 variables:
## $ state : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ abb : chr "AL" "AK" "AZ" "AR" ...
## $ region : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
## $ population: num 4779736 710231 6392017 2915918 37253956 ...
## $ total : num 135 19 232 93 1257 ...
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ tibble 3.0.1 ✓ purrr 0.3.4
## ✓ tidyr 1.0.2 ✓ dplyr 0.8.5
## ✓ readr 1.3.1 ✓ stringr 1.4.0
## ✓ tibble 3.0.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
r <- murders %>%
summarize(pop=sum(population), tot=sum(total)) %>%
mutate(rate = tot/pop*10^6) %>% .$rate
r
## [1] 30.34555
library(ggrepel)
library(ggthemes)
murders %>% ggplot(aes(x = population/10^6, y = total, label = abb)) +
geom_abline(intercept = log10(r), lty=2, col="darkgrey") +
geom_point(aes(color=region), size = 3) +
geom_text_repel() +
scale_x_log10() +
scale_y_log10() +
xlab("Populations in millions (log scale)") +
ylab("Total number of murders (log scale)") +
ggtitle("US Gun Murders in 2010") +
scale_color_discrete(name="Region")
data("gapminder")
?gapminder
head(gapminder)
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 2 11124892 13828152297 Africa Northern Africa
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 5 20619075 108322326649 Americas South America
## 6 1867396 NA Asia Western Asia
str(gapminder)
## 'data.frame': 10545 obs. of 9 variables:
## $ country : Factor w/ 185 levels "Albania","Algeria",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ year : int 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
## $ infant_mortality: num 115.4 148.2 208 NA 59.9 ...
## $ life_expectancy : num 62.9 47.5 36 63 65.4 ...
## $ fertility : num 6.19 7.65 7.32 4.43 3.11 4.55 4.82 3.45 2.7 5.57 ...
## $ population : num 1636054 11124892 5270844 54681 20619075 ...
## $ gdp : num NA 1.38e+10 NA NA 1.08e+11 ...
## $ continent : Factor w/ 5 levels "Africa","Americas",..: 4 1 1 2 2 3 2 5 4 3 ...
## $ region : Factor w/ 22 levels "Australia and New Zealand",..: 19 11 10 2 15 21 2 1 22 21 ...
summary(gapminder)
## country year infant_mortality
## Albania : 57 Min. :1960 Min. : 1.50
## Algeria : 57 1st Qu.:1974 1st Qu.: 16.00
## Angola : 57 Median :1988 Median : 41.50
## Antigua and Barbuda: 57 Mean :1988 Mean : 55.31
## Argentina : 57 3rd Qu.:2002 3rd Qu.: 85.10
## Armenia : 57 Max. :2016 Max. :276.90
## (Other) :10203 NA's :1453
## life_expectancy fertility population gdp
## Min. :13.20 Min. :0.840 Min. :3.124e+04 Min. :4.040e+07
## 1st Qu.:57.50 1st Qu.:2.200 1st Qu.:1.333e+06 1st Qu.:1.846e+09
## Median :67.54 Median :3.750 Median :5.009e+06 Median :7.794e+09
## Mean :64.81 Mean :4.084 Mean :2.701e+07 Mean :1.480e+11
## 3rd Qu.:73.00 3rd Qu.:6.000 3rd Qu.:1.523e+07 3rd Qu.:5.540e+10
## Max. :83.90 Max. :9.220 Max. :1.376e+09 Max. :1.174e+13
## NA's :187 NA's :185 NA's :2972
## continent region
## Africa :2907 Western Asia :1026
## Americas:2052 Eastern Africa : 912
## Asia :2679 Western Africa : 912
## Europe :2223 Caribbean : 741
## Oceania : 684 South America : 684
## Southern Europe: 684
## (Other) :5586
west <- c("Western Europe","Northern Europe","Southern Europe",
"Northern America","Australia and New Zealand")
gapminder <- gapminder %>%
mutate(group = case_when(
region %in% west ~ "The West",
region %in% c("Eastern Asia", "South-Eastern Asia") ~ "East Asia",
region %in% c("Caribbean", "Central America", "South America") ~ "Latin America",
continent == "Africa" & region != "Northern Africa" ~ "Sub-Saharan Africa",
TRUE ~ "Others"))
gapminder <- gapminder %>%
mutate(group = factor(group, levels = rev(c("Others", "Latin America", "East Asia","Sub-Saharan Africa", "The West"))))
filter(gapminder, year%in%c(1962, 2013) & !is.na(group) &
!is.na(fertility) & !is.na(life_expectancy)) %>%
mutate(population_in_millions = population/10^6) %>%
ggplot( aes(fertility, y=life_expectancy, col = group, size = population_in_millions)) +
geom_point(alpha = 0.8) +
guides(size=FALSE) +
theme(plot.title = element_blank(), legend.title = element_blank()) +
coord_cartesian(ylim = c(30, 85)) +
xlab("Fertility rate (births per woman)") +
ylab("Life Expectancy") +
geom_text(aes(x=7, y=82, label=year), cex=12, color="grey") +
facet_grid(. ~ year) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank(),
legend.position = "top")
gapminder$gdppc = gapminder$gdp/gapminder$population
gapminder2 <- gapminder[,c(1,2,4,6,8,9,11)]
head(gapminder2)
## country year life_expectancy population continent
## 1 Albania 1960 62.87 1636054 Europe
## 2 Algeria 1960 47.50 11124892 Africa
## 3 Angola 1960 35.98 5270844 Africa
## 4 Antigua and Barbuda 1960 62.97 54681 Americas
## 5 Argentina 1960 65.39 20619075 Americas
## 6 Armenia 1960 66.86 1867396 Asia
## region gdppc
## 1 Southern Europe NA
## 2 Northern Africa 1242.992
## 3 Middle Africa NA
## 4 Caribbean NA
## 5 South America 5253.501
## 6 Western Asia NA
filter(gapminder2, year%in%c(2011) & !is.na(gdppc) & !is.na(life_expectancy)) %>%
mutate(population_in_millions = (population/10^6)) %>%
ggplot( aes(x=gdppc, y=life_expectancy, col = continent, size = population_in_millions, label = country)) +
geom_point(alpha = 1) +
geom_text_repel() +
guides(size=FALSE) +
theme(plot.title = element_blank(), legend.title = element_blank()) +
coord_cartesian(ylim = c(40, 85)) +
scale_x_log10() +
xlab("GDP per capita (log scale)") +
ylab("Life Expectancy (in years)") +
ggtitle("Health and Wealth of Nations (2011)") +
#facet_grid(. ~ year) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank(),
legend.position = "top")
filter(gapminder2, year%in%c(2011) & !is.na(gdppc) & !is.na(life_expectancy)) %>%
mutate(population_in_millions = (population/10^6)) %>%
ggplot( aes(x=gdppc, y=life_expectancy, col = continent, size = population_in_millions, label = country)) +
geom_point(alpha = 1) +
geom_smooth(method='lm',formula=y~x, se=FALSE) +
guides(size=FALSE) +
theme(plot.title = element_blank(), legend.title = element_blank()) +
coord_cartesian(ylim = c(40, 85)) +
scale_x_log10() +
xlab("GDP per capita (log scale)") +
ylab("Life Expectancy (in years)") +
ggtitle("Health and Wealth of Nations (2011)") +
#facet_grid(. ~ year) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank(),
legend.position = "top")
filter(gapminder2, year%in%c(2011) & !is.na(gdppc) & !is.na(life_expectancy)) %>%
mutate(population_in_millions = (population/10^6)) %>%
ggplot( aes(x=gdppc, y=life_expectancy, size = population_in_millions, label = country)) +
geom_point(alpha = 1) +
geom_smooth(method='lm',formula=y~x, se=FALSE) +
guides(size=FALSE) +
theme(plot.title = element_blank(), legend.title = element_blank()) +
coord_cartesian(ylim = c(40, 85)) +
scale_x_log10() +
xlab("GDP per capita (log scale)") +
ylab("Life Expectancy (in years)") +
ggtitle("Health and Wealth of Nations (2011)") +
#facet_grid(. ~ year) +
theme(strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank(),
legend.position = "top")
data(polls_us_election_2016)
?polls_us_election_2016
head(polls_us_election_2016)
## state startdate enddate
## 1 U.S. 2016-11-03 2016-11-06
## 2 U.S. 2016-11-01 2016-11-07
## 3 U.S. 2016-11-02 2016-11-06
## 4 U.S. 2016-11-04 2016-11-07
## 5 U.S. 2016-11-03 2016-11-06
## 6 U.S. 2016-11-03 2016-11-06
## pollster grade
## 1 ABC News/Washington Post A+
## 2 Google Consumer Surveys B
## 3 Ipsos A-
## 4 YouGov B
## 5 Gravis Marketing B-
## 6 Fox News/Anderson Robbins Research/Shaw & Company Research A
## samplesize population rawpoll_clinton rawpoll_trump rawpoll_johnson
## 1 2220 lv 47.00 43.00 4.00
## 2 26574 lv 38.03 35.69 5.46
## 3 2195 lv 42.00 39.00 6.00
## 4 3677 lv 45.00 41.00 5.00
## 5 16639 rv 47.00 43.00 3.00
## 6 1295 lv 48.00 44.00 3.00
## rawpoll_mcmullin adjpoll_clinton adjpoll_trump adjpoll_johnson
## 1 NA 45.20163 41.72430 4.626221
## 2 NA 43.34557 41.21439 5.175792
## 3 NA 42.02638 38.81620 6.844734
## 4 NA 45.65676 40.92004 6.069454
## 5 NA 46.84089 42.33184 3.726098
## 6 NA 49.02208 43.95631 3.057876
## adjpoll_mcmullin
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
str(polls_us_election_2016)
## 'data.frame': 4208 obs. of 15 variables:
## $ state : Factor w/ 57 levels "Alabama","Alaska",..: 50 50 50 50 50 50 50 50 37 50 ...
## $ startdate : Date, format: "2016-11-03" "2016-11-01" ...
## $ enddate : Date, format: "2016-11-06" "2016-11-07" ...
## $ pollster : Factor w/ 196 levels "ABC News/Washington Post",..: 1 63 81 194 65 55 18 113 195 76 ...
## $ grade : Factor w/ 10 levels "D","C-","C","C+",..: 10 6 8 6 5 9 8 8 NA 8 ...
## $ samplesize : int 2220 26574 2195 3677 16639 1295 1426 1282 8439 1107 ...
## $ population : chr "lv" "lv" "lv" "lv" ...
## $ rawpoll_clinton : num 47 38 42 45 47 ...
## $ rawpoll_trump : num 43 35.7 39 41 43 ...
## $ rawpoll_johnson : num 4 5.46 6 5 3 3 5 6 6 7.1 ...
## $ rawpoll_mcmullin: num NA NA NA NA NA NA NA NA NA NA ...
## $ adjpoll_clinton : num 45.2 43.3 42 45.7 46.8 ...
## $ adjpoll_trump : num 41.7 41.2 38.8 40.9 42.3 ...
## $ adjpoll_johnson : num 4.63 5.18 6.84 6.07 3.73 ...
## $ adjpoll_mcmullin: num NA NA NA NA NA NA NA NA NA NA ...
summary(polls_us_election_2016)
## state startdate enddate
## U.S. :1106 Min. :2015-11-06 Min. :2015-11-08
## Florida : 148 1st Qu.:2016-08-10 1st Qu.:2016-08-21
## North Carolina: 125 Median :2016-09-23 Median :2016-09-30
## Pennsylvania : 125 Mean :2016-08-31 Mean :2016-09-06
## Ohio : 115 3rd Qu.:2016-10-20 3rd Qu.:2016-10-28
## New Hampshire : 112 Max. :2016-11-06 Max. :2016-11-07
## (Other) :2477
## pollster grade
## Ipsos : 919 A- :1085
## Google Consumer Surveys : 743 B :1011
## SurveyMonkey : 660 C- : 693
## YouGov : 130 C+ : 329
## Rasmussen Reports/Pulse Opinion Research: 125 B+ : 204
## USC Dornsife/LA Times : 121 (Other): 457
## (Other) :1510 NA's : 429
## samplesize population rawpoll_clinton rawpoll_trump
## Min. : 35.0 Length:4208 Min. :11.04 Min. : 4.00
## 1st Qu.: 447.5 Class :character 1st Qu.:38.00 1st Qu.:35.00
## Median : 772.0 Mode :character Median :43.00 Median :40.00
## Mean : 1148.2 Mean :41.99 Mean :39.83
## 3rd Qu.: 1236.5 3rd Qu.:46.20 3rd Qu.:45.00
## Max. :84292.0 Max. :88.00 Max. :68.00
## NA's :1
## rawpoll_johnson rawpoll_mcmullin adjpoll_clinton adjpoll_trump
## Min. : 0.000 Min. : 9.0 Min. :17.06 Min. : 4.373
## 1st Qu.: 5.400 1st Qu.:22.5 1st Qu.:40.21 1st Qu.:38.429
## Median : 7.000 Median :25.0 Median :44.15 Median :42.765
## Mean : 7.382 Mean :24.0 Mean :43.32 Mean :42.674
## 3rd Qu.: 9.000 3rd Qu.:27.9 3rd Qu.:46.92 3rd Qu.:46.290
## Max. :25.000 Max. :31.0 Max. :86.77 Max. :72.433
## NA's :1409 NA's :4178
## adjpoll_johnson adjpoll_mcmullin
## Min. :-3.668 Min. :11.03
## 1st Qu.: 3.145 1st Qu.:23.11
## Median : 4.384 Median :25.14
## Mean : 4.660 Mean :24.51
## 3rd Qu.: 5.756 3rd Qu.:27.98
## Max. :20.367 Max. :31.57
## NA's :1409 NA's :4178
polls_us_election_2016 %>%
filter(state == "U.S." & enddate>="2016-07-01") %>%
select(enddate, pollster, rawpoll_clinton, rawpoll_trump) %>%
rename(Clinton = rawpoll_clinton, Trump = rawpoll_trump) %>%
gather(candidate, percentage, -enddate, -pollster) %>%
mutate(candidate = factor(candidate, levels = c("Trump","Clinton")))%>%
group_by(pollster) %>%
filter(n()>=10) %>%
ungroup() %>%
ggplot(aes(enddate, percentage, color = candidate)) +
geom_point(show.legend = FALSE, alpha=0.4) +
geom_smooth(method = "loess", span = 0.15) +
scale_y_continuous(limits = c(30,50))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 22 rows containing non-finite values (stat_smooth).
## Warning: Removed 22 rows containing missing values (geom_point).
library(RColorBrewer)
data("us_contagious_diseases")
?us_contagious_diseases
head(us_contagious_diseases)
## disease state year weeks_reporting count population
## 1 Hepatitis A Alabama 1966 50 321 3345787
## 2 Hepatitis A Alabama 1967 49 291 3364130
## 3 Hepatitis A Alabama 1968 52 314 3386068
## 4 Hepatitis A Alabama 1969 49 380 3412450
## 5 Hepatitis A Alabama 1970 51 413 3444165
## 6 Hepatitis A Alabama 1971 51 378 3481798
str(us_contagious_diseases)
## 'data.frame': 16065 obs. of 6 variables:
## $ disease : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 1966 1967 1968 1969 1970 ...
## $ weeks_reporting: num 50 49 52 49 51 51 45 45 45 46 ...
## $ count : num 321 291 314 380 413 378 342 467 244 286 ...
## $ population : num 3345787 3364130 3386068 3412450 3444165 ...
summary(us_contagious_diseases)
## disease state year weeks_reporting
## Hepatitis A:2346 Alabama : 315 Min. :1928 Min. : 0.00
## Measles :3825 Alaska : 315 1st Qu.:1950 1st Qu.:31.00
## Mumps :1785 Arizona : 315 Median :1975 Median :46.00
## Pertussis :2856 Arkansas : 315 Mean :1971 Mean :37.38
## Polio :2091 California: 315 3rd Qu.:1990 3rd Qu.:50.00
## Rubella :1887 Colorado : 315 Max. :2011 Max. :52.00
## Smallpox :1275 (Other) :14175
## count population
## Min. : 0 Min. : 86853
## 1st Qu.: 7 1st Qu.: 1018755
## Median : 69 Median : 2749249
## Mean : 1492 Mean : 4107584
## 3rd Qu.: 525 3rd Qu.: 4996229
## Max. :132342 Max. :37607525
## NA's :214
the_disease <- "Pertussis"
us_contagious_diseases %>%
filter(disease == the_disease) %>%
mutate(rate = count / population * 10000 * 52 / weeks_reporting) %>%
mutate(state = reorder(state, rate)) %>%
ggplot(aes(year, state, fill = rate)) +
geom_tile(color = "grey50") +
scale_x_continuous(expand=c(0,0)) +
scale_fill_gradientn(colors = brewer.pal(9, "Reds"), trans = "sqrt") +
geom_vline(xintercept=1963, col = "blue") +
theme_minimal() + theme(panel.grid = element_blank()) +
ggtitle(the_disease) +
ylab("") +
xlab("")
library("ggplot2")
theme_set(theme_bw()) # use the black and white theme throughout
# artificial data:
d <- data.frame(x = c(1:8, 1:8), y = runif(16),
group1 = rep(gl(2, 4, labels = c("a", "b")), 2),
group2 = gl(2, 8))
head(d)
## x y group1 group2
## 1 1 0.1238609 a 1
## 2 2 0.4514588 a 1
## 3 3 0.3996547 a 1
## 4 4 0.2725954 a 1
## 5 5 0.4586981 b 1
## 6 6 0.3562236 b 1
ggplot(data = d) + geom_point(aes(x, y, colour = group1)) +
facet_grid(~group2)
library("ggplot2")
data(singer, package="lattice")
?singer
summary(singer,8)
## height voice.part
## Min. :60.0 Bass 2 :26
## 1st Qu.:65.0 Bass 1 :39
## Median :67.0 Tenor 2 :21
## Mean :67.3 Tenor 1 :21
## 3rd Qu.:70.0 Alto 2 :27
## Max. :76.0 Alto 1 :35
## Soprano 2:30
## Soprano 1:36
table(singer$height,singer$voice.part)
##
## Bass 2 Bass 1 Tenor 2 Tenor 1 Alto 2 Alto 1 Soprano 2 Soprano 1
## 60 0 0 0 0 0 1 3 1
## 61 0 0 0 0 0 4 2 2
## 62 0 0 0 0 0 3 5 6
## 63 0 0 0 0 3 4 3 3
## 64 0 0 0 2 5 3 4 1
## 65 0 0 0 1 5 4 5 15
## 66 1 2 1 3 6 7 3 6
## 67 2 0 0 2 2 4 3 1
## 68 2 6 3 3 0 2 1 1
## 69 1 3 7 1 1 1 0 0
## 70 4 8 2 2 5 1 1 0
## 71 1 6 6 2 0 0 0 0
## 72 7 6 0 2 0 1 0 0
## 73 0 3 1 1 0 0 0 0
## 74 4 1 0 1 0 0 0 0
## 75 4 4 0 0 0 0 0 0
## 76 0 0 1 1 0 0 0 0
ggplot(singer, aes(x=height)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(singer, aes(x=voice.part, y=height)) + geom_boxplot()
ggplot(data=singer, aes(x=height)) +
geom_histogram() +
facet_wrap(~voice.part, nrow=4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Salaries=read.csv("Data/Salaries.csv", header = TRUE)
ggplot(Salaries, aes(x=rank, y=salary)) +
geom_boxplot(fill="cornflowerblue",
color="black", notch=TRUE)+
geom_point(position="jitter", color="blue", alpha=.5)+
geom_rug(side="l", color="black")
## Warning: Ignoring unknown parameters: side
ggplot(Salaries, aes(x=yrs.since.phd, y=salary, color=rank,
shape=rank)) + geom_point() + facet_grid(.~sex)
p1 <- ggplot(data=Salaries, aes(x=rank)) + geom_bar()
p2 <- ggplot(data=Salaries, aes(x=sex)) + geom_bar()
p3 <- ggplot(data=Salaries, aes(x=yrs.since.phd, y=salary)) + geom_point()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, ncol=3)
library(car)
## Loading required package: carData
##
## Attaching package: 'carData'
## The following object is masked _by_ '.GlobalEnv':
##
## Salaries
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(ggplot2)
mytheme <- theme(plot.title=element_text(
face="bold.italic", size="14",
color="brown"), axis.title=
element_text( face="bold.italic",
size=10, color="brown"),
axis.text=element_text(
face="bold", size=9,
color="darkblue"),
panel.background=element_rect(
fill="white",color="darkblue"),
panel.grid.major.y=element_line(
color="grey", linetype=1),
panel.grid.minor.y=element_line(
color="grey", linetype=2),
panel.grid.minor.x=element_blank(),
legend.position="top")
ggplot(Salaries, aes(x=rank, y=salary, fill=sex)) +
geom_boxplot() +
labs(title="Salary by Rank and Sex", x="Rank", y="Salary") +
mytheme
head(mpg)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manua… f 21 29 p comp…
## 3 audi a4 2 2008 4 manua… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto(… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto(… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manua… f 18 26 p comp…
?mpg
ggplot(mpg, aes(cty, hwy)) + geom_point(aes(colour = class))
ggplot(mpg, aes(cty, hwy)) + geom_point(colour = "red")
data("WorldPhones")
head(WorldPhones)
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1951 45939 21574 2876 1815 1646 89 555
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
help(WorldPhones)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
WorldPhones.m = melt(WorldPhones)
head(WorldPhones.m)
## Var1 Var2 value
## 1 1951 N.Amer 45939
## 2 1956 N.Amer 60423
## 3 1957 N.Amer 64721
## 4 1958 N.Amer 68484
## 5 1959 N.Amer 71799
## 6 1960 N.Amer 76036
colnames(WorldPhones.m) = c("Year", "Continent", "Phones")
head(WorldPhones.m)
## Year Continent Phones
## 1 1951 N.Amer 45939
## 2 1956 N.Amer 60423
## 3 1957 N.Amer 64721
## 4 1958 N.Amer 68484
## 5 1959 N.Amer 71799
## 6 1960 N.Amer 76036
ggplot(WorldPhones.m, aes(x=Year, y=Phones, color=Continent)) + geom_point()
ggplot(WorldPhones.m, aes(x=Year, y=Phones, color=Continent)) + geom_line()
ggplot(WorldPhones.m, aes(x=Year, y=Phones, color=Continent)) + geom_line() + scale_y_log10()
# install.packages("ggplot2")
# load package and data
options(scipen=999) # turn-off scientific notation like 1e+48
library(ggplot2)
theme_set(theme_bw()) # pre-set the bw theme.
data("midwest", package = "ggplot2")
# midwest <- read.csv("http://goo.gl/G1K41K") # bkup data source
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot",
caption = "Source: midwest")
plot(gg)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ]
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg_select, aes(displ, cty)) +
labs(subtitle="mpg: Displacement vs City Mileage",
title="Bubble chart")
g + geom_jitter(aes(col=manufacturer, size=hwy)) +
geom_smooth(aes(col=manufacturer), method="lm", se=F)
## `geom_smooth()` using formula 'y ~ x'
# Source: https://github.com/dgrtwo/gganimate
#install.packages("devtools")
# install.packages("cowplot") # a gganimate dependency
# devtools::install_github("dgrtwo/gganimate")
library(ggplot2)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
library(gapminder)
##
## Attaching package: 'gapminder'
## The following object is masked _by_ '.GlobalEnv':
##
## gapminder
## The following object is masked from 'package:dslabs':
##
## gapminder
theme_set(theme_bw()) # pre-set the bw theme.
head(gapminder)
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## population gdp continent region group
## 1 1636054 NA Europe Southern Europe The West
## 2 11124892 13828152297 Africa Northern Africa Others
## 3 5270844 NA Africa Middle Africa Sub-Saharan Africa
## 4 54681 NA Americas Caribbean Latin America
## 5 20619075 108322326649 Americas South America Latin America
## 6 1867396 NA Asia Western Asia Others
## gdppc
## 1 NA
## 2 1242.992
## 3 NA
## 4 NA
## 5 5253.501
## 6 NA
ggplot(gapminder, aes(gdppc, life_expectancy, size = population, colour = country)) +
geom_point(alpha = 0.7, show.legend = FALSE) +
#scale_colour_manual(values = country_colors) +
scale_size(range = c(2, 12)) +
scale_x_log10() +
facet_wrap(~continent) +
# Here comes the gganimate specific bits
labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
transition_time(year) +
ease_aes('linear')
## Warning: No renderer available. Please install the gifski, av, or magick
## package to create animated output
## NULL
# anim_save(file="gapminder.gif") # saved, not plotted
library(ggplot2)
library(ggExtra)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) +
geom_count() +
geom_smooth(method="lm", se=F)
ggMarginal(g, type = "histogram", fill="transparent")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
ggMarginal(g, type = "boxplot", fill="transparent")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
# ggMarginal(g, type = "density", fill="transparent")
library(ggplot2)
theme_set(theme_bw())
# Data Prep
data("mtcars") # load data
mtcars$`car name` <- rownames(mtcars) # create new column for car names
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2) # compute normalized mpg
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above / below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot.
# Diverging Barcharts
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_bar(stat='identity', aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalised mileage from 'mtcars'",
title= "Diverging Bars") +
coord_flip()
library(ggplot2)
#install.packages("quantmod")
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Version 0.4-0 included new data defaults. See ?getSymbols.
data("economics", package = "ggplot2")
# Compute % Returns
economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
# Create break points and labels for axis ticks
brks <- economics$date[seq(1, length(economics$date), 12)]
#install.packages("lubridate")
lbls <- lubridate::year(economics$date[seq(1, length(economics$date), 12)])
# Plot
ggplot(economics[1:100, ], aes(date, returns_perc)) +
geom_area() +
scale_x_date(breaks=brks, labels=lbls) +
theme(axis.text.x = element_text(angle=90)) +
labs(title="Area Chart",
subtitle = "Perc Returns for Personal Savings",
y="% Returns for Personal savings",
caption="Source: economics")
library(ggplot2)
library(ggthemes)
options(scipen = 999) # turns of scientific notations like 1e+40
# Read data
email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# X Axis Breaks and Labels
brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")
# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # Fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # Breaks
labels = lbls) + # Labels
coord_flip() + # Flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # Tufte theme from ggfortify
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank()) + # Centre plot title
scale_fill_brewer(palette = "Dark2") # Color palette
# http://margintale.blogspot.in/2012/04/ggplot2-time-series-heatmaps.html
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(zoo)
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv")
df$date <- as.Date(df$date) # format date
df <- df[df$year >= 2012, ] # filter reqd years
# Create Month Week
df$yearmonth <- as.yearmon(df$date)
df$yearmonthf <- factor(df$yearmonth)
df <- ddply(df,.(yearmonthf), transform, monthweek=1+week-min(week)) # compute week number of month
df <- df[, c("year", "yearmonthf", "monthf", "week", "monthweek", "weekdayf", "VIX.Close")]
head(df)
## year yearmonthf monthf week monthweek weekdayf VIX.Close
## 1 2012 Jan 2012 Jan 1 1 Tue 22.97
## 2 2012 Jan 2012 Jan 1 1 Wed 22.22
## 3 2012 Jan 2012 Jan 1 1 Thu 21.48
## 4 2012 Jan 2012 Jan 1 1 Fri 20.63
## 5 2012 Jan 2012 Jan 2 2 Mon 21.07
## 6 2012 Jan 2012 Jan 2 2 Tue 20.69
#> year yearmonthf monthf week monthweek weekdayf VIX.Close
#> 1 2012 Jan 2012 Jan 1 1 Tue 22.97
#> 2 2012 Jan 2012 Jan 1 1 Wed 22.22
#> 3 2012 Jan 2012 Jan 1 1 Thu 21.48
#> 4 2012 Jan 2012 Jan 1 1 Fri 20.63
#> 5 2012 Jan 2012 Jan 2 2 Mon 21.07
#> 6 2012 Jan 2012 Jan 2 2 Tue 20.69
# Plot
ggplot(df, aes(monthweek, weekdayf, fill = VIX.Close)) +
geom_tile(colour = "white") +
facet_grid(year~monthf) +
scale_fill_gradient(low="red", high="green") +
labs(x="Week of Month",
y="",
title = "Time-Series Calendar Heatmap",
subtitle="Yahoo Closing Price",
fill="Close")
# Prepare data: group mean city mileage by manufacturer.
cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate
colnames(cty_mpg) <- c("make", "mileage") # change column names
cty_mpg <- cty_mpg[order(cty_mpg$mileage), ] # sort
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # to retain the order in plot.
head(cty_mpg, 4)
## make mileage
## 9 lincoln 11.33333
## 8 land rover 11.50000
## 3 dodge 13.13514
## 10 mercury 13.25000
#> make mileage
#> 9 lincoln 11.33333
#> 8 land rover 11.50000
#> 3 dodge 13.13514
#> 10 mercury 13.25000
#The X variable is now a factor, let's plot.
library(ggplot2)
theme_set(theme_bw())
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
#install.packages("ggcorrplot")
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.2
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 3.6.2
#devtools::install_github("wilkox/treemapify")
library(treemapify)
library(ggplot2)
data(G20)
head(G20)
## region country gdp_mil_usd hdi econ_classification
## 1 Africa South Africa 384315 0.629 Developing
## 2 North America United States 15684750 0.937 Advanced
## 3 North America Canada 1819081 0.911 Advanced
## 4 North America Mexico 1177116 0.775 Developing
## 5 South America Brazil 2395968 0.730 Developing
## 6 South America Argentina 474954 0.811 Developing
## hemisphere
## 1 Southern
## 2 Northern
## 3 Northern
## 4 Northern
## 5 Southern
## 6 Southern
# region country gdp_mil_usd hdi econ_classification
# Africa South Africa 384315 0.629 Developing
# North America United States 15684750 0.937 Advanced
# North America Canada 1819081 0.911 Advanced
# North America Mexico 1177116 0.775 Developing
# South America Brazil 2395968 0.730 Developing
# South America Argentina 474954 0.811 Developing
ggplot(G20, aes(area = gdp_mil_usd, fill = region, label = country)) +
geom_treemap() +
geom_treemap_text(grow = T, reflow = T, colour = "black") +
facet_wrap( ~ econ_classification) +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "bottom") +
labs(
title = "The G-20 major economies",
caption = "The area of each country is proportional to its relative GDP
within the economic group (advanced or developing)",
fill = "Region"
)
library(ggplot2)
library(ggnetwork)
library(geomnet)
## Registered S3 methods overwritten by 'geomnet':
## method from
## fortify.igraph ggnetwork
## fortify.network ggnetwork
library(network)
## network: Classes for Relational Data
## Version 1.16.0 created on 2019-11-30.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## Mark S. Handcock, University of California -- Los Angeles
## David R. Hunter, Penn State University
## Martina Morris, University of Washington
## Skye Bender-deMoll, University of Washington
## For citation information, type citation("network").
## Type help("network-package") to get started.
##
## Attaching package: 'network'
## The following object is masked from 'package:plyr':
##
## is.discrete
# make the data available
data(madmen, package = 'geomnet')
# data step for ggnetwork
# create undirected network
mm.net <- network(madmen$edges[, 1:2], directed = FALSE)
mm.net # glance at network object
## Network attributes:
## vertices = 45
## directed = FALSE
## hyper = FALSE
## loops = FALSE
## multiple = FALSE
## bipartite = FALSE
## total edges= 39
## missing edges= 0
## non-missing edges= 39
##
## Vertex attribute names:
## vertex.names
##
## No edge attributes
## Network attributes:
## vertices = 45
## directed = FALSE
## hyper = FALSE
## loops = FALSE
## multiple = FALSE
## bipartite = FALSE
## total edges= 39
## missing edges= 0
## non-missing edges= 39
##
## Vertex attribute names:
## vertex.names
## No edge attributes
# create node attribute (gender)
rownames(madmen$vertices) <- madmen$vertices$label
mm.net %v% "gender" <- as.character(
madmen$vertices[ network.vertex.names(mm.net), "Gender"]
)
# gender color palette
mm.col <- c("female" = "#ff0000", "male" = "#00ff00")
set.seed(10052016)
ggplot(data = ggnetwork(mm.net, layout = "kamadakawai"),
aes(x, y, xend = xend, yend = yend)) +
geom_edges(color = "grey50") + # draw edge layer
geom_nodes(aes(colour = gender), size = 2) + # draw node layer
geom_nodetext(aes(colour = gender, label = vertex.names),
size = 3, vjust = -0.6) + # draw node label layer
scale_colour_manual(values = mm.col) +
xlim(c(-0.05, 1.05)) +
theme_blank() +
theme(legend.position = "bottom")
library(triangle)
set.seed(0)
q1_d1 <- round(rtriangle(1000, 1, 7, 5))
q1_d2 <- round(rtriangle(1000, 1, 7, 6))
q1_d3 <- round(rtriangle(1000, 1, 7, 2))
df <- data.frame(q1_d1 = factor(q1_d1), q1_d2 = factor(q1_d2), q1_d3 = factor(q1_d3))
library(dplyr)
# group by combinations and count
df_grouped <- df %>% group_by(q1_d1, q1_d2, q1_d3) %>% count()
# set an id string that denotes the value combination
df_grouped <- df_grouped %>% mutate(id = factor(paste(q1_d1, q1_d2, q1_d3, sep = '-')))
order.freq <- order(df_grouped[,4],decreasing=TRUE)
# sort by count and select top rows
df_grouped <- df_grouped[order.freq[1:25],]
library(reshape2)
library(ggplot2)
# create long format
df_pcp <- melt(df_grouped, id.vars = c('id', 'freq'))
df_pcp$value <- factor(df_pcp$value)
y_levels <- levels(factor(1:7))
ggplot(df_pcp, aes(x = variable, y = value, group = id)) + # group = id is important!
geom_path(aes(size = freq, color = id),
alpha = 0.5,
lineend = 'round', linejoin = 'round') +
scale_y_discrete(limits = y_levels, expand = c(0.5, 0)) +
scale_size(breaks = NULL, range = c(1, 7))
## From Timeseries object (ts)
library(ggplot2)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 3.6.2
theme_set(theme_classic())
# Plot
autoplot(AirPassengers) +
labs(title="AirPassengers") +
theme(plot.title = element_text(hjust=0.5))
library(ggplot2)
theme_set(theme_classic())
# Allow Default X Axis Labels
ggplot(economics, aes(x=date)) +
geom_line(aes(y=unemploy)) +
labs(title="Time Series Chart",
subtitle="Number of unemployed in thousands from 'Economics-US' Dataset",
caption="Source: Economics",
y="unemploy")
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
theme_set(theme_bw())
economics_m <- economics[1:24, ]
# labels and breaks for X axis text
lbls <- paste0(month.abb[month(economics_m$date)], " ", lubridate::year(economics_m$date))
brks <- economics_m$date
# plot
ggplot(economics_m, aes(x=date)) +
geom_line(aes(y=pce)) +
labs(title="Monthly Time Series",
subtitle="Personal consumption expenditures, in billions of dollars",
caption="Source: Economics",
y="pce") + # title and caption
scale_x_date(labels = lbls,
breaks = brks) + # change to monthly ticks and labels
theme(axis.text.x = element_text(angle = 90, vjust=0.5), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
library(ggplot2)
library(lubridate)
theme_set(theme_bw())
economics_y <- economics[1:90, ]
# labels and breaks for X axis text
brks <- economics_y$date[seq(1, length(economics_y$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(economics_y, aes(x=date)) +
geom_line(aes(y=psavert)) +
labs(title="Yearly Time Series",
subtitle="Personal savings rate",
caption="Source: Economics",
y="psavert") + # title and caption
scale_x_date(labels = lbls,
breaks = brks) + # change to monthly ticks and labels
theme(axis.text.x = element_text(angle = 90, vjust=0.5), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
data(economics_long, package = "ggplot2")
head(economics_long)
## # A tibble: 6 x 4
## date variable value value01
## <date> <chr> <dbl> <dbl>
## 1 1967-07-01 pce 507. 0
## 2 1967-08-01 pce 510. 0.000265
## 3 1967-09-01 pce 516. 0.000762
## 4 1967-10-01 pce 512. 0.000471
## 5 1967-11-01 pce 517. 0.000916
## 6 1967-12-01 pce 525. 0.00157
#> date variable value value01
#> <date> <fctr> <dbl> <dbl>
#> 1 1967-07-01 pce 507.4 0.0000000000
#> 2 1967-08-01 pce 510.5 0.0002660008
#> 3 1967-09-01 pce 516.3 0.0007636797
#> 4 1967-10-01 pce 512.9 0.0004719369
#> 5 1967-11-01 pce 518.1 0.0009181318
#> 6 1967-12-01 pce 525.8 0.0015788435
library(ggplot2)
library(lubridate)
theme_set(theme_bw())
df <- economics_long[economics_long$variable %in% c("psavert", "uempmed"), ]
df <- df[lubridate::year(df$date) %in% c(1967:1981), ]
# labels and breaks for X axis text
brks <- df$date[seq(1, length(df$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(df, aes(x=date)) +
geom_line(aes(y=value, col=variable)) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn from Long Data format",
caption="Source: Economics",
y="Returns %",
color=NULL) + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(labels = c("psavert", "uempmed"),
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line color
theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
library(ggplot2)
library(lubridate)
theme_set(theme_bw())
df <- economics[, c("date", "psavert", "uempmed")]
df <- df[lubridate::year(df$date) %in% c(1967:1981), ]
# labels and breaks for X axis text
brks <- df$date[seq(1, length(df$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(df, aes(x=date)) +
geom_area(aes(y=psavert+uempmed, fill="psavert")) +
geom_area(aes(y=uempmed, fill="uempmed")) +
labs(title="Area Chart of Returns Percentage",
subtitle="From Wide Data format",
caption="Source: Economics",
y="Returns %") + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_fill_manual(name="",
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line color
theme(panel.grid.minor = element_blank()) # turn off minor grid
library(ggplot2)
library(forecast)
## Warning: package 'forecast' was built under R version 3.6.2
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
theme_set(theme_classic())
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12)) # subset a smaller timewindow
# Plot
ggseasonplot(AirPassengers) + labs(title="Seasonal plot: International Airline Passengers")
ggseasonplot(nottem_small) + labs(title="Seasonal plot: Air temperatures at Nottingham Castle")
# devtools::install_github("hrbrmstr/ggalt")
library(ggplot2)
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## fortify.table ggfortify
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(ggfortify)
theme_set(theme_classic())
# Compute data with principal components
df <- iris[c(1, 2, 3, 4)]
pca_mod <- prcomp(df) # compute principal components
# Data frame of principal components -
df_pc <- data.frame(pca_mod$x, Species=iris$Species) # dataframe of principal components
df_pc_vir <- df_pc[df_pc$Species == "virginica", ] # df for 'virginica'
df_pc_set <- df_pc[df_pc$Species == "setosa", ] # df for 'setosa'
df_pc_ver <- df_pc[df_pc$Species == "versicolor", ] # df for 'versicolor'
# Plot -
ggplot(df_pc, aes(PC1, PC2, col=Species)) +
geom_point(aes(shape=Species), size=2) + # draw points
labs(title="Iris Clustering",
subtitle="With principal components PC1 and PC2 as X and Y axis",
caption="Source: Iris") +
coord_cartesian(xlim = 1.2 * c(min(df_pc$PC1), max(df_pc$PC1)),
ylim = 1.2 * c(min(df_pc$PC2), max(df_pc$PC2))) + # change axis limits
geom_encircle(data = df_pc_vir, aes(x=PC1, y=PC2)) + # draw circles
geom_encircle(data = df_pc_set, aes(x=PC1, y=PC2)) +
geom_encircle(data = df_pc_ver, aes(x=PC1, y=PC2))
# devtools::install_github("hrbrmstr/ggalt")
library(ggplot2)
library(ggalt)
theme_set(theme_classic())
health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")
# for right ordering of the dumbells
health$Area <- factor(health$Area, levels=as.character(health$Area))
# health$Area <- factor(health$Area)
gg <- ggplot(health, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) +
geom_dumbbell(color="#a3c4dc",
size=0.75,
point.colour.l="#0e668b") +
scale_x_continuous(label=waiver()) +
labs(x=NULL,
y=NULL,
title="Dumbbell Chart",
subtitle="Pct Change: 2013 vs 2014",
caption="Source: https://github.com/hrbrmstr/ggalt") +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.background=element_rect(fill="#f7f7f7"),
panel.background=element_rect(fill="#f7f7f7"),
panel.grid.minor=element_blank(),
panel.grid.major.y=element_blank(),
panel.grid.major.x=element_line(),
axis.ticks=element_blank(),
legend.position="top",
panel.border=element_blank())
## Warning: Ignoring unknown parameters: point.colour.l
plot(gg)
library(dplyr)
theme_set(theme_classic())
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")
# Define functions. Source: https://github.com/jkeirstead/r-slopegraph
tufte_sort <- function(df, x="year", y="value", group="group", method="tufte", min.space=0.05) {
## First rename the columns for consistency
ids <- match(c(x, y, group), names(df))
df <- df[,ids]
names(df) <- c("x", "y", "group")
## Expand grid to ensure every combination has a defined value
tmp <- expand.grid(x=unique(df$x), group=unique(df$group))
tmp <- merge(df, tmp, all.y=TRUE)
df <- mutate(tmp, y=ifelse(is.na(y), 0, y))
## Cast into a matrix shape and arrange by first column
require(reshape2)
tmp <- dcast(df, group ~ x, value.var="y")
ord <- order(tmp[,2])
tmp <- tmp[ord,]
min.space <- min.space*diff(range(tmp[,-1]))
yshift <- numeric(nrow(tmp))
## Start at "bottom" row
## Repeat for rest of the rows until you hit the top
for (i in 2:nrow(tmp)) {
## Shift subsequent row up by equal space so gap between
## two entries is >= minimum
mat <- as.matrix(tmp[(i-1):i, -1])
d.min <- min(diff(mat))
yshift[i] <- ifelse(d.min < min.space, min.space - d.min, 0)
}
tmp <- cbind(tmp, yshift=cumsum(yshift))
scale <- 1
tmp <- melt(tmp, id=c("group", "yshift"), variable.name="x", value.name="y")
## Store these gaps in a separate variable so that they can be scaled ypos = a*yshift + y
tmp <- transform(tmp, ypos=y + scale*yshift)
return(tmp)
}
plot_slopegraph <- function(df) {
ylabs <- subset(df, x==head(x,1))$group
yvals <- subset(df, x==head(x,1))$ypos
fontSize <- 3
gg <- ggplot(df,aes(x=x,y=ypos)) +
geom_line(aes(group=group),colour="grey80") +
geom_point(colour="white",size=8) +
geom_text(aes(label=y), size=fontSize, family="American Typewriter") +
scale_y_continuous(name="", breaks=yvals, labels=ylabs)
return(gg)
}
## Prepare data
df <- tufte_sort(source_df,
x="year",
y="value",
group="group",
method="tufte",
min.space=0.05)
df <- transform(df,
x=factor(x, levels=c(5,10,15,20),
labels=c("5 years","10 years","15 years","20 years")),
y=round(y))
## Plot
plot_slopegraph(df) + labs(title="Estimates of % survival rates") +
theme(axis.title=element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust=0.5,
family = "American Typewriter",
face="bold"),
axis.text = element_text(family = "American Typewriter",
face="bold"))
#install.packages("ggdendro")
library("ggplot2")
library("ggdendro")
theme_set(theme_bw())
hc <- hclust(dist(USArrests), "ave") # hierarchical clustering
# plot
ggdendrogram(hc, rotate = TRUE, size = 2)
library(ggplot2)
theme_set(theme_classic())
# Plot
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill=factor(cyl)), alpha=0.8) +
labs(title="Density Plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders")
library(ggplot2)
theme_set(theme_classic())
# Plot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(title="Boxplot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
library(ggplot2)
theme_set(theme_bw())
# plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_boxplot() +
geom_dotplot(binaxis='y',
stackdir='center',
dotsize = .5,
fill="red") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Boxplot + Dotplot",
subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
var <- mpg$class # the categorical data
## Prep data (nothing to change here)
nrows <- 10
df <- expand.grid(y = 1:nrows, x = 1:nrows)
categ_table <- round(table(var) * ((nrows*nrows)/(length(var))))
categ_table
## var
## 2seater compact midsize minivan pickup subcompact
## 2 20 18 5 14 15
## suv
## 26
#> 2seater compact midsize minivan pickup subcompact suv
#> 2 20 18 5 14 15 26
df$category <- factor(rep(names(categ_table), categ_table))
# NOTE: if sum(categ_table) is not 100 (i.e. nrows^2), it will need adjustment to make the sum to 100.
## Plot
ggplot(df, aes(x = x, y = y, fill = category)) +
geom_tile(color = "black", size = 0.5) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), trans = 'reverse') +
scale_fill_brewer(palette = "Set3") +
labs(title="Waffle Chart", subtitle="'Class' of vehicles",
caption="Source: mpg") +
theme(panel.border = element_rect(size = 2),
plot.title = element_text(size = rel(1.2)),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
legend.title = element_blank(),
legend.position = "right")