Challenge time! export a .pdf checkins for everyone who’s NOT a council member
# NEXT! Let's look at coffee
grow <- read.csv("starbucksgrowth.csv")
# Take a look at the data
grow
## Year Worldwide US
## 1 2015 21536 11962
## 2 2010 16858 11131
## 3 2005 10241 7353
## 4 2000 3501 2729
## 5 1999 2498 1996
## 6 1992 165 146
# Make a quick chart of US growth - plot(x, y,...)
plot(grow$Year, grow$US)
# Put line between the dots-- Check ?plot
plot(grow$Year, grow$US, type="l")
# Add another line for Worldwide growth
plot(grow$Year, grow$US, type="l")
lines(grow$Year, grow$Worldwide, type="l", col="red")
# Well, that's weird.
# Here's the problem. Out-of-the box plotting is based on layers
# Start over but with the order flipped
plot(grow$Year, grow$Worldwide, type="l", col="red")
lines(grow$Year, grow$US, type="l", col="green")
# Much better. Let's clean up the axis titles and add a header
plot(grow$Year, grow$Worldwide, type="l", col="red", main="Starbucks by year", xlab="Year", ylab="Starbucks")
lines(grow$Year, grow$US, type="l", col="green")
# It's missing something.
legend("topleft", # places a legend at the appropriate place
c("Worldwide","US"), # puts text in the legend
lty=c(1,1), # gives the legend appropriate symbols (lines)
lwd=c(2.5,2.5),col=c("red","green")) # gives the legend lines the correct color and width
# Alright, that's ok. Kinda boring. Let's ggplot it up
qplot(Year, Worldwide, data=grow, geom="line")
# Alternatively,
g <- ggplot(grow, aes(x=Year, y=Worldwide)) + geom_line()
# We can't plot the second line easily. We need to change the structure of the dataframe
# http://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
growgg <- gather(grow, "Type", "Starbucks", 2:3)
# Ok, now we can plot it with two lines
ggplot(growgg, aes(x=Year, y=Starbucks, col=Type)) + geom_line()
# Nice! Let's add a title
ggplot(growgg, aes(x=Year, y=Starbucks, col=Type)) + geom_line() + ggtitle("Starbucks growth since 1992")
# Something fun: Let's export the chart we made to Plot.ly
# First, assign the ggplot to a variable
##plotlyggplot <- ggplot(growgg, aes(x=Year, y=Starbucks, col=Type)) + geom_line() + ggtitle("Starbucks growth since 1992")
# Next, download the library
# Get more thorough instructions here https://plot.ly/r/getting-started/
##library(devtools)
# load the plotly library
##library(plotly)
# set up your authorization. Create a login account and generate your own key
## https://plot.ly/settings/api
# edit this code with your username and API key and run it
## set_credentials_file("PlotlyUserName", "APIKey")
# Now, prepare the plotly environment
##py <- plotly()
# This will send your ggplot to Plotly and render it online
##plotted <- py$ggplotly(plotlyggplot)
# Edit it a bit. Add sourceline, etc.
# Plotly has great documentation, guides for how to use R to make charts
# https://plot.ly/r/
# Another chart maker https://rstudio.github.io/dygraphs/index.html
library(dygraphs)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
# Need to convert our years into a time series recognized by R
grow$Year <- strptime(grow$Year, "%Y")
# This is to convert the time series into another format called eXtensible Time Series
grow <- xts(grow[,-1],order.by=as.POSIXct(grow$Year))
dygraph(grow)
# Customize it
dygraph(grow) %>% dyRangeSelector()
# More customization on height and chart type and headline
dygraph(grow,
main = "Starbucks growth worldwide",
ylab = "Starbucks") %>%
dySeries("Worldwide", label = "World") %>%
dySeries("US", label = "US") %>%
dyOptions(stackedGraph = TRUE) %>%
dyRangeSelector(height = 20)
# Bring in some interesting data
sbux <- read.csv("starbucks.csv")
# Load in some libraries
# Leaflet for R tutorial https://rstudio.github.io/leaflet/
require(leaflet)
## Loading required package: leaflet
require(dplyr)
# Make a simple map just to test
m <- leaflet() %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addMarkers(lng=-71.101936, lat=42.348799, popup="Storytelling with Data")
m # Print the map
# How many rows are there?
nrow(sbux)
## [1] 11135
m <- leaflet(sbux) %>% addTiles()
m %>% setView(-98.964844, 38.505191, zoom = 7)
m %>% addCircles(~lon, ~lat)
# Close, but needs some cleaning up. Add some map customization
# Add custom map tiles -- look up here http://homepage.ntlworld.com/keir.clarke/leaflet/leafletlayers.htm
m <- leaflet(sbux) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png')
m %>% setView(-98.964844, 38.505191, zoom = 4)
m %>% addCircles(~lon, ~lat, weight = 2, radius=1, color = "#008000", stroke = FALSE, fillOpacity = 0.5)
# Let's try another mapping library for R. This time from Google
library(ggmap)
# https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/ggmap/ggmapCheatsheet.pdf
# Let's bring in another interesting data set
dunk <- read.csv("dunkindonuts.csv")
myLocation <- "Lebanon, KS"
myMap <- get_map(location=myLocation,
source="stamen", maptype="toner", crop=FALSE, zoom=4)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Lebanon,+KS&zoom=4&size=640x640&maptype=terrain&sensor=false
## Map from URL : http://tile.stamen.com/toner/4/2/4.png
## Map from URL : http://tile.stamen.com/toner/4/3/4.png
## Map from URL : http://tile.stamen.com/toner/4/4/4.png
## Map from URL : http://tile.stamen.com/toner/4/2/5.png
## Map from URL : http://tile.stamen.com/toner/4/3/5.png
## Map from URL : http://tile.stamen.com/toner/4/4/5.png
## Map from URL : http://tile.stamen.com/toner/4/2/6.png
## Map from URL : http://tile.stamen.com/toner/4/3/6.png
## Map from URL : http://tile.stamen.com/toner/4/4/6.png
## Map from URL : http://tile.stamen.com/toner/4/2/7.png
## Map from URL : http://tile.stamen.com/toner/4/3/7.png
## Map from URL : http://tile.stamen.com/toner/4/4/7.png
ggmap(myMap)+
geom_point(aes(x = lng, y = lat), data=dunk, alpha=.5,
color="orange", size=1)
# Alright, let's bring it together. We need to put them on one dataframe
# Take just the latitude and longitude columns in Starbucks (and state, too)
sb <- sbux[,c("lat", "lon", "City", "Province")]
# Need a seperate column to distinguish between SB and DD when joined
sb$type <- "Starbucks"
head(sb)
## lat lon City Province type
## 1 41.83619 -87.66409 Chicago IL Starbucks
## 2 33.30424 -111.86316 Chandler AZ Starbucks
## 3 32.58408 -97.13298 Mansfield TX Starbucks
## 4 47.78450 -122.21282 Bothell WA Starbucks
## 5 47.26970 -122.48869 Tacoma WA Starbucks
## 6 38.66510 -121.22398 Fair Oaks CA Starbucks
dd <- dunk[,c("lat", "lng", "city", "state")]
dd$type <- "Dunkin' Donuts"
# Bring them together!
# sbdd <- rbind(sb, dd)
# Error?? Oh right, the columns are named differently.
colnames(sb) <- c("lat","lng","city", "state","type")
# OK, try it again
sbdd <- rbind(sb, dd)
# Back to leaflet! because it was so pretty
#First, turn Type into a factor, and do some fancy work to assign a color per type
sbdd$type <- as.factor(sbdd$type)
levels(sbdd$type)
## [1] "Dunkin' Donuts" "Starbucks"
cols2 <- c("#FF8000", "#00ff00")
sbdd$colors <- cols2[unclass(sbdd$type)]
# new leaflet code. so exciting
m <- leaflet(sbdd) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png')
m %>% setView(-98.964844, 38.505191, zoom = 4)
m %>% addCircles(~lng, ~lat, weight = 1, radius=1,
color=~colors, stroke = FALSE, fillOpacity = 0.3)
# OK, neat visual. Let's do some calculations
# Chart out the top 5 states for Starbucks
# Good guide for barcharts http://www.cookbook-r.com/Graphs/Bar_and_line_graphs_(ggplot2)/
# Count up the Starbucks per State, turn it into a dataframe
sbstate <- data.frame(table(sb$state))
head(sbstate)
## Var1 Freq
## 1 AK 42
## 2 AL 65
## 3 AR 37
## 4 AZ 391
## 5 CA 2456
## 6 CO 421
# Need to name the columns for clarity
colnames(sbstate) <- c("id", "Starbucks")
# Order dataframer in descending order of number of Starbucks
sbstate <- sbstate[order(-sbstate$Starbucks),]
sbgg <- ggplot(data=head(sbstate), aes(x=id, y=Starbucks)) +
ggtitle("States with the most Starbucks") +
xlab("State") +
geom_bar(fill="darkgreen", stat="identity")
sbgg
# Hm... Order seems off, right? That's because of ordering of factors (states)
sbhead <- head(sbstate)
# Head only displays the top 5 We need to subset it out entirely
sbhead <- sbstate[1:5,]
levels(sbhead$id)
## [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT"
## [8] "DC" "DE" "FL" "GA" "HI" "IA" "ID"
## [15] "IL" "IN" "KS" "KY" "LA" "MA" "MD"
## [22] "ME" "MI" "MN" "MO" "MS" "MT" "MX-BCN"
## [29] "MX-CHH" "NC" "ND" "NE" "NH" "NJ" "NM"
## [36] "NV" "NY" "OH" "OK" "OR" "PA" "PR"
## [43] "RI" "SC" "SD" "TN" "TX" "UT" "VA"
## [50] "VT" "WA" "WI" "WV" "WY"
# Whoa, that's messy. Let's fix it
# First, we purge the old factors by converting it to string and converting it back
sbhead$id <- as.character(sbhead$id)
sbhead$id <- as.factor(sbhead$id)
# Now, we can reorder it
levels(sbhead$id)
## [1] "CA" "FL" "NY" "TX" "WA"
sbhead$id <- factor(sbhead$id,
levels = c("CA", "TX", "WA", "FL", "NY"))
levels(sbhead$id)
## [1] "CA" "TX" "WA" "FL" "NY"
# Ok, plot it again
sbgg <- ggplot(data=sbhead, aes(x=id, y=Starbucks)) +
ggtitle("States with the most Starbucks") +
xlab("State") +
geom_bar(fill="darkgreen", stat="identity")
sbgg
# Want to see it on plotly? Go for it
plottedsb <- py$ggplotly(sbgg)
# Which states have the most SB or DD per capita?
# Bring in the population table
uspop <- read.csv("uspopulation.csv")
# Let's join them together, using the plyr library
library(plyr)
sb <- join(sbstate, uspop)
## Joining by: id
head(sb)
## id Starbucks state population
## 1 CA 2456 California 37253956
## 2 TX 830 Texas 25145561
## 3 WA 689 Washington 6724540
## 4 FL 567 Florida 18801310
## 5 NY 516 New York 19378102
## 6 IL 506 Illinois 12830632
# It worked! OK, let's do some calculations
sb$Per100kPeople <- (sb$Starbucks/sb$population)*100000
sb2 <- arrange(sb, desc(Per100kPeople))
sbhead2 <- sb2[1:5,]
sbhead2$id <- as.character(sbhead2$id)
sbhead2$id <- as.factor(sbhead2$id)
sbhead2$id <- factor(sbhead2$id,
levels = c("DC", "WA", "OR", "CO", "NV"))
levels(sbhead2$id)
## [1] "DC" "WA" "OR" "CO" "NV"
sb2gg <- ggplot(data=sbhead2, aes(x=id, y=Per100kPeople)) +
ggtitle("Most Starbucks per capita") +
xlab("State") +
geom_bar(fill="darkgreen", stat="identity")
sb2gg
# Some fancy Chart layout
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
grid.arrange(sbgg, sb2gg, ncol=2, main="Starbucks popularity")
# Want to try it in Plotly? Go ahead.
##test < - grid.arrange(sbgg, sb2gg, ncol=2, main="Starbucks popularity")
##plottedpc <- py$ggplotly(test)
# Well, it won't work all the time...
# Because it used a new library (gridExtra) on top of ggplot
# OK, back to spatial join!
require(gtools)
## Loading required package: gtools
require(rgdal)
## Loading required package: rgdal
## Loading required package: sp
## rgdal: version: 0.9-3, (SVN revision 530)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/gdal
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.2/Resources/library/rgdal/proj
## Linking to sp version: 1.1-0
require(scales)
## Loading required package: scales
require(Cairo)
## Loading required package: Cairo
require(gpclib)
## Loading required package: gpclib
## General Polygon Clipper Library for R (version 1.5-5)
## Type 'class ? gpc.poly' for help
require(maptools)
## Loading required package: maptools
## Checking rgeos availability: TRUE
require(reshape)
## Loading required package: reshape
##
## Attaching package: 'reshape'
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:dplyr':
##
## rename
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## The following objects are masked from 'package:plyr':
##
## rename, round_any
# Let's manipulate the Dunkin' Donuts data now.
# Focus on Dunkin' Donuts in Massachusetts only
str(dd)
## 'data.frame': 7794 obs. of 5 variables:
## $ lat : num 42.9 42.6 42.7 40.8 40.8 ...
## $ lng : num -73.8 -70.9 -70.6 -73.1 -74.2 ...
## $ city : Factor w/ 2593 levels "Aberdeen","Abingdon",..: 424 521 1954 1027 637 313 250 2486 2109 2026 ...
## $ state: Factor w/ 42 levels "AL","AR","AZ",..: 30 17 17 30 27 17 17 17 17 17 ...
## $ type : chr "Dunkin' Donuts" "Dunkin' Donuts" "Dunkin' Donuts" "Dunkin' Donuts" ...
massdunk <- filter(dd, state == "MA")
# Let's get the count by town
masscount <- data.frame(table(massdunk$city))
# Name the columns of the new dataframe
colnames(masscount) <- c("id", "DD")
gpclibPermit()
## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release
## [1] TRUE
gpclibPermitStatus()
## [1] TRUE
towntracts <- readOGR(dsn="towns", layer="town_shapes")
## OGR data source with driver: ESRI Shapefile
## Source: "towns", layer: "town_shapes"
## with 1243 features
## It has 22 fields
towntracts <- fortify(towntracts, region="TOWN")
## MassData <- left_join(towntracts, masscount)
# That didn't work. Why?
# Because id in towntracts is in uppercase while masscount is not
masscount$id <- toupper(masscount$id)
# Try again
MassData <- left_join(towntracts, masscount)
## Joining by: "id"
head (MassData)
## long lat order hole piece group id DD
## 1 248314.5 872355.5 1 FALSE 1 ABINGTON.1 ABINGTON 4
## 2 245760.1 871717.6 2 FALSE 1 ABINGTON.1 ABINGTON 4
## 3 243309.1 870887.1 3 FALSE 1 ABINGTON.1 ABINGTON 4
## 4 243309.1 870891.9 4 FALSE 1 ABINGTON.1 ABINGTON 4
## 5 243313.8 870896.0 5 FALSE 1 ABINGTON.1 ABINGTON 4
## 6 243320.5 870898.4 6 FALSE 1 ABINGTON.1 ABINGTON 4
# Nice!
# Ok, now it's going to get a little crazy
ddtowns <- ggplot() +
geom_polygon(data = MassData, aes(x=long, y=lat, group=group,
fill=DD), color = "black", size=0.2) +
coord_map() +
scale_fill_distiller(type="seq", palette = "Reds", breaks=pretty_breaks(n=5)) +
theme_nothing(legend=TRUE) +
labs(title="Dunkin Donut towns", fill="")
ddtowns
# Now, we sit and wait
# neat!
# There's a slightly easier way
# Back to leaflet! (I love leaflet)
##pal <- colorQuantile("YlGn", NULL, n = 5)
##town_popup <- paste0("<strong>Dunkin' Donuts: </strong>", MassData$DD)
##mb_tiles <- "http://a.tiles.mapbox.com/v3/kwalkertcu.l1fc0hab/{z}/{x}/{y}.png"
##mb_attribution <- 'Mapbox <a href="http://mapbox.com/about/maps" target="_blank">Terms & Feedback</a>'
##leaflet(data = MassData) %>%
## addTiles(urlTemplate = mb_tiles,
## attribution = mb_attribution) %>%
## addPolygons(fillColor = ~pal(order),
## fillOpacity = 0.8,
## color = "#BDBDC3",
## weight = 1,
## popup = town_popup)
# Real quick, let's take a look at this amazing choropleth package
library(acs)
##
## Attaching package: 'acs'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:base':
##
## apply
library(choroplethr)
library(choroplethrMaps)
# Let's play with Census data-- Sign up for an API key
# http://www.census.gov/developers/
##api.key.install("yourkeygoeshere")
choroplethr_acs("B01003", "state")
## Warning in choroplethr_acs("B01003", "state"): This function is deprecated
## as of choroplethr version 3.0.0. Please use ?state_choropleth_acs, ?
## county_choropleth_acs and ?zip_choropleth_acs instead. The last version
## of choroplethr in which this function worked was version 2.1.1, which
## can be downloaded from CRAN here: http://cran.r-project.org/web/packages/
## choroplethr/index.html
# You can look up more Census tables to map out
# http://censusreporter.org/topics/table-codes/
# Try it again but at the county level
##choroplethr_acs("YourTableofChoice", "county")
# So many choropleth options: Animated, Custom shape files