jalan-jalan sambil minum teh-tarek: R and internet data (Brunsdon & Comber 2019 ch 9)

#Brunsdon & Comber ch 9: R and internet data
# See also https://www.r-spatial.org/r/2018/10/25/ggplot2-sf.html
# Spatial data science
# Or Using UNCOMTRADE data with R
#https://comtrade.un.org/Data/Doc/api/ex/r

#################################################################
#In Stata: jsonio rv, file("https://data.police.uk/api/crimes-street/all-crime?lat=52.629729&lng=-1.131592&date=2017-01")

#local url "https://data.police.uk/api/crimes-street/all-crime?"
#local url_lat "lat=52.629729&"
#local url_lng "lng=-1.131592&"
#local url_dt "date=2017-01"
#jsonio kv, file("`url'`url_lat'`url_lng'`url_dt'")
#################################################################

require(devtools)
require(eurostat)
require(ggplot2)
require(dplyr)

#9.1 Introduction
#9.2 Direct Access to Data
fpe <- read.table("http://data.princeton.edu/wws509/datasets/effort.dat")
head(fpe, 2)
## setting effort change
## Bolivia 46 0 1
## Brazil 74 0 10
## Chile 89 16 29
pairs(fpe, panel=panel.smooth)

#Graph viz
require(Rgraphviz)
require(datasets)

# Load the state.x77 data
data(state)
# Which ones are 'connected' - ie abs correlation above 0.5
connected <- abs(cor(state.x77)) > 0.5
# Create the graph - node names are the column names
conn <- graphNEL(colnames(state.x77))
# Populate with edges - join variables that are TRUE in 'connected'
for (i in colnames(connected)) {
for (j in colnames(connected)) {
if (i < j) {
if (connected[i,j]) {
conn <- addEdge(i,j,conn,1)}}}}
# Create a layout for the graph
conn <- layoutGraph(conn)
# Specify some drawing parameters
attrs <- list(node=list(shape="ellipse",
fixedsize=FALSE, fontsize=12))

plot(conn,attrs=attrs)
#Figure 9.2: Illustration of Rgraphviz

#9.3 Using RCurl
library(RCurl) # Load RCurl

# Get the content of the URL and store it into 'temp'
stem <- 'https://www.gov.uk/government/uploads/system/uploads'
file1 <- '/attachment_data/file/15240/1871702.csv'
temp <- getURL(paste0(stem,file1))

# Use textConnection to read the content of temp
# as though it were a CSV file
imd <- read.csv(textConnection(temp))
# Check - this gives the first 10 column names of the data frame
head(colnames(imd), 10)
## [1] "X.html..body.You.are.being..a.href.https...assets.publishing.service.gov.uk.government.uploads.system.uploads.attachment_data.file.15240.1871702.csv.redirected..a....body...html."
read.csv.https <- function(url) {
temp <- getURL(url)
return(read.csv(textConnection(temp)))
}
# Download the csv dataand put it into 'imd2'
imd <- read.csv(paste0(stem,file1))
# Modify the margins around the plot, to fit the GOR
# names into the left-hand margin
par(mar=c(5,12,4,2) + 0.1)
# Create the boxplot. The 'las' parameter specificies y-axis
# labelling is horizontal, x-axis is vertical
boxplot(IMD.SCORE~GOR.NAME,data=imd,horizontal=TRUE,las=2)
#Boxplot of IMDs by Goverment Office Regions
#Figure 9.3: Boxplot of IMDs by Goverment Office Regions

#9.4 Working with APIs
require(rjson)
crimes.buf <- getForm("http://data.police.uk/api/crimes-street/all-crime", #may need https protocol
lat="52.629729",
lng="-1.131592",
date="2017-01")

crimes <- fromJSON(crimes.buf)
crimes[[1]]
##
## $month
## [1] "2016-04"

getLonLat <- function(x) as.numeric(c(x$location$longitude,
x$location$latitude))
crimes.loc <- t(sapply(crimes,getLonLat))
head(crimes.loc)
## [,1] [,2]
## [1,] -2.979198 53.40392
## [2,] -2.987014 53.40565

getAttr <- function(x) c(
x$category,
x$location$street$name,
x$location_type)
crimes.attr <- as.data.frame(t(sapply(crimes,getAttr)))
colnames(crimes.attr) <- c("category","street","location_type")
head(crimes.attr)
## category street location_type
## 1 anti-social-behaviour On or near Back Bold Street Force
## 2 anti-social-behaviour On or near Shopping Area Force

library(sp)
crimes.pts <- SpatialPointsDataFrame(crimes.loc,crimes.attr)
# Specify the projection - in this case just geographical coordinates
proj4string(crimes.pts) <- CRS("+proj=longlat")
# Note that 'head' doesn't work on SpatialPointsDataFrames
crimes.pts[1:2,]
## coordinates category street
## 1 (-2.979198, 53.40392) anti-social-behaviour On or near Back Bold Street
## 2 (-2.987014, 53.40565) anti-social-behaviour On or near Shopping Area
## location_type
## 1 Force
## 2 Force

asb.pts <- crimes.pts[crimes.pts$category=="anti-social-behaviour",]
cda.pts <- crimes.pts[crimes.pts$category=="criminal-damage-arson",]
require(tmap)
## Loading required package: tmap
require(tmaptools)
## Loading required package: tmaptools
asb_cda.pts <- rbind(asb.pts,cda.pts)
asb_cda.pts$category <- as.character(asb_cda.pts$category)
tm_shape(asb_cda.pts) + tm_dots(col='category',title='Crime Type',
labels=c("Antisocial Behaviour","Criminal Damage"),
palette=c('indianred','dodgerblue'),size=0.1)
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
#Locations of anti-social behaviour and criminal damage/arson incidents
#Figure 9.4: Locations of anti-social behaviour and criminal damage/arson incidents

tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(asb_cda.pts) + tm_dots(col='category',title='Crime Type',
labels=c("Antisocial Behaviour","Criminal Damage"),
palette=c('indianred','dodgerblue'),size=0.02)

#9.5 Creating a Statistical ‘Mashup’
terr3bed.buf <- getForm("https://api.nestoria.co.uk/api",
action='search_listings',
place_name='liverpool',
encoding='json',
listing_type='buy',
number_of_results=50,
bedroom_min=3,bedroom_max=3,
keywords='terrace')
## Ignore warning.

terr3bed <- fromJSON(terr3bed.buf)
getHouseAttr <- function(x) {
as.numeric(c(x$price/1000,x$longitude,x$latitude))
}

terr3bed.attr <- as.data.frame(t(sapply(terr3bed$response$listings,
getHouseAttr)))

colnames(terr3bed.attr) <- c("price","longitude","latitude")
head(terr3bed.attr)
## price longitude latitude
## 1 120.00 -2.940470 53.41720
## 2 95.00 -2.924914 53.40125

# Create an extra column to contain burglary rates
terr3bed.attr <- transform(terr3bed.attr,burgs=0)

# For each house in the data frame
for (i in 1:50) {
# Firstly obtain crimes in a 1-mile radius of the houses
# latitude and longitude and decode it from JSON form
crimes.near <- getForm(
"http://data.police.uk/api/crimes-street/all-crime",
lat=terr3bed.attr$latitude[i],
lng=terr3bed.attr$longitude[i],
date="2017-01")
crimes.near <- fromJSON(crimes.near)
crimes.near <- as.data.frame(t(sapply(crimes.near,getAttr)))
# Then from the 'category' column count the number of burlaries
# and assign it to the burgs column
terr3bed.attr$burgs[i] <- sum(crimes.near[,1] == 'burglary')
# Pause before running next API request - to avoid overloading
# the server
Sys.sleep(0.7)
# Note this stage may cause the code to take a minute or two to run
}

ggplot(terr3bed.attr,aes(x=burgs,y=price)) + geom_point() +
geom_smooth(span=1) +
labs(x='Burglaries in a 1-Mile Radius',
y='House Price (1000s Pounds)')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Scatter plot of Burglary Rate vs. House Price
#Figure 9.5: Scatter plot of Burglary Rate vs. House Price

#9.6 Using Specific Packages: eurostat
tgs00026 <- get_eurostat("tgs00026", time_format = "raw")
## Table tgs00026 cached at /var/folders/t4/3d2rv9_10wx7_xckhgslrmv00000gn/T//RtmpNT6Mtr/eurostat/tgs00026_raw_code_TF.rds
head(tgs00026)
## # A tibble: 6 x 5
## unit na_item geo time values
## <fct> <fct> <fct> <chr> <dbl>
## 1 PPCS_HAB B6N AT11 2005 18100
## 2 PPCS_HAB B6N AT12 2005 18900

tgs00026_06 <- tgs00026[tgs00026$time=='2006',]
tgs00026_10 <- tgs00026[tgs00026$time=='2010',]
tgs00026_10$delta <- 100*(tgs00026_10$values - tgs00026_06$values)/tgs00026_06$values
tgs00026_10$country <- substr(as.character(tgs00026_10$geo),1,2)
ggplot(tgs00026_10,aes(x=country,y=delta)) + geom_boxplot() +
labs(x="Country",y="% Change in Household\n Income 2005-2010")
#Box plots of disposable income change 2005-10 by country (Eurostat data).
#Figure 9.6: Box plots of disposable income change 2005-10 by country (Eurostat data).

library(forcats)
tgs00026_10$country <-
fct_reorder(tgs00026_10$country,tgs00026_10$delta,median)
ggplot(tgs00026_10,aes(x=country,y=delta)) + geom_boxplot() +
labs(x="Country",y="% Change in Household\n Income 2005-2010")
#Median ordered box plots of disposable income change 2005-10 by country.
#Figure 9.7: Median ordered box plots of disposable income change 2005-10 by country.

tmap_mode('plot')
## tmap mode set to plotting
tgs_es <- tgs00026_10[tgs00026_10$country=='ES',]
tgs_map <- get_eurostat_geospatial(output_class = "sf", resolution = "10", nuts_level = "2") %>%
right_join(tgs_es)
head(tgs_map)

ggplot() + geom_sf(data = tgs_map, aes(fill=delta))

tgs_mlmap <- tgs_map[!(tgs_map$NUTS_ID %in% c("ES70","ES53")),]
ggplot() + geom_sf(data = tgs_mlmap, aes(fill=delta))

#Household Income Change (\%, 2005-10) In Mainland Spain
#Figure 9.9: Household Income Change (%, 2005-10) In Mainland Spain

##### From eurostat vignette
require(ggplot2)
require(dplyr)
ct <- c("AT", "BE", "BG", "DE", "ES", "FR", "IT", "SE", "UK")
dat <- get_eurostat(id ="tps00199", filters = list(geo=ct))
head(dat)
# A tibble: 6 x 4
#indic_de geo time values
#<fct> <fct> <date> <dbl>
# 1 TOTFERRT AT 2007-01-01 1.38
# 2 TOTFERRT AT 2008-01-01 1.42
ggplot(data = dat, mapping = aes(x=time, y=values, color=geo)) +
geom_line()

# bar chart
dat_2015 <- dat %>% filter(time == "2015-01-01")
ggplot(dat_2015, aes(x = reorder(geo, values), y = values)) +
geom_col(color = "white", fill = "grey80") +
labs(title = "Total fertility rate, 2015", y = "%", x = NULL)

mapdata <- get_eurostat_geospatial(nuts_level = 0) %>%
right_join(dat_2015) %>%
mutate(categ = cut_to_classes(values, n=4, decimals=1))
head(select(mapdata, geo, values, categ), 2)
ggplot(mapdata, aes(fill = categ)) +
geom_sf(color = alpha("white", .3), alpha = .6) +
xlim(c(-12,44)) + ylim(c(35,70)) +
labs(title = "Total fertility rate, 2015", subtitle = "Avg no life birth per women", fill = "%")

#9.7 Web Scraping
grepl('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'))
## [1] TRUE FALSE TRUE TRUE
grep('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'))
## [1] 1 3 4
grep('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'),
value=TRUE)
## [1] "Chris" "Chris1999" "Chris Brunsdon"

#9.7.1 Scraping Train Times
web.buf <- readLines(
"https://traintimes.org.uk/durham/leicester/00:00/monday")
times <- grep("strong.*[0-2][0-9]:[0-5][0-9].*[0-2][0-9]:[0-5][0-9]",
web.buf,value=TRUE)
locs <- gregexpr("[0-2][0-9]:[0-5][0-9]",times)
# Show the match information for times[1]
locs[[1]]
## [1] 46 60
## attr(,"match.length")
## [1] 5 5
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
timedata <- matrix(0,length(locs),4)
ptr <- 1
for (loc in locs) {
timedata[ptr,1] <- as.numeric(substr(times[ptr],loc[1],loc[1]+1))
timedata[ptr,2] <- as.numeric(substr(times[ptr],loc[1]+3,loc[1]+4))
timedata[ptr,3] <- as.numeric(substr(times[ptr],loc[2],loc[2]+1))
timedata[ptr,4] <- as.numeric(substr(times[ptr],loc[2]+3,loc[2]+4))
ptr <- ptr + 1
}
colnames(timedata) <- c('h1','m1','h2','m2')
timedata <- transform(timedata,duration = h2 + m2/60 - h1 - m1/60 )
timedata
## h1 m1 h2 m2 duration
## 1 4 59 8 24 3.416667
## 2 4 59 8 31 3.533333
## 3 6 38 9 18 2.666667
## 4 6 44 9 46 3.033333
## 5 6 57 10 0 3.050000

jalan-jalan sambil minum teh-tarek

About Me

Friday, February 14, 2020

R and internet data (Brunsdon & Comber 2019 ch 9)

0 Comments: