Friday, February 14, 2020

R and internet data (Brunsdon & Comber 2019 ch 9)

#Brunsdon & Comber ch 9: R and internet data
# See also https://www.r-spatial.org/r/2018/10/25/ggplot2-sf.html
# Spatial data science
# Or Using UNCOMTRADE data with R
#https://comtrade.un.org/Data/Doc/api/ex/r

#################################################################
#In Stata: jsonio rv, file("https://data.police.uk/api/crimes-street/all-crime?lat=52.629729&lng=-1.131592&date=2017-01")

#local url     "https://data.police.uk/api/crimes-street/all-crime?"
#local url_lat "lat=52.629729&"
#local url_lng "lng=-1.131592&"
#local url_dt  "date=2017-01"
#jsonio kv, file("`url'`url_lat'`url_lng'`url_dt'")
#################################################################

require(devtools)
require(eurostat)
require(ggplot2)
require(dplyr)

#9.1 Introduction
#9.2 Direct Access to Data
fpe <- read.table("http://data.princeton.edu/wws509/datasets/effort.dat")
head(fpe, 2)
##           setting effort change
## Bolivia        46      0      1
## Brazil         74      0     10
## Chile          89     16     29
pairs(fpe, panel=panel.smooth)

#Graph viz
require(Rgraphviz)
require(datasets)

# Load the state.x77 data
data(state)
# Which ones are 'connected' - ie abs correlation above 0.5
connected <- abs(cor(state.x77)) > 0.5 
# Create the graph - node names are the column names
conn <- graphNEL(colnames(state.x77))
# Populate with edges - join variables that are TRUE in 'connected'
for (i in colnames(connected)) {
  for (j in colnames(connected)) {
    if (i < j) {
      if (connected[i,j]) {
        conn <- addEdge(i,j,conn,1)}}}}
# Create a layout for the graph
conn <- layoutGraph(conn)
# Specify some drawing parameters
attrs <- list(node=list(shape="ellipse", 
                        fixedsize=FALSE, fontsize=12))

plot(conn,attrs=attrs)
#Figure 9.2: Illustration of Rgraphviz

#9.3 Using RCurl
library(RCurl) # Load RCurl

# Get the content of the URL and store it into 'temp'
stem <- 'https://www.gov.uk/government/uploads/system/uploads'
file1 <- '/attachment_data/file/15240/1871702.csv'
temp <- getURL(paste0(stem,file1))

# Use textConnection to read the content of temp
# as though it were a CSV file
imd <- read.csv(textConnection(temp))
# Check - this gives the first 10 column names of the data frame
head(colnames(imd), 10)
## [1] "X.html..body.You.are.being..a.href.https...assets.publishing.service.gov.uk.government.uploads.system.uploads.attachment_data.file.15240.1871702.csv.redirected..a....body...html."
read.csv.https <- function(url) {
  temp <- getURL(url)
  return(read.csv(textConnection(temp)))
}
# Download the csv dataand put it into 'imd2'
imd <- read.csv(paste0(stem,file1))
# Modify the margins around the plot, to fit the GOR
# names into the left-hand margin
par(mar=c(5,12,4,2) + 0.1)
# Create the boxplot. The 'las' parameter specificies y-axis 
# labelling is horizontal, x-axis is vertical
boxplot(IMD.SCORE~GOR.NAME,data=imd,horizontal=TRUE,las=2)
#Boxplot of IMDs by Goverment Office Regions
#Figure 9.3: Boxplot of IMDs by Goverment Office Regions

#9.4 Working with APIs
require(rjson)
crimes.buf <- getForm("http://data.police.uk/api/crimes-street/all-crime",  #may need https protocol
  lat="52.629729",
  lng="-1.131592",
  date="2017-01")

crimes <- fromJSON(crimes.buf)
crimes[[1]]
## 
## $month
## [1] "2016-04"

getLonLat <- function(x) as.numeric(c(x$location$longitude,
                                      x$location$latitude))
crimes.loc <- t(sapply(crimes,getLonLat))
head(crimes.loc)
##           [,1]     [,2]
## [1,] -2.979198 53.40392
## [2,] -2.987014 53.40565

getAttr <- function(x) c(
  x$category,
  x$location$street$name,
  x$location_type)
crimes.attr <- as.data.frame(t(sapply(crimes,getAttr)))
colnames(crimes.attr) <- c("category","street","location_type")
head(crimes.attr)
##                category                      street location_type
## 1 anti-social-behaviour On or near Back Bold Street         Force
## 2 anti-social-behaviour    On or near Shopping Area         Force

library(sp)
crimes.pts <- SpatialPointsDataFrame(crimes.loc,crimes.attr)
# Specify the projection - in this case just geographical coordinates
proj4string(crimes.pts) <- CRS("+proj=longlat")
# Note that 'head' doesn't work on SpatialPointsDataFrames
crimes.pts[1:2,] 
##             coordinates              category                      street
## 1 (-2.979198, 53.40392) anti-social-behaviour On or near Back Bold Street
## 2 (-2.987014, 53.40565) anti-social-behaviour    On or near Shopping Area
##   location_type
## 1         Force
## 2         Force

asb.pts <- crimes.pts[crimes.pts$category=="anti-social-behaviour",]
cda.pts <- crimes.pts[crimes.pts$category=="criminal-damage-arson",]
require(tmap)
## Loading required package: tmap
require(tmaptools)
## Loading required package: tmaptools
asb_cda.pts <- rbind(asb.pts,cda.pts)
asb_cda.pts$category <- as.character(asb_cda.pts$category)
tm_shape(asb_cda.pts) + tm_dots(col='category',title='Crime Type',
                                labels=c("Antisocial Behaviour","Criminal Damage"),
                                palette=c('indianred','dodgerblue'),size=0.1) 
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
#Locations of anti-social behaviour and criminal damage/arson incidents
#Figure 9.4: Locations of anti-social behaviour and criminal damage/arson incidents

tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(asb_cda.pts) + tm_dots(col='category',title='Crime Type',
                                labels=c("Antisocial Behaviour","Criminal Damage"),
                                palette=c('indianred','dodgerblue'),size=0.02) 

#9.5 Creating a Statistical ‘Mashup’
terr3bed.buf <- getForm("https://api.nestoria.co.uk/api",
                        action='search_listings',
                        place_name='liverpool',
                        encoding='json',
                        listing_type='buy',
                        number_of_results=50,
                        bedroom_min=3,bedroom_max=3,
                        keywords='terrace')
## Ignore warning.

terr3bed <- fromJSON(terr3bed.buf)
getHouseAttr <- function(x) {
  as.numeric(c(x$price/1000,x$longitude,x$latitude))
}

terr3bed.attr <- as.data.frame(t(sapply(terr3bed$response$listings,
                                        getHouseAttr)))

colnames(terr3bed.attr) <- c("price","longitude","latitude")
head(terr3bed.attr)
##    price longitude latitude
## 1 120.00 -2.940470 53.41720
## 2  95.00 -2.924914 53.40125

# Create an extra column to contain burglary rates
terr3bed.attr <- transform(terr3bed.attr,burgs=0)

# For each house in the data frame
for (i in 1:50) {
  # Firstly obtain crimes in a 1-mile radius of the houses
  # latitude and longitude and decode it from JSON form
  crimes.near <- getForm(
    "http://data.police.uk/api/crimes-street/all-crime",
    lat=terr3bed.attr$latitude[i],
    lng=terr3bed.attr$longitude[i],
    date="2017-01")
  crimes.near <- fromJSON(crimes.near)
  crimes.near <- as.data.frame(t(sapply(crimes.near,getAttr)))
  # Then from the 'category' column count the number of burlaries
  # and assign it to the burgs column
  terr3bed.attr$burgs[i] <- sum(crimes.near[,1] == 'burglary')
  # Pause before running next API request - to avoid overloading
  # the server
  Sys.sleep(0.7)
  # Note this stage may cause the code to take a minute or two to run
}

ggplot(terr3bed.attr,aes(x=burgs,y=price)) + geom_point() +
  geom_smooth(span=1) + 
  labs(x='Burglaries in a 1-Mile Radius',
       y='House Price (1000s Pounds)')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Scatter plot of Burglary Rate vs. House Price
#Figure 9.5: Scatter plot of Burglary Rate vs. House Price

#9.6 Using Specific Packages: eurostat
tgs00026 <- get_eurostat("tgs00026", time_format = "raw")
## Table tgs00026 cached at /var/folders/t4/3d2rv9_10wx7_xckhgslrmv00000gn/T//RtmpNT6Mtr/eurostat/tgs00026_raw_code_TF.rds
head(tgs00026)
## # A tibble: 6 x 5
##   unit     na_item geo   time  values
##   <fct>    <fct>   <fct> <chr>  <dbl>
## 1 PPCS_HAB B6N     AT11  2005   18100
## 2 PPCS_HAB B6N     AT12  2005   18900

tgs00026_06       <- tgs00026[tgs00026$time=='2006',]
tgs00026_10       <- tgs00026[tgs00026$time=='2010',]
tgs00026_10$delta <- 100*(tgs00026_10$values - tgs00026_06$values)/tgs00026_06$values
tgs00026_10$country <- substr(as.character(tgs00026_10$geo),1,2)
ggplot(tgs00026_10,aes(x=country,y=delta)) + geom_boxplot() +
  labs(x="Country",y="% Change in Household\n Income 2005-2010")
#Box plots of disposable income change 2005-10 by country (Eurostat data).
#Figure 9.6: Box plots of disposable income change 2005-10 by country (Eurostat data).

library(forcats)
tgs00026_10$country <- 
  fct_reorder(tgs00026_10$country,tgs00026_10$delta,median)
ggplot(tgs00026_10,aes(x=country,y=delta)) + geom_boxplot() + 
  labs(x="Country",y="% Change in Household\n Income 2005-2010")
#Median ordered box plots of disposable income change 2005-10 by country.
#Figure 9.7: Median ordered box plots of disposable income change 2005-10 by country.

tmap_mode('plot')
## tmap mode set to plotting
tgs_es <- tgs00026_10[tgs00026_10$country=='ES',]
tgs_map <- get_eurostat_geospatial(output_class = "sf", resolution = "10", nuts_level = "2") %>%
  right_join(tgs_es)
head(tgs_map)

ggplot() + geom_sf(data = tgs_map, aes(fill=delta))

tgs_mlmap <- tgs_map[!(tgs_map$NUTS_ID %in% c("ES70","ES53")),]
ggplot() + geom_sf(data = tgs_mlmap, aes(fill=delta))

#Household Income Change (\%, 2005-10) In Mainland  Spain
#Figure 9.9: Household Income Change (%, 2005-10) In Mainland Spain


##### From eurostat vignette
require(ggplot2)
require(dplyr)
ct <- c("AT", "BE", "BG", "DE", "ES", "FR", "IT", "SE", "UK")
dat <- get_eurostat(id ="tps00199", filters = list(geo=ct))
head(dat)
# A tibble: 6 x 4
#indic_de geo   time       values
#<fct>    <fct> <date>      <dbl>
#  1 TOTFERRT AT    2007-01-01   1.38
#  2 TOTFERRT AT    2008-01-01   1.42
ggplot(data = dat, mapping = aes(x=time, y=values, color=geo)) +
  geom_line()

# bar chart
dat_2015 <- dat %>% filter(time == "2015-01-01")
ggplot(dat_2015, aes(x = reorder(geo, values), y = values)) +
  geom_col(color = "white", fill = "grey80") +
  labs(title = "Total fertility rate, 2015", y = "%", x = NULL)

mapdata <- get_eurostat_geospatial(nuts_level = 0) %>%
  right_join(dat_2015) %>%
  mutate(categ = cut_to_classes(values, n=4, decimals=1))
head(select(mapdata, geo, values, categ), 2)
ggplot(mapdata, aes(fill = categ)) +
  geom_sf(color = alpha("white", .3), alpha = .6) +
  xlim(c(-12,44)) + ylim(c(35,70)) +
  labs(title = "Total fertility rate, 2015", subtitle = "Avg no life birth per women", fill = "%")


#9.7 Web Scraping
grepl('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'))
## [1]  TRUE FALSE  TRUE  TRUE
grep('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'))
## [1] 1 3 4
grep('Chris[0-9]*',c('Chris','Lex','Chris1999','Chris Brunsdon'),
     value=TRUE)
## [1] "Chris"          "Chris1999"      "Chris Brunsdon"

#9.7.1 Scraping Train Times
web.buf <- readLines(
  "https://traintimes.org.uk/durham/leicester/00:00/monday")
times <- grep("strong.*[0-2][0-9]:[0-5][0-9].*[0-2][0-9]:[0-5][0-9]",
              web.buf,value=TRUE)
locs <- gregexpr("[0-2][0-9]:[0-5][0-9]",times)
# Show the match information for times[1]
locs[[1]]
## [1] 46 60
## attr(,"match.length")
## [1] 5 5
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
timedata <- matrix(0,length(locs),4)
ptr <- 1
for (loc in locs) {
  timedata[ptr,1] <- as.numeric(substr(times[ptr],loc[1],loc[1]+1))
  timedata[ptr,2] <- as.numeric(substr(times[ptr],loc[1]+3,loc[1]+4))
  timedata[ptr,3] <- as.numeric(substr(times[ptr],loc[2],loc[2]+1))
  timedata[ptr,4] <- as.numeric(substr(times[ptr],loc[2]+3,loc[2]+4))
  ptr <- ptr + 1
}
colnames(timedata) <- c('h1','m1','h2','m2')
timedata <- transform(timedata,duration = h2 + m2/60 - h1 - m1/60 )
timedata
##   h1 m1 h2 m2 duration
## 1  4 59  8 24 3.416667
## 2  4 59  8 31 3.533333
## 3  6 38  9 18 2.666667
## 4  6 44  9 46 3.033333
## 5  6 57 10  0 3.050000

0 Comments:

Post a Comment

<< Home