Shannon Gallagher
February 24, 2017
spew
library(devtools)
if(!require(rgeos)) install.packages("rgeos")
install_github("leerichardson/spew")
## Coming to CRAN soon!
if(!require(rgeos)) install.packages("rgeos")
install_github("leerichardson/spew")
library(spew)
library(ggplot2)
library(maptools)
library(plyr)
library(knitr)
library(devtools)
library(osmar)
src <- osmsource_api()
my_bbox <- corner_bbox(left = 10.4935, right = 10.5165,
bottom = 43.8376, top = 43.8504)
lucca <- get_osm(my_bbox, src)
plot(lucca, xlab = "", ylab = "")
## Bounding box coordinates
left <- 10.4935
right <- 10.5165
bottom <- 43.8376
top = 43.8504
mid <- top - bottom
## Making into a spatial polygons object
xym <- matrix(c(left, bottom + mid, right, bottom + mid, right, bottom, left, bottom, left, bottom + mid ),
byrow = TRUE, ncol = 2)
p <- Polygon(xym)
ps <- Polygons(list(p), "A")
sps <- SpatialPolygons(list(ps))
## Add projection
proj4string(sps) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
## Make into SpatialPolygonsDataFrame
data <- data.frame(row.names= c("A"), place_id = c("Lucca"))
spdf <- SpatialPolygonsDataFrame(sps, data)
## Write our shapefile
writeSpatialShape(spdf, "lucca_bdry")
## Extract highways
hw_ids <- find(lucca, way(tags(k == "highway")))
hw_ids <- find_down(lucca, way(hw_ids))
hw <- subset(lucca, ids = hw_ids)
hw_line <- as_sp(hw, "lines")
hw_line@data$timestamp <- as.character(hw_line@data$timestamp)
## Get rid of single point lines to work in gIntersection
inds <- lapply(1:length(hw_line), function(ii) hw_line[ii,]@lines[[1]]@Lines[[1]]@coords)
npts <- sapply(inds, nrow)
single_pt_inds <- which(npts <= 1)
hw_line <- hw_line[-single_pt_inds,]
## Writing the new shapefile
writeSpatialShape(hw_line, "lucca_roads")
pop_table <- data.frame(place_id = c("Lucca"), puma_id = rep("italy", 1), n_house = c(3500))
pop_table
place_id puma_id n_house
1 Lucca italy 3500
write.csv(pop_table, "lucca_pop.csv", row.names = FALSE)
Households
## Household microdata
## From IPUMS-I
head(pums_h[,2:5])
YEAR SAMPLE SERIALNO PERSONS
1 2001 380200101 344465000 1
2 2001 380200101 156930000 3
3 2001 380200101 223988000 5
4 2001 380200101 213919000 1
5 2001 380200101 1073889000 2
6 2001 380200101 1062517000 1
dim(pums_h)
[1] 10000 21
Persons
## Person microdata
## From IPUMS-I
head(pums_p[,2:5])
YEAR SAMPLE SERIALNO PERSONS
1 2001 380200101 344465000 1
2 2001 380200101 156930000 3
3 2001 380200101 156930000 3
4 2001 380200101 156930000 3
5 2001 380200101 223988000 5
6 2001 380200101 223988000 5
dim(pums_p)
[1] 25763 40
library(osmar)
## Get the Lucca geography from OSM
src <- osmsource_osmosis(file = "map.osm")
my_bbox <- corner_bbox(left = 10.4935, right = 10.5165,
bottom = 43.8376, top = 43.8504)
lucca_osm<- get_osm(my_bbox, src)
## Look at available environments
sort(table(lucca_osm$nodes$tag$k))
#Extract shops and tourism places, which were the largest categories
shop_inds <- which(lucca_osm$nodes$tag$k == "shop")
shops <- lucca_osm$nodes$attrs[shop_inds,]
tour_inds<- which(lucca_osm$nodes$tag$k == "tourism")
tour <- lucca_osm$nodes$attrs[tour_inds,]
shop_coords <- shops[, c("lon", "lat")]
tour_coords <- tour[, c("lon", "lat")]
colnames(tour_coords) <- c("longitude", "latitude")
colnames(shop_coords) <- c("longitude", "latitude")
shop_coords$ID <- lucca_osm$nodes$tag$v[shop_inds]
tour_coords$ID <- lucca_osm$nodes$tag$v[tour_inds]
shop_coords$Type <- "shop"
tour_coords$Type <- "tour"
## Write it out
env <- rbind(tour_coords, shop_coords)
write.csv(env, "lucca_env.csv", row.names = FALSE)
head(envs)
longitude latitude ID Type
1 10.50573 43.84744 information tour
2 10.51380 43.84713 hostel tour
3 10.50314 43.84311 information tour
4 10.50804 43.84867 information tour
5 10.50047 43.84460 information tour
6 10.50318 43.84317 hotel tour
## You have to read the shapefiles locally
shp <- readShapeSpatial("data/lucca_bdry.shp")
roads <- readShapeSpatial("data/lucca_roads.shp")
###
counts <- read.csv("http://stat.cmu.edu/~spew/assets/lucca_pop.csv")
## Microdata from IPUMS-I, but to follow along use the below
data(tartanville)
pums_p <- tartanville$pums_p
pums_h <- tartanville$pums_h
env <- read.csv("http://stat.cmu.edu/~spew/assets/lucca_env.csv")
## Show in a plot
plot(shp, main = "Lucca Syneco!")
lines(roads, col = "blue")
points(env$lon, env$lat, pch = 16, col = "red")
Italy counts region names
[1] "Nord-Ovest"
[2] "Regione Piemonte"
[3] "Regione Autonoma Valle d'Aosta"
[4] "Regione Lombardia"
[5] "Regione Liguria"
[6] "Nord-Est"
[7] "Regione Autonoma Trentino-Alto Adige"
[8] "Regione del Veneto"
[9] "Regione Autonoma Friuli-Venezia Giulia"
[10] "Regione Emilia-Romagna"
[11] "Centro"
[12] "Regione Toscana"
[13] "Regione Umbria"
[14] "Regione Marche"
[15] "Regione Lazio"
[16] "Sud"
[17] "Regione Abruzzo"
[18] "Regione Molise"
[19] "Regione Campania"
[20] "Regione Puglia"
Italy geography region names
[1] "Abruzzo" "Apulia"
[3] "Basilicata" "Calabria"
[5] "Campania" "Emilia-Romagna"
[7] "Friuli-Venezia Giulia" "Lazio"
[9] "Liguria" "Lombardia"
[11] "Marche" "Molise"
[13] "Piemonte" "Sardegna"
[15] "Sicily" "Toscana"
[17] "Trentino-Alto Adige" "Umbria"
[19] "Valle d'Aosta" "Veneto"
# ----------------------------------------------
# MAKE CHANGES HERE ----------------------------
# ----------------------------------------------
count_names <- spew:::remove_words("regione", count_names)
count_names <- spew:::remove_words("autonoma", count_names)
count_names <- spew:::remove_words("della", count_names)
count_names <- spew:::remove_words("del", count_names)
sort(count_names)
# Verify the changes work and write
# the edited files ------------------------------------
shapefile_indices <- amatch(count_names, shapefile_names, method = "jw", maxDist = .3)
stopifnot(!any(is.na(shapefile_indices)))
stopifnot(!any(duplicated(shapefile_indices)))
# Write the "revised" version of the
# pop_table AND shapefile if necessary
pop_table[count_indices, "place_id"] <- shapefile_names[shapefile_indices]
write.table(pop_table,
file = out_pt,
sep = ",",
row.names = FALSE,
qmethod = "double")
shapefile$place_id <- shapefile_names
writeSpatialShape(shapefile, out_shape)
count_names <- spew:::replace_word(word = "laichau",replace ="laichau,dienbien,laocai,andyenbai", names = count_names)
# VIETNAM
count_names <- spew:::replace_word(word = "backan", replace ="backan,andthainguyen", names = count_names)
#
count_names <- spew:::replace_word(word = "hagiang", replace ="hagiang,andtuyenquang", names = count_names)
#
count_names <- spew:::replace_word(word = "ninhbinh", replace ="ninhbinh,hoabinh,hanoi,phutho,vinhphuc,hanam,andnamdinh", names = count
_names)
#
count_names <- spew:::replace_word(word = "haiduong", replace ="haiduong,andhungyen", names = count_names)
#
count_names <- spew:::replace_word(word = "bacgiang", replace ="bacgiang,andbacninh", names = count_names)
#
count_names <- spew:::replace_word(word = "nghean", replace ="nghean,andhatinh", names = count_names)
#
count_names <- spew:::replace_word(word = "daklak", replace ="daklakanddaknong", names = count_names)
#
count_names <- spew:::replace_word(word = "gialai", replace ="gialai,andkontum", names = count_names)
#
count_names <- spew:::replace_word(word = "phuyen", replace ="phuyenandkhanhhoa", names = count_names)
Consider:
## Simple Random Sampling
lucca_syneco <- spewr(counts, shp, pums_h, pums_p, convert_count = TRUE)
nrow(lucca_syneco[[1]]$households)
out <- summarize_syneco(lucca_syneco,
vars_df = data.frame(var_name = c("PERSONS", "INCTOT", "AGE", "SEX"),
pop_type = c("hh", "hh", "p", "p"),
var_type = c("cont", "cont", "cont", "cat"),
stringsAsFactors = FALSE), verbose = FALSE)
head(out$SEX, 3)
place_id 1 2
1 Lucca 2856 627
### Seems surprising...
## But if we look at the input data, we see that the SEX categories are disproportionate
table(pums_p$SEX)
1 2
21294 4469
## Shows that uniform sampling can be unreliable!
lucca <- list(pop_table = counts, shapefile = shp, pums_h = pums_h, pums_p = pums_p,
roads = roads, environments = env)
plot_syneco(lucca, lucca_syneco,
region_name = "Lucca", pretty = TRUE)
shapefile <- list(boundaries = shp, roads = roads)
lucca_roads <- spewr(counts, shapefile,
pums_h, pums_p,
locations_method = "roads", noise = .0001)
plot_syneco(lucca, lucca_roads,
region_name = "Lucca", pretty = TRUE)
left <- 10.50322; right <- 10.51130; top <- 43.84811; bottom <- 43.84449
shop_coords <- env
road_pts <- lucca_roads[[1]]$people
shop_sub <- shop_coords[ which(shop_coords$longitude >= left & shop_coords$longitude <= right & shop_coords$latitude >= bottom & shop_coords$longitude <= top),]
people_sub <- road_pts[ which(road_pts$longitude >= left & road_pts$longitude <= right & road_pts$latitude >= bottom & road_pts$longitude <= top),]
set.seed(42)
shop_loc <- shop_sub[sample(1:nrow(shop_sub), 6),]
assigned_df <- spew:::assign_place_coords(people_sub, shop_loc, method = "uniform")
SPEW uses MPI
data(sd_data)
sd_syneco <- spewr(pop_table = sd_data$pop_table,
shapefile = sd_data$shapefiles$shapefile,
pums_h = sd_data$pums$pums_h,
pums_p = sd_data$pums$pums_p,
locations_method = "uniform",
do_parallel = TRUE)
do_parallel= TRUE
;
4 cores, 222 regions
[1] "Location runs in: 35.8"
do_parallel= FALSE
;
1 core, 222 regions
[1] "Location runs in: 85.6"
saveRDS(lucca_syneco, "lucca_syneco.RDS")