
# Create Gridded Counts for Grid-Mesh Optimisation ------------------------

# This R script takes the Los Angeles crime data as well as the socio-economic variables on the census tract levels and interpolates them on to the required grid resolutions. The crime data is turned into a point pattern and then use quadratcount() to generate the necessary counts. The goal of this output data was to generate the covariates for the Grid-Mesh Optimisation method, using the finest grid here at 200m-by-200m. For the data for the actual modelling using INLA and INLA within MCMC, please see CountDataGen_final.R.

# NOTE: in the below code, the loading of and assignment of socio-economic variables other than total population and average income have been commented out. When run these were included for my data, however as they are not used for any modelling, they are not necessary and the only variables of interest would be population and income. Therefore, while the lines of code are kept in order to maintain all code used to generate the final count data used, they are commented out and can be un-commented as needed.

# Author: Nadeen Khaleel

# Setwd and Load Libraries ------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))

library(sp)
library(spatstat)
library(sf)
library(dplyr)
library(readr)
library(maptools)

dim.ind <- 1 # 1 2 3 4 5 # which grid cell dimensions do we want? 200mx200m (1) 500mx500m (2) 1kmx1km (3) or 2kmx2km (4) or 5kmx5km (5)

# Los Angeles -------------------------------------------------------------
# We want to create gridded count data for the Los Angeles homicides and motor vehicle thefts where the aim of the data create in this R script is to generate covariates for the Grid-Mesh Optimisation method.

# Los Angeles: Set-Up -----------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. We also load the spatial point patterns for the two crimes and the socio-economic variables where we project the points of homicide and motor vehicle thefts.

# LA Data
setwd("./LA")

# LA Census Tracts
load("../../../SHAPEFILES/CENSUS_TRACTS/LACityCT.rda")

# LA Point Patterns
# Load and project to UTM 
la_hom <- readRDS("../../POINT_PATTERN/la_hom.rds")
la_gta <- readRDS("../../POINT_PATTERN/la_gta.rds")
la_hom_sf <- readRDS("../../POINT_PATTERN/la_hom_sf.rds")
la_gta_sf <- readRDS("../../POINT_PATTERN/la_gta_sf.rds")
# la_hom_sf.proj <- lwgeom::st_transform_proj(la_hom_sf,"+init=epsg:32611") # originally run code, however with newer package, need to use the line below, without '+init='
# la_gta_sf.proj <- lwgeom::st_transform_proj(la_gta_sf,"+init=epsg:32611") # originally run code, however with newer package, need to use the line below, without '+init='
la_hom_sf.proj <- lwgeom::st_transform_proj(la_hom_sf,"epsg:32611")
la_gta_sf.proj <- lwgeom::st_transform_proj(la_gta_sf,"epsg:32611")

# LA Socio-economic Variables
LA_ctpop_15 <- readRDS("../../../COVARIATES/LA_CTPop_15_proj.rds")
LA_ctinc_15 <- readRDS("../../../COVARIATES/LA_CTInc_15_imp_proj.rds") # use the imputed average income where all missing data is treated the same, regardless of whether there is an estimated total household = 0 for some census tracts.
# LA_ctsex_15 <- readRDS("../../../COVARIATES/LA_CTSex_15_proj.rds")
# LA_ctage_15 <- readRDS("../../../COVARIATES/LA_CTAge_15_proj.rds")
# LA_ctocc_15 <- readRDS("../../../COVARIATES/LA_CTOcc_15_proj.rds")
# LA_ctfood_15 <- readRDS("../../../COVARIATES/LA_CTFood_15_proj.rds")


# Project to UTM: both the census tracts and their union - which will form a boundary for the city.
# ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"+init=epsg:32611") # older package version
ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"epsg:32611")
lacity_boundary <- st_union(ct_LA)
# lacity_boundary.proj <- lwgeom::st_transform_proj(lacity_boundary,"+init=epsg:32611") # older package version
lacity_boundary.proj <- lwgeom::st_transform_proj(lacity_boundary,"epsg:32611")
lacity_geom <- as(lacity_boundary,"Spatial")
lacity_geom.proj <- as(lacity_boundary.proj,"Spatial")
lacity_win <- as.owin.SpatialPolygons((lacity_geom))
lacity_win.proj <- as.owin.SpatialPolygons((lacity_geom.proj))


# Los Angeles: Set-Up Grids -----------------------------------------------
# We now want to set-up the grids for different cell-widths, namely: 5km, 2km, 1km, 500m and 200m. The important output from this would the 200m-by-200m grid resolution as this is used to generate the covariates for the Grid-Mesh Optimisation method for the LA polygon, with the code found in GridMeshOptimIrreg*_final.R. In this section we just ensure that we have the required grid values to use quadratcount() later for the point patterns.

diff(lacity_win.proj$yrange)
# [1] 71787.39
diff(lacity_win.proj$xrange)
# [1] 47170.91

# So for approximately 1kmx1km, 2kmx2km  grids etc need following dims
x.range <- diff(lacity_win.proj$xrange)
y.range <- diff(lacity_win.proj$yrange)

grid_cellsxvec <- ceiling(x.range/(1e3*c(0.2,0.5,1,2,5)))
grid_cellsyvec <- ceiling(y.range/(1e3*c(0.2,0.5,1,2,5)))

grid_cellsx <- grid_cellsxvec[dim.ind]
grid_cellsy <- grid_cellsyvec[dim.ind]


# Los-Angeles: Create Crime Point Pattern Data ----------------------------
# Here we turn the location data for homicides and motor vehicle thefts into point patterns, object of type ppp, in order to use quadratcount() to later generate the count data over the grids.

# Extract 2015 data
la_hom15_sf.proj <- la_hom_sf.proj[which(la_hom_sf.proj$Y==2015),]
la_gta15_sf.proj <- la_gta_sf.proj[which(la_gta_sf.proj$Y==2015),]

la_hom15_sp.proj <- as(la_hom15_sf.proj,"Spatial")
la_gta15_sp.proj <- as(la_gta15_sf.proj,"Spatial")

la_hom15_loc.proj <- la_hom15_sp.proj@coords
colnames(la_hom15_loc.proj) <- c("x","y")
la_gta15_loc.proj <- la_gta15_sp.proj@coords
colnames(la_gta15_loc.proj) <- c("x","y")

# Turn in to ppp object
# Note that for some of these the points are lying slightly outside the window of interest, possibly due to the holes and we do not want to shift their location too much as we are gridded to reasonably fine resolutions, so we will leave the data unaltered. This will be consistent across all grids. This is also consistent with the census tract count data.
la_hom15.proj.ppp <- as.ppp(la_hom15_loc.proj,lacity_win.proj) # 1 point lying outside window?
la_gta15.proj.ppp <- as.ppp(la_gta15_loc.proj,lacity_win.proj) # 35 points lying outside window?


# Los Angeles: Generate Grids ---------------------------------------------
# We now use the grid dimensions calculated above to create the grid cells using quadrats() and store these as spatial objects.

la_cells <- quadrats(lacity_win.proj,grid_cellsx,grid_cellsy)
la_cells <- as(la_cells,"SpatialPolygons")
la_cells_centre <- t(sapply(la_cells@polygons, function(x){x@Polygons[[1]]@labpt}))
la_cells <- st_as_sf(la_cells)
la_cells$ID <- 1:length(la_cells$geometry)
p4s <- lacity_geom.proj@proj4string
la_cells <- st_set_crs(la_cells,p4s@projargs)

# Save
save(la_cells,file=paste0("LAGridCells",grid_cellsx,grid_cellsy,"_proj.rda"))

# Calculate cell areas for data frame and interpolation.
cell_areas <- st_area(la_cells)


# Los Angeles: Generate Gridded Count Data --------------------------------
# We now use the grid dimensions with quadratcount() to generate the count data sets with centroids provided from the above calculations. We include these locations as well as the counts and cell areas and an identifier for the cell for the interpolation of the socio-economic variables.

la_hom15.proj.qcount <- quadratcount(la_hom15.proj.ppp,nx=grid_cellsx,ny=grid_cellsy)
la_gta15.proj.qcount <- quadratcount(la_gta15.proj.ppp,nx=grid_cellsx,ny=grid_cellsy)


la_hom15.proj.count <- data.frame(x=la_cells_centre[,1],y=la_cells_centre[,2],count=unname(la_hom15.proj.qcount))
la_hom15.proj.count$count.Var1 <- as.numeric(la_hom15.proj.count$count.Var1)
colnames(la_hom15.proj.count) <- c("x","y","cell","count")
la_hom15.proj.count$cellarea <- cell_areas

la_gta15.proj.count <- data.frame(x=la_cells_centre[,1],y=la_cells_centre[,2],count=unname(la_gta15.proj.qcount))
la_gta15.proj.count$count.Var1 <- as.numeric(la_gta15.proj.count$count.Var1)
colnames(la_gta15.proj.count) <- c("x","y","cell","count")
la_gta15.proj.count$cellarea <- cell_areas


# Los Angeles: Socio-economic Variable Generation -------------------------
# We will now use the intersections between the grid cells and census tracts to calculate weighted sums for the interpolation of the socio-economic variables.

# Calculate the intersection of the grid cells and census tracts and calculate the areas of these intersections
s_intLA <- st_intersection(ct_LA.proj,la_cells)
s_areaLA <- as.numeric(st_area(s_intLA))

# Create a data frame for these intersections, with identifiers for the census tracts and cells
int_areaLA <- data.frame(ct=s_intLA$TRACTCE,cell=s_intLA$ID,area=as.numeric(s_areaLA))
int_areaLA <- int_areaLA[order(int_areaLA$ct),]

# Create a data frame for the census tracts only and calculate the areas of the census tracts
ct_areaLA <- data.frame(ct=ct_LA$TRACTCE,totpoly_area=as.numeric(st_area(ct_LA.proj)))
ct_areaLA <- ct_areaLA[order(ct_areaLA$ct),]

# Add the census tract areas to the intersection data frame and calculate the proportion of the intersected areas with respect to the census tract total areas.
int_areaLA$totpoly_area <- ct_areaLA$totpoly_area[match(int_areaLA$ct,ct_areaLA$ct)]
int_areaLA$proppoly_area <- int_areaLA$area/int_areaLA$totpoly_area

# Create a data frame for the grid cells only and calculate the areas of the grid cells
cell_areaLA <- data.frame(cell=la_cells$ID,totcell_area=as.numeric(cell_areas))
cell_areaLA <- cell_areaLA[order(cell_areaLA$cell),]

# Add the grid cell areas to the intersection data frame and calculate the proportion of the intersected areas with respect to the grid cell total areas.
int_areaLA$totcell_area <- cell_areaLA$totcell_area[match(int_areaLA$cell,cell_areaLA$cell)]
int_areaLA$propcell_area <- int_areaLA$area/int_areaLA$totcell_area


geo2ctLA <- function(x){ct_LA$TRACTCE[which(ct_LA$GEOID==x)]}

# Assign the census tract level socio-economic variable values for each census tract in the intersection data frame
int_areaLA$pop <- LA_ctpop_15$pop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctpop_15$geoid2),function(i){geo2ctLA(LA_ctpop_15$geoid2[i])})))]
int_areaLA$inc <- LA_ctinc_15$inc[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctinc_15$geoid2),function(i){geo2ctLA(LA_ctinc_15$geoid2[i])})))]
# int_areaLA$msex <- LA_ctsex_15$mprop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctsex_15$geoid2),function(i){geo2ctLA(LA_ctsex_15$geoid2[i])})))]
# int_areaLA$fsex <- LA_ctsex_15$fprop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctsex_15$geoid2),function(i){geo2ctLA(LA_ctsex_15$geoid2[i])})))]
# int_areaLA$food <- LA_ctfood_15$proprec[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctfood_15$geoid2),function(i){geo2ctLA(LA_ctfood_15$geoid2[i])})))]
# int_areaLA$own <- LA_ctocc_15$ownprop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctocc_15$geoid2),function(i){geo2ctLA(LA_ctocc_15$geoid2[i])})))]
# int_areaLA$rent <- LA_ctocc_15$rentprop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctocc_15$geoid2),function(i){geo2ctLA(LA_ctocc_15$geoid2[i])})))]

# For each proportion of areas, proportion of the cell or proportion of the census tract polygons, we calculate the product of the proportion with the value of the socio-economic variables for the relevant census tract and grid cell intersection.

int_areaLA$propcell.pop <- int_areaLA$propcell_area*int_areaLA$pop
int_areaLA$proppoly.pop <- int_areaLA$proppoly_area*int_areaLA$pop

int_areaLA$propcell.inc <- int_areaLA$propcell_area*int_areaLA$inc
int_areaLA$proppoly.inc <- int_areaLA$proppoly_area*int_areaLA$inc

# int_areaLA$propcell.msex <- int_areaLA$propcell_area*int_areaLA$msex
# int_areaLA$proppoly.msex <- int_areaLA$proppoly_area*int_areaLA$msex
# 
# int_areaLA$propcell.fsex <- int_areaLA$propcell_area*int_areaLA$fsex
# int_areaLA$proppoly.fsex <- int_areaLA$proppoly_area*int_areaLA$fsex
# 
# int_areaLA$propcell.food <- int_areaLA$propcell_area*int_areaLA$food
# int_areaLA$proppoly.food <- int_areaLA$proppoly_area*int_areaLA$food
# 
# int_areaLA$propcell.own <- int_areaLA$propcell_area*int_areaLA$own
# int_areaLA$proppoly.own <- int_areaLA$proppoly_area*int_areaLA$own
# 
# int_areaLA$propcell.rent <- int_areaLA$propcell_area*int_areaLA$rent
# int_areaLA$proppoly.rent <- int_areaLA$proppoly_area*int_areaLA$rent


int_areaLA$cellx <-  la_cells_centre[int_areaLA$cell,1]
int_areaLA$celly <-  la_cells_centre[int_areaLA$cell,2]

# Save
saveRDS(int_areaLA,file=paste0("LA_Cov_",grid_cellsx,grid_cellsy,"_proj.rds"))

# Combine counts with socio-economic variables
# First, calculate the areal weighted sums for the variables
# a. population and average income
# these are both weighted sums using the proportion of the area of the census tract that is intersected with each grid
sumLA2525.a <- aggregate(int_areaLA[, c(16,18)], list(cell=int_areaLA$cell), sum, na.rm = TRUE)

# This new line of code is to create sumLA2525 where, previously, it was created by combining sumLA2525.a and sumLA2525.b, however as the only socio-economic variables considered are total population and average income we have commented out the code to include the other variables, although they remain in this R script as it was the original code to create the data sets. If further variables are used, the code for sumLA2525.b and sumLA2525 where combines sumLA2525.a nd sumLA2525.b can be uncommented, and the following line of code must be commented.
sumLA2525 <- sumLA2525.a

# b. all remaining socio-economic variables of interest
# these are all weighted sums using the proportion of the area of the grid cell that is intersected with each census tract
# sumLA2525.b <- aggregate(int_areaLA[, c(19,21,23,25,27)],list(cell=int_areaLA$cell), sum, na.rm = TRUE)
# sumLA2525 <- merge(sumLA2525.a,sumLA2525.b,by="cell",all=TRUE)


# Los Angeles: Generate Final Count Data Sets -----------------------------
# Merge the counts for each crime with the interpolated variables

hom_countdf <- data.frame("cell"=la_hom15.proj.count$cell,"hom"=la_hom15.proj.count$count,"area"=la_hom15.proj.count$cellarea,"x"=la_hom15.proj.count$x,"y"=la_hom15.proj.count$y)
hom_countdf$pop <- sumLA2525$proppoly.pop
hom_countdf$inc <- sumLA2525$proppoly.inc
# hom_countdf$msex <- sumLA2525$propcell.msex
# hom_countdf$fsex <- sumLA2525$propcell.fsex
# hom_countdf$food <- sumLA2525$propcell.food
# hom_countdf$own <- sumLA2525$propcell.own
# hom_countdf$rent <- sumLA2525$propcell.rent

# Turn homicide data frame into spatial objects, in particular sf 
spdf_homcount <- SpatialPointsDataFrame(coords=hom_countdf[,c("x","y")],data=hom_countdf,proj4string = p4s)
sf_homcount <- st_as_sf(spdf_homcount)


gta_countdf <- data.frame("cell"=la_gta15.proj.count$cell,"gta"=la_gta15.proj.count$count,"area"=la_gta15.proj.count$cellarea,"x"=la_gta15.proj.count$x,"y"=la_gta15.proj.count$y)
gta_countdf$pop <- sumLA2525$proppoly.pop
gta_countdf$inc <- sumLA2525$proppoly.inc
# gta_countdf$msex <- sumLA2525$propcell.msex
# gta_countdf$fsex <- sumLA2525$propcell.fsex
# gta_countdf$food <- sumLA2525$propcell.food
# gta_countdf$own <- sumLA2525$propcell.own
# gta_countdf$rent <- sumLA2525$propcell.rent

# Turn motor vehicle theft data frame into spatial objects, in particular sf 
spdf_gtacount <- SpatialPointsDataFrame(coords=gta_countdf[,c("x","y")],data=gta_countdf,proj4string = p4s)
sf_gtacount <- st_as_sf(spdf_gtacount)

# Save
save(hom_countdf,gta_countdf,file=paste0("LA2015CT",grid_cellsx,grid_cellsy,"CountData_proj.rda"))
save(sf_homcount,sf_gtacount,file=paste0("LA2015CT",grid_cellsx,grid_cellsy,"SFCountData_proj.rda"))



# sessionInfo() -----------------------------------------------------------

sessionInfo()
 