
# Los Angeles 2014: Count Data and Poisson Models -------------------------

# While we initially used quick inla Poisson models for the 2015 data to guide the direction and scale of our covariate effects for the Grid-Mesh Optimisation method, we were later also interested in seeing if the behaviour was similar for the 2014 data, which had been used to guided the maximum possible grid and mesh resolution through the minimum contrast method. The below code generates the necessary count data (at the 200m-by-200m scale only) and interpolates the population and income on to the discretisation grid. Note that while we generate the relevant income variable below (as we wanted to match the methodology for the variable to that of the creation of the second covariate effect for the GMO method and therefore does not match the income variable created within the MinimumContrast_final.R script where census tracts with missing data are assigned zero if the estimate of the total households for that census tract is zero), we assume that the MinumumContrast.R script has already been run, generating the necessary population data set.
# After the necessary data has been generated we implement the inla() function to run simple Poisson GLMs to the count data.

# Author: Nadeen Khaleel

# Setwd and Load Libraries ------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))

library(sp)
library(spatstat)
library(sf)
library(dplyr)
library(zipcode)
library(readr)
library(maptools)


# Los Angeles 2014: Income Set-Up -----------------------------------------
# This section loads up the census tracts and projects them to UTM coordinates in preparation for the creation of the 2014 imputed Average Income data.
load("../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACityCT.rda")

# LA Census Tracts
ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"epsg:32611")
latlon <- sapply(1:length(ct_LA.proj$geometry),function(i){return(as.numeric(st_centroid(ct_LA.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude

ct_LA.proj$x <- latlon[1,]
ct_LA.proj$y <- latlon[2,]


# Los Angeles 2014: Average Income (Imputed-GMO) --------------------------
# We will now extract the average income data, similarly to the population data. However, it is important to note that the code for the average income in this R script treats all the missing data the same, even though some of the missingness may be due to zero population estimated within that census tract and no households in particular census tracts. This is done to match the 2015 data set-up for the Grid-Mesh Optimisation method. Originally, the code to generate this data was not in this function and was created elsewhere in a unnecessary R script and, therefore, in order to ensure we have the data necessary for the models below the below code was copid over. (It is very similar to the code to generate the 2015 imputed average income, with the only different being the column numbers for the necessary variables due to a slight change in the data set-up from the ACS.)

# Average Income ##
la_inc_2014 <- read_csv("../../RAW_DATA/COVARIATES/LA/ACSST5Y2014.S1902_2020-07-24T133414/ACSST5Y2014.S1902_data_with_overlays_2020-07-24T133400.csv")

# Sort out the census tract identification for merging, as in the 2015 LA data we had geoid2.
geo2 <- str_extract_all(la_inc_2014$GEO_ID[2:length(la_inc_2014$GEO_ID)],"\\d+")
la_inc_2014$GEO.id2 <- la_inc_2014$GEO_ID
la_inc_2014$GEO.id2[1] <- "id2"
la_inc_2014$GEO.id2[2:length(la_inc_2014$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
LA_ctinc_14_0 <- la_inc_2014[la_inc_2014$GEO.id2%in%ct_LA.proj$GEOID,]

# Assign the relevant (UTM) coordinates
LA_ctinc_14_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_14_0$GEO.id2,]$y
LA_ctinc_14_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_14_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by S1902_C01_001E/S1902_C01_001M and is in slot 57 and 58, while lat and lon are in slot 112 and 113
LA_ctinc_14 <- LA_ctinc_14_0[,c(1,111,2,57,58,112,113)]
colnames(LA_ctinc_14) <- c("geoid","geoid2","name","inc","err","y","x")
LA_ctinc_14$inc <- as.numeric(LA_ctinc_14$inc) # THERE IS MISSING DATA

# There is some missing data.
sum(is.na(LA_ctinc_14$inc))
# [1] 9
# We will need to impute

# Which census tracts have the missing data?
ind.na <- which(is.na(LA_ctinc_14$inc))
plot(ct_LA.proj$geometry,axes=T,main="Census Tracts with Missing Icnome Data")
ind.na.ct <- match(LA_ctinc_14$geoid2[ind.na],ct_LA.proj$GEOID)
plot(ct_LA.proj[ind.na.ct,]$geometry,add=T,col="red")

# Load the LA County census tract to plot and get a visual representation of the census tracts with the missing data.
load("../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyCT.rda")
ct_LA_County.proj <- lwgeom::st_transform_proj(ct_LA_County,"+init=epsg:32611")
ct_LA_Countysp.proj <- as(ct_LA_County.proj,"Spatial")

plot(ct_LA_County.proj$geometry,border="red")
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_14$geoid2[ind.na],]$geometry,col="red",add=T)
plot(ct_LA.proj$geometry,add=T,border="blue")

# In order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for Los Angeles county.
load("../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyNB.rda")

# Extract only the rows from the neighbourhood matrix for the census tracts that have missing income data.
ct_ind <- match(LA_ctinc_14$geoid2[ind.na],ct_LA_Countysp$GEOID) # which geometries are the missings from
nb_ind <- sapply(1:length(ct_ind),function(i){which(unname(nb[ct_ind[i],]))}) # who are their neighbours
nb_ct <- sapply(1:length(nb_ind),function(i){match(ct_LA_Countysp$GEOID[nb_ind[[i]]],ct_LA_County.proj$GEOID)}) # which geometries on the county scale have we selected
plot(ct_LA_County.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)

# Find the indices for the county with the indices of the income data frame
inc_ind <- sapply(1:length(nb_ct),function(i){match(ct_LA_County.proj$GEOID[nb_ct[[i]]],la_inc_2014$GEO.id2)})
sum(is.na(as.numeric(unlist(la_inc_2014[unique(unlist(inc_ind)),57]))))

# Extract the indices for census tracts with NAs for the average income in the list elements of neighbours for each of the missing data census tracts
cty_na_ind <- sapply(1:length(inc_ind),function(i){inc_ind[[i]][is.na(as.numeric(unlist(la_inc_2014[inc_ind[[i]],6])))]})
# Extract the indices within the county data
cty_na_ct <- sapply(1:length(cty_na_ind),function(i){match(la_inc_2014$GEO.id2[cty_na_ind[[i]]],ct_LA_County.proj$GEOID)})

# These are just to visually inspect missing data between the county and city.
plot(ct_LA_County.proj[unlist(cty_na_ct),]$geometry,col="yellow",border="blue",add=T) # which are NA in county data
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_14$geoid2[ind.na],]$geometry,col="red",border="yellow",add=T) # but they are the ones also in the missing data within the city..

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na)
# nb_inc extracts the average income for each element of the list of neighbours for each of the 10 census tracts within missing data.
nb_inc <- sapply(1:length(nb_ind),function(i){as.numeric(unlist(la_inc_2014[match(ct_LA_Countysp$GEOID[nb_ind[[i]]],la_inc_2014$GEO.id2),6]))})


# Impute the mean, making sure to ignore any missing data.
LA_ctinc_14_imp <- LA_ctinc_14
mean.inc <- lapply(nb_inc,mean,na.rm=T)
LA_ctinc_14_imp$inc[ind.na] <- unlist(mean.inc)

# Check that we no longer have missing data for the average income.
sum(is.na(LA_ctinc_14))
# [1] 9
sum(is.na(LA_ctinc_14_imp))
# [1] 0

saveRDS(LA_ctinc_14_imp,"LA_ctinc_14_imp_proj.rds")
LA_ctinc_14_imp <- readRDS("LA_ctinc_14_imp_proj.rds")


#  Los Angeles 2014: Count Data: Set-Up -----------------------------------


# The finest resolution is in the first value of the number of cells in the x and y direction.
dim.ind <- 1 # want the finest resolution

# Now load the necessary data
# LA Census Tracts
load("../../SHAPEFILES/CENSUS_TRACTS/LACityCT.rda")

# LA Point Patterns
# Load and project to UTM
la_hom <- readRDS("../../PROCESSED_DATA/CRIME/POINT_PATTERN/la_hom.rds")
la_gta <- readRDS("../../PROCESSED_DATA/CRIME/POINT_PATTERN/la_gta.rds")
la_hom_sf <- readRDS("../../PROCESSED_DATA/CRIME/POINT_PATTERN/la_hom_sf.rds")
la_gta_sf <- readRDS("../../PROCESSED_DATA/CRIME/POINT_PATTERN/la_gta_sf.rds")
la_hom_sf.proj <- lwgeom::st_transform_proj(la_hom_sf,"epsg:32611")
la_gta_sf.proj <- lwgeom::st_transform_proj(la_gta_sf,"epsg:32611")

# LA 2014 Socio-economic variables
LA_ctpop_14 <- readRDS("./LA_CTPop_14_proj.rds")
LA_ctinc_14 <- readRDS("./LA_CTInc_14_imp_proj.rds")

# Extract 2014 data
la_hom14 <- la_hom_sf[which(la_hom_sf$Y==2014),]
la_gta14 <- la_gta_sf[which(la_gta_sf$Y==2014),]
la_hom14_sf.proj <- la_hom_sf.proj[which(la_hom_sf.proj$Y==2014),]
la_gta14_sf.proj <- la_gta_sf.proj[which(la_gta_sf.proj$Y==2014),]

# Project to UTM: both the census tracts and their union - which will form a boundary for the city which is used to create the point patterns.
ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"epsg:32611")
lacity_boundary <- st_union(ct_LA)
lacity_boundary.proj <- lwgeom::st_transform_proj(lacity_boundary,"epsg:32611")
lacity_geom <- as(lacity_boundary,"Spatial")
lacity_geom.proj <- as(lacity_boundary.proj,"Spatial")
lacity_win.proj <- as.owin.SpatialPolygons((lacity_geom.proj))
p4s <- lacity_geom.proj@proj4string



# Los Angeles 2014: Count Data: Set-Up Grids ------------------------------
# We now want to set-up the grids for different cell-widths, namely: 5km, 2km, 1km, 500m and 200m. The grid resolution that we are interested in this instance is the finest resolution, 200m, hence dim.ind <- 1 was assigned above.

# So for approximately 1kmx1km, 2kmx2km  grids etc need following dims
x.range <- diff(lacity_win.proj$xrange)
y.range <- diff(lacity_win.proj$yrange)

grid_cellsxvec <- ceiling(x.range/(1e3*c(0.2,0.5,1,2,5)))
grid_cellsyvec <- ceiling(y.range/(1e3*c(0.2,0.5,1,2,5)))

grid_cellsx <- grid_cellsxvec[dim.ind]
grid_cellsy <- grid_cellsyvec[dim.ind]


# Los Angeles 2014: Count Data: Point Patterns ----------------------------

# Turn the already extracted and projected 2014 point data into spatial objects
la_hom14_sp.proj <- as(la_hom14_sf.proj,"Spatial")
la_gta14_sp.proj <- as(la_gta14_sf.proj,"Spatial")

la_hom14_loc.proj <- la_hom14_sp.proj@coords
colnames(la_hom14_loc.proj) <- c("x","y")
la_gta14_loc.proj <- la_gta14_sp.proj@coords
colnames(la_gta14_loc.proj) <- c("x","y")

# Turn in to ppp object
# Note that for some of these the points are lying slightly outside the window of interest, possibly due to the holes and we do not want to shift their location too much as we are gridded to reasonably fine resolutions, so we will leave the data unaltered. This will be consistent across all grids and is consis tent with the census tract count data.
la_hom14.proj.ppp <- as.ppp(la_hom14_loc.proj,lacity_win.proj) # 1 point lying outside window?
la_gta14.proj.ppp <- as.ppp(la_gta14_loc.proj,lacity_win.proj) # 41 points lying outside window?


# Los Angeles 2014: Count Data: Generate Gridded Counts -------------------
# As wel have already produced the necessary grid cells over the Los Angeles city polygon, we can just load this up, instead of the time consuming re-creation of such a fine resolution quadrats. The load grids were created when generating the gridded count data in preparation for the Grid-Mesh Optimisation method.

load("../../PROCESSED DATA/CRIME/COUNT_DATA_GMO/LA/LAGridCells236359_proj.rda")
cell_areas <- st_area(la_cells)

la_hom14.proj.qcount <- quadratcount(la_hom14.proj.ppp,nx=grid_cellsx,ny=grid_cellsy)
la_gta14.proj.qcount <- quadratcount(la_gta14.proj.ppp,nx=grid_cellsx,ny=grid_cellsy)


# Los Angeles 2014: Count Data: Generate Gridded Variables ----------------


# Calculate the intersection of the grid cells and census tracts and calculate the areas of these intersections
s_intLA <- st_intersection(ct_LA.proj,la_cells)
s_areaLA <- as.numeric(st_area(s_intLA))

# Create a data frame for these intersections, with identifiers for the census tracts and cells
int_areaLA <- data.frame(ct=s_intLA$TRACTCE,cell=s_intLA$ID,area=as.numeric(s_areaLA))
int_areaLA <- int_areaLA[order(int_areaLA$ct),]

# Create a data frame for the census tracts only and calculate the areas of the census tracts
ct_areaLA <- data.frame(ct=ct_LA$TRACTCE,totpoly_area=as.numeric(st_area(ct_LA.proj)))
ct_areaLA <- ct_areaLA[order(ct_areaLA$ct),]

# Add the census tract areas to the intersection data frame and calculate the proportion of the intersected areas with respect to the census tract total areas.
int_areaLA$totpoly_area <- ct_areaLA$totpoly_area[match(int_areaLA$ct,ct_areaLA$ct)]
int_areaLA$proppoly_area <- int_areaLA$area/int_areaLA$totpoly_area

# Create a data frame for the grid cells only and calculate the areas of the grid cells
cell_areaLA <- data.frame(cell=la_cells$ID,totcell_area=as.numeric(cell_areas))
cell_areaLA <- cell_areaLA[order(cell_areaLA$cell),]

# Add the grid cell areas to the intersection data frame and calculate the proportion of the intersected areas with respect to the grid cell total areas.
int_areaLA$totcell_area <- cell_areaLA$totcell_area[match(int_areaLA$cell,cell_areaLA$cell)]
int_areaLA$propcell_area <- int_areaLA$area/int_areaLA$totcell_area

geo2ctLA <- function(x){ct_LA$TRACTCE[which(ct_LA$GEOID==x)]}

# Assign the census tract level socio-economic variable values for each census tract in the intersection data frame
int_areaLA$pop <- LA_ctpop_14$pop[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctpop_14$geoid2),function(i){geo2ctLA(LA_ctpop_14$geoid2[i])})))]
int_areaLA$inc <- LA_ctinc_14$inc[match(int_areaLA$ct,as.character(sapply(1:length(LA_ctinc_14$geoid2),function(i){geo2ctLA(LA_ctinc_14$geoid2[i])})))]

# For each proportion of areas, proportion of the cell or proportion of the census tract polygons, we calculate the product of the proportion with the value of the socio-economic variables for the relevant census tract and grid cell intersection.

int_areaLA$propcell.pop <- int_areaLA$propcell_area*int_areaLA$pop
int_areaLA$proppoly.pop <- int_areaLA$proppoly_area*int_areaLA$pop

int_areaLA$propcell.inc <- int_areaLA$propcell_area*int_areaLA$inc
int_areaLA$proppoly.inc <- int_areaLA$proppoly_area*int_areaLA$inc

# Combine counts with socio-economic variables
# First, calculate the areal weighted sums for the variables: population and average income
# these are both weighted sums using the proportion of the area of the census tract that is intersected with each grid
names(int_areaLA)[c(11,13)]
sumLA2525.a <- aggregate(int_areaLA[, c(11,13)], list(cell=int_areaLA$cell), sum, na.rm = TRUE)
sumLA2525 <- sumLA2525.a


# Los Angeles 2014: Count Data: Final Count Data Set ----------------------
# Place the counts into a data frame with tge grid centres and then include the necessary interpolated socio-economic variables to produce two data frames, one for the homicides in 2014 and one for the motor vehicle thefts in 2014. These will also be turned into sf objects and saved (in addition to the original data frames).

# Generate the count data frames using the counts previously generated.
la_cells <- as_Spatial(la_cells)
la_cells_centre <- t(sapply(la_cells@polygons, function(x){x@Polygons[[1]]@labpt}))

la_hom14.proj.count <- data.frame(x=la_cells_centre[,1],y=la_cells_centre[,2],count=unname(la_hom14.proj.qcount))
la_hom14.proj.count$count.Var1 <- as.numeric(la_hom14.proj.count$count.Var1)
colnames(la_hom14.proj.count) <- c("x","y","cell","count")
la_hom14.proj.count$cellarea <- cell_areas

la_gta14.proj.count <- data.frame(x=la_cells_centre[,1],y=la_cells_centre[,2],count=unname(la_gta14.proj.qcount))
la_gta14.proj.count$count.Var1 <- as.numeric(la_gta14.proj.count$count.Var1)
colnames(la_gta14.proj.count) <- c("x","y","cell","count")
la_gta14.proj.count$cellarea <- cell_areas

# Assign the relevant socio-economic interpolated variables
hom_countdf <- data.frame("cell"=la_hom14.proj.count$cell,"hom"=la_hom14.proj.count$count,"area"=la_hom14.proj.count$cellarea,"x"=la_hom14.proj.count$x,"y"=la_hom14.proj.count$y)
hom_countdf$pop <- sumLA2525$proppoly.pop
hom_countdf$inc <- sumLA2525$proppoly.inc

# Turn homicide data frame into spatial objects, in particular sf 
spdf_homcount <- SpatialPointsDataFrame(coords=hom_countdf[,c("x","y")],data=hom_countdf,proj4string = p4s)
sf_homcount <- st_as_sf(spdf_homcount)


gta_countdf <- data.frame("cell"=la_gta14.proj.count$cell,"gta"=la_gta14.proj.count$count,"area"=la_gta14.proj.count$cellarea,"x"=la_gta14.proj.count$x,"y"=la_gta14.proj.count$y)
gta_countdf$pop <- sumLA2525$proppoly.pop
gta_countdf$inc <- sumLA2525$proppoly.inc

# Turn motor vehicle theft data frame into spatial objects, in particular sf 
spdf_gtacount <- SpatialPointsDataFrame(coords=gta_countdf[,c("x","y")],data=gta_countdf,proj4string = p4s)
sf_gtacount <- st_as_sf(spdf_gtacount)


# Save
save(hom_countdf,gta_countdf,file=paste0("LA2014CT",grid_cellsx,grid_cellsy,"CountData_proj.rda"))
save(sf_homcount,sf_gtacount,file=paste0("LA2014CT",grid_cellsx,grid_cellsy,"SFCountData_proj.rda"))


# Los Angeles 2014: INLA GLM Modelling ------------------------------------
# Here we take the count data generated above and model them using a Poisson GLM with the inla function.

# Load count data
load("./LA2014CT236359CountData_proj.rda") # added in-case the above is run individually from the models

# Homicide Models (without loag(area) offset, with log(area) offset and where area is divided by 10^8 to be on the same scale as the shifted and scaled data that will be used for the GMO).

# Standardise variables
hom_countdf$zpop <- (hom_countdf$pop - mean(hom_countdf$pop))/sd(hom_countdf$pop)
hom_countdf$zinc <- (hom_countdf$inc - mean(hom_countdf$inc))/sd(hom_countdf$inc)

fit.glm <- inla(hom ~ 1+ zpop + zinc,data=hom_countdf,family="poisson")
fit.glm$summary.fixed

# Include log(area) offset.
fit.glm <- inla(hom ~ 1 + offset(log(area)) + zpop + zinc,data=hom_countdf,family="poisson")
fit.glm$summary.fixed

# Include log(area) offset but scale the area so that it matched the final grid cell areas for the shifted and scaled data used in the GMO.
fit.glm <- inla(hom ~ 1 + offset(log(area/1e8)) + zpop + zinc,data=hom_countdf,family="poisson")
fit.glm$summary.fixed


# Motor Vehicle Theft Models (without loag(area) offset, with log(area) offset and where area is divided by 10^8 to be on the same scale as the shifted and scaled data that will be used for the GMO).

# Standardise variables
gta_countdf$zpop <- (gta_countdf$pop - mean(gta_countdf$pop))/sd(gta_countdf$pop)
gta_countdf$zinc <- (gta_countdf$inc - mean(gta_countdf$inc))/sd(gta_countdf$inc)

fit.glm <- inla(gta ~ 1 + zpop + zinc,data=gta_countdf,family="poisson")
fit.glm$summary.fixed

# Include log(area) offset.
fit.glm <- inla(gta ~ 1 + offset(log(area)) + zpop + zinc,data=gta_countdf,family="poisson")
fit.glm$summary.fixed

# Include log(area) offset but scale the area so that it matched the final grid cell areas for the shifted and scaled data used in the GMO.
fit.glm <- inla(gta ~ 1 + offset(log(area/1e8)) + zpop + zinc,data=gta_countdf,family="poisson")
fit.glm$summary.fixed


# sessionInfo() -----------------------------------------------------------

sessionInfo()
