
# Census Tract Covariate Data Generation ----------------------------------

# In this R script we take the ACS data over the census tracts in the relevant counties and extract the values of the census tracts for the cities only. We additionally have to deal with any missing data.
# The output data will then be saved in DATA/PROCESSED_DATA/COVARIATES.
# Important note: the average income data set created in this R script is only used within the Grid-Mesh Optimisation method implementation for the Los Angeles polygon in order to create one of the two covariates used. For the average income used within the modelling of the crime data itself, please see CovDatGen_Inc_final.R in the same directory as this R script.

# Note: the data from the ACS may be updated, especially with respect to the Los Angeles 2015, which was originally accessed through the American FactFinder (which was later retired), and so newly accessed data may have different column set-ups, more like the New York and Portland data, for reference.

# Author: Nadeen Khaleel


# Setwd and Load Libraries ------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))

library(dplyr)
library(readr)
library(revgeo)
library(sf)
library(lwgeom)
library(stringr)
library(purrr)
library(sp)


# Los Angeles -------------------------------------------------------------
# In this section we use the census tract shapefiles in combination with the downloaded socio-economic variables to extract the values of the variables overt the required census tracts. If there is missing data we will use the neighbourhood matrices to impute the missing data using the average values of the neighbouring census tracts (within both the city and the county itself).


# Los Angeles: Set-Up -----------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# LA Data
setwd("./LA")

# LA Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACityCT.rda")

# Project to UTM
# ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"+init=epsg:32611") # originally run code, however with newer package, need to use the line below, without '+init='
ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"epsg:32611")
latlon <- sapply(1:length(ct_LA.proj$geometry),function(i){return(as.numeric(st_centroid(ct_LA.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude

ct_LA.proj$x <- latlon[1,]
ct_LA.proj$y <- latlon[2,]


# Los Angeles: Population -------------------------------------------------
# We will now extract the necessary population data and check for missing values before saving the final data set. The ACS data contains estimate for the population as well errors for these estimates so we will store these as well.

fiveyear_15 <- read_csv("./ACS_15_5YR_B01003/ACS_15_5YR_B01003_with_ann.csv")

# Extract the data for the census tracts of interest
LA_ctpop_15_0 <- fiveyear_15[fiveyear_15$GEO.id2%in%ct_LA.proj$GEOID,]

# Assign the relevant (UTM) coordinates
LA_ctpop_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctpop_15_0$GEO.id2,]$y
LA_ctpop_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctpop_15_0$GEO.id2,]$x

# These columns were selected by checking the necessary column names in the metadata
LA_ctpop_15 <- LA_ctpop_15_0[,c(1,2,3,4,5,6,7)]
colnames(LA_ctpop_15) <- c("geoid","geoid2","name","pop","err","y","x")
LA_ctpop_15$pop <- as.numeric(LA_ctpop_15$pop)
# No missing data (usually, the as.numeric would have picked up on these and produce a warning)

# Save
saveRDS(LA_ctpop_15,"LA_CTPop_15_proj.rds")
# LA_ctpop_15 <- readRDS("LA_CTPop_15_proj.rds")


# Los Angeles: Average Income (Imputed-GMO) -------------------------------
# We will now extract the average income data, similarly to the population data. However, it is important to note that the code for the average income in this R script treats all the missing data the same, even though some of the missingness may be due to zero population estimated within that census tract and no households in particular census tracts. This imputed data is further used within the Grid-Mesh Optimisation implementation for Los Angeles, but not for the final models. For the Average Income data for the modelling, we look to CovDataGen_Inc_final.R where we assign zero average income to census tracts with an estimated zero total households.

fiveyearinc_15 <- read_csv("./ACS_15_5YR_S1902/ACS_15_5YR_S1902_with_ann.csv")

# Extract the data for the census tracts of interest
LA_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_LA.proj$GEOID,]

# Assign the relevant (UTM) coordinates
LA_ctinc_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15_0$GEO.id2,]$y
LA_ctinc_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by HCO2_EST/MOE_VCO2 and is in slot 6 and 7, while lat and lon are in slot 112 and 113
LA_ctinc_15 <- LA_ctinc_15_0[,c(1,2,3,6,7,112,113)]
colnames(LA_ctinc_15) <- c("geoid","geoid2","name","inc","err","y","x")
LA_ctinc_15$inc <- as.numeric(LA_ctinc_15$inc)

# There is some missing data.
sum(is.na(LA_ctinc_15$inc))
# [1] 10
# We will need to impute.

# Which census tracts have the missing data?
ind.na <- which(is.na(LA_ctinc_15$inc))
plot(ct_LA.proj$geometry,axes=T,main="Census Tracts with Missing Icnome Data")
ind.na.ct <- match(LA_ctinc_15$geoid2[ind.na],ct_LA.proj$GEOID)
plot(ct_LA.proj[ind.na.ct,]$geometry,add=T,col="red")

# Load the LA County census tract to plot and get a visual representation of the census tracts with the missing data.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyCT.rda")
# ct_LA_County.proj <- lwgeom::st_transform_proj(ct_LA_County,"+init=epsg:32611") # older package version
ct_LA_County.proj <- lwgeom::st_transform_proj(ct_LA_County,"epsg:32611")
ct_LA_Countysp.proj <- as(ct_LA_County.proj,"Spatial")

plot(ct_LA_County.proj$geometry,border="red")
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15$geoid2[ind.na],]$geometry,col="red",add=T)
plot(ct_LA.proj$geometry,add=T,border="blue")

# In order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for Los Angeles county.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyNB.rda")

# Extract only the rows from the neighbourhood matrix for the census tracts that have missing income data.
ct_ind <- match(LA_ctinc_15$geoid2[ind.na],ct_LA_Countysp$GEOID) # which geometries are the missings from
nb_ind <- sapply(1:length(ct_ind),function(i){which(unname(nb[ct_ind[i],]))}) # who are their neighbours
nb_ct <- sapply(1:length(nb_ind),function(i){match(ct_LA_Countysp$GEOID[nb_ind[[i]]],ct_LA_County.proj$GEOID)}) # which geometries on the county scale have we selected

# Visually inspect that we have the correct census tracts
plot(ct_LA_County.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)

# Find the indices for the county with the indices of the income data frame
inc_ind <- sapply(1:length(nb_ct),function(i){match(ct_LA_County.proj$GEOID[nb_ct[[i]]],fiveyearinc_15$GEO.id2)})
sum(is.na(as.numeric(unlist(fiveyearinc_15[unique(unlist(inc_ind)),57])))) # Have NAs, neighbours

# Extract the indices for census tracts with NAs for the average income in the list elements of neighbours for each of the missing data census tracts
cty_na_ind <- sapply(1:length(inc_ind),function(i){inc_ind[[i]][is.na(as.numeric(unlist(fiveyearinc_15[inc_ind[[i]],6])))]})
# Extract the indices within the county data
cty_na_ct <- sapply(1:length(cty_na_ind),function(i){match(fiveyearinc_15$GEO.id2[cty_na_ind[[i]]],ct_LA_County.proj$GEOID)})

# These are just to visually inspect missing data between the county and city.
plot(ct_LA_County.proj[unlist(cty_na_ct),]$geometry,col="yellow",border="blue",add=T) # which are NA in county data
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15$geoid2[ind.na],]$geometry,col="red",border="yellow",add=T) # but they are the ones also in the missing data within the city..

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na, but some are neighbours of others)
# nb_inc extracts the average income for each element of the list of neighbours for each of the 10 census tracts within missing data.
nb_inc <- sapply(1:length(nb_ind),function(i){as.numeric(unlist(fiveyearinc_15[match(ct_LA_Countysp$GEOID[nb_ind[[i]]],fiveyearinc_15$GEO.id2),6]))})

# Impute the mean, making sure to ignore the missing data.
LA_ctinc_15_imp <- LA_ctinc_15
mean.inc <- lapply(nb_inc,mean,na.rm=T)
LA_ctinc_15_imp$inc[ind.na] <- unlist(mean.inc)

# Check that we no longer have missing data for the average income.
sum(is.na(LA_ctinc_15))
# [1] 10
sum(is.na(LA_ctinc_15_imp))
# [1] 0

# Save
saveRDS(LA_ctinc_15_imp,"LA_CTInc_15_imp_proj.rds")
# LA_ctinc_15 <- readRDS("LA_CTInc_15_imp_proj.rds")



# # Los Angeles: Sex and Age ------------------------------------------------
# # We will now extract the data on the sex and age of the population within each census tract. We will first begin by considering the data avaiable on the Sex of the population before moving on to age.
# 
# fiveyearagesex_15 <- read_csv("./ACS_15_5YR_S0101/ACS_15_5YR_S0101_with_ann.csv")
# 
# # Sex
# # Extract the data for the census tracts of interest
# LA_ctsex_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_LA.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# LA_ctsex_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctsex_15_0$GEO.id2,]$y
# LA_ctsex_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctsex_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # - total pop (margin of error), which according to the metadata is given by HCO1_EST(MOE)_VCO2 and is in slot 4 and 5;
# # - total male pop (margin of error), which according to the metadata is given by HCO2_EST(MOE)_VCO2 and is in slot 6 and 7;
# # - total female pop (margin of error), which according to the metadata is given by HCO1_EST(MOE)_VCO2 and is in slot 8 and 9;
# # - while lat and lon are in slot 220 and 221
# LA_ctsex_15 <- LA_ctsex_15_0[,c(1,2,3,4,5,6,7,8,9,220,221)]
# colnames(LA_ctsex_15) <- c("geoid","geoid2","name","totp","totperr","mtotp","mtotperr","ftotp","ftotperr","y","x")
# LA_ctsex_15[,4:9] <- sapply(LA_ctsex_15[,4:9],as.numeric)
# 
# # Calculate proportions
# LA_ctsex_15$mprop <- LA_ctsex_15$mtotp/LA_ctsex_15$totp
# LA_ctsex_15$fprop <- LA_ctsex_15$ftotp/LA_ctsex_15$totp
# 
# # Missing data: this could be due to the calculation of the proportions where the denominator for the total population is 0.
# # NAs produced for the proportions where the total population is 0, 11 CTs: let us compare and check.
# sum(is.na(LA_ctsex_15$mprop))
# # [1] 4
# sum(is.na(LA_ctsex_15$fprop))
# # [1] 4
# sum(LA_ctsex_15$totp==0)
# # [1] 4
# sum(LA_ctsex_15$mtotp==0)
# # [1] 5
# sum(LA_ctsex_15$ftotp==0)
# # [1] 5
# sum(LA_ctsex_15$fprop==1,na.rm=T)
# # [1] 1
# sum(LA_ctsex_15$mprop==1,na.rm=T)
# # [1] 1
# which(LA_ctsex_15$mtotp==0)
# # [1] 988 993 994 995 999
# which(LA_ctsex_15$ftotp==0)
# # [1] 990 993 994 995 999
# which(LA_ctsex_15$totp==0)
# # [1] 993 994 995 999
# LA_ctsex_15[988,]$fprop
# # [1] 1
# LA_ctsex_15[990,]$mprop
# # [1] 1
# # Set these values to 0!
# which(LA_ctsex_15$totp==0)
# # [1] 993 994 995 999
# which(is.na(LA_ctsex_15$mprop))
# # [1] 993 994 995 999
# which(is.na(LA_ctsex_15$fprop))
# # [1] 993 994 995 999
# 
# # Okay, so with no population set the values for these missing data to 0.
# LA_ctsex_15$fprop[which(LA_ctsex_15$totp==0)] <- 0
# LA_ctsex_15$mprop[which(LA_ctsex_15$totp==0)] <- 0
# 
# # Check we have removed all missing data.
# sum(is.na(LA_ctsex_15))
# # [1] 0
# 
# # Save
# saveRDS(LA_ctsex_15,"LA_CTSex_15_proj.rds")
# # LA_ctsex_15 <- readRDS("LA_CTSex_15_proj.rds")
# 
# 
# # Age
# # Extract the data for the census tracts of interest
# LA_ctage_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_LA.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# LA_ctage_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctage_15_0$GEO.id2,]$y
# LA_ctage_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctage_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # Rather than above for the smaller number of rows required, we want to automate the collection. There are 18 age groups in 5 year increments <5,5-9,10-14,...,70-74,75-79,80-84,>85 with the labelling as follows:
# # HC*_EST(MOE)_VC** 
# # where
# #     01 - TOTAL
# # * = 02 - MALE
# #     03 - FEMALE
# #       01 - < 5
# #       03 - 5-9 (THERE IS NO VC02)
# #       04 - 10-14
# # ** =  ..
# #       18 - 74-79
# #       19 - 80-84
# #       20 - > 85
# # - while lat and lon are in slot 220 and 221 as for the previous data set
# 
# # Generate the column names (as descrbied in the above comment) for the necessary groups for the ages
# sex_grp <- paste0("HC0",rep(1:3,each=2),rep(c("_EST_","_MOE_"),3))
# age_grp <- paste0("VC0",c(1,3,4,5,6,7,8,9))
# age_grp <- append(age_grp,paste0("VC",10:20))
# comb_grp <- paste0(rep(sex_grp,length(age_grp)),rep(age_grp,each=length(sex_grp)))
# grp_sel <- append(c("GEO.id2","GEO.display-label"),comb_grp)
# grp_sel <- grp_sel %>% append(c("y","x"))
# 
# # Extract the necessary columns
# LA_ctage_15 <- LA_ctage_15_0[,grp_sel]
# 
# # For new labels follow the previous code to select the groups so then the labels so follow the correct order
# newlab_sex <- paste0(rep(c("tot","m","f"),each=2),rep(c("","err"),3)) 
# newlab_age <- "tot"
# newlab_age <- newlab_age %>% append(paste0(seq(0,84,by=5),"-",seq(4,84,by=5))) %>% append("85-")
# comb_lab <- paste0(rep(newlab_sex,length(newlab_age)),rep(newlab_age,each=length(newlab_sex)))
# grp_names <- c("geoid2","name")
# grp_names <- grp_names %>% append(comb_lab) %>% append(c("y","x"))
# # Assign the new column names
# colnames(LA_ctage_15) <- grp_names
# LA_ctage_15[,3:(ncol(LA_ctage_15)-2)] <- sapply(LA_ctage_15[,3:(ncol(LA_ctage_15)-2)],as.numeric)
# 
# # Check the total population for all ages and for each sex
# sum(LA_ctage_15$tottot==0)
# # [1] 4
# sum(LA_ctage_15$ftot==0)
# # [1] 5
# # Check if missing proportion census tracts are for 0 females
# fage <- grep("f",colnames(LA_ctage_15))
# for (i in 1:length(fage)){
#   sum(is.na(LA_ctage_15[,fage[i]]))
#   print(sum(which(is.na(LA_ctage_15[,fage[i]]))%in%which(LA_ctage_15$ftot==0)))
# }
# # [1] 5
# 
# sum(LA_ctage_15$tottot==0)
# # [1] 4
# sum(LA_ctage_15$mtot==0)
# # [1] 5
# # Check if missing proportion census tracts are for 0 males
# mage <- grep("m",colnames(LA_ctage_15))
# mage <- mage[-1] # first element is for "name"!! Don't want to alter this!
# for (i in 1:length(mage)){
#   sum(is.na(LA_ctage_15[,mage[i]]))
#   print(sum(which(is.na(LA_ctage_15[,mage[i]]))%in%which(LA_ctage_15$mtot==0)))
# }
# # [1] 5
# 
# which(LA_ctage_15$tottot==0)
# # [1] 993 994 995 999
# which(LA_ctage_15$ftot==0)
# # [1] 990 993 994 995 999
# which(LA_ctage_15$mtot==0)
# # [1] 988 993 994 995 999
# 
# # The for loops show that the 5 0s for both males and female match the locations of the NAs in the proportions
# # Except, of course for the first couple of elements of fage and mage because they are the TOTALs and so need to remove these (after we have already removed the first element of mage above).
# frep <- fage[-c(1,2)]
# mrep <- mage[-c(1,2)]
# 
# # Set the necessary proportions to 0.
# ind.f <- which(LA_ctage_15$ftot==0)
# ind.m <- which(LA_ctage_15$mtot==0)
# LA_ctage_15[ind.f,frep] <- 0
# LA_ctage_15[ind.m,mrep] <- 0
# 
# # Now do the same for the totals of each section
# tage <- grep("tot",colnames(LA_ctage_15))
# for (i in 1:length(tage)){
#   sum(is.na(LA_ctage_15[,tage[i]]))
#   print(sum(which(is.na(LA_ctage_15[,tage[i]]))%in%which(LA_ctage_15$tottot==0)))
# }
# # [1] 4
# tage <- tage[-c(1:6)] # Main totals, not the age groups where the NAs are so don't want to affect this
# 
# ind.t <- which(LA_ctage_15$tottot==0)
# LA_ctage_15[ind.t,tage] <- 0
# 
# # Check we've removed all missing data
# sum(is.na(LA_ctage_15))
# # [1] 0
# 
# # Save
# saveRDS(LA_ctage_15,"LA_CTAge_15_proj.rds")
# # LA_ctage_15 <- readRDS("LA_CTAge_15_proj.rds")
# 
# 
# 
# # Los Angeles: Home Occupier ----------------------------------------------
# # We will now extract the necessary home occupier data (property owned or rented?) and check for missing values before saving the final data set.
# 
# fiveyearocc_15 <- read_csv("./ACS_15_5YR_B25003/ACS_15_5YR_B25003_with_ann.csv")
# 
# # Extract the data for the census tracts of interest
# LA_ctocc_15_0 <- fiveyearocc_15[fiveyearocc_15$GEO.id2%in%ct_LA.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# LA_ctocc_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctocc_15_0$GEO.id2,]$y
# LA_ctocc_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctocc_15_0$GEO.id2,]$x
# 
# # Take the total homes and their error as well as number of those owned and rented to calculate the proportions, while lat and lon are in slot 10 and 11
# LA_ctocc_15 <- LA_ctocc_15_0 # Want all the columns
# colnames(LA_ctocc_15) <- c("geoid","geoid2","name","tot","err","totown","errown","totrent","errrent","y","x")
# LA_ctocc_15[,4:9] <- sapply(LA_ctocc_15[,4:9],as.numeric)
# 
# # Calculate proportions
# LA_ctocc_15$ownprop <- LA_ctocc_15$totown/LA_ctocc_15$tot
# LA_ctocc_15$rentprop <- LA_ctocc_15$totrent/LA_ctocc_15$tot
# 
# # NAs - match the total=0, understandably given how they were calculated
# sum(is.na(LA_ctocc_15$ownprop))
# # [1] 6
# sum(is.na(LA_ctocc_15$rentprop))
# # [1] 6
# sum(LA_ctocc_15$tot==0)
# # [1] 6
# sum(LA_ctocc_15$totown==0)
# # [1] 25
# sum(LA_ctocc_15$totrent==0)
# # [1] 7
# 
# which(LA_ctocc_15$tot==0)
# # [1] 847 988 993 994 995 999
# which(is.na(LA_ctocc_15$ownprop))
# # [1] 847 988 993 994 995 999
# which(is.na(LA_ctocc_15$rentprop))
# # [1] 847 988 993 994 995 999
# 
# # Set values to 0
# LA_ctocc_15$ownprop[which(LA_ctocc_15$tot==0)] <- 0
# LA_ctocc_15$rentprop[which(LA_ctocc_15$tot==0)] <- 0
# 
# # Double check all missing data has been accounted for
# sum(is.na(LA_ctocc_15))
# # [1] 0
# 
# # Save
# saveRDS(LA_ctocc_15,"LA_CTOcc_15_proj.rds")
# # LA_ctocc_15 <- readRDS("LA_CTOcc_15_proj.rds")
# 
# 
# 
# # Los Angeles: Food Stamps ------------------------------------------------
# # We will now extract the necessary food stamp (SNAPs) data, which we are interested in as a proxy for poverty, and check for missing values vefore saving the final data set.
# 
# fiveyearfood_15 <- read_csv("./ACS_15_5YR_S2201/ACS_15_5YR_S2201_with_ann.csv")
# 
# # Extract the data for the census tracts of interest
# LA_ctfood_15_0 <- fiveyearfood_15[fiveyearfood_15$GEO.id2%in%ct_LA.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# LA_ctfood_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctfood_15_0$GEO.id2,]$y
# LA_ctfood_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctfood_15_0$GEO.id2,]$x
# 
# # Take the total households and their error (columns 4 and 5) as well as the number as well as percentage of those that receive food stamps and their error (8 and 9, 10 and 11), while lat and lon are in slot 460 and 461
# LA_ctfood_15 <- LA_ctfood_15_0[,c(1,2,3,4,5,8,9,10,11,460,461)] # Want all the columns!!
# colnames(LA_ctfood_15) <- c("geoid","geoid2","name","tot","err","totrec","errrec","proprec","errproprec","y","x")
# LA_ctfood_15$tot <- as.numeric(LA_ctfood_15$tot)
# LA_ctfood_15$err <- as.numeric(LA_ctfood_15$err)
# LA_ctfood_15$totrec <- as.numeric(LA_ctfood_15$totrec)
# LA_ctfood_15$errrec <- as.numeric(LA_ctfood_15$errrec)
# LA_ctfood_15$proprec <- as.numeric(LA_ctfood_15$proprec)
# LA_ctfood_15$errproprec <- as.numeric(LA_ctfood_15$errproprec)
# 
# # NAs - matches total = 0 unsurprisingly
# sum(is.na(LA_ctfood_15$proprec))
# # [1] 6
# sum(LA_ctfood_15$tot==0)
# # [1] 6
# sum(LA_ctfood_15$totrec==0)
# # [1] 67
# 
# which(LA_ctfood_15$tot==0)
# # [1] 847 988 993 994 995 999
# which(is.na(LA_ctfood_15$proprec))
# # [1] 847 988 993 994 995 999
# which(is.na(LA_ctfood_15$errproprec))
# # [1] 847 988 993 994 995 999
# 
# # Set these to 0
# LA_ctfood_15$proprec[which(LA_ctfood_15$tot==0)] <- 0
# LA_ctfood_15$errproprec[which(LA_ctfood_15$tot==0)] <- 0
# 
# # Double check all of the missing data as been dealt with
# sum(is.na(LA_ctfood_15))
# # [1] 0
# 
# # Save
# saveRDS(LA_ctfood_15,"LA_CTFood_15_proj.rds")
# # LA_ctfood_15 <- readRDS("LA_CTFood_15_proj.rds")
# 

# Los Angeles: Re-set WD --------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# New York City -----------------------------------------------------------
# In this section we use the census tract shapefiles in combination with the downloaded socio-economic variables to extract the values of the variables overt the required census tracts. If there is missing data we will use the neighbourhood matrices to impute the missing data using the average values of the neighbouring census tracts (within both the city and the county itself).

# New York: Set-Up --------------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# NYC Data
setwd("./NYC")

# NYC Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCityCT.rda")

# Project to UTM
# ct_NY.proj <- lwgeom::st_transform_proj(ct_NY,"+init=epsg:32618") # originally run code, however with newer package, need to use the line below, without '+init='
ct_NY.proj <- lwgeom::st_transform_proj(ct_NY,"epsg:32618")
latlon <- sapply(1:length(ct_NY.proj$geometry),function(i){return(as.numeric(st_centroid(ct_NY.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude


ct_NY.proj$x <- latlon[1,]
ct_NY.proj$y <- latlon[2,]


# New York: Population ----------------------------------------------------
# We will now extract the necessary population data and check for missing values before saving the final data set. The ACS data contains estimate for the population as well errors for these estimates so we will store these as well.

# First row is titles...even though that should be dealt with in read_csv
fiveyear_15 <- read_csv("./ACSDT5Y2015.B01003_2020-06-15T083125/ACSDT5Y2015.B01003_data_with_overlays_2020-06-15T083120.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyear_15$GEO_ID[2:length(fiveyear_15$GEO_ID)],"\\d+")
fiveyear_15$GEO.id2 <- fiveyear_15$GEO_ID
fiveyear_15$GEO.id2[1] <- "id2"
fiveyear_15$GEO.id2[2:length(fiveyear_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
NY_ctpop_15_0 <- fiveyear_15[fiveyear_15$GEO.id2%in%ct_NY.proj$GEOID,]

# Assign the relevant (UTM) coordinates
NY_ctpop_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctpop_15_0$GEO.id2,]$y
NY_ctpop_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctpop_15_0$GEO.id2,]$x
NY_ctpop_15 <- NY_ctpop_15_0[,c(1,5,2,3,4,6,7)]
colnames(NY_ctpop_15) <- c("geoid","geoid2","name","pop","err","y","x")
NY_ctpop_15$pop <- as.numeric(NY_ctpop_15$pop)
# No missing data (usually, the as.numeric would have picked up on these and produce a warning)

# Save
saveRDS(NY_ctpop_15,"NY_CTPop_15_proj.rds")
# NY_ctpop_15 <- readRDS("NY_CTPop_15_proj.rds")


# New York: Average Income (Imputed-GMO) ----------------------------------
# We will now extract the average income data, similarly to the population data. However, it is important to note that the code for the average income in this R script treats all the missing data the same, even though some of the missingness may be due to zero population estimated within that census tract. This imputed data is further used within the Grid-Mesh Optimisation implementation for Los Angeles, but not for the final models (both on the census tracts-level and gridded). For the Average Income data for the modelling, we look to CovDataGen_Inc_final.R where we assign zero average income to census tracts with an estimated zero total households.

fiveyearinc_15 <- read_csv("./ACSST5Y2015.S1902_2020-06-15T083620/ACSST5Y2015.S1902_data_with_overlays_2020-06-15T083605.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyearinc_15$GEO_ID[2:length(fiveyearinc_15$GEO_ID)],"\\d+")
fiveyearinc_15$GEO.id2 <- fiveyearinc_15$GEO_ID
fiveyearinc_15$GEO.id2[1] <- "id2"
fiveyearinc_15$GEO.id2[2:length(fiveyearinc_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
NY_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_NY.proj$GEOID,]

# Assign the relevant (UTM) coordinates
NY_ctinc_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15_0$GEO.id2,]$y
NY_ctinc_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by S1902_C01_001E/S1902_C01_001M and is in slot 57 and 58, while lat and lon are in slot 112 and 113
NY_ctinc_15 <- NY_ctinc_15_0[,c(1,111,2,57,58,112,113)]
colnames(NY_ctinc_15) <- c("geoid","geoid2","name","inc","err","y","x")
NY_ctinc_15$inc <- as.numeric(NY_ctinc_15$inc)

# There is some missing data.
sum(is.na(NY_ctinc_15$inc))
ind.na <- which(is.na(NY_ctinc_15$inc))
plot(ct_NY.proj$geometry,axes=T,main="Census Tracts with Missing Income Data")
ind.na.ct <- match(NY_ctinc_15$geoid2[ind.na],ct_NY.proj$GEOID)
plot(ct_NY.proj[ind.na.ct,]$geometry,col="red",add=T)
# We will need to impute

# Load the NYC County census tract to plot and get a visual representation of the census tracts with the missing data.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCCountyCT.rda")
# ct_NY_County.proj <- lwgeom::st_transform_proj(ct_NY_County,"+init=epsg:32618") # older package version
ct_NY_County.proj <- lwgeom::st_transform_proj(ct_NY_County,"epsg:32618")

plot(ct_NY_County.proj$geometry,border="red")
plot(ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15$geoid2[ind.na],]$geometry,col="red",add=T)
plot(ct_NY.proj$geometry,add=T,border="blue")

# In order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for counties containing the city.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCountyNB.rda")
ct_ind <- match(NY_ctinc_15$geoid2[ind.na],ct_NY_Countysp$GEOID) # which geometries are the missings from
nb_ind <- sapply(1:length(ct_ind),function(i){which(unname(nb[ct_ind[i],]))}) # who are their neighbours
nb_ct <- sapply(1:length(nb_ind),function(i){match(ct_NY_Countysp$GEOID[nb_ind[[i]]],ct_NY_County.proj$GEOID)}) # which geometries on the county scale have we selected

# Visually inspect that we have the correct census tracts
plot(ct_NY_County.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)


# Find the indices for the county with the indices of the income data frame
inc_ind <- sapply(1:length(nb_ct),function(i){match(ct_NY_County.proj$GEOID[nb_ct[[i]]],fiveyearinc_15$GEO.id2)})
sum(is.na(as.numeric(unlist(fiveyearinc_15[unique(unlist(inc_ind)),57])))) 

# Extract the indices for census tracts with NAs for the average income in the list elements of neighbours for each of the missing data census tracts
cty_na_ind <- sapply(1:length(inc_ind),function(i){inc_ind[[i]][is.na(as.numeric(unlist(fiveyearinc_15[inc_ind[[i]],57])))]})
# Extract the indices within the county data
cty_na_ct <- sapply(1:length(cty_na_ind),function(i){match(fiveyearinc_15$GEO.id2[cty_na_ind[[i]]],ct_NY_County.proj$GEOID)})

# These are just to visually inspect missing data between the county and city.
plot(ct_NY_County.proj[unlist(cty_na_ct),]$geometry,col="yellow",border="blue",add=T) # which are NA in county data
plot(ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15$geoid2[ind.na],]$geometry,col="red",border="yellow",add=T) # but they are the ones also in the missing data within the city..

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na)
# nb_inc extracts the average income for each element of the list of neighbours for each of the census tracts with missing data 
nb_inc <- sapply(1:length(nb_ind),function(i){as.numeric(unlist(fiveyearinc_15[match(ct_NY_Countysp$GEOID[nb_ind[[i]]],fiveyearinc_15$GEO.id2),57]))})

# There is one with no neigbours, but it has no population, so will set mean income to zero
sum(NY_ctpop_15$geoid2[which(NY_ctpop_15$pop==0)]==NY_ctinc_15$geoid2[ind.na[25]])

# Impute the mean, making sure to ignore any missing data.
NY_ctinc_15_imp <- NY_ctinc_15
mean.inc <- lapply(nb_inc,mean,na.rm=T)
mean.inc[[25]]
mean.inc[[25]] <- 0
NY_ctinc_15_imp$inc[ind.na] <- unlist(mean.inc)

# Check that we no longer have missing data for the average income.
sum(is.na(NY_ctinc_15_imp))
# [1] 0

# Save
saveRDS(NY_ctinc_15_imp,"NY_CTInc_15_imp_proj.rds")
# NY_ctinc_15 <- readRDS("NY_CTInc_15_imp_proj.rds")


# # New York: Sex and Age ---------------------------------------------------
# # We will now extract the data on the sex and age of the population within each census tract. We will first begin by considering the data avaiable on the Sex of the population before moving on to age.
# 
# fiveyearagesex_15 <- read_csv("./ACSST5Y2015.S0101_2020-06-15T083507/ACSST5Y2015.S0101_data_with_overlays_2020-06-15T083459.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearagesex_15$GEO_ID[2:length(fiveyearagesex_15$GEO_ID)],"\\d+")
# fiveyearagesex_15$GEO.id2 <- fiveyearagesex_15$GEO_ID
# fiveyearagesex_15$GEO.id2[1] <- "id2"
# fiveyearagesex_15$GEO.id2[2:length(fiveyearagesex_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Sex
# # Extract the data for the census tracts of interest
# NY_ctsex_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_NY.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# NY_ctsex_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctsex_15_0$GEO.id2,]$y
# NY_ctsex_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctsex_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # - total pop (margin of error), which according to the metadata is given by S0101_C01_001E(M) and is in slot 13 and 14;
# # - total male pop (margin of error), which according to the metadata is given by S0101_C02_001E(M) and is in slot 85 and 86;
# # - total female pop (margin of error), which according to the metadata is given by S0101_C03_001E(M) and is in slot 157 and 158;
# # - while lat and lon are in slot 220 and 221
# NY_ctsex_15 <- NY_ctsex_15_0[,c(1,219,2,13,14,85,86,157,158,220,221)]
# colnames(NY_ctsex_15) <- c("geoid","geoid2","name","totp","totperr","mtotp","mtotperr","ftotp","ftotperr","y","x")
# NY_ctsex_15[,4:9] <- sapply(NY_ctsex_15[,4:9],as.numeric)
# 
# # Calculate proportions
# NY_ctsex_15$mprop <- NY_ctsex_15$mtotp/NY_ctsex_15$totp
# NY_ctsex_15$fprop <- NY_ctsex_15$ftotp/NY_ctsex_15$totp
# 
# # Missing data: this could be due to the calculation of the proportions where the denominator for the total population is 0.
# # NAs produced for the proportions where the total population is 0, 36 CTs: let us compare and check.
# sum(is.na(NY_ctsex_15$mprop))
# # [1] 36
# sum(is.na(NY_ctsex_15$fprop))
# # [1] 36
# sum(NY_ctsex_15$totp==0)
# # [1] 36
# sum(NY_ctsex_15$mtotp==0)
# # [1] 41
# sum(NY_ctsex_15$ftotp==0)
# # [1] 38
# 
# which(is.na(NY_ctsex_15$mprop))
# # [1] 166  172  249  346  443  497  547  565  567  779  974  992 1297 1300 1301 1304 1385 1413 1449 1458 1503 1514 1538 1539 1689 1720 1750 1769 1835 1849 1884 1926 1987 2017 2025 2096
# which(is.na(NY_ctsex_15$fprop))
# # [1] 166  172  249  346  443  497  547  565  567  779  974  992 1297 1300 1301 1304 1385 1413 1449 1458 1503 1514 1538 1539 1689 1720 1750 1769 1835 1849 1884 1926 1987 2017 2025 2096
# which(NY_ctsex_15$totp==0)
# # [1]  166  172  249  346  443  497  547  565  567  779  974  992 1297 1300 1301 1304 1385 1413 1449 1458 1503 1514 1538 1539 1689 1720 1750 1769 1835 1849 1884 1926 1987 2017 2025 2096
# 
# sum(which(is.na(NY_ctsex_15$mprop))%in%which(NY_ctsex_15$totp==0))
# # [1] 36
# sum(which(is.na(NY_ctsex_15$fprop))%in%which(NY_ctsex_15$totp==0))
# # [1] 36
# 
# # Okay, so with no population set the values for these missing data to 0.
# NY_ctsex_15$mprop[which(NY_ctsex_15$totp==0)] <- 0
# NY_ctsex_15$fprop[which(NY_ctsex_15$totp==0)] <- 0
# 
# # Check we have removed all missing data.
# sum(is.na(NY_ctsex_15))
# # [1] 0
# 
# # Save
# saveRDS(NY_ctsex_15,"NY_CTSex_15_proj.rds")
# # NY_ctsex_15 <- readRDS("NY_CTSex_15_proj.rds")
# 
# 
# # Age
# # Extract the data for the census tracts of interest
# NY_ctage_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_NY.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# NY_ctage_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctage_15_0$GEO.id2,]$y
# NY_ctage_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctage_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # Rather than above for the smaller number of rows recquired, we want to automate the collection. There are 18 age groups in 5 year increments <5,5-9,10-14,...,70-74,75-79,80-84,>85 with the labelling as follows:
# # S0101_C*_**E(M)
# # where
# #     01 - TOTAL
# # * = 02 - MALE
# #     03 - FEMALE
# #      ( 002 - < 5
# #      ( 003 - 5-9 
# #      ( 004 - 10-14
# # ** =  (..
# #      ( 017 - 74-79
# #      ( 018 - 80-84
# #      ( 019 - > 85
# # - while lat and lon are in slot 220 and 221 as for the previous data set
# 
# # Generate the column names (as descrbied in the above comment) for the necessary groups for the ages
# sex_grp <- paste0("S0101_C0",rep(1:3,each=1))
#                   # ,rep(c("_EST_","_MOE_"),3))
# age_grp <- paste0("_00",rep(c(1,2,3,4,5,6,7,8,9),each=2),rep(c("E","M"),9))
# age_grp <- append(age_grp,paste0("_0",rep(10:19,each=2),rep(c("E","M"),10)))
# comb_grp <- paste0(rep(sex_grp,length(age_grp)),rep(age_grp,each=length(sex_grp)))
# grp_sel <- append(c("GEO_ID","GEO.id2","NAME"),comb_grp)
# grp_sel <- grp_sel %>% append(c("y","x"))
# 
# # Extract necessary columns
# NY_ctage_15 <- NY_ctage_15_0[,grp_sel]
# 
# # For new labels follow the previous code to select the groups so then the labels so follow the correct order
# newlab_sex <- paste0(rep(c("tot","m","f"),2),rep(c("","err"),each=3)) # compare newlab_sex with sex_grp - yes tot, m, f est and then tot, m, f moe
# newlab_age <- "tot"
# newlab_age <- newlab_age %>% append(paste0(seq(0,84,by=5),"-",seq(4,84,by=5))) %>% append("85-")
# comb_lab <- paste0(rep(newlab_sex,length(newlab_age)),rep(newlab_age,each=length(newlab_sex)))
# grp_names <- c("geoid","geoid2","name")
# grp_names <- grp_names %>% append(comb_lab) %>% append(c("y","x"))
# colnames(NY_ctage_15) <- grp_names
# NY_ctage_15[,4:(ncol(NY_ctage_15)-2)] <- sapply(NY_ctage_15[,4:(ncol(NY_ctage_15)-2)],as.numeric)
# 
# # Check the total population for all ages and for each sex
# sum(NY_ctage_15$tottot==0)
# # [1] 36
# sum(NY_ctage_15$ftot==0)
# # [1] 38
# # Check if missing proportion census tracts are for 0 females
# fage <- grep("f",colnames(NY_ctage_15))
# for (i in 1:length(fage)){
#   sum(is.na(NY_ctage_15[,fage[i]]))
#   print(sum(which(is.na(NY_ctage_15[,fage[i]]))%in%which(NY_ctage_15$ftot==0)))
# }
# 
# sum(NY_ctage_15$mtot==0)
# # [1] 41
# # Check if missing proportion census tracts are for 0 males
# mage <- grep("m",colnames(NY_ctage_15))
# for (i in 1:length(mage)){
#   sum(is.na(NY_ctage_15[,mage[i]]))
#   print(sum(which(is.na(NY_ctage_15[,mage[i]]))%in%which(NY_ctage_15$mtot==0)))
# }
# mage <- mage[-1] # first element corresponds to "name" column, don't want to mess with that column
# 
# # The for loops show that the 5 0s for both males and female match the locations of the NAs in the proportions
# # Except, of course for the first couple of elements of fage and mage because they are the TOTALs and so need to remove these (after we have already removed the first element of mage above).
# frep <- fage[-c(1,2)]
# mrep <- mage[-c(1,2)]
# 
# # Set the necessary proportions to 0.
# ind.f <- which(NY_ctage_15$ftot==0)
# ind.m <- which(NY_ctage_15$mtot==0)
# NY_ctage_15[ind.f,frep] <- 0
# NY_ctage_15[ind.m,mrep] <- 0
# 
# # Now do the same for the totals of each section
# tage <- grep("tot",colnames(NY_ctage_15))
# for (i in 1:length(tage)){
#   sum(is.na(NY_ctage_15[,tage[i]]))
#   print(sum(which(is.na(NY_ctage_15[,tage[i]]))%in%which(NY_ctage_15$tottot==0)))
# }
# tage <- tage[-c(1:6)] # first 6 elements correspond to main totals, not where the NAs are, in the groups for tot
# 
# ind.t <- which(NY_ctage_15$tottot==0)
# NY_ctage_15[ind.t,tage] <- 0
# 
# # Check we've removed all missing data
# sum(is.na(NY_ctage_15))
# # [1] 0
# 
# # Save
# saveRDS(NY_ctage_15,"NY_CTAge_15_proj.rds")
# # NY_ctage_15 <- readRDS("NY_CTAge_15_proj.rds")
# 
# 
# 
# # New York: Home Occupier -------------------------------------------------
# # We will now extract the necessary home occupier data (property owned or rented?) and check for missing values before saving the final data set.
# 
# fiveyearocc_15 <- read_csv("X:/Maths/ResearchProjects/TRSmith/TRSmith2/DATA_LAPTOP/DATA/RAW DATA/COVARIATES/COV_CT/NYC/ACSDT5Y2015.B25003_2020-06-15T084809/ACSDT5Y2015.B25003_data_with_overlays_2020-06-15T084802.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearocc_15$GEO_ID[2:length(fiveyearocc_15$GEO_ID)],"\\d+")
# fiveyearocc_15$GEO.id2 <- fiveyearocc_15$GEO_ID
# fiveyearocc_15$GEO.id2[1] <- "id2"
# fiveyearocc_15$GEO.id2[2:length(fiveyearocc_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Extract the data for the census tracts of interest
# NY_ctocc_15_0 <- fiveyearocc_15[fiveyearocc_15$GEO.id2%in%ct_NY.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# NY_ctocc_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctocc_15_0$GEO.id2,]$y
# NY_ctocc_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctocc_15_0$GEO.id2,]$x
# 
# # Take the total homes and their error as well as number of those owned and rented to calculate the proportions, while lat and lon are in slot 10 and 11
# NY_ctocc_15 <- NY_ctocc_15_0[,c(1,9,2,3,4,5,6,7,8,10,11)] # Want all the columns!!
# colnames(NY_ctocc_15) <- c("geoid","geoid2","name","tot","err","totown","errown","totrent","errrent","y","x")
# NY_ctocc_15[,4:9] <- sapply(NY_ctocc_15[,4:9],as.numeric)
# 
# # Calculate proportions
# NY_ctocc_15$ownprop <- NY_ctocc_15$totown/NY_ctocc_15$tot
# NY_ctocc_15$rentprop <- NY_ctocc_15$totrent/NY_ctocc_15$tot
# 
# # NA checks - where the total (denominator is 0)
# sum(NY_ctocc_15$tot==0)
# # [1] 44
# sum(is.na(NY_ctocc_15$ownprop))
# # [1] 44
# sum(is.na(NY_ctocc_15$rentprop))
# # [1] 44
# # Show that we have NAs because zero pop
# sum(which(NY_ctocc_15$tot==0)%in%which(is.na(NY_ctocc_15$ownprop)))
# # [1] 44
# sum(which(NY_ctocc_15$tot==0)%in%which(is.na(NY_ctocc_15$rentprop)))
# # [1] 44
# 
# # Set values to 0
# NY_ctocc_15$ownprop[which(NY_ctocc_15$tot==0)] <- 0
# NY_ctocc_15$rentprop[which(NY_ctocc_15$tot==0)] <- 0
# 
# # Double check all missing data has been accounted for
# sum(is.na(NY_ctocc_15))
# # [1] 0
# 
# # Save
# saveRDS(NY_ctocc_15,"NY_CTOcc_15_proj.rds")
# # NY_ctocc_15 <- readRDS("NY_CTOcc_15_proj.rds")
# 
# 
# # New York: Food Stamps ---------------------------------------------------
# # We will now extract the necessary food stamp (SNAPs) data, which we are interested in as a proxy for poverty, and check for missing values before saving the final data set.
# 
# fiveyearfood_15 <- read_csv("./ACSST5Y2015.S2201_2020-06-15T083905/ACSST5Y2015.S2201_data_with_overlays_2020-06-15T083854.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearfood_15$GEO_ID[2:length(fiveyearfood_15$GEO_ID)],"\\d+")
# fiveyearfood_15$GEO.id2 <- fiveyearfood_15$GEO_ID
# fiveyearfood_15$GEO.id2[1] <- "id2"
# fiveyearfood_15$GEO.id2[2:length(fiveyearfood_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Extract the data for the census tracts of interest
# NY_ctfood_15_0 <- fiveyearfood_15[fiveyearfood_15$GEO.id2%in%ct_NY.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# NY_ctfood_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctfood_15_0$GEO.id2,]$y
# NY_ctfood_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctfood_15_0$GEO.id2,]$x
# 
# # Take the total households and their error (columns 3 and 4-"S2201_C01_001E") as well as the percentage and total number of those that receive food stamps and their error (EMPTY CELLS-231 and 232-"S2201_C04_001E", 155 and 156), while lat and lon are in slot 460 and 461
# NY_ctfood_15 <- NY_ctfood_15_0[,c(1,459,2,3,4,155,156,231,232,460,461)] # Want all the columns!!
# colnames(NY_ctfood_15) <- c("geoid","geoid2","name","tot","err","totrec","errrec","proprec","errproprec","y","x")
# NY_ctfood_15$tot <- as.numeric(NY_ctfood_15$tot)
# NY_ctfood_15$err <- as.numeric(NY_ctfood_15$err)
# NY_ctfood_15$totrec <- as.numeric(NY_ctfood_15$totrec)
# NY_ctfood_15$errrec <- as.numeric(NY_ctfood_15$errrec)
# NY_ctfood_15$proprec <- as.numeric(NY_ctfood_15$proprec)
# NY_ctfood_15$errproprec <- as.numeric(NY_ctfood_15$errproprec)
# 
# # NA checks - NAs in percentages match the TOTALs not the total receiving as denominator
# sum(NY_ctfood_15$totrec==0)
# # [1] 86
# sum(NY_ctfood_15$tot==0)
# # [1] 44
# sum(is.na(NY_ctfood_15$proprec))
# # [1] 44
# sum(is.na(NY_ctfood_15$errproprec))
# # [1] 44
# sum(which(NY_ctfood_15$tot==0)%in%which(is.na(NY_ctfood_15$proprec)))
# # [1] 44
# sum(which(NY_ctfood_15$tot==0)%in%which(is.na(NY_ctfood_15$errproprec)))
# # [1] 44
# 
# # Set these to 0
# NY_ctfood_15$proprec[which(NY_ctfood_15$tot==0)] <- 0
# NY_ctfood_15$errproprec[which(NY_ctfood_15$tot==0)] <- 0
# 
# # Double check all of the missing data as been dealt with
# sum(is.na(NY_ctfood_15))
# # [1] 0
# 
# # Save
# saveRDS(NY_ctfood_15,"NY_CTFood_15_proj.rds")
# # NY_ctfood_15 <- readRDS("NY_CTFood_15_proj.rds")
# 

# New York: Re-set WD -----------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# Portland ----------------------------------------------------------------
# In this section we use the census tract shapefiles in combination with the downloaded socio-economic variables to extract the values of the variables overt the required census tracts. If there is missing data we will use the neighbourhood matrices to impute the missing data using the average values of the neighbouring census tracts (within both the city and the county itself).

# Portland: Set-Up --------------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# Portland Data
setwd("./Portland")

# Portland Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/PCityCT.rda")

# Project to UTM
# ct_P.proj <- lwgeom::st_transform_proj(ct_P,"+init=epsg:32610") # originally run code, however with newer package, need to use the line below, without '+init='
ct_P.proj <- lwgeom::st_transform_proj(ct_P,"epsg:32610")
latlon <- sapply(1:length(ct_P.proj$geometry),function(i){return(as.numeric(st_centroid(ct_P.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude


ct_P.proj$x <- latlon[1,]
ct_P.proj$y <- latlon[2,]


# Portland: Population ----------------------------------------------------
# We will now extract the necessary population data and check for missing values before saving the final data set. The ACS data contains estimate for the population as well errors for these estimates so we will store these as well.

# First Row is titles...even though that should be dealt with in read_csv
fiveyear_15 <- read_csv("./ACSDT5Y2015.B01003_2020-06-13T080147/ACSDT5Y2015.B01003_data_with_overlays_2020-06-13T080143.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyear_15$GEO_ID[2:length(fiveyear_15$GEO_ID)],"\\d+")
fiveyear_15$GEO.id2 <- fiveyear_15$GEO_ID
fiveyear_15$GEO.id2[1] <- "id2"
fiveyear_15$GEO.id2[2:length(fiveyear_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
P_ctpop_15_0 <- fiveyear_15[fiveyear_15$GEO.id2%in%ct_P.proj$GEOID,]

# Assign the relevant (UTM) coordinates
P_ctpop_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctpop_15_0$GEO.id2,]$y
P_ctpop_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctpop_15_0$GEO.id2,]$x
P_ctpop_15 <- P_ctpop_15_0[,c(1,5,2,3,4,6,7)]
colnames(P_ctpop_15) <- c("geoid","geoid2","name","pop","err","y","x")
P_ctpop_15$pop <- as.numeric(P_ctpop_15$pop)
# No missing data (usually, the as.numeric would have picked up on these and produce a warning)

# Save
saveRDS(P_ctpop_15,"P_CTPop_15_proj.rds")
# P_ctpop_15 <- readRDS("P_CTPop_15_proj.rds")


# Portland: Average Income (Imputed-GMO) ----------------------------------
# We will now extract the average income data, similarly to the population data. However, it is important to note that the code for the average income in this R script treats all the missing data the same, even though some of the missingness may be due to zero population estimated within that census tract. This imputed data is further used within the Grid-Mesh Optimisation implementation for Los Angeles, but not for the final models (both on the census tracts-level and gridded). For the Average Income data for the modelling, we look to CovDataGen_Inc_final.R where we assign zero average income to census tracts with an estimated zero total households.

fiveyearinc_15 <- read_csv("./ACSST5Y2015.S1902_2020-06-13T081245/ACSST5Y2015.S1902_data_with_overlays_2020-06-13T081241.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyearinc_15$GEO_ID[2:length(fiveyearinc_15$GEO_ID)],"\\d+")
fiveyearinc_15$GEO.id2 <- fiveyearinc_15$GEO_ID
fiveyearinc_15$GEO.id2[1] <- "id2"
fiveyearinc_15$GEO.id2[2:length(fiveyearinc_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
P_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_P.proj$GEOID,]

# Assign the relevant (UTM) coordinates
P_ctinc_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctinc_15_0$GEO.id2,]$y
P_ctinc_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctinc_15_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by S1902_C01_001E/S1902_C01_001M and is in slot 57 and 58, while lat and lon are in slot 112 and 113
P_ctinc_15 <- P_ctinc_15_0[,c(1,111,2,57,58,112,113)]
colnames(P_ctinc_15) <- c("geoid","geoid2","name","inc","err","y","x")
P_ctinc_15$inc <- as.numeric(P_ctinc_15$inc)

# There is some missing data - for a single census tract to be specific.
sum(is.na(P_ctinc_15$inc))
ind.na <- which(is.na(P_ctinc_15$inc))
plot(ct_P.proj$geometry,axes=T,main="Census Tracts with Missing Income Data")
ind.na.ct <- match(P_ctinc_15$geoid2[ind.na],ct_P.proj$GEOID)
plot(ct_P.proj[ind.na.ct,]$geometry,col="red",add=T)
# We will need to impute


# Finding neighbours in Portland city only as the census tract with the missing data is surrounded by other census tract within the city itself, so unlike for the other cities we will not need to consider the county census tracts for neighbours.
# Therefore, in order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for the city.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/PCityNB.rda")
ct_ind <- match(P_ctinc_15$geoid2[ind.na],ct_Psp$GEOID) # which geometries are the missings from
nb_ind <- which(unname(nb[ct_ind,])) # who are their neighbours
nb_ct <- match(ct_Psp$GEOID[nb_ind],ct_P.proj$GEOID) # which geometries in the city have we selected

# Visually inspect that we have the correct census tracts
plot(ct_P.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)

# Find the indices for the city with the indices of the income data frame
inc_ind <- match(ct_P.proj$GEOID[nb_ct],fiveyearinc_15$GEO.id2)
sum(is.na(as.numeric(unlist(fiveyearinc_15[unique(unlist(inc_ind)),57]))))

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na)
# nb_inc extracts the average income for each element of the list of neighbours for the census tracts with the missing value
nb_inc <- as.numeric(unlist(fiveyearinc_15[match(ct_Psp$GEOID[nb_ind],fiveyearinc_15$GEO.id2),57]))

# Impute the mean.
P_ctinc_15_imp <- P_ctinc_15
mean.inc <- mean(nb_inc,na.rm=T)
P_ctinc_15_imp$inc[ind.na] <- mean.inc

# Check that we no longer have missing data for the average income.
sum(is.na(P_ctinc_15_imp))
# [1] 0

# Save
saveRDS(P_ctinc_15_imp,"P_CTInc_15_imp_proj.rds")
# P_ctinc_15 <- readRDS("P_CTInc_15_imp_proj.rds")


# # Portland: Sex and Age ---------------------------------------------------
# # We will now extract the data on the sex and age of the population within each census tract. We will first begin by considering the data avaiable on the Sex of the population before moving on to age.
# 
# fiveyearagesex_15 <- read_csv("X:/Maths/ResearchProjects/TRSmith/TRSmith2/DATA_LAPTOP/DATA/RAW DATA/COVARIATES/COV_CT/Portland/ACSST5Y2015.S0101_2020-06-13T080302/ACSST5Y2015.S0101_data_with_overlays_2020-06-13T080259.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearagesex_15$GEO_ID[2:length(fiveyearagesex_15$GEO_ID)],"\\d+")
# fiveyearagesex_15$GEO.id2 <- fiveyearagesex_15$GEO_ID
# fiveyearagesex_15$GEO.id2[1] <- "id2"
# fiveyearagesex_15$GEO.id2[2:length(fiveyearagesex_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Sex
# # Extract the data for the census tracts of interest
# P_ctsex_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_P.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# P_ctsex_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctsex_15_0$GEO.id2,]$y
# P_ctsex_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctsex_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # - total pop (margin of error), which according to the metadata is given by S0101_C01_001E(M) and is in slot 13 and 14;
# # - total male pop (margin of error), which according to the metadata is given by S0101_C02_001E(M) and is in slot 85 and 86;
# # - total female pop (margin of error), which according to the metadata is given by S0101_C03_001E(M) and is in slot 157 and 158;
# # - while lat and lon are in slot 220 and 221
# P_ctsex_15 <- P_ctsex_15_0[,c(1,219,2,13,14,85,86,157,158,220,221)]
# colnames(P_ctsex_15) <- c("geoid","geoid2","name","totp","totperr","mtotp","mtotperr","ftotp","ftotperr","y","x")
# P_ctsex_15[,4:9] <- sapply(P_ctsex_15[,4:9],as.numeric)
# 
# # Calculate the proportions
# P_ctsex_15$mprop <- P_ctsex_15$mtotp/P_ctsex_15$totp
# P_ctsex_15$fprop <- P_ctsex_15$ftotp/P_ctsex_15$totp
# 
# # Missing data: this could be due to the calculation of the proportions where the denominator for the total population is 0.
# # NAs produced for the proportions where the total population is 0, 1 CT: let us compare and check.
# sum(P_ctsex_15$totp==0)
# # [1] 1
# sum(P_ctsex_15$mtotp==0)
# # [1] 1
# sum(P_ctsex_15$ftotp==0)
# # [1] 1
# sum(is.na(P_ctsex_15$mprop))
# # [1] 1
# sum(is.na(P_ctsex_15$fprop))
# 
# which(P_ctsex_15$totp==0)
# # [1] 151
# which(P_ctsex_15$mtotp==0)
# # [1] 151
# which(P_ctsex_15$ftotp==0)
# # [1] 151
# which(is.na(P_ctsex_15$mprop))
# # [1] 151
# which(is.na(P_ctsex_15$fprop))
# 
# # Okay, so with no population set the values for these missing data to 0.
# P_ctsex_15$mprop[which(P_ctsex_15$totp==0)] <- 0
# P_ctsex_15$fprop[which(P_ctsex_15$totp==0)] <- 0
# 
# # Check we have removed all missing data
# sum(is.na(P_ctsex_15))
# # [1] 0
# 
# # Save
# saveRDS(P_ctsex_15,"P_CTSex_15_proj.rds")
# # P_ctsex_15 <- readRDS("P_CTSex_15_proj.rds")
# 
# 
# # Age
# # Extract the data for the census tracts of interest
# P_ctage_15_0 <- fiveyearagesex_15[fiveyearagesex_15$GEO.id2%in%ct_P.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# P_ctage_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctage_15_0$GEO.id2,]$y
# P_ctage_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctage_15_0$GEO.id2,]$x
# 
# # Take the following rows:
# # Rather than above for the smaller number of rows recquired, we want to automate the collection. There are 18 age groups in 5 year increments <5,5-9,10-14,...,70-74,75-79,80-84,>85 with the labelling as follows:
# # S0101_C*_**E(M)
# # where
# #     01 - TOTAL
# # * = 02 - MALE
# #     03 - FEMALE
# #      ( 002 - < 5
# #      ( 003 - 5-9 
# #      ( 004 - 10-14
# # ** =  (..
# #      ( 017 - 74-79
# #      ( 018 - 80-84
# #      ( 019 - > 85
# # - while lat and lon are in slot 220 and 221 as for the previous data set
# 
# # Generate the column names (as descrbied in the above comment) for the necessary groups for the ages
# sex_grp <- paste0("S0101_C0",rep(1:3,each=1))
# # ,rep(c("_EST_","_MOE_"),3))
# age_grp <- paste0("_00",rep(c(1,2,3,4,5,6,7,8,9),each=2),rep(c("E","M"),9))
# age_grp <- append(age_grp,paste0("_0",rep(10:19,each=2),rep(c("E","M"),10)))
# comb_grp <- paste0(rep(sex_grp,length(age_grp)),rep(age_grp,each=length(sex_grp)))
# grp_sel <- append(c("GEO_ID","GEO.id2","NAME"),comb_grp)
# grp_sel <- grp_sel %>% append(c("y","x"))
# 
# # Extract necessary columns
# P_ctage_15 <- P_ctage_15_0[,grp_sel]
# 
# # For new labels follow the previous code to select the groups so then the labels so follow the correct order
# newlab_sex <- paste0(rep(c("tot","m","f"),2),rep(c("","err"),each=3)) # compare newlab_sex with sex_grp - yes tot, m, f est and then tot, m, f moe
# newlab_age <- "tot"
# newlab_age <- newlab_age %>% append(paste0(seq(0,84,by=5),"-",seq(4,84,by=5))) %>% append("85-")
# comb_lab <- paste0(rep(newlab_sex,length(newlab_age)),rep(newlab_age,each=length(newlab_sex)))
# grp_names <- c("geoid","geoid2","name")
# grp_names <- grp_names %>% append(comb_lab) %>% append(c("y","x"))
# colnames(P_ctage_15) <- grp_names
# P_ctage_15[,4:(ncol(P_ctage_15)-2)] <- sapply(P_ctage_15[,4:(ncol(P_ctage_15)-2)],as.numeric)
# 
# 
# # Check the total population for all ages and for each sex
# sum(P_ctage_15$tottot==0)
# # [1] 1
# sum(P_ctage_15$mtot==0)
# # [1] 1
# sum(P_ctage_15$ftot==0)
# # [1] 1
# which(P_ctage_15$tottot==0)
# # [1] 151
# which(P_ctage_15$mtot==0)
# # [1] 151
# which(P_ctage_15$ftot==0)
# # [1] 151
# # Check if missing proportion census tracts are for 0 females
# fage <- grep("f",colnames(P_ctage_15))
# for (i in 1:length(fage)){
#   sum(is.na(P_ctage_15[,fage[i]]))
#   print(sum(which(is.na(P_ctage_15[,fage[i]]))%in%which(P_ctage_15$ftot==0)))
# }
# 
# # Check if missing proportion census tracts are for 0 males
# sum(P_ctage_15$mtot==0)
# mage <- grep("m",colnames(P_ctage_15))
# for (i in 1:length(mage)){
#   sum(is.na(P_ctage_15[,mage[i]]))
#   print(sum(which(is.na(P_ctage_15[,mage[i]]))%in%which(P_ctage_15$mtot==0)))
# }
# mage <- mage[-1] # first one is just "name" column, don't want to alter that
# 
# # The for loops show that the 5 0s for both males and female match the locations of the NAs in the proportions
# # Except, of course for the first couple of elements of fage and mage because they are the TOTALs and so need to remove these (after we have already removed the first element of mage above).
# frep <- fage[-c(1,2)]
# mrep <- mage[-c(1,2)]
# 
# # Set the necessary proportions to 0.
# ind.f <- which(P_ctage_15$ftot==0)
# ind.m <- which(P_ctage_15$mtot==0)
# P_ctage_15[ind.f,frep] <- 0
# P_ctage_15[ind.m,mrep] <- 0
# 
# # Now do the same for the total of each section
# sum(P_ctage_15$tottot==0)
# tage <- grep("tot",colnames(P_ctage_15))
# for (i in 1:length(mage)){
#   sum(is.na(P_ctage_15[,tage[i]]))
#   print(sum(which(is.na(P_ctage_15[,tage[i]]))%in%which(P_ctage_15$tottot==0)))
# }
# tage <- tage[-c(1:6)] # first 1-6 are main total columns, not where NAs are
# 
# ind.t <- which(P_ctage_15$tottot==0)
# P_ctage_15[ind.t,tage] <- 0
# 
# # Check we've removed all missing data
# sum(is.na(P_ctage_15))
# # [1] 0
# 
# # Save
# saveRDS(P_ctage_15,"P_CTAge_15_proj.rds")
# # P_ctage_15 <- readRDS("P_CTAge_15_proj.rds")
# 
# 
# 
# # Portland: Home Occupier -------------------------------------------------
# # We will now extract the necessary home occupier data (property owned or rented?) and check for missing values before saving the final data set.
# 
# fiveyearocc_15 <- read_csv("./ACSDT5Y2015.B25003_2020-06-13T081653/ACSDT5Y2015.B25003_data_with_overlays_2020-06-13T081649.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearocc_15$GEO_ID[2:length(fiveyearocc_15$GEO_ID)],"\\d+")
# fiveyearocc_15$GEO.id2 <- fiveyearocc_15$GEO_ID
# fiveyearocc_15$GEO.id2[1] <- "id2"
# fiveyearocc_15$GEO.id2[2:length(fiveyearocc_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Extract the data for the census tracts of interest
# P_ctocc_15_0 <- fiveyearocc_15[fiveyearocc_15$GEO.id2%in%ct_P.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# P_ctocc_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctocc_15_0$GEO.id2,]$y
# P_ctocc_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctocc_15_0$GEO.id2,]$x
# 
# # Take the total homes and their error as well as number of those owned and rented to calculate the proportions, while lat and lon are in slot 10 and 11
# P_ctocc_15 <- P_ctocc_15_0[,c(1,9,2,3,4,5,6,7,8,10,11)] # Want all the columns!!
# colnames(P_ctocc_15) <- c("geoid","geoid2","name","tot","err","totown","errown","totrent","errrent","y","x")
# P_ctocc_15[,4:9] <- sapply(P_ctocc_15[,4:9],as.numeric)
# 
# # Calculate proportions
# P_ctocc_15$ownprop <- P_ctocc_15$totown/P_ctocc_15$tot
# P_ctocc_15$rentprop <- P_ctocc_15$totrent/P_ctocc_15$tot
# 
# # NA checks
# sum(P_ctocc_15$tot==0)
# # [1] 1
# sum(P_ctocc_15$totown==0)
# # [1] 1
# sum(P_ctocc_15$totrent==0)
# # [1] 1
# sum(is.na(P_ctocc_15$ownprop))
# # [1] 1
# sum(is.na(P_ctocc_15$rentprop))
# # [1] 1
# which(is.na(P_ctocc_15$ownprop))
# # [1] 151
# which(is.na(P_ctocc_15$rentprop))
# # [1] 151
# 
# # Set values to 0
# P_ctocc_15$ownprop[which(P_ctocc_15$tot==0)] <- 0
# P_ctocc_15$rentprop[which(P_ctocc_15$tot==0)] <- 0
# 
# # Double check all missing data has been accounted for
# sum(is.na(P_ctocc_15))
# # [1] 0
# 
# # Save
# saveRDS(P_ctocc_15,"P_CTOcc_15_proj.rds")
# # P_ctocc_15 <- readRDS("P_CTOcc_15_proj.rds")
# 
# 
# # Portland: Food Stamps ---------------------------------------------------
# # We will now extract the necessary food stamp (SNAPs) data, which we are interested in as a proxy for poverty, and check for missing values before saving the final data set.
# 
# # Food Stamps (as a proxy for poverty)
# fiveyearfood_15 <- read_csv("./ACSST5Y2015.S2201_2020-06-13T081424/ACSST5Y2015.S2201_data_with_overlays_2020-06-13T081420.csv")
# 
# # Sort out the census tract identification for merging, as in the LA data we had geoid2.
# geo2 <- str_extract_all(fiveyearfood_15$GEO_ID[2:length(fiveyearfood_15$GEO_ID)],"\\d+")
# fiveyearfood_15$GEO.id2 <- fiveyearfood_15$GEO_ID
# fiveyearfood_15$GEO.id2[1] <- "id2"
# fiveyearfood_15$GEO.id2[2:length(fiveyearfood_15$GEO_ID)] <- map_chr(geo2,2)
# 
# # Extract the data for the census tracts of interest
# P_ctfood_15_0 <- fiveyearfood_15[fiveyearfood_15$GEO.id2%in%ct_P.proj$GEOID,]
# 
# # Assign the relevant (UTM) coordinates
# P_ctfood_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctfood_15_0$GEO.id2,]$y
# P_ctfood_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctfood_15_0$GEO.id2,]$x
# 
# # Take the total households and their error (columns 3 and 4-"S2201_C01_001E") as well as the percentage and total number of those that receive food stamps and their error (231 nd 232-"S2201_C04_001E", 155 and 156), while lat and lon are in slot 460 and 461
# P_ctfood_15 <- P_ctfood_15_0[,c(1,459,2,3,4,155,156,231,232,460,461)] # Want all the columns!!
# colnames(P_ctfood_15) <- c("geoid","geoid2","name","tot","err","totrec","errrec","proprec","errproprec","y","x")
# P_ctfood_15$tot <- as.numeric(P_ctfood_15$tot)
# P_ctfood_15$err <- as.numeric(P_ctfood_15$err)
# P_ctfood_15$totrec <- as.numeric(P_ctfood_15$totrec)
# P_ctfood_15$errrec <- as.numeric(P_ctfood_15$errrec)
# P_ctfood_15$proprec <- as.numeric(P_ctfood_15$proprec)
# P_ctfood_15$errproprec <- as.numeric(P_ctfood_15$errproprec)
# 
# # NA checks
# sum(is.na(P_ctfood_15$totrec))
# # [1] 0
# sum(is.na(P_ctfood_15$proprec))
# # [1] 1
# sum(P_ctfood_15$tot==0)
# # [1] 1
# sum(P_ctfood_15$totrec==0)
# # [1] 1
# sum(is.na(P_ctfood_15$errproprec))
# # [1] 1
# which(P_ctfood_15$tot==0)
# # [1] 151
# which(P_ctfood_15$totrec==0)
# # [1] 151
# which(is.na(P_ctfood_15$proprec))
# # [1] 151
# which(is.na(P_ctfood_15$errproprec))
# # [1] 151
# 
# # Set these to 0
# P_ctfood_15$proprec[which(P_ctfood_15$tot==0)] <- 0
# P_ctfood_15$errproprec[which(P_ctfood_15$tot==0)] <- 0
# 
# # Double check all of the missing data as been dealt with
# sum(is.na(P_ctfood_15))
# # [1] 0
# 
# # Save
# saveRDS(P_ctfood_15,"P_CTFood_15_proj.rds")
# # P_ctfood_15 <- readRDS("P_CTFood_15_proj.rds")
# 
# 
# 

# Portland: Re-set WD -----------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# sessionInfo() -----------------------------------------------------------

sessionInfo()
