
# Census Tract Average Income Imputation Including Total Households -------

# In this R script we approach the average income missing data slightly differently for use within out final data set for modelling. Here we take into account the estimate of the total households in the data provided by the ACS, if there are no households we set the value of the average income to 0. If there is still missing data, then we impute the missing values as we have done for the average income variable in CountDataGen_final.R which is for use within the Grid-Mesh Optimisation method.
# The output data will then be saved in DATA/PROCESSED_DATA/COVARIATES.

# Note: the data from the ACS may be updated, especially with respect to the Los Angeles 2015, which was originally accessed through the American FactFinder (which was later retired), and so newly accessed data may have different column set-ups, more like the New York and Portland data, for reference.

# Author: Nadeen Khaleel


# Setwd and Load Libraries ------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))

library(dplyr)
library(readr)
library(revgeo)
library(sf)
library(lwgeom)
library(stringr)
library(purrr)
library(sp)


# Los Angeles -------------------------------------------------------------
# We need to create the average income data set where we take into account the estimate of total households.

# Los Angeles: Set-Up -----------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# LA Data
setwd("./LA")

# LA Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACityCT.rda")

# Project to UTM
ct_LA.proj <- lwgeom::st_transform_proj(ct_LA,"epsg:32611")
latlon <- sapply(1:length(ct_LA.proj$geometry),function(i){return(as.numeric(st_centroid(ct_LA.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude

ct_LA.proj$x <- latlon[1,]
ct_LA.proj$y <- latlon[2,]


# Los Angeles: Average Income (Imputed-Models) ----------------------------
# We will now extract the average income data. It is important to note that the results from the following code assigns missing average income data for a census tract a value of zero if there is an estimated zero total households therefore total income of zero. This differs from the output of CovDataGen_final.R where all missing data was treated the same. This version of the imputed average income is intended for use within the models.

fiveyearinc_15 <- read_csv("./ACS_15_5YR_S1902/ACS_15_5YR_S1902_with_ann.csv")

# Extract the data for the census tracts of interest
LA_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_LA.proj$GEOID,]

# Assign the relevant (UTM) coordinates
LA_ctinc_15_0$y <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15_0$GEO.id2,]$y
LA_ctinc_15_0$x <- ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by HCO2_EST/MOE_VCO2 and is in slot 6 and 7, while lat and lon are in slot 112 and 113
LA_ctinc_15 <- LA_ctinc_15_0[,c(1,2,3,4,6,7,112,113)]
colnames(LA_ctinc_15) <- c("geoid","geoid2","name","total.inc","inc","err","y","x")
LA_ctinc_15$inc <- as.numeric(LA_ctinc_15$inc)
LA_ctinc_15$total.inc <- as.numeric(LA_ctinc_15$total.inc)

# We have missing data
sum(is.na(LA_ctinc_15$inc))
# [1] 10

# Assign those with zero estimated total households a zero value.
LA_ctinc_15$inc[LA_ctinc_15$total.inc==0] <- 0
sum(is.na(LA_ctinc_15$inc))
# [1] 4
# There is still some missing data, so still need to impute.

# Which census tracts have the missing data?
ind.na <- which(is.na(LA_ctinc_15$inc))
plot(ct_LA.proj$geometry,axes=T,main="Census Tracts with Missing Income Data")
ind.na.ct <- match(LA_ctinc_15$geoid2[ind.na],ct_LA.proj$GEOID)
plot(ct_LA.proj[ind.na.ct,]$geometry,add=T,col="red")

# Load the LA County census tract to plot and get a visual representation of the census tracts with the missing data.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyCT.rda")
ct_LA_County.proj <- lwgeom::st_transform_proj(ct_LA_County,"epsg:32611")
ct_LA_Countysp.proj <- as(ct_LA_County.proj,"Spatial")

plot(ct_LA_County.proj$geometry,border="red")
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15$geoid2[ind.na],]$geometry,col="red",add=T)
plot(ct_LA.proj$geometry,add=T,border="blue")

# In order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for Los Angeles county.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/LACountyNB.rda")
ct_ind <- match(LA_ctinc_15$geoid2[ind.na],ct_LA_Countysp$GEOID) # which geometries are the missings from
nb_ind <- sapply(1:length(ct_ind),function(i){which(unname(nb[ct_ind[i],]))}) # who are their neighbours
nb_ct <- sapply(1:length(nb_ind),function(i){match(ct_LA_Countysp$GEOID[nb_ind[[i]]],ct_LA_County.proj$GEOID)}) # which geometries on the county scale have we selected

# Visually inspect that we have the correct census tracts
plot(ct_LA_County.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)

# Find the indices for the county with the indices of the income data frame
inc_ind <- sapply(1:length(nb_ct),function(i){match(ct_LA_County.proj$GEOID[nb_ct[[i]]],fiveyearinc_15$GEO.id2)})
sum(is.na(as.numeric(unlist(fiveyearinc_15[unique(unlist(inc_ind)),57]))))

# Extract the indices for census tracts with NAs for the average income in the list elements of neighbours for each of the missing data census tracts
cty_na_ind <- sapply(1:length(inc_ind),function(i){inc_ind[[i]][is.na(as.numeric(unlist(fiveyearinc_15[inc_ind[[i]],6])))]})
# Extract the indices within the county data
cty_na_ct <- sapply(1:length(cty_na_ind),function(i){match(fiveyearinc_15$GEO.id2[cty_na_ind[[i]]],ct_LA_County.proj$GEOID)})

# These are just to visually inspect missing data between the county and city.
plot(ct_LA_County.proj[unlist(cty_na_ct),]$geometry,col="yellow",border="blue",add=T) # which are NA in county data
plot(ct_LA.proj[ct_LA.proj$GEOID%in%LA_ctinc_15$geoid2[ind.na],]$geometry,col="red",border="yellow",add=T) # but they are the ones also in the missing data within the city..

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na, but some are neighbours of others)
# nb_inc extracts the average income for each element of the list of neighbours for each of the 10 census tracts within missing data.
nb_inc <- sapply(1:length(nb_ind),function(i){as.numeric(unlist(fiveyearinc_15[match(ct_LA_Countysp$GEOID[nb_ind[[i]]],fiveyearinc_15$GEO.id2),6]))})

# Impute the mean, making sure to ignore any missing data.
LA_ctinc_15_imp <- LA_ctinc_15
mean.inc <- lapply(nb_inc,mean,na.rm=T)
LA_ctinc_15_imp$inc[ind.na] <- unlist(mean.inc)

# Check that we no longer have missing data for the average income.
sum(is.na(LA_ctinc_15))
# [1] 10
sum(is.na(LA_ctinc_15_imp))
# [1] 0

# Save
saveRDS(LA_ctinc_15_imp,"LA_CTInc_15_0imp_proj.rds")
# LA_ctinc_15 <- readRDS("LA_CTInc_15_0imp_proj.rds")


# Los Angeles: Re-set WD --------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# New York City -----------------------------------------------------------
# We need to create the average income data set where we take into account the estimate of total households.

# New York: Set-Up --------------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# NYC Data
setwd("./NYC")

# NYC Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCityCT.rda")

# Project to UTM
ct_NY.proj <- lwgeom::st_transform_proj(ct_NY,"epsg:32618")
latlon <- sapply(1:length(ct_NY.proj$geometry),function(i){return(as.numeric(st_centroid(ct_NY.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude


ct_NY.proj$x <- latlon[1,]
ct_NY.proj$y <- latlon[2,]


# New York: Average Income (Imputed-Models) -------------------------------
# We will now extract the average income data. It is important to note that the results from the following code assigns missing average income data for a census tract a value of zero if there is an estimated zero total households therefore total income of zero. This differs from the output of CovDataGen_final.R where all missing data was treated the same. This version of the imputed average income is intended for use within the models.

fiveyearinc_15 <- read_csv("./ACSST5Y2015.S1902_2020-06-15T083620/ACSST5Y2015.S1902_data_with_overlays_2020-06-15T083605.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyearinc_15$GEO_ID[2:length(fiveyearinc_15$GEO_ID)],"\\d+")
fiveyearinc_15$GEO.id2 <- fiveyearinc_15$GEO_ID
fiveyearinc_15$GEO.id2[1] <- "id2"
fiveyearinc_15$GEO.id2[2:length(fiveyearinc_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
NY_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_NY.proj$GEOID,]

# Extract the data for the census tracts of interest
NY_ctinc_15_0$y <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15_0$GEO.id2,]$y
NY_ctinc_15_0$x <- ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15_0$GEO.id2,]$x
# Take the mean income and margin of error, which according to the metadata is given by S1902_C01_001E/S1902_C01_001M and is in slot 57 and 58, while lat and lon are in slot 112 and 113
NY_ctinc_15 <- NY_ctinc_15_0[,c(1,111,2,3,57,58,112,113)]
colnames(NY_ctinc_15) <- c("geoid","geoid2","name","total.inc","inc","err","y","x")
NY_ctinc_15$inc <- as.numeric(NY_ctinc_15$inc)
NY_ctinc_15$total.inc <- as.numeric(NY_ctinc_15$total.inc)

# We have missing data
sum(is.na(NY_ctinc_15$inc))
# [1] 54

# Assign those with zero estimated total households a zero value.
NY_ctinc_15$inc[NY_ctinc_15$total.inc==0] <- 0
sum(is.na(NY_ctinc_15$inc))
# [1] 10
# There is still some missing data, so still need to impute.

ind.na <- which(is.na(NY_ctinc_15$inc))
plot(ct_NY.proj$geometry,axes=T,main="Census Tracts with Missing Income Data")
ind.na.ct <- match(NY_ctinc_15$geoid2[ind.na],ct_NY.proj$GEOID)
plot(ct_NY.proj[ind.na.ct,]$geometry,col="red",add=T)

# Load the NYC County census tract to plot and get a visual representation of the census tracts with the missing data.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCCountyCT.rda")
ct_NY_County.proj <- lwgeom::st_transform_proj(ct_NY_County,"epsg:32618")

plot(ct_NY_County.proj$geometry,border="red")
plot(ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15$geoid2[ind.na],]$geometry,col="red",add=T)
plot(ct_NY.proj$geometry,add=T,border="blue")

# In order to deal with the missing data in the income, I will use the average of each census tracts neighbours to impute the missing data. Therefore we must load in the neighbourhood matrix for counties containing the city.
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/NYCountyNB.rda")
ct_ind <- match(NY_ctinc_15$geoid2[ind.na],ct_NY_Countysp$GEOID) # which geometries are the missings from
nb_ind <- sapply(1:length(ct_ind),function(i){which(unname(nb[ct_ind[i],]))}) # who are their neighbours
nb_ct <- sapply(1:length(nb_ind),function(i){match(ct_NY_Countysp$GEOID[nb_ind[[i]]],ct_NY_County.proj$GEOID)}) # which geometries on the county scale have we selected

# Visually inspect that we have the correct census tracts
plot(ct_NY_County.proj[unlist(nb_ct),]$geometry,col="magenta",add=T)

# Find the indices for the county with the indices of the income data frame
inc_ind <- sapply(1:length(nb_ct),function(i){match(ct_NY_County.proj$GEOID[nb_ct[[i]]],fiveyearinc_15$GEO.id2)})
sum(is.na(as.numeric(unlist(fiveyearinc_15[unique(unlist(inc_ind)),57]))))

# Extract the indices for census tracts with NAs for the average income in the list elements of neighbours for each of the missing data census tracts
cty_na_ind <- sapply(1:length(inc_ind),function(i){inc_ind[[i]][is.na(as.numeric(unlist(fiveyearinc_15[inc_ind[[i]],57])))]})
# Extract the indices within the county data
cty_na_ct <- sapply(1:length(cty_na_ind),function(i){match(fiveyearinc_15$GEO.id2[cty_na_ind[[i]]],ct_NY_County.proj$GEOID)})

# These are just to visually inspect missing data between the county and city.
plot(ct_NY_County.proj[unlist(cty_na_ct),]$geometry,col="yellow",border="blue",add=T) # which are NA in county data
plot(ct_NY.proj[ct_NY.proj$GEOID%in%NY_ctinc_15$geoid2[ind.na],]$geometry,col="red",border="yellow",add=T)

# Now to impute the average of the neighbours (recall nb_ind is a list for each ind.na)
# nb_inc extracts the average income for each element of the list of neighbours for each of the census tracts with missing data 
nb_inc <- sapply(1:length(nb_ind),function(i){as.numeric(unlist(fiveyearinc_15[match(ct_NY_Countysp$GEOID[nb_ind[[i]]],fiveyearinc_15$GEO.id2),57]))})

# Impute the mean, making sure to ignore any missing data.
NY_ctinc_15_imp <- NY_ctinc_15
mean.inc <- lapply(nb_inc,mean,na.rm=T)
NY_ctinc_15_imp$inc[ind.na] <- unlist(mean.inc)

# Check that we no longer have missing data for the average income.
sum(is.na(NY_ctinc_15_imp))
# [1] 0

# Save
saveRDS(NY_ctinc_15_imp,"NY_CTInc_15_0imp_proj.rds")
# NY_ctinc_15 <- readRDS("NY_CTInc_15_0imp_proj.rds")

# New York: Re-set WD -----------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# Portland ----------------------------------------------------------------
# We need to create the average income data set where we take into account the estimate of total households.

# Portland: Set-Up --------------------------------------------------------
# Change the working directory as we want to save the data in separate folders for each city. We then load the census tract data produced in DATA/RAW_DATA/SHAPEFILES/CENSUS_TRACTS and project these to UTM coordinates. These census tracts are saved in the DATA/PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS directory and we will access them through this file path.

# Portland Data
setwd("./Portland")

# Portland Census Tracts
load("../../../PROCESSED_DATA/SHAPEFILES/CENSUS_TRACTS/PCityCT.rda")

# Project to UTM
ct_P.proj <- lwgeom::st_transform_proj(ct_P,"epsg:32610")
latlon <- sapply(1:length(ct_P.proj$geometry),function(i){return(as.numeric(st_centroid(ct_P.proj$geometry[[i]])))}) # returns a 2xlength(b$geometry) matrix, where each column is for each geometry and row one is longitude and row two is latitude

ct_P.proj$x <- latlon[1,]
ct_P.proj$y <- latlon[2,]


# Portland: Average Income (Imputed-Models) -------------------------------
# We will now extract the average income data. It is important to note that the results from the following code assigns missing average income data for a census tract a value of zero if there is an estimated zero total households therefore total income of zero. This differs from the output of CovDataGen_final.R where all missing data was treated the same. This version of the imputed average income is intended for use within the models.

fiveyearinc_15 <- read_csv("./ACSST5Y2015.S1902_2020-06-13T081245/ACSST5Y2015.S1902_data_with_overlays_2020-06-13T081241.csv")

# Sort out the census tract identification for merging, as in the LA data we had geoid2.
geo2 <- str_extract_all(fiveyearinc_15$GEO_ID[2:length(fiveyearinc_15$GEO_ID)],"\\d+")
fiveyearinc_15$GEO.id2 <- fiveyearinc_15$GEO_ID
fiveyearinc_15$GEO.id2[1] <- "id2"
fiveyearinc_15$GEO.id2[2:length(fiveyearinc_15$GEO_ID)] <- map_chr(geo2,2)

# Extract the data for the census tracts of interest
P_ctinc_15_0 <- fiveyearinc_15[fiveyearinc_15$GEO.id2%in%ct_P.proj$GEOID,]

# Assign the relevant (UTM) coordinates
P_ctinc_15_0$y <- ct_P.proj[ct_P.proj$GEOID%in%P_ctinc_15_0$GEO.id2,]$y
P_ctinc_15_0$x <- ct_P.proj[ct_P.proj$GEOID%in%P_ctinc_15_0$GEO.id2,]$x

# Take the mean income and margin of error, which according to the metadata is given by S1902_C01_001E/S1902_C01_001M and is in slot 57 and 58, while lat and lon are in slot 112 and 113
P_ctinc_15 <- P_ctinc_15_0[,c(1,111,2,3,57,58,112,113)]
colnames(P_ctinc_15) <- c("geoid","geoid2","name","total.inc","inc","err","y","x")
P_ctinc_15$inc <- as.numeric(P_ctinc_15$inc)
P_ctinc_15$total.inc <- as.numeric(P_ctinc_15$total.inc)

# There is some missing data - for a single census tract to be specific.
sum(is.na(P_ctinc_15$inc))
# [1] 1

# Assign those with zero estimated total households a zero value.
P_ctinc_15$inc[P_ctinc_15$total.inc==0] <- 0
sum(is.na(P_ctinc_15$inc))
# [1] 0
# We no longer have any missing data within the average income data, so we will not need to impute anything, we can just go on to save the data set.

P_ctinc_15_imp <- P_ctinc_15

# Save
saveRDS(P_ctinc_15_imp,"P_CTInc_15_0imp_proj.rds")
# P_ctinc_15 <- readRDS("P_CTInc_15_0imp_proj.rds")


# Portland: Re-set WD -----------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))


# sessionInfo() -----------------------------------------------------------

sessionInfo()    
