
# New York Crime Data -----------------------------------------------------

# This code takes in the full NYPD Crime Incidence data set: "NYPD_Complaint_Data_Historic" and then manipulates it so that the text Location column is turned into numeric longitude and latitude columns. It also subsets and saves two data sets, one containing location and time information for incidences of homicides and the other containing the location and time of cases of vehicular theft, the two crimes we want to consider within this thesis.
# Note: here we load up the New York data as originally accessed, and so the code to read in the necessary data will differ dependent on the users download and file name. Additionally, due to updates to the data since original access, there may be variations with respect to column names and types which will need to be taken into consideration in the code below.

# Author: Nadeen Khaleel


# Setwd and Load Libraries ------------------------------------------------

library("rstudioapi")
# Either setwd() to the source file location, or run the following:
setwd(dirname(getActiveDocumentContext()$path))

library(readr)
library(ggplot2)
library(ggmap)
library(dplyr)
library(stringr)
library(revgeo)
library(sp)
library(sf)
library(lwgeom)

# Load in Crime Data ------------------------------------------------------

NYPD <- read_csv("NYPD_Complaint_Data_Historic.csv", col_types = cols(CMPLNT_FR_DT = col_date(format = "%m/%d/%Y"), CMPLNT_FR_TM = col_time(format = "%H:%M:%S"), CMPLNT_TO_DT = col_date(format = "%m/%d/%Y"), CMPLNT_TO_TM = col_time(format = "%H:%M:%S"), RPT_DT = col_date(format = "%m/%d/%Y")))

# Remove data without location information and offense description
NYPD <- NYPD[!is.na(NYPD$Latitude),]
NYPD <- NYPD[!is.na(NYPD$Longitude),]
NYPD <- NYPD[!is.na(NYPD$KY_CD),]
NYPD <- NYPD[!is.na(NYPD$OFNS_DESC),]

# Data Manipulation -------------------------------------------------------

NYPD$DT_OCC <- as.Date(NYPD$CMPLNT_FR_DT,format='%m/%d/%Y')
NYPD$DT_OCC_NUM <- as.numeric(NYPD$DT_OCC)
NYPD$MY <- format(as.Date(NYPD$DT_OCC),"%Y-%m")
NYPD$Y <- format(as.Date(NYPD$DT_OCC),"%Y")

# Subset data into homicides and stolen vehicles by selecting the required crime code descriptions
hom_sub <- grep("HOMICIDE|MANSLAUGHTER|NEGLIGENT",unique(NYPD$OFNS_DESC),value=TRUE)
gta_sub <- grep( "GRAND LARCENY OF MOTOR VEHICLE" ,unique(NYPD$OFNS_DESC),value=TRUE)

NYPD_Hom <-NYPD[NYPD$OFNS_DESC%in%hom_sub,]
NYPD_GTA <- NYPD[NYPD$OFNS_DESC%in%gta_sub,]


# Retain points only within the city
nyc_boundary <- st_read("../SHAPEFILES/BOUNDARIES/Borough Boundaries NYC/geo_export_3dcef28d-8b06-44cb-b8a3-5254b5580b2c.shp")
# nyc_boundary.proj <- lwgeom::st_transform_proj(nyc_boundary,"+init=epsg:32618") # this was the original run version, however now the inclusion of +init is no longer in use
nyc_boundary.proj <- lwgeom::st_transform_proj(nyc_boundary,"epsg:32618")


nyc_hom_sf <- st_as_sf(NYPD_Hom, coords = c("Longitude", "Latitude"), crs = st_crs(nyc_boundary))
nyc_gta_sf <- st_as_sf(NYPD_GTA, coords = c("Longitude", "Latitude"), crs = st_crs(nyc_boundary))
# nyc_hom_sf.proj <- lwgeom::st_transform_proj(nyc_hom_sf,"+init=epsg:32618") # as discussed above
# nyc_gta_sf.proj <- lwgeom::st_transform_proj(nyc_gta_sf,"+init=epsg:32618") # as discussed above
nyc_hom_sf.proj <- lwgeom::st_transform_proj(nyc_hom_sf,"epsg:32618")
nyc_gta_sf.proj <- lwgeom::st_transform_proj(nyc_gta_sf,"epsg:32618")
int_points_hom <- st_covers(nyc_boundary.proj, nyc_hom_sf.proj)
int_points_gta <- st_covers(nyc_boundary.proj, nyc_gta_sf.proj)

keep.hom <- c(int_points_hom[[1]],int_points_hom[[2]],int_points_hom[[3]],int_points_hom[[4]],int_points_hom[[5]]) # all non-empty one
keep.gta <- c(int_points_gta[[1]],int_points_gta[[2]],int_points_gta[[3]],int_points_gta[[4]],int_points_gta[[5]]) # all non-empty ones

nyc_hom <- NYPD_Hom[keep.hom,]
nyc_gta <- NYPD_GTA[keep.gta,]

nyc_hom_sf <- st_as_sf(nyc_hom, coords = c("Longitude", "Latitude"), crs = st_crs(nyc_boundary))
nyc_gta_sf <- st_as_sf(nyc_gta, coords = c("Longitude", "Latitude"), crs = st_crs(nyc_boundary))


# Save the Data Sets ------------------------------------------------------

saveRDS(nyc_hom,"nyc_hom.rds")
saveRDS(nyc_gta,"nyc_gta.rds")

saveRDS(nyc_hom_sf,"nyc_hom_sf.rds")
saveRDS(nyc_gta_sf,"nyc_gta_sf.rds")



# SessionInfo() -----------------------------------------------------------

sessionInfo()
 