# Simulate Data for Testing Multivariate INLA within MCMC -----------------

# This R script contains the code used to simulate the multiple data sets for different scenarios across two study regions in order to assess the behavioUr of the Multivariate INLA within MCMC under these different situations.
# Window 1: will only simulate one data set, with no post-processing the data.
# Window 2: will simulate multiple data sets, one un-altered data set and then a sparse data set and data sets with different covariate effects than those in Window 1.

# Author: Nadeen Khaleel

# Set Working Directory and Load Libraries --------------------------------

# Either setwd() to the source file location, or run the following:
library("rstudioapi")
setwd(dirname(getActiveDocumentContext()$path))

library(INLA)
library(INLABMA)
library(spatstat)
library(sp)
library(raster)
library(maptools)
library(purrr)
library(stringr)

# Data Simulation ---------------------------------------------------------
# Want to simulate two data sets on regular polygons, which are disjoint, so say
# W_1 = [0,2.5]x[0,2.5] and W_2 = [2.5,5]x[2.5,5]
# Simulate a data set once and save and load for testing the functions.

# Covariates
cov.gen <- function(W,seed,save.file){
  
  set.seed(seed)
  
  reg.poly <- W$type=="rectangle" # True if rectangular window/false if different polygon.

  # Points at which to simulate the data.
  nx <- 250; ny <- 250;
  # Create (fine) regular grid over the window in order to be able to simulate the covariate. Want to use intersect function to make sure that the overlayed grid points retained lie within the boundary, this is not needed for the rectangular grid, the function gridcentres should work fine there.
  W.gridc <- gridcentres(W, nx, ny)
  # However, we can use the function (in spatstat), inside.owin to create a logical vector for which points lie inside/outside the grid.
  keep.c <- inside.owin(W.gridc$x,W.gridc$y,W)

  W.gridc$x <- W.gridc$x[keep.c]; W.gridc$y <- W.gridc$y[keep.c];


  # Simulate the covariate data (and save covariates too!)
  max.y <- max(W.gridc$y)
  cov1.df <- data.frame(x=W.gridc$x,y=W.gridc$y)
  cov1.df$val <- W.gridc$y/max.y + rnorm(length(W.gridc$x),mean = 0,sd=0.25)

  max.norm <- sqrt(max(W.gridc$x)^2+max(W.gridc$y)^2)
  cov2.df <- data.frame(x=W.gridc$x,y=W.gridc$y)
  cov2.df$val <- sqrt(W.gridc$x^2+W.gridc$y^2)/max.norm + rnorm(length(W.gridc$x),mean = 0,sd = 0.25)

  int.df <- data.frame(x=W.gridc$x,y=W.gridc$y,val=rep(1,length(W.gridc$x)))

  cov1.ras <- rasterFromXYZ(cov1.df)
  cov2.ras <- rasterFromXYZ(cov2.df)
  int.ras <- rasterFromXYZ(int.df,digits = 10)

  cov1.im <- as.im.RasterLayer(cov1.ras)
  cov2.im <- as.im.RasterLayer(cov2.ras)
  int.im <- as.im.RasterLayer(int.ras)

  save(cov1.ras,cov2.ras,int.ras,cov1.im,cov2.im,int.im,file=save.file)

  return(list("cov1.ras"=cov1.ras,"cov2.ras"=cov2.ras,"int.ras"=int.ras,"cov1.im"=cov1.im,"cov2.im"=cov2.im,"int.im"=int.im))
}

#
# Meshes
mesh.gen <- function(W,disc,save.file){
    M <- disc; N <- disc
    cellsize <- c((W$xrange[2]-W$xrange[1])/M,(W$yrange[2]-W$yrange[1])/N)
    
    g <- quadrats(W,M,N)
    g.sp <- as(g,"SpatialPolygons")
    ord <- sapply(1:length(g.sp),function(ii){unlist(centroid.owin(g.sp[ii]))})
    
    m <- matrix(as.numeric(unlist(str_extract_all(names(g.sp),"\\d*\\d"))),ncol=2,byrow=TRUE)
    ord.df <- data.frame(ind=1:g$n,row=m[,1],col=m[,2]) # col matches the raster definition of col=x, and row=y which makes sense visually from the plots as x<--> and y ^inc(^)
    ord.df <- ord.df[order(ord.df$col),]
    
    x.ord <- ord[1,ord.df$ind]; y.ord <- ord[2,ord.df$ind]
    
    df <- data.frame(x=x.ord,y=y.ord)

    coords <- df[,c("x","y")]
    coordinates(coords) <- ~ x + y
    boundary <- as(W,"SpatialPolygons") # For the meshes
    
    mesh <- inla.mesh.2d(loc=coords, boundary=boundary, max.edge=c(max(cellsize), max(cellsize)+0.5), min.angle=c(30, 21),
                         max.n=c(48000, 16000), ## Safeguard against large meshes.
                         max.n.strict=c(128000, 128000), ## Don't build a huge mesh!
                         cutoff=0.01, ## Filter away adjacent points.
                         offset=c(0.1, 1)) ## Offset for extra boundaries, if needed.

  save(mesh,file=save.file)
  
  return("mesh"=mesh)
}

# Data Set
data.gen <- function(W=owin(c(0,1),c(0,1)),theta,cov.name,disc=25,seed,sparse=FALSE,percent.keep=50,save.file){

  # Load Covariates
  load(cov.name)

  # Create Mean of GP
  mu <- theta[[1]]*int.im + theta[[2]]*cov1.im + theta[[3]]*cov2.im
  
  # Simulate LGCP
  set.seed(seed)
  lgcp.ppp <- rLGCP(model="matern",mu,var=(theta[[4]])^2,scale=theta[[5]]/2,nu=1,win = W) # Note: default resolution of the simulated data is 128x128, if this is too coarse for the data aggregations (if one of the data aggregation requires a grid finer than 128x128), then also set dimyx=c(ny,nx).

  if (sparse==TRUE){ # uniformly keep a particular percentage of points
    loc.df <- data.frame(x=lgcp.ppp$x,y=lgcp.ppp$y)
    perc <- percent.keep/100
    n.keep <- ceiling(perc*lgcp.ppp$n)
    samp.keep <- sample(lgcp.ppp$n,n.keep)
    loc.keep <- loc.df[samp.keep,]
    lgcp.ppp <- as.ppp(loc.keep,W)
  }

  # Aggregate to count data
  M <- disc; N <- disc
  g <- quadrats(W,M,N)
  g.sp <- as(g,"SpatialPolygons")
  ord <- sapply(1:length(g.sp),function(ii){unlist(centroid.owin(g.sp[ii]))})
  a <- unname(rgeos::gArea(g.sp,byid = T))
  
  m <- matrix(as.numeric(unlist(str_extract_all(names(g.sp),"\\d*\\d"))),ncol=2,byrow=TRUE)
  ord.df <- data.frame(ind=1:g$n,row=m[,1],col=m[,2]) # col matches the raster definition of col=x, and row=y which makes sense visually from the plots as x<--> and y ^inc(^)
  ord.df <- ord.df[order(ord.df$col),]
  
  x.ord <- ord[1,ord.df$ind]; y.ord <- ord[2,ord.df$ind]
  
  cell.area <- a[ord.df$ind]
  
  q <- quadratcount(lgcp.ppp,tess=g)
  count.df <- data.frame(x=x.ord,y=y.ord,count=as.vector(unname(q)),area=cell.area) # no need to re-order q, for regular polygons, appears to go down y for fixed x.
  
  count.df$cov1 <- raster::extract(cov1.ras,g.sp,weights=T,fun=mean,method="simple")[ord.df$ind]
  count.df$cov2 <- raster::extract(cov2.ras,g.sp,weights=T,fun=mean,method="simple")[ord.df$ind]

  save(count.df,file=save.file)
  
  return("count.df"=count.df)
}



# Compare LA and Portland Data --------------------------------------------

# Compare LA and Portland Crime Incidence Counts - these are not the final-final versions of the data sets, rather those used in the Grid-Mesh simulation checks, and are using the finest scale possible, but this is mostly to check the ratio of counts for the different crimes in LA and Portland and the quick models are just to give some ideas for parameter values. However, the results for the final-final data can be found in the results of the GLM models, and are very similar to these results. Outputs at the time of running this code can be found in the file "LAPortlandOldDataModelComp.txt" in this directory.
#
# # load("../../DATA/PROCESSED_DATA/CRIME/COUNT_DATA_FINAL/LA/LA2015CT236359CountData_proj.rda") # check file path
# # la_hom <- hom_countdf
# # la_gta <- gta_countdf
# # load("../../DATA/PROCESSED_DATA/CRIME/COUNT_DATA_FINAL/Portland/P2015CT190129CountData_proj.rda") # check file path
# # p_hom <- hom_countdf
# # p_gta <- gta_countdf
# # sum(la_hom$hom)
# # sum(la_gta$gta)
# # sum(p_hom$hom)
# # sum(p_gta$gta)
# # sum(p_hom$hom)/sum(la_hom$hom)
# # sum(p_hom$hom)/sum(la_hom$hom)*100
# # sum(p_gta$gta)/sum(la_gta$gta)
# # sum(p_gta$gta)/sum(la_gta$gta)*100
# 
# # la_hom$zpop <- (la_hom$pop - mean(la_hom$pop))/sd(la_hom$pop)
# # la_hom$zinc <- (la_hom$inc - mean(la_hom$inc))/sd(la_hom$inc)
# # la_gta$zpop <- (la_gta$pop - mean(la_gta$pop))/sd(la_gta$pop)
# # la_gta$zinc <- (la_gta$inc - mean(la_gta$inc))/sd(la_gta$inc)
# # 
# # la_hom.fit <- inla(hom ~ 1 + zpop + zinc + offset(log(area/1e8)),data=la_hom,family="poisson")
# # # la_hom.fit$summary.fixed
# # la_gta.fit <- inla(gta ~ 1 + zpop + zinc + offset(log(area/1e8)),data=la_gta,family="poisson")
# # # la_gta.fit$summary.fixed
# # 
# # p_gta$zpop <- (p_gta$pop - mean(p_gta$pop))/sd(p_gta$pop)
# # p_gta$zinc <- (p_gta$inc - mean(p_gta$inc))/sd(p_gta$inc)
# # 
# # p_gta.fit <- inla(gta ~ 1 + zpop + zinc + offset(log(area/1e8)),data=p_gta,family="poisson")
# # # p_gta.fit$summary.fixed
# 




# Set-Up Simulated Data ---------------------------------------------------

# Set-Up Windows

W_1 <- owin(c(0,2.5),c(0,2.5)) # this will always have 100% data
W_2 <- owin(c(2.5,5),c(2.5,5)) # this will have 100% or 20% data and will also have different covariates


# Set-Up Covariates
cov.name_1 <- "Window1Covariates.rda"
cov.name_2 <- "Window2Covariates.rda"

seed.cov_1 <- 50
seed.cov_2 <- 1250
cov_1 <- cov.gen(W_1,seed=seed.cov_1,save.file=cov.name_1)
cov_2 <- cov.gen(W_2,seed=seed.cov_2,save.file=cov.name_2)

# Set-Up Count Data

# Test Parameter Values
# #
# theta.s <- list(beta0=2,beta1=2,beta2=-2,sigma=2,rho=0.75)
# theta.d <- list(beta0=2,beta1=2,beta2=-1,sigma=2,rho=0.75)
# 
# # Testing LGCP numbers
# mu.s <- theta.s[[1]]*cov_2$int.im + theta.s[[2]]*cov_2$cov1.im + theta.s[[3]]*cov_2$cov2.im
# mu.d <- theta.d[[1]]*cov_2$int.im + theta.d[[2]]*cov_2$cov1.im + theta.d[[3]]*cov_2$cov2.im
# 
# quantile(mu.s)
# # 0%       25%       50%       75%      100% 
# # -1.192775  1.491434  1.986505  2.484174  4.824086 
# quantile(mu.d)
# # 0%       25%       50%       75%      100% 
# # 0.2616252 2.3423639 2.7447304 3.1483677 5.1961587 
# 
# n.s <- rep(NA,50)
# n.d <- rep(NA,50)
# for (i in 1:50){
#   lgcp.ppp.s <- rLGCP(model="matern",mu.s,var=(theta.s[[4]])^2,scale=theta.s[[5]]/2,nu=1,win = W_2)
#   lgcp.ppp.d <- rLGCP(model="matern",mu.d,var=(theta.d[[4]])^2,scale=theta.d[[5]]/2,nu=1,win = W_2)
#   n.s[i] <- lgcp.ppp.s$n
#   n.d[i] <- lgcp.ppp.d$n
# }
# 
# mean(n.s)
# # [1] 603.48
# mean(n.d)
# # [1] 831.92
# quantile(n.s)
# # 0%     25%     50%     75%    100% 
# # 96.00  221.50  340.00  769.75 3114.00 
# quantile(n.d)
# # 0%    25%    50%    75%   100% 
# # 77.0  358.0  540.5 1039.5 5304.0 

disc <- 25 # what discretisation? 2.5/M,2.5/N (N=M)

theta.s <- list(beta0=2,beta1=2,beta2=-2,sigma=1,rho=0.75)
theta.d <- list(beta0=2,beta1=2,beta2=-1,sigma=1,rho=0.75)
theta.d2 <- list(beta0=2,beta1=2,beta2=2,sigma=1,rho=0.75)

data.name_1 <- "Window1FullData.rda"
data.name_2 <- "Window2FullData.rda"
data.name_3 <- "Window2SparseData.rda"
data.name_4 <- "Window2DifferentBeta2Data.rda"
# data.name_5 <- "Window2DifferentBeta2SparseData.rda" # while this data set was created it was never used or modelled and so I will keep the relevant code for interest, but just commented out.
data.name_6 <- "Window2DifferentSignBeta2Data.rda"

seed_1_100 <- 125
seed_2_100 <- 625
seed_3_100 <- 3250
seed_4_100 <- 15625

data_1_100 <- data.gen(W = W_1, theta=theta.s,cov.name=cov.name_1,disc=disc,seed=seed_1_100,sparse=FALSE,percent.keep=NULL,save.file=data.name_1)
data_2_100 <- data.gen(W = W_2, theta=theta.s,cov.name=cov.name_2,disc=disc,seed=seed_2_100,sparse=FALSE,percent.keep=NULL,save.file=data.name_2)
data_2_20 <- data.gen(W = W_2, theta=theta.s,cov.name=cov.name_2,disc=disc,seed=seed_2_100,sparse=TRUE,percent.keep=20,save.file=data.name_3)
data_3_100 <- data.gen(W = W_2, theta=theta.d,cov.name=cov.name_2,disc=disc,seed=seed_3_100,sparse=FALSE,percent.keep=NULL,save.file=data.name_4)
# data_3_20 <- data.gen(W = W_2, theta=theta.d,cov.name=cov.name_2,disc=disc,seed=seed_3_100,sparse=TRUE,percent.keep=20,save.file=data.name_5)
data_4_100 <-  data.gen(W = W_2, theta=theta.d2,cov.name=cov.name_2,disc=disc,seed=seed_4_100,sparse=FALSE,percent.keep=NULL,save.file=data.name_6)

# Set-Up Meshes

mesh.name_1 <- "MeshWindow1.rda"
mesh.name_2 <- "MeshWindow2.rda"

mesh_1 <- mesh.gen(W=W_1,disc=disc,save.file=mesh.name_1)
mesh_2 <- mesh.gen(W=W_2,disc=disc,save.file=mesh.name_2)



# sessionInfo() -----------------------------------------------------------

sessionInfo()
