
# Assessing Timings SBC Error Re-Runs for LA Polygon ----------------------

# Here I want to consider the processes that were re-run, either for known timing issues, potential timing issues or space issues.
# If there are re-runs which ran for more than 12 hours, we want to remove the outputs and place them as ERRORS, for consistency as any runs that I caught that extended beyond the 12 hour limit were manually stopped due to this.
# Any other timing errors that could not be re-run and were stopped before the 12 hour time limit still retained their errors. Therefore, for consistency, if any of the re-runs completed after the 12 hour time limit, we will remove the outputs for this particular run.

# Author: Nadeen Khaleel

# Either setwd() to the source file location, or run the following:
library("rstudioapi")
setwd(dirname(getActiveDocumentContext()$path))

library(stringr)


# Check Run Times and Highlight Long Runs ---------------------------------

procs <- 1:40

# Find the processors with warnings that resulted from errors.
warn.count.vec <- rep(0,40)
for (i in 1:40){
  load(paste0("../GridMeshIrregPolLGCPSBCSS",i,".rda"))
  for (j in 1:4){
    for (l in 1:4){
      warn.count.vec[i] <- warn.count.vec[i] + sum(!is.na(run.out[[j]][[l]]$mess.ls$warning))
    }
  }  
}

print("procs with warnings:")
warn.procs <- which(warn.count.vec>0)

warn.procs

warn.df.list <- vector(mode="list",length=length(warn.procs))
names(warn.df.list) <- paste0("Process_",warn.procs)

regexp <- "[[:digit:]]+"
for (i in 1:length(warn.procs)){
  proc <- warn.procs[i]
  load(paste0("../GridMeshIrregPolLGCPSBCSS",proc,".rda"))
  
  warn.df <- data.frame(sim=numeric(),grid=numeric(),mesh=numeric(),timing_mins=numeric(),warn.time=numeric())
  
  warn.count <- 0
  for (j in 1:4){
    for (l in 1:4){
      s <- which(str_detect(run.out[[j]][[l]]$mess.ls$warning,"ERROR"))
      if (length(s)>0){
        warn.df[(warn.count+1):(warn.count+length(s)),1] <- s
        warn.df[(warn.count+1):(warn.count+length(s)),2] <- rep(j,length(s))
        warn.df[(warn.count+1):(warn.count+length(s)),3] <- rep(l,length(s))
        warn.df[(warn.count+1):(warn.count+length(s)),4] <- run.out[[j]][[l]]$run.df$time[s]/60
        warn.df[(warn.count+1):(warn.count+length(s)),5] <- str_extract(run.out[[j]][[l]]$mess.ls$warning[s],regexp)
      }
      warn.count <- warn.count + length(s)
    }
  }
  warn.df$warn.time[is.na(warn.df$warn.time)] <- 1
  
  warn.df <- warn.df[order(warn.df$sim),]
  
  warn.df.list[[i]] <- warn.df
  
}


warn.time.full <- cbind(data.frame("Process"=rep(names(warn.df.list)[1],nrow(warn.df.list[[1]]))),warn.df.list[[1]])
for (i in 2:length(warn.df.list)){
  warn.time.full <- rbind(warn.time.full,cbind(data.frame("Process"=rep(names(warn.df.list)[i],nrow(warn.df.list[[i]]))),warn.df.list[[i]]))
}

# Let us see which runs took less than 3 hours - for interest
less3hr <- which(warn.time.full$timing_mins<3*60)

warn.time.full[less3hr,]

more12hr <- which(warn.time.full$timing_mins>=12*60)

long.runs <- warn.time.full[more12hr,]

long.runs
# Process sim grid mesh timing_mins warn.time
# 16  Process_10  20    4    4    742.4530         1
# 111 Process_18   9    4    1    768.0719         1
# 118 Process_33  16    4    1    723.2593         1

save(long.runs,file="longrunstimeerror.rda") # Once saved, just to make sure it's not saved over, comment out.


# Re-Set Long Runs (>12hrs) -----------------------------------------------
# Runs for the time error re-runs taking over 12 hours were manually stopped if caught, however some were not caught. Therefore, for consistency we want to ensure that no results from runs that took over 12 hours are kept as they should have been stopped. Therefore we will manually replace the results with the necessary NAs and place an error in the original place.
# I have changed their names to *_FULL.rda

load("longrunstimeerror.rda")

regexp <- "[[:digit:]]+"
procs <- as.numeric(str_extract(long.runs$Process,regexp))


for (ii in 1:length(procs)){
  total.nodes = 20 # how many nodes am I using?
  Nprocs.vec = rep(2,20) # vector where each element contains the number of processors for a particular node
  Nprocs.total = 40 # total number of processors across ALL nodes (/jobs)
  N = 1000 # how many simulations?
  L = 100
  sim = 0 # start new or re-starting at last saved simulation?
  
  procs <- as.numeric(str_extract(long.runs$Process,regexp))
  print(procs)
  
  paste("# ",1:length(procs),sep="",collapse=" ")
  k.ind <- ii
  k <- procs[k.ind]
  this.node <- k%/%2 + as.numeric(k%%2!=0)
  
  sink(paste0("ResettingTimingError",k,".txt"))
  
  old.file <- paste0("../GridMeshIrregPolLGCPSBCSS",k,"_FULL.rda")
  load(old.file)
  run.out.old <- run.out
  rm(run.out)
  gm.old <- gm
  rm(gm)
  
  
  # Saving the output
  save.file <- paste0("../GridMeshIrregPolLGCPSBCSS",k,".rda")
  print(save.file)
  
  N.g <- 4; N.m <- 4
  
  p.rep <- long.runs$sim[ii]
  grid.rep <- long.runs$grid[ii]
  mesh.rep <- long.runs$mesh[ii]
  
  print(p.rep)
  print(grid.rep)
  print(mesh.rep)
  
  # if (stop.val==0){
  print("Pre-change mess.ls")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$error[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$message[[p.rep]])
  print("Pre-change est.df")
  print(run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,])
  print("Pre-change run.df")
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$time[p.rep]/3600)
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$waic[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$dic[p.rep])
  
  ind <- (grid.rep-1)*4 + mesh.rep
  print(sum(is.na(gm.old[[ind]]$ranks.param[p.rep,])))
  print(sum(is.na(gm.old[[ind]]$ranks.mf[p.rep,])))
  
  print("Sum FFT not NA")
  print(sum(!is.na(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT)))
  print("est.df for p.rep not NA")
  print(sum(!is.na(run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,])))
  print("gm NA")
  iind <- (grid.rep-1)*N.m + mesh.rep
  print(sum(!is.na(gm.old[[iind]]$ranks.param[p.rep,])))
  print(sum(!is.na(gm.old[[iind]]$ranks.mf[p.rep,])))
  
  print("FFT NAs")
  print(sum(is.na(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT)))
  print("est.df for p.rep NA")
  print(sum(is.na(run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,])))
  
  print("mess.ls")
  print("Error")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$error[p.rep])
  print("Warning")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep])
  print("FFT")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT[p.rep])
  print("Message")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$message[[p.rep]])
  
  run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$error[p.rep] <- run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep] #
  run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep] <- NA
  run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$message[p.rep] <- paste0("This combination of grid and mesh for this simulation has been rerun with 16 processors, however it still took longer than 12 hours (",long.runs$timing_mins[ii]," mins) to run and so we will not include the output and replace the error and remove the warning and outputs, see *_FULL.rda for any results that may have been produced here.")
  run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT[p.rep] <- NA
  
  run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,] <- rep(NA,length(run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,]))
  
  # The below was used previously, and resulted in 25 NAs for each of the elements below in the final data set
  # run.out.old[[grid.rep]][[mesh.rep]]$run.df$time <- NA
  # run.out.old[[grid.rep]][[mesh.rep]]$run.df$waic <- NA
  # run.out.old[[grid.rep]][[mesh.rep]]$run.df$dic <- NA
  run.out.old[[grid.rep]][[mesh.rep]]$run.df$time[p.rep] <- NA
  run.out.old[[grid.rep]][[mesh.rep]]$run.df$waic[p.rep] <- NA
  run.out.old[[grid.rep]][[mesh.rep]]$run.df$dic[p.rep] <- NA
  
  print("mess.ls")
  print("Error")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$error[p.rep])
  print("Warning")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep])
  print("FFT")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT[p.rep])
  print("Message")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$message[[p.rep]])
  
  ind <- (grid.rep-1)*4 + mesh.rep
  gm.old[[ind]]$ranks.param[p.rep,] <- rep(NA,length(gm.old[[ind]]$ranks.param[p.rep,]))
  gm.old[[ind]]$ranks.mf[p.rep,] <- rep(NA,length(gm.old[[ind]]$ranks.mf[p.rep,]))
  
  print("Post-change mess.ls")
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$error[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$warning[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$FFT[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$mess.ls$message[[p.rep]])
  print("Post-change est.df")
  print(run.out.old[[grid.rep]][[mesh.rep]]$est.df[p.rep,])
  print("Post-change run.df")
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$time[p.rep]/3600)
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$waic[p.rep])
  print(run.out.old[[grid.rep]][[mesh.rep]]$run.df$dic[p.rep])
  
  
  ind <- (grid.rep-1)*4 + mesh.rep
  print(sum(is.na(gm.old[[ind]]$ranks.param[p.rep,])))
  print(sum(is.na(gm.old[[ind]]$ranks.mf[p.rep,])))
  
  run.out <- run.out.old
  gm <- gm.old
  
  save(run.out,gm,true.theta,data.err.tracker,seed.vec,file=save.file)
  
  sink()
  
}

sink()
rm(list=ls())
# }



# Checking Re-Set for Long Runs -------------------------------------------


load("longrunstimeerror.rda")
regexp <- regexp <- "[[:digit:]]+"
procs <- as.numeric(str_extract(long.runs$Process,regexp))


sink("DoubleCheckingTimingErrorResetOverRun.txt")
for (ii in 1:length(procs)){
  
  k.ind <- ii
  k <- procs[k.ind]
  this.node <- k%/%2 + as.numeric(k%%2!=0)
  
  print(paste0("Process ",k))
  
  
  old.file <- paste0("../GridMeshIrregPolLGCPSBCSS",k,"_FULL.rda")
  load(old.file)
  run.out.old <- run.out
  rm(run.out)
  gm.old <- gm
  rm(gm)
  true.theta.old <- true.theta
  rm(true.theta)
  seed.vec.old <- seed.vec
  rm(seed.vec)
  data.err.tracker.old <- data.err.tracker
  rm(data.err.tracker)
  
  save.file <- paste0("../GridMeshIrregPolLGCPSBCSS",k,".rda")
  load(save.file)
  
  N.g <- 4; N.m <- 4
  
  nn <- dim(gm[[N.g*N.m]]$ranks.mf)[2]
  p.length <- sum(!is.na(gm[[N.g*N.m]]$ranks.mf[,nn])) + sum(!is.na(run.out[[N.g]][[N.m]]$mess.ls$error)) + 1 # no ranks in trad ss
  
  gm.s <- function(g){sapply(1:length(g),function(i){sum(!is.na(g[[i]]$ranks.mf[,dim(g[[i]]$ranks.mf)[2]]))})}
  s <- matrix(gm.s(gm),nrow=N.m) # fills in down the columns, so for each grid, fills in row i with mesh i, following the output from e.s below
  e.s <- function(g){sapply(1:length(g),function(i){sum(!is.na(g[[i]]$mess.ls$error))})}
  s <- s + sapply(1:length(run.out),function(i){e.s(run.out[[i]])})
  ds <- diff(s)
  if (sum(ds)!=0){
    w <- which(ds!=0,arr.ind = T)
    grid.start.ind <- unname(w)[2]
    mesh.start.ind <- unname(w)[1] + 1
  } else if (sum(diff(t(s))!=0)){
    w <- which(diff(t(s))!=0,arr.ind=TRUE) # should be easy to extract the common row, then +1 to get the required GRID that needs to begin running...
    grid.start.ind <- unname(w)[1,1] + 1
    mesh.start.ind <- unname(w)[1,2]
  } else {
    grid.start.ind <- 1
    mesh.start.ind <- 1
  }
  
  
  print("Names gm")
  print(names(gm.old))
  print(names(gm))
  
  print(sum(names(gm.old)==names(gm)))
  print(sum(names(gm.old)!=names(gm)))
  
  
  for (i in 1:4){
    for (j in 1:4){
      print("Length Message")
      lm <- length(run.out[[i]][[j]]$mess.ls$message)
      print(lm)
      print("Null Messages Counts")
      print(sum(sapply(1:lm,function(l){is.null(run.out.old[[i]][[j]]$mess.ls$message[[l]])})))
      print(sum(sapply(1:lm,function(l){is.null(run.out[[i]][[j]]$mess.ls$message[[l]])})))
    }
  }
  
  for (i in 1:4){
    for (j in 1:4){
      print("Length FFT")
      lm <- length(run.out[[i]][[j]]$mess.ls$FFT)
      print(lm)
      print("Length Error")
      lm2 <- length(run.out[[i]][[j]]$mess.ls$error)
      print(lm2)
      print("FFT Count")
      print(sum((run.out.old[[i]][[j]]$mess.ls$FFT),na.rm=TRUE))
      print(sum((run.out[[i]][[j]]$mess.ls$FFT),na.rm=TRUE))
      print("Warnings Present")
      print(sum(!is.na(run.out.old[[i]][[j]]$mess.ls$warning)))
      print(sum(!is.na(run.out[[i]][[j]]$mess.ls$warning)))
      print("Errors Present")
      print(sum(!is.na(run.out.old[[i]][[j]]$mess.ls$error)))
      print(sum(!is.na(run.out[[i]][[j]]$mess.ls$error)))
    }
  }
  
  ######
  # CHECKS ####
  
  if (grid.start.ind==1&mesh.start.ind==1){ # for time error addition
    p.check <- p.length - 1
  } else {
    p.check <- p.length
  }
  
  print("Compare true.theta and seed.vec")
  print(sum(true.theta[1:p.check,]-true.theta.old[1:p.check,]))
  print(max(abs(true.theta[1:p.check,]-true.theta.old[1:p.check,])))
  print(sum(seed.vec[1:p.check]-seed.vec.old[1:p.check]))
  print(max(abs(seed.vec[1:p.check]-seed.vec.old[1:p.check])))
  
  print("data.err.tracker comparison")
  print(data.err.tracker)
  print(data.err.tracker.old)
  
  print("Sum and Max-Abs Different and sum is.na in est.df")
  for (jj in 1:4){
    for (ll in 1:4){
      if (jj <= grid.start.ind&ll<(mesh.start.ind-1)){
        replacement.max <- p.length
      } else {
        replacement.max <- p.length - 1
      }
      
      print(sum(run.out[[jj]][[ll]]$est.df[1:replacement.max,] - run.out.old[[jj]][[ll]]$est.df[1:replacement.max,],na.rm=TRUE))
      print(max(abs(run.out[[jj]][[ll]]$est.df[1:replacement.max,] - run.out.old[[jj]][[ll]]$est.df[1:replacement.max,])))
      
      print(sum(is.na(run.out.old[[jj]][[ll]]$est.df)))
      print(sum(is.na(run.out[[jj]][[ll]]$est.df)))
      
      print(sum(!is.na(run.out.old[[jj]][[ll]]$est.df)))
      print(sum(!is.na(run.out[[jj]][[ll]]$est.df)))
    }
  }
  
  print("Sum and Max-Abs difference in FFT")
  for (jj in 1:4){
    for (ll in 1:4){
      if (jj <= grid.start.ind&ll<(mesh.start.ind-1)){
        replacement.max <- p.length
      } else {
        replacement.max <- p.length - 1
      }
      
      print(sum(run.out[[jj]][[ll]]$mess.ls$FFT[1:replacement.max] - run.out.old[[jj]][[ll]]$mess.ls$FFT[1:replacement.max],na.rm=TRUE))
      print(max(abs(run.out[[jj]][[ll]]$mess.ls$FFT[1:replacement.max] - run.out.old[[jj]][[ll]]$mess.ls$FFT[1:replacement.max])))
      
      print(sum(is.na(run.out.old[[jj]][[ll]]$mess.ls$FFT)))
      print(sum(is.na(run.out[[jj]][[ll]]$mess.ls$FFT)))
      
      print(sum(!is.na(run.out.old[[jj]][[ll]]$mess.ls$FFT)))
      print(sum(!is.na(run.out[[jj]][[ll]]$mess.ls$FFT)))
    }
  }
  
  N.g <- 4; N.m <- 4
  
  print("Sum and Max-Abs difference in ranks.param")
  for (jj in 1:4){
    for (ll in 1:4){
      if (jj <= grid.start.ind&ll<(mesh.start.ind-1)){
        replacement.max <- p.length
      } else {
        replacement.max <- p.length - 1
      }
      ind <- (jj-1)*N.m + ll
      
      print(sum(gm[[ind]]$ranks.param[1:replacement.max,] - gm.old[[ind]]$ranks.param[1:replacement.max,],na.rm=TRUE))
      print(max(abs(gm[[ind]]$ranks.param[1:replacement.max,] - gm.old[[ind]]$ranks.param[1:replacement.max,])))
      
      print(sum(is.na(gm.old[[ind]]$ranks.param)))
      print(sum(is.na(gm[[ind]]$ranks.param)))
      
      print(sum(!is.na(gm.old[[ind]]$ranks.param)))
      print(sum(!is.na(gm[[ind]]$ranks.param)))
    }
  }
  
  print("Sum and Max-Abs difference in ranks.mf")
  for (jj in 1:4){
    for (ll in 1:4){
      if (jj <= grid.start.ind&ll<(mesh.start.ind-1)){
        replacement.max <- p.length
      } else {
        replacement.max <- p.length - 1
      }
      ind <- (jj-1)*N.m + ll
      
      print(sum(gm[[ind]]$ranks.mf[1:replacement.max,] - gm.old[[ind]]$ranks.mf[1:replacement.max,],na.rm=TRUE))
      print(max(abs(gm[[ind]]$ranks.mf[1:replacement.max,] - gm.old[[ind]]$ranks.mf[1:replacement.max,])))
      
      print(sum(is.na(gm.old[[ind]]$ranks.mf)))
      print(sum(is.na(gm[[ind]]$ranks.mf)))
      
      print(sum(!is.na(gm.old[[ind]]$ranks.mf)))
      print(sum(!is.na(gm[[ind]]$ranks.mf)))
    }
  }
  
  for (i in 1:16){
    print(paste0("Dimension gm.old ", i))
    print(dim(gm.old[[i]]$ranks.param))
    print(dim(gm.old[[i]]$ranks.mf))
    print(paste0("Dimension gm ", i))
    print(dim(gm[[i]]$ranks.param))
    print(dim(gm[[i]]$ranks.mf))
  }
  
  
  
}

sink()

rm(list=ls())


# sessionInfo() -----------------------------------------------------------

sessionInfo()
