#Cleaning GNAT data for citalopram study
#Catherine Hobbs, University of Bath c.hobbs@bath.ac.uk
#24.03.2020
#RStudio Version 1.2.1335 Mac OS

#Trying to sort out Z-scores

#Packages Required
library(readxl)
library(tidyverse)
library(doBy)
library(data.table)
library(zoo)

#Setting Working Directory
setwd("/Volumes/files/RDSSD/Katie Studies/Citalopram/8. Data/")
GNAT_trial_raw <- read_excel("Raw/Cognitive Tasks/GNAT/GNAT_Merged_trial_raw_anon.xlsx")

#Note: date and time columns removed during anonymisation process, if running the task may want to keep these or remove (code below to remove)
#GNAT_trial_raw <- GNAT_trial_raw %>%
  #select(- date, - time, - stimulusitem1)

#Keeping only test trials (removing practice and training)
GNAT_trial_clean <- subset(GNAT_trial_raw, values.trialtype == "test")

#Removing weird part of names
names(GNAT_trial_clean) <- (gsub("item.", "", names(GNAT_trial_clean) ) )
names(GNAT_trial_clean) <- (gsub("values.", "", names(GNAT_trial_clean) ) )

#Checking subject IDs
table(GNAT_trial_clean$subject)
GNAT_trial_clean %>%
  summarise(n_distinct(subject))

#Identifying if same key pressed on all trials within a block
GNAT_trial_clean <- GNAT_trial_clean %>%
  group_by(subject, pairing) %>%
  mutate(overall_keys = mean(response)) 

summary(GNAT_trial_clean$overall_keys)

GNAT_trial_clean$exclude <- ifelse(GNAT_trial_clean$overall_keys == 0 | GNAT_trial_clean$overall_keys == 57, 1, 0) #Noting blocks where all responses where gos (57) or no gos (0)

table(GNAT_trial_clean$exclude) #1 person had all the same response

GNAT_trial_clean %>%
  group_by(exclude, pairing) %>%
  summarise(n_distinct(subject))

excluded_participants <- GNAT_trial_clean %>%
  filter(exclude == 1) %>%
  select(subject, pairing, response) %>%
  group_by(subject, pairing) %>%
  summarise(response = mean(response)) %>%
  ungroup()

print(excluded_participants)  

###Note: participant 778f60bb21f8739dfcad392243499c233f5b8c03 excluded as they made no response on any trial in any block (i.e. refrained from pressing spacebar throughout)
GNAT_trial_clean <- GNAT_trial_clean %>%
  filter(subject != "778f60bb21f8739dfcad392243499c233f5b8c03")

#Checking no. participants remaining
length(unique(GNAT_trial_clean$subject)) 

GNAT_trial_clean$exclude <- NULL
GNAT_trial_clean$overall_keys <- NULL

#Changing response variable to be 1 if spacebar pressed (go) and 0 if not pressed (no go)
GNAT_trial_clean$resp <- ifelse(GNAT_trial_clean$response == 57, 1, 0)
GNAT_trial_clean$response <- NULL
GNAT_trial_clean$resp <- factor(GNAT_trial_clean$resp,
                                levels = c(0,1),
                                labels = c("No Go", "Go"))

#Labelling signal variable to show what response is correct
GNAT_trial_clean$signal <- factor(GNAT_trial_clean$signal,
                                  levels = c(0,1),
                                  labels = c("No Go Correct (Noise trial)", "Go Correct (Signal Trial)"))

#Creating Stimulus Category variable
GNAT_trial_clean$stimulus_cat <- ifelse(GNAT_trial_clean$pairing == "AA", 1, ifelse(GNAT_trial_clean$pairing == "AB", 2, ifelse(GNAT_trial_clean$pairing == "BA", 3, ifelse(GNAT_trial_clean$pairing == "BB", 4, NA))))


GNAT_trial_clean$stimulus_cat <- factor(GNAT_trial_clean$stimulus_cat,
                                        levels = c(1, 2, 3, 4),
                                        labels = c("Self_Pos", "Self_Neg", "Other_Pos", "Other_Neg"))
GNAT_trial_clean$pairing <- NULL

#Generating response type
GNAT_trial_clean$response_type<- ifelse(GNAT_trial_clean$signal == "Go Correct (Signal Trial)" & GNAT_trial_clean$correct == 1, 1, ifelse(GNAT_trial_clean$signal == "Go Correct (Signal Trial)" & GNAT_trial_clean$correct == 0, 2, ifelse(GNAT_trial_clean$signal == "No Go Correct (Noise trial)" & GNAT_trial_clean$correct == 1, 3, ifelse(GNAT_trial_clean$signal == "No Go Correct (Noise trial)" & GNAT_trial_clean$correct == 0, 4, NA))))
GNAT_trial_clean$response_type <- factor(GNAT_trial_clean$response_type,
                                         levels = c(1, 2, 3, 4),
                                         labels = c("Hit", "Miss", "Correct Rejection", "False Alarm"))

#Creating seperate variables for response types
GNAT_trial_clean$hit <- ifelse(GNAT_trial_clean$response_type == "Hit", 1, 0)
GNAT_trial_clean$miss <- ifelse(GNAT_trial_clean$response_type == "Miss", 1, 0)
GNAT_trial_clean$correct_reject <- ifelse(GNAT_trial_clean$response_type == "Correct Rejection", 1, 0)
GNAT_trial_clean$fa <- ifelse(GNAT_trial_clean$response_type == "False Alarm", 1, 0)

#Editing trial numbers and blocks to remove no. of practice and training trials
GNAT_trial_clean$trial <- GNAT_trial_clean$trialnum - 18
GNAT_trial_clean$block <- GNAT_trial_clean$blocknum - 7

GNAT_trial_clean %>%
  group_by(block) %>%
  summarise(min(trial), max(trial), n_distinct(trial)) #Checking number of trials correct

#Calculating total no. hits etc. per subject and block
GNAT_trial_clean <- GNAT_trial_clean %>%
  group_by(subject, stimulus_cat) %>%
  mutate(hit_cum = cumsum(hit), hit_sum = sum(hit), miss_cum = cumsum(miss), miss_sum = sum(miss), correct_reject_cum = cumsum(correct_reject), correct_reject_sum = sum(correct_reject), fa_cum = cumsum(fa), fa_sum = sum(fa), correct_cum = cumsum(correct), correct_sum = sum(correct))

#Keeping relevant variables
GNAT_trial_clean <- GNAT_trial_clean %>%
  select(subject, block, trial, stimulus_cat, signal, resp, latency, correct, response_type, hit, miss, correct_reject, fa, hit_cum, hit_sum, miss_cum, miss_sum, correct_reject_cum, correct_reject_sum, fa_cum, fa_sum, correct_cum, correct_sum)

#Creating trial no.s for go vs. no go trials for cumulative graphs (e.g. cumulative hits go down with total no. of trials but only possible to get 50% as only half of trials require a go response)
GNAT_trial_clean_signal <- GNAT_trial_clean %>%
  filter(signal == "Go Correct (Signal Trial)") %>%
  group_by(subject, stimulus_cat) %>%
  mutate(subtrial_signal = row_number(), subtrial_noise = NA)

GNAT_trial_clean_noise <- GNAT_trial_clean %>%
  filter(signal == "No Go Correct (Noise trial)") %>%
  group_by(subject, stimulus_cat) %>%
  mutate(subtrial_noise = row_number(), subtrial_signal = NA)

GNAT_trial_clean_subtrials <- rbind(GNAT_trial_clean_noise, GNAT_trial_clean_signal)  

GNAT_trial_clean <- GNAT_trial_clean_subtrials %>%
  arrange(subject, stimulus_cat, trial)

#Calcuating cumulative proportions
GNAT_trial_clean$correct_prop_cum <- GNAT_trial_clean$correct_cum/GNAT_trial_clean$trial
GNAT_trial_clean$hit_prop_cum <- GNAT_trial_clean$hit_cum/GNAT_trial_clean$subtrial_signal
GNAT_trial_clean$miss_prop_cum <- GNAT_trial_clean$miss_cum/GNAT_trial_clean$subtrial_signal
GNAT_trial_clean$correct_reject_prop_cum <- GNAT_trial_clean$correct_reject_cum/GNAT_trial_clean$subtrial_noise
GNAT_trial_clean$fa_prop_cum <- GNAT_trial_clean$fa_cum/GNAT_trial_clean$subtrial_noise

#Calculating proportions
GNAT_trial_clean$correct_prop <- GNAT_trial_clean$correct_sum/48
GNAT_trial_clean$hit_prop <- GNAT_trial_clean$hit_sum/24
GNAT_trial_clean$miss_prop <- GNAT_trial_clean$miss_sum/24
GNAT_trial_clean$correct_reject_prop <- GNAT_trial_clean$correct_reject_sum/24
GNAT_trial_clean$fa_prop <- GNAT_trial_clean$fa_sum/24

#Save data frame
save(GNAT_trial_clean, file = "Analysis/GNAT/GNAT_trial_clean")

#Collapsing trial level data
GNAT_trial_clean <- as.data.frame(GNAT_trial_clean)
GNAT_agg_clean <- summaryBy(latency + correct_prop + hit_prop + miss_prop + correct_reject_prop + fa_prop + correct_sum + hit_sum + miss_sum + correct_reject_sum + fa_sum ~ subject + stimulus_cat, FUN = c(mean), na.rm=TRUE, data = GNAT_trial_clean, keep.names = TRUE)

#Creating z-scores (adjusting by correcting those with very low and very high values)
GNAT_agg_clean$hit_adj <- ifelse(GNAT_agg_clean$hit_prop == 0, GNAT_agg_clean$hit_prop + 0.005, ifelse(GNAT_agg_clean$hit_prop == 1, GNAT_agg_clean$hit_prop - 0.005, GNAT_agg_clean$hit_prop))

summary(GNAT_agg_clean$hit_prop)
summary(GNAT_agg_clean$hit_adj)

GNAT_agg_clean$miss_adj <- ifelse(GNAT_agg_clean$miss_prop == 0, GNAT_agg_clean$miss_prop + 0.005, ifelse(GNAT_agg_clean$miss_prop == 1, GNAT_agg_clean$miss_prop - 0.005, GNAT_agg_clean$miss_prop))

summary(GNAT_agg_clean$miss_prop)
summary(GNAT_agg_clean$miss_adj)

GNAT_agg_clean$correct_reject_adj <- ifelse(GNAT_agg_clean$correct_reject_prop == 0, GNAT_agg_clean$correct_reject_prop + 0.005, ifelse(GNAT_agg_clean$correct_reject_prop == 1, GNAT_agg_clean$correct_reject_prop - 0.005, GNAT_agg_clean$correct_reject_prop))

summary(GNAT_agg_clean$correct_reject_prop)
summary(GNAT_agg_clean$correct_reject_adj)

GNAT_agg_clean$fa_adj <- ifelse(GNAT_agg_clean$fa_prop == 0, GNAT_agg_clean$fa_prop + 0.005, ifelse(GNAT_agg_clean$fa_prop == 1, GNAT_agg_clean$fa_prop - 0.005, GNAT_agg_clean$fa_prop))

summary(GNAT_agg_clean$fa_prop)
summary(GNAT_agg_clean$fa_adj)

#Caculating z-scores with probability P[X > x] 
GNAT_agg_clean$z_Hits <- qnorm(GNAT_agg_clean$hit_adj, lower.tail = FALSE)
GNAT_agg_clean$z_Misses <- qnorm(GNAT_agg_clean$miss_adj, lower.tail = FALSE)
GNAT_agg_clean$z_Correct_Reject <- qnorm(GNAT_agg_clean$correct_reject_adj, lower.tail = FALSE)
GNAT_agg_clean$z_FA <- qnorm(GNAT_agg_clean$fa_adj, lower.tail = FALSE)

#Creating d' Larger absolute values of d' mean that a person is more sensitive to the difference between the Signal Present and Signal Absent distributions. d' values near zero indicate chance performance.
GNAT_agg_clean$d <-  GNAT_agg_clean$z_FA - GNAT_agg_clean$z_Hits

#Creating a wide dataset
GNAT_agg_wide <- dcast(setDT(GNAT_agg_clean), subject ~ stimulus_cat, value.var = c('latency', 'correct_prop', 'hit_prop', 'miss_prop', 'correct_reject_prop', 'fa_prop', 'correct_sum', 'hit_sum', 'miss_sum', 'correct_reject_sum', 'fa_sum', 'z_Hits', 'z_Misses', 'z_Correct_Reject', 'z_FA', 'd'))

#Calculating negative self-other biases
GNAT_agg_wide$Neg_Self_Other_bias <- GNAT_agg_wide$d_Other_Neg - GNAT_agg_wide$d_Self_Neg 

#Calculating positive self-other biases
GNAT_agg_wide$Pos_Self_Other_bias <- GNAT_agg_wide$d_Self_Pos - GNAT_agg_wide$d_Other_Pos 

#Calculating self biases (negative values = negative bias, positive values = positive bias)
GNAT_agg_wide$Self_bias <- GNAT_agg_wide$d_Self_Pos - GNAT_agg_wide$d_Self_Neg

#Calculating other biases (negative values = negative bias, positive values = positive bias)
GNAT_agg_wide$Other_bias <- GNAT_agg_wide$d_Other_Pos - GNAT_agg_wide$d_Other_Neg

#Calculating overall bias
GNAT_agg_wide$Overall_bias <- (GNAT_agg_wide$d_Self_Pos - GNAT_agg_wide$d_Other_Pos) + (GNAT_agg_wide$d_Other_Neg - GNAT_agg_wide$d_Self_Neg)

#Creating long dataset
GNAT_agg_long <- melt(setDT(GNAT_agg_wide), measure = patterns('^latency', '^correct_prop', '^hit_prop', '^miss_prop', '^correct_reject_prop', '^fa_prop', '^correct_sum', '^hit_sum', '^miss_sum', '^correct_reject_sum', '^fa_sum', '^z_Hits', '^z_Misses', '^z_Correct_Reject', '^z_FA', '^d_','^Self_bias',  '^Other_bias', '^Pos_Self_Other_bias', '^Neg_Self_Other_bias', '^Overall_bias'), value.name = c('latency', 'correct_prop', 'hit_prop', 'miss_prop', 'correct_reject_prop', 'fa_prop', 'correct_sum', 'hit_sum', 'miss_sum', 'correct_reject_sum', 'fa_sum', 'z_Hits', 'z_Misses', 'z_Correct_Reject', 'z_FA', 'd', 'Other_bias', 'Self_bias', 'Pos_Self_other_bias', 'Neg_Self_Other_bias', 'Overall_bias'), variable.name = 'Condition')

GNAT_agg_long <- GNAT_agg_long %>%
  arrange(subject, Condition)

#Bringing bias scores down for all conditions
GNAT_agg_long <- GNAT_agg_long %>%
  group_by(subject) %>%
  mutate(Other_bias = na.locf0(Other_bias), Self_bias = na.locf0(Self_bias), Pos_Self_other_bias = na.locf0(Pos_Self_other_bias), Neg_Self_Other_bias = na.locf0(Neg_Self_Other_bias), Overall_bias = na.locf0(Overall_bias))

#Labelling Conditions
GNAT_agg_long$Condition <- factor(GNAT_agg_long$Condition,
                                  levels = c(1, 2, 3, 4),
                                  labels = c("Self Positive", "Self Negative", "Other Positive", "Other Negative"))


#Creating separate condition & valence columns
GNAT_agg_long <- GNAT_agg_long %>%
  mutate(ref_condition = ifelse(Condition == "Self Positive" | Condition == "Self Negative", 1, 2)) %>%
  mutate(valence = ifelse(Condition == "Self Positive" | Condition == "Other Positive", 1, 2))

GNAT_agg_long$ref_condition <- as.factor(GNAT_agg_long$ref_condition)
GNAT_agg_long$valence <- as.factor(GNAT_agg_long$valence)

GNAT_agg_long$ref_condition <- factor(GNAT_agg_long$ref_condition ,
                                      levels = c(1,2),
                                      labels = c("Self", "Other"))

GNAT_agg_long$valence <- factor(GNAT_agg_long$valence ,
                                levels = c(1,2),
                                labels = c("Positive", "Negative"))      


#Merging with drug group data
GNAT_agg_long <- GNAT_agg_long %>%
  rename(participant = subject)

drug_matching <- read_excel("Raw/Group Allocation/Blinded_drug_matching_KH_format_for_R_anon.xlsx")
GNAT_agg_long <- merge(drug_matching, GNAT_agg_long , by = "participant")

GNAT_agg_long$group <- as.factor(GNAT_agg_long$group)

#Checking no. subjects
GNAT_agg_long %>%
  summarise(n_distinct(participant))

#Multiplying proportion data for easier analysis
GNAT_agg_long <- GNAT_agg_long %>%
  mutate(correct_prop = correct_prop*100, hit_prop = hit_prop*100, miss_prop = miss_prop*100, correct_reject_prop = correct_reject_prop*100, fa_prop = fa_prop*100)

#Saving Dataframe
save(GNAT_agg_long, file = "Analysis/GNAT/GNAT_agg_long")
write.csv(GNAT_agg_long, file = "Analysis/GNAT/GNAT_agg_long.csv")


#Removing non-compliant subjects - repeating the above cleaning with the long dataset but with participants with a priori data exclusion removed
##Calculating number of hits and false alarms
##Calculate overall discrimination (avg hit - fa) and identify ineligble ps.s (scores below 5)
GNAT_agg_wide <- GNAT_agg_wide %>%
  mutate (discrim = (rowMeans(select(GNAT_agg_wide, starts_with("hit_sum")), na.rm = TRUE)) - (rowMeans(select(GNAT_agg_wide, starts_with("fa_sum")), na.rm = TRUE)))
summary(GNAT_agg_wide$discrim)
GNAT_agg_wide$discrim_ineligible <- ifelse(GNAT_agg_wide$discrim < 5, 1, 0)
table(GNAT_agg_wide$discrim_ineligible)
GNAT_agg_wide %>%
  group_by(discrim_ineligible) %>%
  summarise (mean(discrim), sd(discrim), min(discrim), max(discrim))

##Calculate overall bias (avg hit + fa) and identify ineligble p.s (below 12 or above 36)
GNAT_agg_wide <- GNAT_agg_wide %>%
  mutate (bias = (rowMeans(select(GNAT_agg_wide, starts_with("hit_sum")), na.rm = TRUE)) + (rowMeans(select(GNAT_agg_wide, starts_with("fa_sum")), na.rm = TRUE)))
summary(GNAT_agg_wide$bias)
GNAT_agg_wide$bias_ineligible <- ifelse(GNAT_agg_wide$bias < 12 | GNAT_agg_wide$bias > 36, 1, 0)
table(GNAT_agg_wide$bias_ineligible)
GNAT_agg_wide %>%
  group_by(bias_ineligible) %>%
  summarise (mean(bias), sd(bias), min(bias), max(bias))

##Combining ineligibility to drop participants (Note: 9 participants dropped)
GNAT_agg_wide$ineligible <- ifelse(GNAT_agg_wide$discrim_ineligible == 1 | GNAT_agg_wide$bias_ineligible == 1, 1, 0)
table(GNAT_agg_wide$ineligible)
GNAT_agg_wide %>%
  group_by(ineligible)%>%
  summarise(n_distinct(subject))
GNAT_agg_wide <- subset(GNAT_agg_wide, ineligible == 0)

#Creating long dataset
GNAT_agg_long_excluded <- melt(setDT(GNAT_agg_wide), measure = patterns('^latency', '^correct_prop', '^hit_prop', '^miss_prop', '^correct_reject_prop', '^fa_prop', '^correct_sum', '^hit_sum', '^miss_sum', '^correct_reject_sum', '^fa_sum', '^z_Hits', '^z_Misses', '^z_Correct_Reject', '^z_FA', '^d_','^Self_bias',  '^Other_bias', '^Pos_Self_Other_bias', '^Neg_Self_Other_bias', '^Overall_bias'), value.name = c('latency', 'correct_prop', 'hit_prop', 'miss_prop', 'correct_reject_prop', 'fa_prop', 'correct_sum', 'hit_sum', 'miss_sum', 'correct_reject_sum', 'fa_sum', 'z_Hits', 'z_Misses', 'z_Correct_Reject', 'z_FA', 'd', 'Other_bias', 'Self_bias', 'Pos_Self_other_bias', 'Neg_Self_Other_bias', 'Overall_bias'), variable.name = 'Condition')

GNAT_agg_long_excluded <- GNAT_agg_long_excluded %>%
  arrange(subject, Condition)

#Bringing bias scores down for all conditions
GNAT_agg_long_excluded <- GNAT_agg_long_excluded %>%
  group_by(subject) %>%
  mutate(Other_bias = na.locf0(Other_bias), Self_bias = na.locf0(Self_bias), Pos_Self_other_bias = na.locf0(Pos_Self_other_bias), Neg_Self_Other_bias = na.locf0(Neg_Self_Other_bias), Overall_bias = na.locf0(Overall_bias))

#Labelling Conditions
GNAT_agg_long_excluded$Condition <- factor(GNAT_agg_long_excluded$Condition,
                                  levels = c(1, 2, 3, 4),
                                  labels = c("Self Positive", "Self Negative", "Other Positive", "Other Negative"))


#Creating separate condition & valence columns
GNAT_agg_long_excluded <- GNAT_agg_long_excluded %>%
  mutate(ref_condition = ifelse(Condition == "Self Positive" | Condition == "Self Negative", 1, 2)) %>%
  mutate(valence = ifelse(Condition == "Self Positive" | Condition == "Other Positive", 1, 2))

GNAT_agg_long_excluded$ref_condition <- as.factor(GNAT_agg_long_excluded$ref_condition)
GNAT_agg_long_excluded$valence <- as.factor(GNAT_agg_long_excluded$valence)

GNAT_agg_long_excluded$ref_condition <- factor(GNAT_agg_long_excluded$ref_condition ,
                                      levels = c(1,2),
                                      labels = c("Self", "Other"))

GNAT_agg_long_excluded$valence <- factor(GNAT_agg_long_excluded$valence ,
                                levels = c(1,2),
                                labels = c("Positive", "Negative"))      


#Merging with drug group data
GNAT_agg_long_excluded <- GNAT_agg_long_excluded %>%
  rename(participant = subject)

drug_matching <- read_excel("Raw/Group Allocation/Blinded_drug_matching_KH_format_for_R_anon.xlsx")
GNAT_agg_long_excluded <- merge(drug_matching, GNAT_agg_long_excluded , by = "participant")

GNAT_agg_long_excluded$group <- as.factor(GNAT_agg_long_excluded$group)

#Checking no. subjects
GNAT_agg_long_excluded %>%
  summarise(n_distinct(participant))

#Multiplying proportion data so 100 = 100%
GNAT_agg_long_excluded <- GNAT_agg_long_excluded %>%
  mutate(correct_prop = correct_prop*100, hit_prop = hit_prop*100, miss_prop = miss_prop*100, correct_reject_prop = correct_reject_prop*100, fa_prop = fa_prop*100)

#Saving Dataframe
save(GNAT_agg_long_excluded, file = "Analysis/GNAT/GNAT_agg_long_excluded")
write.csv(GNAT_agg_long_excluded, "Analysis/GNAT/GNAT_agg_long_excluded.csv")


