#Cleaning Categorisation and Recall data for citalopram study
#Catherine Hobbs, University of Bath c.hobbs@bath.ac.uk
#25.03.2020
#RStudio Version 1.2.1335 Mac OS

#Packages needed
invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE, force=TRUE))#Removing currently loaded packages to avoid compatability issues
library(tidyverse)
library(readxl)
library(data.table)
library(hunspell)
dictionary("en_GB")

#Setting Working Directory
setwd("/Volumes/files/RDSSD/Katie Studies/Citalopram/8. Data/")
recall_df <- read_excel("Raw/Cognitive Tasks/Recall/recall_merged_raw_anon.xlsx")

#Note: below code not needed for my purposes as have removed this info when anonymising data, however if you're using the task it may be useful to remove these columns to have a cleaner dataframe
#Getting rid of variables not needed
#recall_df <- recall_df[, -grep(".stopped$", colnames(recall_df))]
#recall_df <- recall_df[, -grep(".started$", colnames(recall_df))]
#recall_df <- subset(recall_df, select = -c(date, `Unnamed: 121`, other_name_resp.keys, expName, psychopyVersion, frameRate, mem_intro_resp.keys, mem_intro_resp.rt, keyTime.keys, practice_keyTime.keys)) 

#Spreading key data (what yes or no correspond to) 
recall_df <- recall_df %>% 
  group_by(participant) %>%
  fill(yes, .direction = "down") %>%
  fill(no, .direction = "down") #%>%
  #fill(other_rating_familiarity.history, .direction = "down") %>%
  #fill(other_rating_like.history, .direction = "down")

#Removing practice trials
recall_df <- subset(recall_df, !is.na(condition))

#Spreading word assignment condition across conditions & subejcts as currently only on top row
recall_df <- recall_df %>% 
  group_by(participant, condition) %>%
  fill(word_assignment, .direction = "down") %>%
  ungroup

#Keeping only relevant columns
recall_df <- recall_df %>%
  select(participant, condition, Word, Likability, Variation, Mean, word_type, length, valence, cat_trial_loop.thisN, mem_trial_loop.thisN, yes, no, word_assignment, trial_resp.keys, trial_resp.rt, cat_resp, mem_response)

#Separating categorisation and recall elements
cat_df <- recall_df %>%
  filter(!is.na(cat_trial_loop.thisN))%>%
  select(participant, condition, Word, Likability, Variation, Mean, word_type, length, valence, cat_trial_loop.thisN, yes, no, word_assignment, trial_resp.keys, trial_resp.rt, cat_resp) %>%
  rename(stimuli_assignment = word_assignment) %>%
  arrange(participant, condition, cat_trial_loop.thisN)

recall_only_df <- recall_df %>%
  filter(!is.na(mem_trial_loop.thisN)) %>%
  select(participant, condition, mem_response, mem_trial_loop.thisN, word_assignment) %>%
  rename(stimuli_assignment = word_assignment) %>%
  arrange(participant, condition, mem_trial_loop.thisN) %>%
  filter(!is.na(mem_response)) #Removing trials where no keys recorded (participant just pressed enter)

##Checking correct no. participants and trials in categorisation df
cat_df %>%
  summarise(n_distinct(participant))

trial_no_check <- cat_df %>%
  group_by(participant, condition) %>%
  summarise(n_distinct(cat_trial_loop.thisN)) %>%
  ungroup

rm(trial_no_check)

#Identifying memory trials where participant did not submit each word individually (so will have to be separated)
recall_only_df$separate <- ifelse(grepl(" ", recall_only_df$mem_response),1, 0)

#Subsetting trials for separation
recall_only_df_separate <- recall_only_df %>%
  filter(separate == 1)

recall_only_df_separate <- recall_only_df_separate %>% separate(mem_response, into = c("word1", "word2", "word3", "word4", "word5", "word6", "word7", "word8", "word9", "word10", "word11", "word12", "word13", "word14", "word15", "word16", "word17", "word18", "word19", "word20", "word21", "word22", "word23", "word24", "word25", "word26", "word27", "word28", "word29", "word30", "word31", "word32", "word33", "word34", "word35", "word36", "word37", "word38"), sep = " ", fill = "right", extra = "warn") 

recall_only_df_separate <- melt(setDT(recall_only_df_separate), measure = patterns('^word'), variable.name = 'word_no')

recall_only_df_separate$word_no <- sub(recall_only_df_separate$word_no, pattern = "word", replacement = "")

recall_only_df_separate$word_no <- as.numeric(recall_only_df_separate$word_no)

recall_only_df_separate <- recall_only_df_separate %>% 
  arrange(participant, condition, word_no) %>%
  filter(!is.na(value)) %>%
  select(participant, condition, word_no, value, stimuli_assignment) %>%
  rename(memN = word_no, mem_response = value)

#Incoporating trials back into main recall dataframe
recall_only_df <- recall_only_df %>%
  filter(separate == 0) %>%
  select(-separate) %>%
  rename(memN = mem_trial_loop.thisN)

recall_only_df <- rbind(recall_only_df, recall_only_df_separate)

recall_only_df <- recall_only_df%>%
  arrange(participant, condition, memN)
  
#Checking correct no. participants
recall_only_df %>%
  group_by(condition) %>%
  summarise(n_distinct(participant)) %>%
  ungroup

#Removing separated recall dataframe
rm(recall_only_df_separate)


#Creating a dataframe of possible words presented to participants, and then separate lists from these
presented_words <- cat_df %>% #Note same words used for all participants so can just take first participant
  filter(participant == "0f9726d6a339cf57027eab5d2e3409cfb311c531") %>%
  select(Word, word_type, stimuli_assignment) 

list1_likeable <- presented_words %>%
  filter(stimuli_assignment == 1 & word_type == "Likeable") %>%
  select(Word)


list1_dislikeable <- presented_words %>%
  filter(stimuli_assignment == 1 & word_type == "Dislikeable") %>%
  select(Word)


list2_likeable <- presented_words %>%
  filter(stimuli_assignment == 2 & word_type == "Likeable") %>%
  select(Word)


list2_dislikeable <- presented_words %>%
  filter(stimuli_assignment == 2 & word_type == "Dislikeable") %>%
  select(Word)


#Matching recalled words to possible words presented for that condition according to stimuli assignment (e.g. it would not be correct if they recalled a word presented in the self condition when asked to recall words in the other condition)
recall_only_df$Match <- ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list1_likeable$Word, 1, ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list1_dislikeable$Word, 2, ifelse(recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list2_likeable$Word, 1, ifelse(recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list2_dislikeable$Word, 2, ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list2_likeable$Word | recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list2_dislikeable$Word | recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list1_likeable$Word | recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list1_dislikeable$Word, 3, 4)))))


recall_only_df$Match <- as.factor(recall_only_df$Match)

recall_only_df$Match <- factor(recall_only_df$Match,
                                   levels = c(1, 2, 3, 4),
                                   labels = c("Correct Likeable","Correct Dislikeable", "Incorrect, word in other condition", "Incorrect"))



#Checking spellings
wrong <- recall_only_df %>%
  filter(Match == "Incorrect")

wrong$spelling <- hunspell(wrong$mem_response)

#Editing spellings which are clearly typos or slight misspellings
recall_only_df$mem_response[recall_only_df$mem_response == "socail"] <- "social"
recall_only_df$mem_response[recall_only_df$mem_response == "greatful"] <- "grateful"
recall_only_df$mem_response[recall_only_df$mem_response == "bosey"] <- "bossy"
recall_only_df$mem_response[recall_only_df$mem_response == "comcal"] <- "comical"
recall_only_df$mem_response[recall_only_df$mem_response == "cowardy"] <- "cowardly"
recall_only_df$mem_response[recall_only_df$mem_response == "obidient"] <- "obedient"
recall_only_df$mem_response[recall_only_df$mem_response == "humourous"] <- "humorous"
recall_only_df$mem_response[recall_only_df$mem_response == "humurous"] <- "humorous"
recall_only_df$mem_response[recall_only_df$mem_response == "listles"] <- "listless"
recall_only_df$mem_response[recall_only_df$mem_response == "unruley"] <- "unruly"
recall_only_df$mem_response[recall_only_df$mem_response == "frienly"] <- "friendly"
recall_only_df$mem_response[recall_only_df$mem_response == "obediant"] <- "obedient"
recall_only_df$mem_response[recall_only_df$mem_response == "tactul"] <- "tactful"
recall_only_df$mem_response[recall_only_df$mem_response == "gratefyl"] <- "grateful"
recall_only_df$mem_response[recall_only_df$mem_response == "humerous"] <- "humorous"
recall_only_df$mem_response[recall_only_df$mem_response == "truthfull"] <- "truthful"
recall_only_df$mem_response[recall_only_df$mem_response == "likeable"] <- "likable"
recall_only_df$mem_response[recall_only_df$mem_response == "insolen"] <- "insolent"
recall_only_df$mem_response[recall_only_df$mem_response == "origianl"] <- "original"
recall_only_df$mem_response[recall_only_df$mem_response == "comial"] <- "comical"
recall_only_df$mem_response[recall_only_df$mem_response == "sociable"] <- "social"
recall_only_df$mem_response[recall_only_df$mem_response == "unsociable"] <- "unsocial"


#Rechecking those correct
recall_only_df$Match <- ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list1_likeable$Word, 1, ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list1_dislikeable$Word, 2, ifelse(recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list2_likeable$Word, 1, ifelse(recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list2_dislikeable$Word, 2, ifelse(recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list2_likeable$Word | recall_only_df$stimuli_assignment == 1 & recall_only_df$mem_response %in% list2_dislikeable$Word | recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list1_likeable$Word | recall_only_df$stimuli_assignment == 2 & recall_only_df$mem_response %in% list1_dislikeable$Word, 3, 4)))))


recall_only_df$Match <- as.factor(recall_only_df$Match)

recall_only_df$Match <- factor(recall_only_df$Match,
                               levels = c(1, 2, 3, 4),
                               labels = c("Correct Likeable","Correct Dislikeable", "Incorrect, word in other condition", "Incorrect"))

#Removing duplicate responses and counting number of unique words entered
recall_only_df <- recall_only_df %>%
  group_by(participant, condition) %>%
  distinct(mem_response, .keep_all = TRUE) %>%
  mutate(number_words_entered = n()) %>%
  ungroup


#Calculating total no. correct
recall_only_df <- recall_only_df %>%
  mutate(correct = ifelse(Match == "Correct Likeable" | Match == "Correct Dislikeable", 1, 0), correct_likeable = ifelse(Match == "Correct Likeable", 1, 0), correct_dislikeable = ifelse(Match == "Correct Dislikeable", 1, 0), incorrect_other_condition = ifelse(Match == "Incorrect, word in other condition", 1, 0)) %>%
  group_by(participant, condition) %>%
  mutate(tot_correct = sum(correct), tot_correct_likeable = sum(correct_likeable), tot_correct_dislikeable = sum(correct_dislikeable), tot_incorrect_other_condition = sum(incorrect_other_condition)) %>%
  mutate(tot_fa = number_words_entered - tot_correct) %>%
  select(-memN) %>%
  ungroup

#Checking that output are possible (i.e. shouldn't be negative)
recall_only_df %>%
  summarise_at(c("tot_correct", "tot_correct_likeable", "tot_correct_dislikeable", "number_words_entered", "tot_incorrect_other_condition", "tot_fa"), min)

recall_only_df %>%
  summarise_at(c("tot_correct", "tot_correct_likeable", "tot_correct_dislikeable", "number_words_entered", "tot_incorrect_other_condition", "tot_fa"), max)

#Collapsing for aggregated data
recall_aggreagte <- recall_only_df %>%
  group_by(participant, condition, stimuli_assignment) %>%
  summarise_at(c("tot_correct", "tot_correct_likeable", "tot_correct_dislikeable", "number_words_entered", "tot_incorrect_other_condition", "tot_fa"), mean, na.rm = TRUE)

#Checking correct no. p.s
recall_aggreagte %>%
  group_by(condition) %>%
  summarise(n_distinct(participant)) %>%
  ungroup

#Renaming columns so can distinguish variables from categorisation data when merged
colnames(recall_aggreagte) = gsub("tot", "recall_tot", colnames(recall_aggreagte))
colnames(recall_aggreagte) = gsub("words", "recall_words", colnames(recall_aggreagte))

#Sorting out categorisation data
cat_df <- cat_df %>%
  group_by(participant, condition, word_type) %>%
  arrange(participant, condition, word_type) %>%
  mutate(tot_yes = sum(cat_resp)) %>%
  mutate(tot_no = 20 - tot_yes) %>%
  summarise_at(c("tot_yes", "tot_no", "trial_resp.rt"), mean, na.rm = TRUE) %>%
  mutate(emotion = ifelse(word_type == "Likeable", 1, 2)) %>% #Creating numeric variable for easier spreading of data, 1 = positive / likeable, 2 = negative / dislikable
  select(-word_type) %>%
  ungroup

cat_df <- dcast(setDT(cat_df), participant + condition ~ emotion, value.var = c('tot_yes', 'tot_no', 'trial_resp.rt'))

#Renaming columns for clarity and to distinguish from recall data
colnames(cat_df) = gsub("1", "likeable", colnames(cat_df))
colnames(cat_df) = gsub("2", "dislikeable", colnames(cat_df))
colnames(cat_df) = gsub("tot", "cat_tot", colnames(cat_df))
colnames(cat_df) = gsub("trial_resp.rt", "cat_rt", colnames(cat_df))

#Merging categorisation and recall data
cat_recall <- merge(cat_df, recall_aggreagte, by = c("participant", "condition"))

#Merging with group data
drug_matching <- read_excel("Raw/Group Allocation/Blinded_drug_matching_KH_format_for_R_anon.xlsx")
cat_recall <- merge(drug_matching, cat_recall , by = "participant")

cat_recall$group <- as.factor(cat_recall$group)

#Making Likeable vs. Dislikeable into long format
##Have to first change the name of some of the variables so they're not spread
cat_recall <- cat_recall %>%
  rename(overall_number_recalled = recall_tot_correct)

cat_recall_long <- melt(setDT(cat_recall), measure = patterns('^cat_tot_yes', '^cat_tot_no', 'cat_rt', '^recall_tot_correct'), value.name = c('cat_tot_yes', 'cat_tot_no', 'cat_rt', 'recall_tot_correct'), variable.name = 'valence')

cat_recall_long <- cat_recall_long %>%
  arrange(participant, condition, valence)

cat_recall_long$valence <- as.factor(cat_recall_long$valence)

cat_recall_long$valence <- factor(cat_recall_long$valence,
                               levels = c(1, 2),
                               labels = c("Likeable","Dislikeable"))

#Creating a new variable for condition for better presentation (i.e. want self to be first)
cat_recall_long$condition1 <- ifelse(cat_recall_long$condition == "self", 1, 2)

cat_recall_long$condition1 <- as.factor(cat_recall_long$condition1)

cat_recall_long$condition1 <- factor(cat_recall_long$condition1,
                                  levels = c(1, 2),
                                  labels = c("Self","Other"))

cat_recall_long <- cat_recall_long %>%
  select(-condition) %>%
  rename(condition = condition1)

#Multiply RT by 1000 for better descriptives
cat_recall_long <- cat_recall_long %>%
  mutate(cat_rt = cat_rt * 1000)

#Saving Dataframe
save(cat_recall_long, file = "Analysis/Recall/cat_recall_long")
write.csv(cat_recall_long, "Analysis/Recall/cat_recall_long.csv")
