library(quanteda)
library(quanteda.dictionaries)
library(tidyverse)
library(readxl)

# Read the per-participant data
raw_data <- read_excel("2023-03-31_TST_Cleaned-Data.xlsx")

# Reshape data: each participant has 20 offline columns and 20 online columns
data_long <- raw_data %>%
  mutate(participant_id = row_number()) %>%
  pivot_longer(
    cols = matches("^[0-9]+_Offline$"),
    names_to = "item_offline",
    values_to = "text_offline"
  ) %>%
  pivot_longer(
    cols = matches("^Online I am"),
    names_to = "item_online",
    values_to = "text_online"
  ) %>%
  group_by(participant_id, Age, Gender, `Country of residence`) %>%
  summarise(
    text_offline = paste(text_offline, collapse = " "),
    text_online = paste(text_online, collapse = " "),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(text_offline, text_online),
    names_to = "condition",
    values_to = "text",
    names_prefix = "text_"
  )

# Load LIWC dictionary
dictliwc <- dictionary(file = "~/Dropbox/Text Mining Dictionaries/LIWC2015_English_Flat.dic", 
                       format = "LIWC", encoding = "UTF-8")

# Calculate LIWC scores per participant-condition
liwc_scores <- liwcalike(data_long$text, dictliwc, what = "word")

# Combine with metadata
liwc_data <- bind_cols(data_long %>% select(-text), liwc_scores) %>%
  select(-docname, -Segment)

# Verification checks
cat("\n=== DATA STRUCTURE VERIFICATION ===\n")
cat(sprintf("Total rows: %d (should be 240 = 120 participants × 2 conditions)\n", nrow(liwc_data)))
cat(sprintf("Unique participants: %d (should be 120)\n", n_distinct(liwc_data$participant_id)))
cat(sprintf("Conditions: %s\n", paste(unique(liwc_data$condition), collapse = ", ")))
cat(sprintf("Rows per condition: offline=%d, online=%d\n", 
            sum(liwc_data$condition == "offline"),
            sum(liwc_data$condition == "online")))

# Reshape to wide for paired t-tests, EXCLUDING structural/meta categories
liwc_wide <- liwc_data %>%
  select(-c(WPS, WC, Dic, Sixltr)) %>%  # EXCLUDE meta-categories
  pivot_longer(
    cols = -c(participant_id, Age, Gender, `Country of residence`, condition),
    names_to = "liwc_category",
    values_to = "score"
  ) %>%
  pivot_wider(
    names_from = condition,
    values_from = score,
    id_cols = c(participant_id, liwc_category)
  )

# Perform paired t-tests for each LIWC category
ttest_results <- liwc_wide %>%
  group_by(liwc_category) %>%
  summarise(
    n = sum(!is.na(offline) & !is.na(online)),
    mean_offline = mean(offline, na.rm = TRUE),
    mean_online = mean(online, na.rm = TRUE),
    sd_offline = sd(offline, na.rm = TRUE),
    sd_online = sd(online, na.rm = TRUE),
    mean_diff = mean_online - mean_offline,
    t_stat = {
      test <- t.test(online, offline, paired = TRUE)
      test$statistic
    },
    df = {
      test <- t.test(online, offline, paired = TRUE)
      test$parameter
    },
    p_value = {
      test <- t.test(online, offline, paired = TRUE)
      test$p.value
    },
    cohens_d = mean_diff / sqrt((sd_offline^2 + sd_online^2) / 2)
  ) %>%
  arrange(p_value) %>%
  mutate(
    significant = p_value < 0.05,
    direction = case_when(
      !significant ~ "no difference",
      mean_diff > 0 ~ "online > offline",
      mean_diff < 0 ~ "offline > online"
    )
  )

# Display significant differences
cat("\n=== SIGNIFICANT LIWC CATEGORY DIFFERENCES (p < 0.05) ===\n\n")
ttest_results %>%
  filter(significant) %>%
  select(liwc_category, mean_offline, mean_online, mean_diff, cohens_d, t_stat, p_value, direction) %>%
  arrange(p_value) %>%
  print(n = 100)

write_csv(ttest_results,"LIWC_ttest_results.csv")

# Calculate confidence intervals for plotting
plot_data <- ttest_results %>%
  filter(significant) %>%
  mutate(
    se_diff = sqrt(sd_offline^2/n + sd_online^2/n),
    ci_lower = mean_diff - 1.96 * se_diff,
    ci_upper = mean_diff + 1.96 * se_diff
  )

# Visualization 1: Forest plot of ALL significant differences
plot_data %>%
  ggplot(aes(x = mean_diff, y = reorder(liwc_category, mean_diff))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), height = 0.3, color = "darkgray") +
  geom_point(aes(color = direction), size = 3) +
  scale_color_manual(values = c("online > offline" = "#17a2b8", 
                                "offline > online" = "#ff7f7f",
                                "no difference" = "gray70")) +
  labs(
    x = "Mean Difference (Online - Offline)",
    y = "LIWC Category",
    title = "Significant LIWC Category Differences",
    subtitle = "Error bars represent 95% confidence intervals",
    color = "Direction"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

# Visualization 2: Volcano plot
ttest_results %>%
  filter(!is.na(p_value), !is.infinite(p_value)) %>%
  ggplot(aes(x = cohens_d, y = -log10(p_value))) +
  geom_point(aes(color = significant), size = 3, alpha = 0.7) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_text(
    data = ttest_results %>% 
      filter(significant, abs(cohens_d) > 0.3, !is.na(p_value)),
    aes(label = liwc_category),
    hjust = -0.1, vjust = 0.5, size = 2.5, check_overlap = TRUE
  ) +
  scale_color_manual(values = c("TRUE" = "#d62728", "FALSE" = "gray70")) +
  labs(
    x = "Effect Size (Cohen's d)",
    y = "-log10(p-value)",
    title = "Volcano Plot: LIWC Category Differences",
    subtitle = "Red line indicates p = 0.05 threshold",
    color = "Significant"
  ) +
  theme_minimal() +
  theme(legend.position = "top")

# Visualization 3: Error bar chart showing offline vs online with SE bars
plot_data %>%
  select(liwc_category, mean_offline, mean_online, sd_offline, sd_online, n) %>%
  mutate(
    se_offline = sd_offline / sqrt(n),
    se_online = sd_online / sqrt(n)
  ) %>%
  pivot_longer(
    cols = c(mean_offline, mean_online),
    names_to = "condition",
    values_to = "mean_score",
    names_prefix = "mean_"
  ) %>%
  mutate(
    se = if_else(condition == "offline", se_offline, se_online)
  ) %>%
  ggplot(aes(x = mean_score, y = reorder(liwc_category, mean_score), color = condition)) +
  geom_errorbarh(aes(xmin = mean_score - se, xmax = mean_score + se), 
                 height = 0.3, alpha = 0.7) +
  geom_point(size = 2) +
  scale_color_manual(
    values = c("offline" = "#ff7f7f", "online" = "#17a2b8"),
    labels = c("Offline", "Online")
  ) +
  labs(
    x = "Mean Score (%) ± SE",
    y = "LIWC Category",
    title = "LIWC Category Differences: Online vs Offline Identity",
    subtitle = "Error bars represent ±1 standard error; non-overlapping bars indicate significant differences",
    color = "Condition"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank()
  )