library(quanteda) library(quanteda.dictionaries) library(tidyverse) library(readxl) # Read the per-participant data raw_data <- read_excel("2023-03-31_TST_Cleaned-Data.xlsx") # Reshape data: each participant has 20 offline columns and 20 online columns data_long <- raw_data %>% mutate(participant_id = row_number()) %>% pivot_longer( cols = matches("^[0-9]+_Offline$"), names_to = "item_offline", values_to = "text_offline" ) %>% pivot_longer( cols = matches("^Online I am"), names_to = "item_online", values_to = "text_online" ) %>% group_by(participant_id, Age, Gender, `Country of residence`) %>% summarise( text_offline = paste(text_offline, collapse = " "), text_online = paste(text_online, collapse = " "), .groups = "drop" ) %>% pivot_longer( cols = c(text_offline, text_online), names_to = "condition", values_to = "text", names_prefix = "text_" ) # Load LIWC dictionary dictliwc <- dictionary(file = "~/Dropbox/Text Mining Dictionaries/LIWC2015_English_Flat.dic", format = "LIWC", encoding = "UTF-8") # Calculate LIWC scores per participant-condition liwc_scores <- liwcalike(data_long$text, dictliwc, what = "word") # Combine with metadata liwc_data <- bind_cols(data_long %>% select(-text), liwc_scores) %>% select(-docname, -Segment) # Verification checks cat("\n=== DATA STRUCTURE VERIFICATION ===\n") cat(sprintf("Total rows: %d (should be 240 = 120 participants × 2 conditions)\n", nrow(liwc_data))) cat(sprintf("Unique participants: %d (should be 120)\n", n_distinct(liwc_data$participant_id))) cat(sprintf("Conditions: %s\n", paste(unique(liwc_data$condition), collapse = ", "))) cat(sprintf("Rows per condition: offline=%d, online=%d\n", sum(liwc_data$condition == "offline"), sum(liwc_data$condition == "online"))) # Reshape to wide for paired t-tests, EXCLUDING structural/meta categories liwc_wide <- liwc_data %>% select(-c(WPS, WC, Dic, Sixltr)) %>% # EXCLUDE meta-categories pivot_longer( cols = -c(participant_id, Age, Gender, `Country of residence`, condition), names_to = "liwc_category", values_to = "score" ) %>% pivot_wider( names_from = condition, values_from = score, id_cols = c(participant_id, liwc_category) ) # Perform paired t-tests for each LIWC category ttest_results <- liwc_wide %>% group_by(liwc_category) %>% summarise( n = sum(!is.na(offline) & !is.na(online)), mean_offline = mean(offline, na.rm = TRUE), mean_online = mean(online, na.rm = TRUE), sd_offline = sd(offline, na.rm = TRUE), sd_online = sd(online, na.rm = TRUE), mean_diff = mean_online - mean_offline, t_stat = { test <- t.test(online, offline, paired = TRUE) test$statistic }, df = { test <- t.test(online, offline, paired = TRUE) test$parameter }, p_value = { test <- t.test(online, offline, paired = TRUE) test$p.value }, cohens_d = mean_diff / sqrt((sd_offline^2 + sd_online^2) / 2) ) %>% arrange(p_value) %>% mutate( significant = p_value < 0.05, direction = case_when( !significant ~ "no difference", mean_diff > 0 ~ "online > offline", mean_diff < 0 ~ "offline > online" ) ) # Display significant differences cat("\n=== SIGNIFICANT LIWC CATEGORY DIFFERENCES (p < 0.05) ===\n\n") ttest_results %>% filter(significant) %>% select(liwc_category, mean_offline, mean_online, mean_diff, cohens_d, t_stat, p_value, direction) %>% arrange(p_value) %>% print(n = 100) write_csv(ttest_results,"LIWC_ttest_results.csv") # Calculate confidence intervals for plotting plot_data <- ttest_results %>% filter(significant) %>% mutate( se_diff = sqrt(sd_offline^2/n + sd_online^2/n), ci_lower = mean_diff - 1.96 * se_diff, ci_upper = mean_diff + 1.96 * se_diff ) # Visualization 1: Forest plot of ALL significant differences plot_data %>% ggplot(aes(x = mean_diff, y = reorder(liwc_category, mean_diff))) + geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") + geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), height = 0.3, color = "darkgray") + geom_point(aes(color = direction), size = 3) + scale_color_manual(values = c("online > offline" = "#17a2b8", "offline > online" = "#ff7f7f", "no difference" = "gray70")) + labs( x = "Mean Difference (Online - Offline)", y = "LIWC Category", title = "Significant LIWC Category Differences", subtitle = "Error bars represent 95% confidence intervals", color = "Direction" ) + theme_minimal() + theme(legend.position = "bottom") # Visualization 2: Volcano plot ttest_results %>% filter(!is.na(p_value), !is.infinite(p_value)) %>% ggplot(aes(x = cohens_d, y = -log10(p_value))) + geom_point(aes(color = significant), size = 3, alpha = 0.7) + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + geom_text( data = ttest_results %>% filter(significant, abs(cohens_d) > 0.3, !is.na(p_value)), aes(label = liwc_category), hjust = -0.1, vjust = 0.5, size = 2.5, check_overlap = TRUE ) + scale_color_manual(values = c("TRUE" = "#d62728", "FALSE" = "gray70")) + labs( x = "Effect Size (Cohen's d)", y = "-log10(p-value)", title = "Volcano Plot: LIWC Category Differences", subtitle = "Red line indicates p = 0.05 threshold", color = "Significant" ) + theme_minimal() + theme(legend.position = "top") # Visualization 3: Error bar chart showing offline vs online with SE bars plot_data %>% select(liwc_category, mean_offline, mean_online, sd_offline, sd_online, n) %>% mutate( se_offline = sd_offline / sqrt(n), se_online = sd_online / sqrt(n) ) %>% pivot_longer( cols = c(mean_offline, mean_online), names_to = "condition", values_to = "mean_score", names_prefix = "mean_" ) %>% mutate( se = if_else(condition == "offline", se_offline, se_online) ) %>% ggplot(aes(x = mean_score, y = reorder(liwc_category, mean_score), color = condition)) + geom_errorbarh(aes(xmin = mean_score - se, xmax = mean_score + se), height = 0.3, alpha = 0.7) + geom_point(size = 2) + scale_color_manual( values = c("offline" = "#ff7f7f", "online" = "#17a2b8"), labels = c("Offline", "Online") ) + labs( x = "Mean Score (%) ± SE", y = "LIWC Category", title = "LIWC Category Differences: Online vs Offline Identity", subtitle = "Error bars represent ±1 standard error; non-overlapping bars indicate significant differences", color = "Condition" ) + theme_minimal() + theme( legend.position = "bottom", panel.grid.minor = element_blank() )