# ============================================================
# Paired differential expression analysis for GSE121212
# edgeR and DESeq2
#
# Input file:
#   GSE121212_gene_counts.txt
#
# File format:
#   Column 1: NCBI Gene ID
#   Row 1: patient
#   Row 2: disease state
#   Row 3: GSM number
#   Row 4 onward: gene counts
#
# Comparison:
#   lesional vs non-lesional
#   patient is used as a blocking factor
#
# Outputs:
#   all_pvalue.txt
#   edgeR_p05.txt
#   DESeq2_p05.txt
#   edgeR_p05_x12.txt
#   DESeq2_p05_x12.txt
#   sample_info.txt
# ============================================================

library(edgeR)
library(DESeq2)

# -----------------------------
# 1. Input file
# -----------------------------

input_file <- "GSE121212_gene_counts.txt"

raw <- read.delim(
  input_file,
  header = FALSE,
  sep = "\t",
  check.names = FALSE,
  stringsAsFactors = FALSE,
  quote = ""
)

# -----------------------------
# 2. Read sample information
# -----------------------------

patient <- as.character(raw[1, -1])
disease_state <- as.character(raw[2, -1])
gsm <- as.character(raw[3, -1])

sample_info <- data.frame(
  Sample = gsm,
  Patient = patient,
  DiseaseState = disease_state,
  stringsAsFactors = FALSE
)

# Standardize disease-state labels
sample_info$Condition <- tolower(sample_info$DiseaseState)
sample_info$Condition <- gsub("_", "-", sample_info$Condition)

# Check condition names
print(table(sample_info$Condition))

if (!all(c("non-lesional", "lesional") %in% sample_info$Condition)) {
  stop("Disease state must contain both 'non-lesional' and 'lesional'. Please check row 2.")
}

sample_info$Condition <- factor(
  sample_info$Condition,
  levels = c("non-lesional", "lesional")
)

sample_info$Patient <- factor(sample_info$Patient)

# Save sample information for checking GSM numbers
write.table(
  sample_info,
  file = "sample_info.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

# -----------------------------
# 3. Read count matrix
# -----------------------------

gene_id <- as.character(raw[-c(1:3), 1])

count_data <- raw[-c(1:3), -1]
count_data <- as.data.frame(count_data, stringsAsFactors = FALSE)

# Convert to numeric matrix
count_matrix <- apply(count_data, 2, function(x) as.numeric(as.character(x)))
count_matrix <- as.matrix(count_matrix)

colnames(count_matrix) <- gsm

# Keep original NCBI Gene IDs
gene_id_original <- gene_id

# R row names must be unique
rownames(count_matrix) <- make.unique(gene_id)

# edgeR / DESeq2 require integer counts
count_matrix <- round(count_matrix)

# -----------------------------
# 4. Check paired design
# -----------------------------

pair_table <- table(sample_info$Patient, sample_info$Condition)

if (any(pair_table[, "non-lesional"] == 0 | pair_table[, "lesional"] == 0)) {
  print(pair_table)
  stop("Some patients do not have both non-lesional and lesional samples. Paired analysis cannot be performed.")
}

# Make sure sample order matches count matrix
sample_info <- sample_info[match(colnames(count_matrix), sample_info$Sample), ]

# -----------------------------
# 5. edgeR paired analysis
# -----------------------------

group <- sample_info$Condition
patient <- sample_info$Patient

y <- DGEList(counts = count_matrix)

# No gene filtering is applied
y <- calcNormFactors(y)

design_edgeR <- model.matrix(~ patient + group)

# Check design matrix
print(colnames(design_edgeR))

y <- estimateDisp(y, design_edgeR)

fit <- glmQLFit(y, design_edgeR)

# Coefficient for lesional vs non-lesional
coef_edgeR <- grep("^grouplesional$", colnames(design_edgeR))

if (length(coef_edgeR) != 1) {
  stop("Could not find edgeR coefficient for lesional vs non-lesional.")
}

qlf <- glmQLFTest(fit, coef = coef_edgeR)

edgeR_res <- topTags(qlf, n = Inf)$table

edgeR_out <- data.frame(
  GeneID = gene_id_original[match(rownames(edgeR_res), rownames(count_matrix))],
  edgeR_log2FC = edgeR_res$logFC,
  edgeR_PValue = edgeR_res$PValue,
  edgeR_FDR = edgeR_res$FDR,
  stringsAsFactors = FALSE
)

# Restore original gene order
edgeR_out <- edgeR_out[match(gene_id_original, edgeR_out$GeneID), ]

# -----------------------------
# 6. DESeq2 paired analysis
# -----------------------------

coldata <- data.frame(
  row.names = sample_info$Sample,
  patient = sample_info$Patient,
  condition = sample_info$Condition
)

dds <- DESeqDataSetFromMatrix(
  countData = count_matrix,
  colData = coldata,
  design = ~ patient + condition
)

# No gene filtering is applied
# Use poscounts to avoid size-factor problems when many genes contain zeros
dds <- estimateSizeFactors(dds, type = "poscounts")

dds <- DESeq(dds)

res_deseq2 <- results(
  dds,
  contrast = c("condition", "lesional", "non-lesional")
)

deseq2_out <- data.frame(
  GeneID = gene_id_original,
  DESeq2_log2FC = res_deseq2$log2FoldChange,
  DESeq2_PValue = res_deseq2$pvalue,
  DESeq2_FDR = res_deseq2$padj,
  stringsAsFactors = FALSE
)

# -----------------------------
# 7. Merge all p-values
# -----------------------------

all_out <- data.frame(
  GeneID = gene_id_original,
  edgeR_log2FC = edgeR_out$edgeR_log2FC,
  edgeR_PValue = edgeR_out$edgeR_PValue,
  edgeR_FDR = edgeR_out$edgeR_FDR,
  DESeq2_log2FC = deseq2_out$DESeq2_log2FC,
  DESeq2_PValue = deseq2_out$DESeq2_PValue,
  DESeq2_FDR = deseq2_out$DESeq2_FDR,
  stringsAsFactors = FALSE
)

write.table(
  all_out,
  file = "all_pvalue.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

# -----------------------------
# 8. p < 0.05 lists
# -----------------------------

edgeR_p05 <- all_out[
  !is.na(all_out$edgeR_PValue) & all_out$edgeR_PValue < 0.05,
  c("GeneID", "edgeR_log2FC", "edgeR_PValue", "edgeR_FDR")
]

DESeq2_p05 <- all_out[
  !is.na(all_out$DESeq2_PValue) & all_out$DESeq2_PValue < 0.05,
  c("GeneID", "DESeq2_log2FC", "DESeq2_PValue", "DESeq2_FDR")
]

write.table(
  edgeR_p05,
  file = "edgeR_p05.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

write.table(
  DESeq2_p05,
  file = "DESeq2_p05.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

# -----------------------------
# 9. p < 0.05 and fold change >= 1.2
# -----------------------------
# 1.2-fold corresponds to abs(log2FC) >= log2(1.2)

log2fc_cutoff <- log2(1.2)

edgeR_p05_x12 <- edgeR_p05[
  !is.na(edgeR_p05$edgeR_log2FC) &
    abs(edgeR_p05$edgeR_log2FC) >= log2fc_cutoff,
]

DESeq2_p05_x12 <- DESeq2_p05[
  !is.na(DESeq2_p05$DESeq2_log2FC) &
    abs(DESeq2_p05$DESeq2_log2FC) >= log2fc_cutoff,
]

write.table(
  edgeR_p05_x12,
  file = "edgeR_p05_x12.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

write.table(
  DESeq2_p05_x12,
  file = "DESeq2_p05_x12.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE
)

# -----------------------------
# 10. Summary
# -----------------------------

cat("\nFinished.\n")
cat("All genes:", nrow(all_out), "\n")
cat("edgeR p < 0.05:", nrow(edgeR_p05), "\n")
cat("DESeq2 p < 0.05:", nrow(DESeq2_p05), "\n")
cat("edgeR p < 0.05 and FC >= 1.2:", nrow(edgeR_p05_x12), "\n")
cat("DESeq2 p < 0.05 and FC >= 1.2:", nrow(DESeq2_p05_x12), "\n")