# ============================================================
# edgeR and DESeq2 p-value calculation for GSE121212
#
# Input format:
#   Column 1: NCBI Gene ID
#   Row 1: Patient ID
#   Row 2: Disease Status
#   Row 3: GSM number
#   Row 4 onward: Gene Counts
#
# Output files:
#   1. All genes:
#      GSE121212_edgeR_DESeq2_pvavlues.txt
#
#   2. p < 0.05 genes:
#      GSE121212_edgeR_p005_genes.txt
#      GSE121212_DESeq2_p005_genes.txt
#
#   3. p < 0.05 and fold change >= 1.4 genes:
#      GSE121212_edgeR_p005_FC1.4_genes.txt
#      GSE121212_DESeq2_p005_FC1.4_genes.txt
#
# No gene filtering is applied before edgeR / DESeq2 analysis.
#
# Note:
#   NaN / NA / empty values in the count table are replaced with 0.
# ============================================================

# ----------------------------
# Settings
# ----------------------------

input_file <- "GSE121212_gene_counts.txt"

output_file <- "GSE121212_edgeR_DESeq2_pvavlues.txt"

replace_missing_counts_with_zero <- TRUE

fc_cutoff <- 1.4

# ----------------------------
# Load packages
# ----------------------------

library(edgeR)
library(DESeq2)

# ----------------------------
# Read input file
# ----------------------------

raw <- read.delim(
  input_file,
  header = FALSE,
  stringsAsFactors = FALSE,
  check.names = FALSE,
  na.strings = character(0)
)

# ----------------------------
# Extract sample information
# ----------------------------

patient_id <- as.character(raw[1, -1])
disease_status <- as.character(raw[2, -1])
gsm_id <- as.character(raw[3, -1])

sample_names <- gsm_id

# ----------------------------
# Extract count matrix
# ----------------------------

gene_id <- as.character(raw[-c(1, 2, 3), 1])

count_df_original <- raw[-c(1, 2, 3), -1]

# Convert all count cells to character
count_df_char <- as.data.frame(
  lapply(count_df_original, function(x) {
    x <- as.character(x)
    x <- gsub(",", "", x)
    x <- trimws(x)
    x
  }),
  check.names = FALSE
)

# Convert to matrix for safer replacement
count_mat_char <- as.matrix(count_df_char)

# Detect missing-like values:
# actual NA/NaN, empty strings, "NA", and "NaN"
missing_like <- is.na(count_mat_char) |
  count_mat_char == "" |
  toupper(count_mat_char) == "NA" |
  toupper(count_mat_char) == "NAN"

missing_count <- sum(missing_like, na.rm = TRUE)

if (missing_count > 0) {
  cat("Found", missing_count, "missing-like count values: NA / NaN / empty\n")
  
  if (replace_missing_counts_with_zero) {
    cat("These values will be replaced with 0.\n")
    count_mat_char[missing_like] <- "0"
  } else {
    stop("Missing-like count values were found.")
  }
}

# Convert to numeric matrix
count_matrix <- matrix(
  as.numeric(count_mat_char),
  nrow = nrow(count_mat_char),
  ncol = ncol(count_mat_char),
  dimnames = list(gene_id, sample_names)
)

# Check remaining NA values after numeric conversion
na_pos <- which(is.na(count_matrix), arr.ind = TRUE)

if (nrow(na_pos) > 0) {
  cat("NA values were generated during numeric conversion.\n")
  cat("These are values that are not numeric and were not simple NA/NaN/empty values.\n")
  cat("Please check the following rows and columns in the original count table:\n\n")
  
  problem_table <- data.frame(
    GeneID = gene_id[na_pos[, 1]],
    Sample = sample_names[na_pos[, 2]],
    OriginalValue = as.matrix(count_df_original)[na_pos],
    stringsAsFactors = FALSE
  )
  
  print(head(problem_table, 100))
  
  stop("Non-numeric count values remain. Please fix the values above.")
}

# ----------------------------
# Basic count checks
# ----------------------------

if (any(count_matrix < 0)) {
  stop("Negative count values were found. Gene Counts must be non-negative.")
}

if (any(count_matrix != round(count_matrix))) {
  warning("Some count values are not integers. They will be rounded before analysis.")
  count_matrix <- round(count_matrix)
}

# ----------------------------
# Define groups
# ----------------------------

group <- factor(disease_status)

# Keep only non-lesional and lesional samples
target_samples <- group %in% c("non-lesional", "lesional")

count_matrix <- count_matrix[, target_samples, drop = FALSE]
group <- droplevels(group[target_samples])
patient_id <- patient_id[target_samples]
sample_names <- sample_names[target_samples]

# Check whether both groups are present
if (!all(c("non-lesional", "lesional") %in% levels(group))) {
  stop("Both 'non-lesional' and 'lesional' must be present in Disease Status.")
}

# Set reference level
group <- relevel(group, ref = "non-lesional")

cat("Number of samples used:\n")
print(table(group))

cat("Number of genes used:", nrow(count_matrix), "\n")

# ----------------------------
# edgeR
# ----------------------------

dge <- DGEList(
  counts = count_matrix,
  group = group
)

# No gene filtering is applied
dge <- calcNormFactors(dge)

design_edgeR <- model.matrix(~ group)

dge <- estimateDisp(dge, design_edgeR)

fit <- glmFit(dge, design_edgeR)

lrt <- glmLRT(fit, coef = 2)

edgeR_table <- topTags(
  lrt,
  n = Inf,
  sort.by = "none"
)$table

edgeR_pvalue <- edgeR_table$PValue

edgeR_pvalue <- edgeR_pvalue[
  match(rownames(count_matrix), rownames(edgeR_table))
]

# edgeR logFC is log2(lesional / non-lesional)
edgeR_logFC <- edgeR_table$logFC

edgeR_logFC <- edgeR_logFC[
  match(rownames(count_matrix), rownames(edgeR_table))
]

edgeR_FC <- 2^edgeR_logFC

# ----------------------------
# DESeq2
# ----------------------------

coldata <- data.frame(
  row.names = colnames(count_matrix),
  group = group
)

dds <- DESeqDataSetFromMatrix(
  countData = count_matrix,
  colData = coldata,
  design = ~ group
)

# No gene filtering is applied
dds <- DESeq(dds)

res <- results(
  dds,
  contrast = c("group", "lesional", "non-lesional")
)

DESeq2_pvalue <- res$pvalue

DESeq2_pvalue <- DESeq2_pvalue[
  match(rownames(count_matrix), rownames(res))
]

# DESeq2 log2FoldChange is log2(lesional / non-lesional)
DESeq2_logFC <- res$log2FoldChange

DESeq2_logFC <- DESeq2_logFC[
  match(rownames(count_matrix), rownames(res))
]

DESeq2_FC <- 2^DESeq2_logFC

# ----------------------------
# Output 1:
# All genes with edgeR and DESeq2 p values
# ----------------------------

output_all <- data.frame(
  `NCBI Gene ID` = rownames(count_matrix),
  `edgeR p value` = edgeR_pvalue,
  `DESeq2 p value` = DESeq2_pvalue,
  check.names = FALSE
)

write.table(
  output_all,
  file = output_file,
  sep = "\t",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE
)

# ----------------------------
# Output 2:
# p < 0.05 gene lists
# ----------------------------

edgeR_p005 <- data.frame(
  `NCBI Gene ID` = rownames(count_matrix),
  `P value` = edgeR_pvalue,
  check.names = FALSE
)

edgeR_p005 <- edgeR_p005[
  !is.na(edgeR_p005$`P value`) &
    edgeR_p005$`P value` < 0.05,
]

write.table(
  edgeR_p005,
  file = "GSE121212_edgeR_p005_genes.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE
)

DESeq2_p005 <- data.frame(
  `NCBI Gene ID` = rownames(count_matrix),
  `P value` = DESeq2_pvalue,
  check.names = FALSE
)

DESeq2_p005 <- DESeq2_p005[
  !is.na(DESeq2_p005$`P value`) &
    DESeq2_p005$`P value` < 0.05,
]

write.table(
  DESeq2_p005,
  file = "GSE121212_DESeq2_p005_genes.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE
)

# ----------------------------
# Output 3:
# p < 0.05 and fold change >= 1.4 gene lists
#
# fold change is lesional / non-lesional.
#
# This includes both:
#   FC >= 1.4
#   FC <= 1 / 1.4
#
# Internally, this is judged by:
#   abs(log2 fold change) >= log2(1.4)
# ----------------------------

logFC_cutoff <- log2(fc_cutoff)

edgeR_p005_fc14 <- data.frame(
  `NCBI Gene ID` = rownames(count_matrix),
  `P value` = edgeR_pvalue,
  `fold change` = edgeR_FC,
  check.names = FALSE
)

edgeR_p005_fc14 <- edgeR_p005_fc14[
  !is.na(edgeR_pvalue) &
    !is.na(edgeR_logFC) &
    edgeR_pvalue < 0.05 &
    abs(edgeR_logFC) >= logFC_cutoff,
]

write.table(
  edgeR_p005_fc14,
  file = "GSE121212_edgeR_p005_FC1.4_genes.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE
)

DESeq2_p005_fc14 <- data.frame(
  `NCBI Gene ID` = rownames(count_matrix),
  `P value` = DESeq2_pvalue,
  `fold change` = DESeq2_FC,
  check.names = FALSE
)

DESeq2_p005_fc14 <- DESeq2_p005_fc14[
  !is.na(DESeq2_pvalue) &
    !is.na(DESeq2_logFC) &
    DESeq2_pvalue < 0.05 &
    abs(DESeq2_logFC) >= logFC_cutoff,
]

write.table(
  DESeq2_p005_fc14,
  file = "GSE121212_DESeq2_p005_FC1.4_genes.txt",
  sep = "\t",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE
)

# ----------------------------
# Finish
# ----------------------------

cat("Done.\n")
cat("Output files:\n")
cat(output_file, "\n")
cat("GSE121212_edgeR_p005_genes.txt\n")
cat("GSE121212_DESeq2_p005_genes.txt\n")
cat("GSE121212_edgeR_p005_FC1.4_genes.txt\n")
cat("GSE121212_DESeq2_p005_FC1.4_genes.txt\n")