#####################################################
# Script to find the number of genes per GO-slim term
# and the number of shared genes between GO-slim terms
# Requires a matrix or data frame (read in from a file) 
# containing Swiss-Pro IDs, GO domain, and GO-slim terms.
# Returns two matrices: One with the number of genes per GO-slim terms,
# and one with the number of genes shared between GO-slim terms.
# These matrices are written out as CSV files
# and used to build networks in Python.
#
# Author: Sonia Singhal, 18 Aug 2012
#####################################################

# read in data
filename <- "C:/Users/Sonia/Downloads/ID for DEG in QPX (trimmed).txt"
diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE)
# extract genes involved in biological processes
proc_data <- subset(diff_gene_data, GO.domain2 == "P")

# For each GO-slim term, count the number of genes involved.
GOslim <- unique(proc_data$GO.slim)
genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim)))
for (s in GOslim){
  sub <- subset(proc_data, GO.slim == s, select = c(Gene_ID, GO.slim))
  genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID))
}

# Initialize the network matrix.
# Link the row and column numbers in the matrix to the numbers in the GOslim vector
GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0)

# for each gene, count the number of GO-slim terms
# Insert the values in the matrix
genes <- unique(proc_data$Gene_ID)
for(g in genes){
  sub <-subset(proc_data, Gene_ID == g, select = c(Gene_ID, GO.slim))
  slim_terms <- unique(sub$GO.slim)
  if(length(slim_terms) > 1){
    for(term1 in slim_terms){
      for(term2 in slim_terms){
        if (term1 == term2){
          next
        }
        GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1
      }
    }
  }
}

# write the matrix to a file
write.csv(GOslim_mat, file = "FHL/GOslim_mat.csv")