#####################################################
# Script to create network graphs of genes shared between GO-slim terms.
# Requires a matrix or data frame (read in from a file) 
# containing Swiss-Pro IDs (Gene_ID), GO domain (GO.domain2), and GO-slim terms (GO.slim).
# (If column names are different, code will have to be changed appropriately.)
# Creates data frames with the number of genes per GO-slim terms and
# the number of genes shared between GO-slim terms and
# uses these matrices to create network graphs using iGraph.
# The matrices are also written to csv files for use in
# network_statistics_SUPPL.py, a Python program that obtains basic
# statistics on the networks.
#
# Author: Sonia Singhal, 22 Aug 2012
# singhal@u.washington.edu
#####################################################

library(igraph)

# Function to get a vector of unique GO-slim terms from a data table.
getGOslimNames <- function(GOslim_data){
  return (unique(GOslim_data$GO.slim))
}

# Function to count the number of genes involved in each GO-slim term
getGenesPerSlim <- function(GOslim_data){
  GOslim <- getGOslimNames(GOslim_data)
  genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim)))
  for (s in GOslim){
    sub <- subset(GOslim_data, GO.slim == s, select = c(Gene_ID, GO.slim))
    genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID))
  }
  return (genes_per_slim)
}

# Function to create an adjacency matrix from shared genes between GO-slim terms.
makeAdjancencyMat <- function(GOslim_data){
  # Initialize the network adjacency matrix.
  GOslim <- getGOslimNames(GOslim_data)
  GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0)
  dimnames(GOslim_mat)[[1]] <- GOslim
  dimnames(GOslim_mat)[[2]] <-GOslim
  
  # For each gene, count the number of GO-slim terms
  # Insert the values in the matrix
  genes <- unique(GOslim_data$Gene_ID)
  for(g in genes){
    sub <-subset(GOslim_data, Gene_ID == g, select = c(Gene_ID, GO.slim))
    slim_terms <- unique(sub$GO.slim)
    if(length(slim_terms) > 1){
      for(term1 in slim_terms){
        for(term2 in slim_terms){
          if (term1 == term2){
            next
          }
          GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1
        }
      }
    }
  }
  return (GOslim_mat)
}

# Initialize the graph
makeIGraph <- function(GOslim_mat, genes_per_slim){
  g <- graph.adjacency(GOslim_mat, mode = "undirected", weighted = TRUE)
  g <- set.vertex.attribute(g, "name", value = as.character(genes_per_slim$GOslim))
  g <- set.vertex.attribute(g, "number", value = genes_per_slim$num_genes)
  return (g)
}


# Read in data. Filename will need to be changed for a different file/ computer.
# File must contain columns with the GO-slim terms ("GO.slim");
# the GO domain ("GO.domain2"), and a Swiss-Prot or other unique ID ("Gene_ID").
filename <- "C:/Users/Sonia/Downloads/ID for DEG in QPX (trimmed).txt"
diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE)
# extract genes involved in biological processes
proc_data <- subset(diff_gene_data, GO.domain2 == "P")
# Remove "other" categories in the GO-slim terms
proc_data <- subset(proc_data, GO.slim != "other biological processes")
proc_data <- subset(proc_data, GO.slim != "other metabolic processes")
# Find the number of genes per GO-slim term
proc_genes_per_slim <- getGenesPerSlim(proc_data)
# Create the adjacency matrix of connections
GOslim_proc_mat <- makeAdjancencyMat(proc_data)
# Initialize the graph
proc_graph <- makeIGraph(GOslim_proc_mat, proc_genes_per_slim)

# The figures that appear in the paper were created first 
# by plotting with the tkplot function in iGraph, which allows for
# interactive arrangement of the nodes in the graph, and adjusted by hand.
# The position of each node was then obtained by querying the tkplot for the coordinates
# and tweaked in the basic plotting function (igraph.plot).
# Node names were added in separately, after the graph had been exported.
# Sample functions:
tkplot(proc_graph, 
       vertex.label = V(proc_graph)$name,
       vertex.size = log(V(proc_graph)$number)*10,
       edge.width = E(proc_graph)$weight, 
       edge.label = E(proc_graph)$weight,
       edge.color = "black")

proc_coords <- tkplot.getcoords(2) # 2 is the number of the particular tkplot.

plot.igraph(proc_graph, 
            vertex.label = NA, # Remove node name            
            vertex.size = log(V(proc_graph)$number)*10, # Scale node size by the number of genes
            edge.width = E(proc_graph)$weight, # Change width of connections based on the number of shared genes 
            layout = proc_coords2, # Use the coordinates from the tkplot to arrange nodes
            edge.color = "black")


# Cellular processes:
cell_data <- subset(diff_gene_data, GO.domain2 == "C")
cell_data <- subset(cell_data, GO.slim != "other cellular component")
cell_genes_per_slim <- getGenesPerSlim(cell_data)

GOslim_cell_mat <- makeAdjancencyMat(cell_data)

cell_graph <- makeIGraph(GOslim_cell_mat, cell_genes_per_slim)

tkplot(cell_graph, 
       vertex.label = V(cell_graph)$name,
       vertex.size = log(V(cell_graph)$number)*10,
       edge.width = E(cell_graph)$weight, 
       edge.label = E(cell_graph)$weight,
       edge.color = "black")

cell_coords <- tkplot.getcoords(4)

plot.igraph(cell_graph, 
            vertex.label = NA, 
            vertex.size = log(V(cell_graph)$number)*10,
            edge.width = E(cell_graph)$weight,  
            layout = cell_coords,
            edge.color = "black")

# Write the matrix files to a csv for use in the Python program,
# network_statistics_SUPPL.py.
write.csv(GOslim_proc_mat, "proc_mat.csv")
write.csv(GOslim_cell_mat, "cell_mat.csv")
write.csv(proc_genes_per_slim, "proc_genes.csv")
write.csv(cell_genes_per_slim, "cell_genes.csv")