##################################################### # Script to create network graphs of genes shared between GO-slim terms. # Requires a matrix or data frame (read in from a file) # containing Swiss-Pro IDs (Gene_ID), GO domain (GO.domain2), and GO-slim terms (GO.slim). # (If column names are different, code will have to be changed appropriately.) # Creates data frames with the number of genes per GO-slim terms and # the number of genes shared between GO-slim terms and # uses these matrices to create network graphs using iGraph. # The matrices are also written to csv files for use in # network_statistics_SUPPL.py, a Python program that obtains basic # statistics on the networks. # # Author: Sonia Singhal, 22 Aug 2012 # singhal@u.washington.edu ##################################################### library(igraph) # Function to get a vector of unique GO-slim terms from a data table. getGOslimNames <- function(GOslim_data){ return (unique(GOslim_data$GO.slim)) } # Function to count the number of genes involved in each GO-slim term getGenesPerSlim <- function(GOslim_data){ GOslim <- getGOslimNames(GOslim_data) genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim))) for (s in GOslim){ sub <- subset(GOslim_data, GO.slim == s, select = c(Gene_ID, GO.slim)) genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID)) } return (genes_per_slim) } # Function to create an adjacency matrix from shared genes between GO-slim terms. makeAdjancencyMat <- function(GOslim_data){ # Initialize the network adjacency matrix. GOslim <- getGOslimNames(GOslim_data) GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0) dimnames(GOslim_mat)[[1]] <- GOslim dimnames(GOslim_mat)[[2]] <-GOslim # For each gene, count the number of GO-slim terms # Insert the values in the matrix genes <- unique(GOslim_data$Gene_ID) for(g in genes){ sub <-subset(GOslim_data, Gene_ID == g, select = c(Gene_ID, GO.slim)) slim_terms <- unique(sub$GO.slim) if(length(slim_terms) > 1){ for(term1 in slim_terms){ for(term2 in slim_terms){ if (term1 == term2){ next } GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1 } } } } return (GOslim_mat) } # Initialize the graph makeIGraph <- function(GOslim_mat, genes_per_slim){ g <- graph.adjacency(GOslim_mat, mode = "undirected", weighted = TRUE) g <- set.vertex.attribute(g, "name", value = as.character(genes_per_slim$GOslim)) g <- set.vertex.attribute(g, "number", value = genes_per_slim$num_genes) return (g) } # Read in data. Filename will need to be changed for a different file/ computer. # File must contain columns with the GO-slim terms ("GO.slim"); # the GO domain ("GO.domain2"), and a Swiss-Prot or other unique ID ("Gene_ID"). filename <- "C:/Users/Sonia/Downloads/ID for DEG in QPX (trimmed).txt" diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE) # extract genes involved in biological processes proc_data <- subset(diff_gene_data, GO.domain2 == "P") # Remove "other" categories in the GO-slim terms proc_data <- subset(proc_data, GO.slim != "other biological processes") proc_data <- subset(proc_data, GO.slim != "other metabolic processes") # Find the number of genes per GO-slim term proc_genes_per_slim <- getGenesPerSlim(proc_data) # Create the adjacency matrix of connections GOslim_proc_mat <- makeAdjancencyMat(proc_data) # Initialize the graph proc_graph <- makeIGraph(GOslim_proc_mat, proc_genes_per_slim) # The figures that appear in the paper were created first # by plotting with the tkplot function in iGraph, which allows for # interactive arrangement of the nodes in the graph, and adjusted by hand. # The position of each node was then obtained by querying the tkplot for the coordinates # and tweaked in the basic plotting function (igraph.plot). # Node names were added in separately, after the graph had been exported. # Sample functions: tkplot(proc_graph, vertex.label = V(proc_graph)$name, vertex.size = log(V(proc_graph)$number)*10, edge.width = E(proc_graph)$weight, edge.label = E(proc_graph)$weight, edge.color = "black") proc_coords <- tkplot.getcoords(2) # 2 is the number of the particular tkplot. plot.igraph(proc_graph, vertex.label = NA, # Remove node name vertex.size = log(V(proc_graph)$number)*10, # Scale node size by the number of genes edge.width = E(proc_graph)$weight, # Change width of connections based on the number of shared genes layout = proc_coords2, # Use the coordinates from the tkplot to arrange nodes edge.color = "black") # Cellular processes: cell_data <- subset(diff_gene_data, GO.domain2 == "C") cell_data <- subset(cell_data, GO.slim != "other cellular component") cell_genes_per_slim <- getGenesPerSlim(cell_data) GOslim_cell_mat <- makeAdjancencyMat(cell_data) cell_graph <- makeIGraph(GOslim_cell_mat, cell_genes_per_slim) tkplot(cell_graph, vertex.label = V(cell_graph)$name, vertex.size = log(V(cell_graph)$number)*10, edge.width = E(cell_graph)$weight, edge.label = E(cell_graph)$weight, edge.color = "black") cell_coords <- tkplot.getcoords(4) plot.igraph(cell_graph, vertex.label = NA, vertex.size = log(V(cell_graph)$number)*10, edge.width = E(cell_graph)$weight, layout = cell_coords, edge.color = "black") # Write the matrix files to a csv for use in the Python program, # network_statistics_SUPPL.py. write.csv(GOslim_proc_mat, "proc_mat.csv") write.csv(GOslim_cell_mat, "cell_mat.csv") write.csv(proc_genes_per_slim, "proc_genes.csv") write.csv(cell_genes_per_slim, "cell_genes.csv")