##################################################### # Script to find the number of genes per GO-slim term # and the number of shared genes between GO-slim terms # Requires a matrix or data frame (read in from a file) # containing Swiss-Pro IDs, GO domain, and GO-slim terms. # Returns two matrices: One with the number of genes per GO-slim terms, # and one with the number of genes shared between GO-slim terms. # These matrices are written out as CSV files # and used to build networks in Python. # # Author: Sonia Singhal, 18 Aug 2012 ##################################################### # read in data filename <- "C:/Users/Sonia/Downloads/ID for DEG in QPX (trimmed).txt" diff_gene_data <- read.delim(filename, stringsAsFactors = FALSE) # extract genes involved in biological processes proc_data <- subset(diff_gene_data, GO.domain2 == "P") # For each GO-slim term, count the number of genes involved. GOslim <- unique(proc_data$GO.slim) genes_per_slim <- data.frame(GOslim = GOslim, num_genes = rep(0, length(GOslim))) for (s in GOslim){ sub <- subset(proc_data, GO.slim == s, select = c(Gene_ID, GO.slim)) genes_per_slim$num_genes[genes_per_slim$GOslim == s] <- length(unique(sub$Gene_ID)) } # Initialize the network matrix. # Link the row and column numbers in the matrix to the numbers in the GOslim vector GOslim_mat <- matrix(nrow = length(GOslim), ncol = length(GOslim), data = 0) # for each gene, count the number of GO-slim terms # Insert the values in the matrix genes <- unique(proc_data$Gene_ID) for(g in genes){ sub <-subset(proc_data, Gene_ID == g, select = c(Gene_ID, GO.slim)) slim_terms <- unique(sub$GO.slim) if(length(slim_terms) > 1){ for(term1 in slim_terms){ for(term2 in slim_terms){ if (term1 == term2){ next } GOslim_mat[which(GOslim == term1), which(GOslim == term2)] <- GOslim_mat[which(GOslim == term1), which(GOslim == term2)] + 1 } } } } # write the matrix to a file write.csv(GOslim_mat, file = "FHL/GOslim_mat.csv")