######################################################
# R script to make scatterplots of data of unique reads from genomics/ transcriptomics experiments.
# Requires a text file containing contig names, unique reads for each treatment, and fold change data.
# Returns a scatterplot of the unique reads plotted x vs. y for each pair of treatments.
# Reads can be color-coded according to the fold change data.
# The graphs can either be printed to a file or to the console.
# Printing graphs to console allows for interactive identification of interesting points.
#
# Author: Sonia Singhal, 17 Aug 2012
#####################################################

# Read in the file. File path will need to be changed on a different computer.
filename <- "C:/Users/Sonia/Documents/FHL/Carolyn's data.txt"
c_data <- read.delim(filename, stringsAsFactors = FALSE)
c_data[,5] <- as.numeric(c_data[,5])   # Necessary for this particular file; may not always be needed. 

# Pull out the columns with the contig names, fold change, and unique reads. These numbers may change by data set.
unique_reads <- data.frame(contig = c_data[,1], fold_change = c_data[,5], unique400 = c_data[,8], unique1600 = c_data[,14], unique2200 = c_data[,20], unique1000 = c_data[,26])
unique_reads <- na.omit(unique_reads) # This particular file had 252 NA values; these were removed from analysis.

# Histogram or boxplot of fold change helps determine obvious cutoffs to use for coloring the graph.
hist(c_data[,5])
boxplot(c_data[,5])

# Use the next two commands to print the graphs to a file. In this case, points cannot be actively identified.
dev.set(dev.next())
pdf("Carolyn data.pdf", onefile = TRUE)

# Run through each set of treatment pairs and graph the unique reads from each treatment on a log scale as x vs. y. 
# Note that there is a small bug in this loop, and some of the graphs get made twice with the axes flipped.
for (i in 3:length(names(unique_reads))){
  for (j in 4:length(names(unique_reads))){
    if(j != i){
      xvals <- log(unique_reads[,i])
      yvals <- log(unique_reads[,j])
      xlab <- names(unique_reads)[i]
      ylab <- names(unique_reads)[j]
      # To use different fold-change cutoff values for the colors, change the 50 or the 2 in the command below to the desired number.
      # Colors can also be changed--basic colors are all recognized.
      # Point type can be changed with the argument pch. Using a symbol in quotations will make that symbol a point. Removing pch = "+" results in open circles.
      plot(xvals, yvals, main = "Comparison of differential expression", xlab = paste("Log of ", xlab), ylab = paste("Log of ", ylab), pch = "+", col = ifelse(abs(diffex_data$fold_change) > 50, "red", ifelse(abs(diffex_data$fold_change) > 2, "blue", "black")))
      # add a 1:1 line to the graph.
      abline(0, 1)
      # If the graphs are NOT being printed to a file, the following commands can be used to identify interesting points.
      # Print the particular treatments to the console.
      cat(xlab, ylab)
      # Identify and label points
      identify(xvals, yvals, labels = unique_reads[,1], plot = FALSE)
      # Points can be identified by moving the mouse over them and clicking on them.
      # When all points have been identified (return to console and hit "esc"), the point number will be printed to the consol and the contig name will be printed on the graph.
      # Note that, depending on the length of the contig name, individual names may overlap or get cut off.
      # However, they can always be retrieved by asking for the name at the point value.
      # For example, if one identified point is 189, use:
      # unique_reads$contig[189]
      # to retrieve the contig name.
      
      # Pause and wait for carriage return before starting the next loop. For use with the identify() commands.
      browser()
    }
    
    else {
      next
    }
    
  }
}
# Close the file if the graphs have been printed to a file.
dev.off()

# It is also possible to retrieve any subset of the unique_reads table.
# Example: Get a table only of contigs with a fold change greater than 50 that are in the column "unique400".
# Format: subset(TABLE NAME, EXPRESSION OF ROWS TO KEEP, select = c(NAMES OF COLUMNS TO KEEP))
# All caps get replaced appropriately.
sub_table <- subset(unique_reads, fold_change > 50, select = c(contig, fold_change, unique400))
# Summarize the data in the table (number of rows; means, maxes, and mins for numeric columns):
summary(sub_table)
# View the table in the console
sub_table
# Write the table to a text file
write(sub_table, "Greater_than_50.txt")