source("http://bioconductor.org/biocLite.R")
library(Ringo)
source("http://bioconductor.org/biocLite.R")#
library(Ringo)#
library(ChIPpeakAnno)#
library(rtracklayer)#
#
#1st step from Ryan's code is to generate annotation object --what is an annotation object?:#
#this command takes a Nimblegen POS file a tab delim which #
#maps each probe to it's position in the reference genome#
#we received these files on a CD with the slides - I've saved them here -> http://eagle.fish.washington.edu/bivalvia/index.php?dir=array%2Fnimblegen%2F#
#ok, got it - going to run Ryan's code, but save into my own folder (added the verbose=F so it doesn't write out the whole file in the console)#
#...note: once this file is made once it can always be reference going forward, if you want to use the file Ryan made and sent, it's here -> http://eagle.fish.washington.edu/bivalvia/index.php?dir=array%2Fnimblegen%2F#
#probeAnno<-posToProbeAnno("121101_OysterV9_MG_meth.pos", genome="Pacific Oyster", microarrayPlatform="NimbleGen 121101 Oyster V9 MG meth",verbose = FALSE)#
#save(probeAnno,file="121101_OysterV9_MG_meth.ProbeAnnotation_MG.rda")#
#
#load annotation object - the file Ryan made#
setwd("/Volumes/web/bivalvia/array/Rstuff")#
load("121101_OysterV9_MG_meth.ProbeAnnotation.rda")#
#
#load gff#
gff<-read.delim("oyster.v9.glean.final.rename.mRNA.gff", header=F)#
head (gff)#
names(gff)<-c("chr","source","type","start","end","score","strand","phase","name")#
#
#Ryan's annotation of this step: 'read in altered targets file such that the control samlpe is in the denominator slot (Cy3)'#
#I'm not sure of the necessity of the date thing...#
todaysDate<-substr(format(Sys.time(), "%Y.%m.%d."),1,11)#
#The help for readNimblegen function says it's a function to read in Ng intensity text files into an RGlist#
#Ryan provided a 'spottypes' file and 'targets' file in a folder called 'pair' , the intensity files also need to be in here (see path argument) - #
#spot types are made or read by limma, I think targets is too.  I need to ask him about this a little more..#
#I took the pair files from the raw data from Cassie in the 'Oyster Array' folder, and put them in the 'pair' folder with the spot files and target types#
#renamed them to be consistent with Ryan's naming convention in the Target file#
RG<-readNimblegen("2013.09.24.targets.txt", "spottypes.txt", headerPattern="# software=DEVA", path="pair")#
save(RG, file="2014.03.03.RG.rda")#
#load a dataset written with the function save#
load("2014.03.03.RG.rda")#
#This step removes control probes (labeled as RANDOM (versus BLOCK1))#
RG<-RG[RG$gene$Status != "RANDOM",]#
#
#Preprocessing - taking raw probe intensities into (background corrected) normalized log ratios (M-values), using the loess normalization (note the first round of analysis used the 'nimblegen' method - should see why they changed but I do know there are fewer sites with the loess)#
NG<-preprocess(RG,method="loess")#
#renames the samples to be "targetIDnumerator_vs_targetIDdemoninator"#
sampleNames(NG) <-with(RG$targets, paste(FileNameCy5,"vs",FileNameCy3,sep="_"))#
#
#Ryan's annotation: ProbeIDs all have ":+:" in their names, which has been converted to "..." when the preprocess function is run.  This is in conflict#
#with the probeAnno#
row.names(exprs(NG))<-gsub("...",":+:",row.names(exprs(NG)), fixed=TRUE)#
#
#smooth probe intensities#
#computes running medians on tiling expression sets, arguements include the normalized file, the probe anno made earlier, the column that has the 'phenoData' #
#win half size says this: Half the size of the window centered at a probe position, in which all other probes contribute to the calculation of the median.need to ask about this one#
#in Ryan's original code the winHalfSize was 500 (full size 1000).  When they did the reananlysis the window size was changed to 600, so I changed this to 300#
smoothNG<-computeRunningMedians(NG,probeAnno=probeAnno, modColumn="FileNameCy5", winHalfSize=300,verbose = FALSE)#
###########Not running this next section, but I'm going to leave it in since this is how the original analysis was done.  Ryan and Jeff then suggested more conservative threshold:#
###########thresholds for the final analysis: 2 fold change (log2(2) =1) for A02 and A03, and a less stringent threshold of 1.4 fold change in A01 (log2(1.4)=0.485)#
###########when I asked why different threshold this was response: A lower threshold was used for the input vs input (A01) sample in an attempt to identify the most robust DMRs.  When using the same threshold for all samples, there were a large number of DMRs in A02 and A03, where the same, but slightly weaker pattern of logFC values was found.  #
#the apply means applying functions over error margine, 2 indicates columns, #
#upperBoundNull is a function from Ringo - This function tries to pinpoint the minimum of data values (log2 ratios) that are more likely to arise from the alternative distribution, i.e. an upper bound for values following the null distribution.#
#in the final analysis these thresholds weren't used.  Instead, a more conservative pick of Use a threshold of 2 fold change (log2(2) =1) for A02 and A03, and a less stringent threshold of 1.4 fold change in A01 (log2(1.4)=0.485), #
#y0 <- apply(exprs(smoothNG),2,upperBoundNull)#
# A01_EE2.input_532.pair_vs_A01_Ctrl.input_635.pair.sm     A02_EE2.MBD_532.pair_vs_A02_Ctrl.MBD_635.pair.sm #
# 0.6742440                                            0.6978518 #
# A03_EE2.MBD_635.pair_vs_A03_Ctrl.MBD_532.pair.sm #
# 0.7904904 #
#
#final threshold selection#
y0 <- c(.485,1,1)#
#
#'chers' are ChIP enriched regions, thisfirst one uses the smoothed probe intensities, the enviroment of mapping 'probe anno' and the thresholds for#
#for which you call something enriched, in this case it'sthe vcetor y0, which is the value of the upper bound of the null distribution - values above this are enriched, distCutOff is the#
#max #of bp at which enriched probes are condensed into 1 cher#
chersNG <- findChersOnSmoothed(smoothNG, probeAnno=probeAnno, thresholds=c(y0[1],y0[2],y0[3]), distCutOff=600, verbose = FALSE)#
#
#this function relates found chers to annotated genomic features, such as transcripts#
#gff is the mRNA regions (see above when it was read in)#
chersNG <- relateChers(chersNG, gff)#
#weird, this says called internally by other ringo functions, not normalled called by user#
chersNGD <- as.data.frame.cherList(chersNG)#
#order them in decreasing order by max level#
chersNGD<-chersNGD[order(chersNGD$maxLevel, decreasing=TRUE),]#
nrow(chersNGD) # --number of chers #4012#
#
#chers detected within each array#
nrow(chersNGD[grep("A01",chersNGD$name),])   #3036#
nrow(chersNGD[grep("A02",chersNGD$name),]) #448#
nrow(chersNGD[grep("A03",chersNGD$name),]) #528#
#
###note: I will come back to this later if I have time, but skipping it for now#
##Ryan's annotation: try to write wig using the writeWIG function from Starr#
#library(Starr)#
#writeWIG(smoothNG[,1],probeAnno,file="2013.09.25.A01_smoothed.wig")#
#writeWIG(smoothNG[,2],probeAnno,file="2013.09.25.A02_smoothed.wig")#
#writeWIG(smoothNG[,3],probeAnno,file="2013.09.25.A03_smoothed.wig")#
##had to change NAs to 0s in order to load into IGV#
#
#find overlapping regions#
#create ranged data objects#
#looks like here it's repeating finding chers, but each assay (e.g. A01) is generated separately#
chersA01 <- findChersOnSmoothed(smoothNG[,1], probeAnno=probeAnno, thresholds=c(y0[1]), distCutOff=600, verbose = FALSE)#
#described above#
chersA01 <- relateChers(chersA01, gff)#
#exports cherList into a file of gff or bed#
exportCherList(chersA01,filename="2014.03.03.chers_A01_HYPER.bed",format="bed",genome="oyster")#
#read back in without header#
bedA01<-read.delim("2014.03.03.chers_A01_HYPER.bed", header=F)#
#generate a 6th column with '1' in every row -it's a default for strand column#
bedA01$V6<-1#
#this ranged data is some version of a reformatted bed#
#it's used later for findOverlapping peaks (I wonder why bedtools wouldn't be simpler for this?)#
#this is required for the later step#
rangedA01<-BED2RangedData(bedA01)#
#
#doing the same thing for assay A02 and A03#
chersA02 <- findChersOnSmoothed(smoothNG[,2], probeAnno=probeAnno, thresholds=c(y0[2]), distCutOff=600, verbose = FALSE)#
chersA02<- relateChers(chersA02, gff)#
exportCherList(chersA02,filename="2014.03.03.chers_A02_HYPER.bed",format="bed",genome="oyster")#
bedA02<-read.delim("2014.03.03.chers_A02_HYPER.bed", header=F)#
bedA02$V6<-1#
rangedA02<-BED2RangedData(bedA02)#
#
chersA03 <- findChersOnSmoothed(smoothNG[,3], probeAnno=probeAnno, thresholds=c(y0[3]), distCutOff=600, verbose = FALSE)#
chersA03<- relateChers(chersA03, gff)#
exportCherList(chersA03,filename="2014.03.03.chers_A03_HYPER.bed",format="bed",genome="oyster")#
bedA03<-read.delim("2014.03.03.chers_A03_HYPER.bed", header=F)#
bedA03$V6<-1#
rangedA03<-BED2RangedData(bedA03)#
#
#generate objects that contain overlapping regions#
#this requires the 'ranged' data generated above#
overlapA01.A02<-findOverlappingPeaks(rangedA01,rangedA02)#
overlapA01.A03<-findOverlappingPeaks(rangedA01,rangedA03)#
overlapA02.A03<-findOverlappingPeaks(rangedA02,rangedA03)#
#
#number of overlapping peaks in each pairwise comparison#
nrow(overlapA01.A02$OverlappingPeaks) #397#
nrow(overlapA01.A03$OverlappingPeaks) #394#
nrow(overlapA02.A03$OverlappingPeaks) #353#
#
#This shows how multiple peaks in A02 are covered by 1 peak in A03. Here 346 regions of A02 overlap with A03, but 344 regions of A03 overlap with A02.#
##############
nrow(rangedA02) #448#
A02peaksWithOverlapA03<-overlapA02.A03$Peaks1withOverlaps; nrow(A02peaksWithOverlapA03) #346#
A02noOverlapWithA03<-rangedA02[!rownames(rangedA02) %in% rownames(A02peaksWithOverlapA03),]; nrow(A02noOverlapWithA03) #102#
#
nrow(rangedA03) #528#
A03peaksWithOverlapA02<-overlapA02.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA02) #344#
A03noOverlapWithA02<-rangedA03[!rownames(rangedA03) %in% rownames(A03peaksWithOverlapA02),]; nrow(A03noOverlapWithA02) #184#
#################
#
#Ryan's annotation: we now have overlapping and nonoverlapping chers between A02 and A03, what we want now, is to remove those chers that also overlap with A01 from each set#
A02peaksWithOverlapA01<-overlapA01.A02$Peaks2withOverlaps; nrow(A02peaksWithOverlapA01) #394#
A02peaksWithOverlapA03butNotA01<-A02peaksWithOverlapA03[!rownames(A02peaksWithOverlapA03) %in% rownames(A02peaksWithOverlapA01),]; nrow(A02peaksWithOverlapA03butNotA01) #30#
#
A03peaksWithOverlapA01<-overlapA01.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA01) #461#
A03peaksWithOverlapA02butNotA01<-A03peaksWithOverlapA02[!rownames(A03peaksWithOverlapA02) %in% rownames(A03peaksWithOverlapA01),]; nrow(A03peaksWithOverlapA02butNotA01) #28#
#
#write RangedData to bed#
#Ryan's annotation: https://stat.ethz.ch/pipermail/bioconductor/2012-April/045050.html#
write.table(as.data.frame(A02peaksWithOverlapA03butNotA01)[,c(1,2,3)], file="2014.03.03.A02overlapA03_butNotA01_HYPER.bed",sep="\t",quote=F,row.names=F,col.names=F)#
write.table(as.data.frame(A03peaksWithOverlapA02butNotA01)[,c(1,2,3)], file="2014.03.03.A03overlapA02_butNotA01_HYPER.bed",sep="\t",quote=F,row.names=F,col.names=F)#
#
#######################################################################################################################
# Hypomethylation#
#
#flip the ratios on these such that we're looking for hypomethylated regions#
exprs(smoothNG) = -exprs(smoothNG)#
chersNG <- findChersOnSmoothed(smoothNG, probeAnno=probeAnno, thresholds=c(y0[1],y0[2],y0[3]), distCutOff=600, verbose = FALSE)#
chersNG <- relateChers(chersNG, gff)#
chersNGD <- as.data.frame.cherList(chersNG)#
chersNGD<-chersNGD[order(chersNGD$maxLevel, decreasing=TRUE),]#
nrow(chersNGD) # --number of chers #3518#
#
#chers detected within each array#
nrow(chersNGD[grep("A01",chersNGD$name),])   #2758#
nrow(chersNGD[grep("A02",chersNGD$name),]) #348#
nrow(chersNGD[grep("A03",chersNGD$name),]) #412#
#repeating finding chers, but each assay (e.g. A01) is generated separately#
chersA01 <- findChersOnSmoothed(smoothNG[,1], probeAnno=probeAnno, thresholds=c(y0[1]), distCutOff=600, verbose = FALSE)#
chersA01 <- relateChers(chersA01, gff)#
exportCherList(chersA01,filename="2014.03.03.chers_A01_HYPO.bed",format="bed",genome="oyster")#
bedA01<-read.delim("2014.03.03.chers_A01_HYPO.bed", header=F)#
bedA01$V6<-1#
rangedA01<-BED2RangedData(bedA01)#
#
#doing the same thing for assay A02 and A03#
chersA02 <- findChersOnSmoothed(smoothNG[,2], probeAnno=probeAnno, thresholds=c(y0[2]), distCutOff=600, verbose = FALSE)#
chersA02<- relateChers(chersA02, gff)#
exportCherList(chersA02,filename="2014.03.03.chers_A02_HYPO.bed",format="bed",genome="oyster")#
bedA02<-read.delim("2014.03.03.chers_A02_HYPO.bed", header=F)#
bedA02$V6<-1#
rangedA02<-BED2RangedData(bedA02)#
#
chersA03 <- findChersOnSmoothed(smoothNG[,3], probeAnno=probeAnno, thresholds=c(y0[3]), distCutOff=600, verbose = FALSE)#
chersA03<- relateChers(chersA03, gff)#
exportCherList(chersA03,filename="2014.03.03.chers_A03_HYPO.bed",format="bed",genome="oyster")#
bedA03<-read.delim("2014.03.03.chers_A03_HYPO.bed", header=F)#
bedA03$V6<-1#
rangedA03<-BED2RangedData(bedA03)#
#
#generate objects that contain overlapping regions#
#this requires the 'ranged' data generated above#
overlapA01.A02<-findOverlappingPeaks(rangedA01,rangedA02)#
overlapA01.A03<-findOverlappingPeaks(rangedA01,rangedA03)#
overlapA02.A03<-findOverlappingPeaks(rangedA02,rangedA03)#
#
#number of overlapping peaks in each pairwise comparison#
nrow(overlapA01.A02$OverlappingPeaks) #317#
nrow(overlapA01.A03$OverlappingPeaks) #369#
nrow(overlapA02.A03$OverlappingPeaks) #269#
#
#This shows how multiple peaks in A03 are covered by 1 peak in A02. Here 255 regions of A02 overlap with A03, but 265 regions of A03 overlap with A02.#
##############
nrow(rangedA02) #348#
A02peaksWithOverlapA03<-overlapA02.A03$Peaks1withOverlaps; nrow(A02peaksWithOverlapA03) #255#
A02noOverlapWithA03<-rangedA02[!rownames(rangedA02) %in% rownames(A02peaksWithOverlapA03),]; nrow(A02noOverlapWithA03) #93#
#
nrow(rangedA03) #412#
A03peaksWithOverlapA02<-overlapA02.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA02) #265#
A03noOverlapWithA02<-rangedA03[!rownames(rangedA03) %in% rownames(A03peaksWithOverlapA02),]; nrow(A03noOverlapWithA02) #147#
#################
#
#Ryan's annotation: we now have overlapping and nonoverlapping chers between A02 and A03, what we want now, is to remove those chers that also overlap with A01 from each set#
A02peaksWithOverlapA01<-overlapA01.A02$Peaks2withOverlaps; nrow(A02peaksWithOverlapA01) #315#
A02peaksWithOverlapA03butNotA01<-A02peaksWithOverlapA03[!rownames(A02peaksWithOverlapA03) %in% rownames(A02peaksWithOverlapA01),]; nrow(A02peaksWithOverlapA03butNotA01) #19#
#
A03peaksWithOverlapA01<-overlapA01.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA01) #364#
A03peaksWithOverlapA02butNotA01<-A03peaksWithOverlapA02[!rownames(A03peaksWithOverlapA02) %in% rownames(A03peaksWithOverlapA01),]; nrow(A03peaksWithOverlapA02butNotA01) #18#
#
#write RangedData to bed#
#Ryan's annotation: https://stat.ethz.ch/pipermail/bioconductor/2012-April/045050.html#
write.table(as.data.frame(A02peaksWithOverlapA03butNotA01)[,c(1,2,3)], file="2014.03.03.A02overlapA03_butNotA01_HYPO.bed",sep="\t",quote=F,row.names=F,col.names=F)#
write.table(as.data.frame(A03peaksWithOverlapA02butNotA01)[,c(1,2,3)], file="2014.03.03.A03overlapA02_butNotA01_HYPO.bed",sep="\t",quote=F,row.names=F,col.names=F)
load("/Volumes/web/bivalvia/array/Rstuff/121101_OysterV9_MG_meth.ProbeAnnotation.rda")