source("http://bioconductor.org/biocLite.R") library(Ringo) library(ChIPpeakAnno) library(rtracklayer) #1st step from Ryan's code is to generate annotation object --what is an annotation object?: #this command takes a Nimblegen POS file a tab delim which #maps each probe to it's position in the reference genome #we received these files on a CD with the slides - I've saved them here -> http://eagle.fish.washington.edu/bivalvia/index.php?dir=array%2Fnimblegen%2F #ok, got it - going to run Ryan's code, but save into my own folder (added the verbose=F so it doesn't write out the whole file in the console) #...note: once this file is made once it can always be reference going forward, if you want to use the file Ryan made and sent, it's here -> http://eagle.fish.washington.edu/bivalvia/index.php?dir=array%2Fnimblegen%2F #probeAnno<-posToProbeAnno("121101_OysterV9_MG_meth.pos", genome="Pacific Oyster", microarrayPlatform="NimbleGen 121101 Oyster V9 MG meth",verbose = FALSE) #save(probeAnno,file="121101_OysterV9_MG_meth.ProbeAnnotation_MG.rda") #load annotation object - the file Ryan made setwd("/Volumes/web/bivalvia/array/Rstuff") load("121101_OysterV9_MG_meth.ProbeAnnotation.rda") #load gff gff<-read.delim("oyster.v9.glean.final.rename.mRNA.gff", header=F) head (gff) names(gff)<-c("chr","source","type","start","end","score","strand","phase","name") #Ryan's annotation of this step: 'read in altered targets file such that the control samlpe is in the denominator slot (Cy3)' #I'm not sure of the necessity of the date thing... todaysDate<-substr(format(Sys.time(), "%Y.%m.%d."),1,11) #The help for readNimblegen function says it's a function to read in Ng intensity text files into an RGlist #Ryan provided a 'spottypes' file and 'targets' file in a folder called 'pair' , the intensity files also need to be in here (see path argument) - #spot types are made or read by limma, I think targets is too. I need to ask him about this a little more.. #I took the pair files from the raw data from Cassie in the 'Oyster Array' folder, and put them in the 'pair' folder with the spot files and target types #renamed them to be consistent with Ryan's naming convention in the Target file RG<-readNimblegen("2013.09.24.targets.txt", "spottypes.txt", headerPattern="# software=DEVA", path="pair") save(RG, file="2014.03.03.RG.rda") #load a dataset written with the function save load("2014.03.03.RG.rda") #This step removes control probes (labeled as RANDOM (versus BLOCK1)) RG<-RG[RG$gene$Status != "RANDOM",] #Preprocessing - taking raw probe intensities into (background corrected) normalized log ratios (M-values), using the loess normalization (note the first round of analysis used the 'nimblegen' method - should see why they changed but I do know there are fewer sites with the loess) NG<-preprocess(RG,method="loess") #renames the samples to be "targetIDnumerator_vs_targetIDdemoninator" sampleNames(NG) <-with(RG$targets, paste(FileNameCy5,"vs",FileNameCy3,sep="_")) #Ryan's annotation: ProbeIDs all have ":+:" in their names, which has been converted to "..." when the preprocess function is run. This is in conflict #with the probeAnno row.names(exprs(NG))<-gsub("...",":+:",row.names(exprs(NG)), fixed=TRUE) #smooth probe intensities #computes running medians on tiling expression sets, arguements include the normalized file, the probe anno made earlier, the column that has the 'phenoData' #win half size says this: Half the size of the window centered at a probe position, in which all other probes contribute to the calculation of the median.need to ask about this one #in Ryan's original code the winHalfSize was 500 (full size 1000). When they did the reananlysis the window size was changed to 600, so I changed this to 300 smoothNG<-computeRunningMedians(NG,probeAnno=probeAnno, modColumn="FileNameCy5", winHalfSize=300,verbose = FALSE) ###########Not running this next section, but I'm going to leave it in since this is how the original analysis was done. Ryan and Jeff then suggested more conservative threshold: ###########thresholds for the final analysis: 2 fold change (log2(2) =1) for A02 and A03, and a less stringent threshold of 1.4 fold change in A01 (log2(1.4)=0.485) ###########when I asked why different threshold this was response: A lower threshold was used for the input vs input (A01) sample in an attempt to identify the most robust DMRs. When using the same threshold for all samples, there were a large number of DMRs in A02 and A03, where the same, but slightly weaker pattern of logFC values was found. #the apply means applying functions over error margine, 2 indicates columns, #upperBoundNull is a function from Ringo - This function tries to pinpoint the minimum of data values (log2 ratios) that are more likely to arise from the alternative distribution, i.e. an upper bound for values following the null distribution. #in the final analysis these thresholds weren't used. Instead, a more conservative pick of Use a threshold of 2 fold change (log2(2) =1) for A02 and A03, and a less stringent threshold of 1.4 fold change in A01 (log2(1.4)=0.485), #y0 <- apply(exprs(smoothNG),2,upperBoundNull) # A01_EE2.input_532.pair_vs_A01_Ctrl.input_635.pair.sm A02_EE2.MBD_532.pair_vs_A02_Ctrl.MBD_635.pair.sm # 0.6742440 0.6978518 # A03_EE2.MBD_635.pair_vs_A03_Ctrl.MBD_532.pair.sm # 0.7904904 #final threshold selection y0 <- c(.485,1,1) #'chers' are ChIP enriched regions, thisfirst one uses the smoothed probe intensities, the enviroment of mapping 'probe anno' and the thresholds for #for which you call something enriched, in this case it'sthe vcetor y0, which is the value of the upper bound of the null distribution - values above this are enriched, distCutOff is the #max #of bp at which enriched probes are condensed into 1 cher chersNG <- findChersOnSmoothed(smoothNG, probeAnno=probeAnno, thresholds=c(y0[1],y0[2],y0[3]), distCutOff=600, verbose = FALSE) #this function relates found chers to annotated genomic features, such as transcripts #gff is the mRNA regions (see above when it was read in) chersNG <- relateChers(chersNG, gff) #weird, this says called internally by other ringo functions, not normalled called by user chersNGD <- as.data.frame.cherList(chersNG) #order them in decreasing order by max level chersNGD<-chersNGD[order(chersNGD$maxLevel, decreasing=TRUE),] nrow(chersNGD) # --number of chers #4012 #chers detected within each array nrow(chersNGD[grep("A01",chersNGD$name),]) #3036 nrow(chersNGD[grep("A02",chersNGD$name),]) #448 nrow(chersNGD[grep("A03",chersNGD$name),]) #528 ###note: I will come back to this later if I have time, but skipping it for now ##Ryan's annotation: try to write wig using the writeWIG function from Starr #library(Starr) #writeWIG(smoothNG[,1],probeAnno,file="2013.09.25.A01_smoothed.wig") #writeWIG(smoothNG[,2],probeAnno,file="2013.09.25.A02_smoothed.wig") #writeWIG(smoothNG[,3],probeAnno,file="2013.09.25.A03_smoothed.wig") ##had to change NAs to 0s in order to load into IGV #find overlapping regions #create ranged data objects #looks like here it's repeating finding chers, but each assay (e.g. A01) is generated separately chersA01 <- findChersOnSmoothed(smoothNG[,1], probeAnno=probeAnno, thresholds=c(y0[1]), distCutOff=600, verbose = FALSE) #described above chersA01 <- relateChers(chersA01, gff) #exports cherList into a file of gff or bed exportCherList(chersA01,filename="2014.03.03.chers_A01_HYPER.bed",format="bed",genome="oyster") #read back in without header bedA01<-read.delim("2014.03.03.chers_A01_HYPER.bed", header=F) #generate a 6th column with '1' in every row -it's a default for strand column bedA01$V6<-1 #this ranged data is some version of a reformatted bed #it's used later for findOverlapping peaks (I wonder why bedtools wouldn't be simpler for this?) #this is required for the later step rangedA01<-BED2RangedData(bedA01) #doing the same thing for assay A02 and A03 chersA02 <- findChersOnSmoothed(smoothNG[,2], probeAnno=probeAnno, thresholds=c(y0[2]), distCutOff=600, verbose = FALSE) chersA02<- relateChers(chersA02, gff) exportCherList(chersA02,filename="2014.03.03.chers_A02_HYPER.bed",format="bed",genome="oyster") bedA02<-read.delim("2014.03.03.chers_A02_HYPER.bed", header=F) bedA02$V6<-1 rangedA02<-BED2RangedData(bedA02) chersA03 <- findChersOnSmoothed(smoothNG[,3], probeAnno=probeAnno, thresholds=c(y0[3]), distCutOff=600, verbose = FALSE) chersA03<- relateChers(chersA03, gff) exportCherList(chersA03,filename="2014.03.03.chers_A03_HYPER.bed",format="bed",genome="oyster") bedA03<-read.delim("2014.03.03.chers_A03_HYPER.bed", header=F) bedA03$V6<-1 rangedA03<-BED2RangedData(bedA03) #generate objects that contain overlapping regions #this requires the 'ranged' data generated above overlapA01.A02<-findOverlappingPeaks(rangedA01,rangedA02) overlapA01.A03<-findOverlappingPeaks(rangedA01,rangedA03) overlapA02.A03<-findOverlappingPeaks(rangedA02,rangedA03) #number of overlapping peaks in each pairwise comparison nrow(overlapA01.A02$OverlappingPeaks) #397 nrow(overlapA01.A03$OverlappingPeaks) #462 nrow(overlapA02.A03$OverlappingPeaks) #353 #This shows how multiple peaks in A02 are covered by 1 peak in A03. Here 346 regions of A02 overlap with A03, but 344 regions of A03 overlap with A02. ############# nrow(rangedA02) #448 A02peaksWithOverlapA03<-overlapA02.A03$Peaks1withOverlaps; nrow(A02peaksWithOverlapA03) #346 A02noOverlapWithA03<-rangedA02[!rownames(rangedA02) %in% rownames(A02peaksWithOverlapA03),]; nrow(A02noOverlapWithA03) #102 nrow(rangedA03) #528 A03peaksWithOverlapA02<-overlapA02.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA02) #344 A03noOverlapWithA02<-rangedA03[!rownames(rangedA03) %in% rownames(A03peaksWithOverlapA02),]; nrow(A03noOverlapWithA02) #184 ################ #Ryan's annotation: we now have overlapping and nonoverlapping chers between A02 and A03, what we want now, is to remove those chers that also overlap with A01 from each set A02peaksWithOverlapA01<-overlapA01.A02$Peaks2withOverlaps; nrow(A02peaksWithOverlapA01) #394 A02peaksWithOverlapA03butNotA01<-A02peaksWithOverlapA03[!rownames(A02peaksWithOverlapA03) %in% rownames(A02peaksWithOverlapA01),]; nrow(A02peaksWithOverlapA03butNotA01) #30 A03peaksWithOverlapA01<-overlapA01.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA01) #461 A03peaksWithOverlapA02butNotA01<-A03peaksWithOverlapA02[!rownames(A03peaksWithOverlapA02) %in% rownames(A03peaksWithOverlapA01),]; nrow(A03peaksWithOverlapA02butNotA01) #28 #write RangedData to bed #Ryan's annotation: https://stat.ethz.ch/pipermail/bioconductor/2012-April/045050.html write.table(as.data.frame(A02peaksWithOverlapA03butNotA01)[,c(1,2,3)], file="2014.03.03.A02overlapA03_butNotA01_HYPER.bed",sep="\t",quote=F,row.names=F,col.names=F) write.table(as.data.frame(A03peaksWithOverlapA02butNotA01)[,c(1,2,3)], file="2014.03.03.A03overlapA02_butNotA01_HYPER.bed",sep="\t",quote=F,row.names=F,col.names=F) ###################################################################################################################### # Hypomethylation #flip the ratios on these such that we're looking for hypomethylated regions exprs(smoothNG) = -exprs(smoothNG) chersNG <- findChersOnSmoothed(smoothNG, probeAnno=probeAnno, thresholds=c(y0[1],y0[2],y0[3]), distCutOff=600, verbose = FALSE) chersNG <- relateChers(chersNG, gff) chersNGD <- as.data.frame.cherList(chersNG) chersNGD<-chersNGD[order(chersNGD$maxLevel, decreasing=TRUE),] nrow(chersNGD) # --number of chers #3518 #chers detected within each array nrow(chersNGD[grep("A01",chersNGD$name),]) #2758 nrow(chersNGD[grep("A02",chersNGD$name),]) #348 nrow(chersNGD[grep("A03",chersNGD$name),]) #412 #repeating finding chers, but each assay (e.g. A01) is generated separately chersA01 <- findChersOnSmoothed(smoothNG[,1], probeAnno=probeAnno, thresholds=c(y0[1]), distCutOff=600, verbose = FALSE) chersA01 <- relateChers(chersA01, gff) exportCherList(chersA01,filename="2014.03.03.chers_A01_HYPO.bed",format="bed",genome="oyster") bedA01<-read.delim("2014.03.03.chers_A01_HYPO.bed", header=F) bedA01$V6<-1 rangedA01<-BED2RangedData(bedA01) #doing the same thing for assay A02 and A03 chersA02 <- findChersOnSmoothed(smoothNG[,2], probeAnno=probeAnno, thresholds=c(y0[2]), distCutOff=600, verbose = FALSE) chersA02<- relateChers(chersA02, gff) exportCherList(chersA02,filename="2014.03.03.chers_A02_HYPO.bed",format="bed",genome="oyster") bedA02<-read.delim("2014.03.03.chers_A02_HYPO.bed", header=F) bedA02$V6<-1 rangedA02<-BED2RangedData(bedA02) chersA03 <- findChersOnSmoothed(smoothNG[,3], probeAnno=probeAnno, thresholds=c(y0[3]), distCutOff=600, verbose = FALSE) chersA03<- relateChers(chersA03, gff) exportCherList(chersA03,filename="2014.03.03.chers_A03_HYPO.bed",format="bed",genome="oyster") bedA03<-read.delim("2014.03.03.chers_A03_HYPO.bed", header=F) bedA03$V6<-1 rangedA03<-BED2RangedData(bedA03) #generate objects that contain overlapping regions #this requires the 'ranged' data generated above overlapA01.A02<-findOverlappingPeaks(rangedA01,rangedA02) overlapA01.A03<-findOverlappingPeaks(rangedA01,rangedA03) overlapA02.A03<-findOverlappingPeaks(rangedA02,rangedA03) #number of overlapping peaks in each pairwise comparison nrow(overlapA01.A02$OverlappingPeaks) #317 nrow(overlapA01.A03$OverlappingPeaks) #369 nrow(overlapA02.A03$OverlappingPeaks) #269 #This shows how multiple peaks in A03 are covered by 1 peak in A02. Here 255 regions of A02 overlap with A03, but 265 regions of A03 overlap with A02. ############# nrow(rangedA02) #348 A02peaksWithOverlapA03<-overlapA02.A03$Peaks1withOverlaps; nrow(A02peaksWithOverlapA03) #255 A02noOverlapWithA03<-rangedA02[!rownames(rangedA02) %in% rownames(A02peaksWithOverlapA03),]; nrow(A02noOverlapWithA03) #93 nrow(rangedA03) #412 A03peaksWithOverlapA02<-overlapA02.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA02) #265 A03noOverlapWithA02<-rangedA03[!rownames(rangedA03) %in% rownames(A03peaksWithOverlapA02),]; nrow(A03noOverlapWithA02) #147 ################ #Ryan's annotation: we now have overlapping and nonoverlapping chers between A02 and A03, what we want now, is to remove those chers that also overlap with A01 from each set A02peaksWithOverlapA01<-overlapA01.A02$Peaks2withOverlaps; nrow(A02peaksWithOverlapA01) #315 A02peaksWithOverlapA03butNotA01<-A02peaksWithOverlapA03[!rownames(A02peaksWithOverlapA03) %in% rownames(A02peaksWithOverlapA01),]; nrow(A02peaksWithOverlapA03butNotA01) #19 A03peaksWithOverlapA01<-overlapA01.A03$Peaks2withOverlaps; nrow(A03peaksWithOverlapA01) #364 A03peaksWithOverlapA02butNotA01<-A03peaksWithOverlapA02[!rownames(A03peaksWithOverlapA02) %in% rownames(A03peaksWithOverlapA01),]; nrow(A03peaksWithOverlapA02butNotA01) #18 #write RangedData to bed #Ryan's annotation: https://stat.ethz.ch/pipermail/bioconductor/2012-April/045050.html write.table(as.data.frame(A02peaksWithOverlapA03butNotA01)[,c(1,2,3)], file="2014.03.03.A02overlapA03_butNotA01_HYPO.bed",sep="\t",quote=F,row.names=F,col.names=F) write.table(as.data.frame(A03peaksWithOverlapA02butNotA01)[,c(1,2,3)], file="2014.03.03.A03overlapA02_butNotA01_HYPO.bed",sep="\t",quote=F,row.names=F,col.names=F)