Methylation Analysis - P.generosa Bismark on Mox

This is a quick and dirty (i.e. no adaptor/quality trimming) assessment of all of our Panopea generosa bisulfite sequencing efforts to date in order to get a rough idea of methylation mapping, per this GitHub issue. Ran Bismark on Mox on a series of subset of the reads:

100k
500k
1M
2M
5M
10M

This was run as both paired end and single end, as the paired end analysis didn’t actually produce a mapping efficiency (but does have alignment percentages), but the single end ended up yielding mapping efficiencies. Single end analysis consisted of just the R1 reads.

SBATCH scripts are linked and pasted below:

Paired end script:


#!/bin/bash
## Job Name
#SBATCH --job-name=bismark
## Allocation Definition
#SBATCH --account=coenv
#SBATCH --partition=coenv
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=10-00:00:00
## Memory per node
#SBATCH --mem=120G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=samwhite@uw.edu
## Specify the working directory for this job
#SBATCH --workdir=/gscratch/scrubbed/samwhite/outputs/20190222_geo_rrbs_bismark

# Load Python Mox module for Python module availability

module load intel-python3_2017

# Document programs in PATH (primarily for program version ID)

date >> system_path.log
echo "" >> system_path.log
echo "System PATH for $SLURM_JOB_ID" >> system_path.log
echo "" >> system_path.log
printf "%0.s-" {1..10} >> system_path.log
echo ${PATH} | tr : \\n >> system_path.log


# Directories and programs
wd=$(pwd)
bismark_dir="/gscratch/srlab/programs/Bismark-0.19.0"
bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/"
samtools="/gscratch/srlab/programs/samtools-1.9/samtools"
reads_dir="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/"

## genomes
genome_v070="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v070"
genome_v071="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v071"
genome_v073="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v073"

genome_array=(${genome_v070} ${genome_v071} ${genome_v073})

## An array of subsets of reads to use in bismark:
## 100k, 500k, 1M, 2M, 5M, 10M
subset_array=(100000 500000 1000000 2000000 5000000 10000000)

## Concatenated FastQ Files
R1="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/pgen_bsseq_all_R1.fastq.gz"
R2="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/pgen_bsseq_all_R2.fastq.gz"

## FastQ files lists
R1_list="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/pgen_bsseq_all_R1.list"
R2_list="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/pgen_bsseq_all_R2.list"

# Check for existence of previous concatenation
# If they exist, delete them

for file in ${R1} ${R1_list} ${R2} ${R2_list}
do
  if [ -e ${file} ]; then
    rm ${file}
  fi
done

# Concatenate R1 reads and generate lists of FastQs
for fastq in ${reads_dir}*R1*.gz
do
  echo ${fastq} >> ${R1_list}
  cat ${fastq} >> ${R1}
done

# Concatenate R2 reads and generate lists of FastQs
for fastq in ${reads_dir}*R2*.gz
do
  echo ${fastq} >> ${R2_list}
  cat ${fastq} >> ${R2}
done

# Run bismark using bisulftie-converted genome

for genome in ${genome_array[@]}
do
  for subset in ${subset_array[@]}
  do
    genome_version=$(echo ${genome##*/})
    mkdir subset_${genome_version}_${subset}
    cd subset_${genome_version}_${subset}
    ${bismark_dir}/bismark \
    --path_to_bowtie ${bowtie2_dir} \
    --genome ${genome} \
    --score_min L,0,-0.6 \
    -u ${subset} \
    -p 28 \
    -1 ${R1} \
    -2 ${R2} \
    2> subset_${genome_version}_${subset}_summary.txt

    # Methylation extraction

    ${bismark_dir}/bismark_methylation_extractor \
    --bedgraph \
    --counts \
    --scaffolds \
    --remove_spaces \
    --multicore 28 \
    --buffer_size 75% \
    *.bam

    # Bismark processing report

    ${bismark_dir}/bismark2report

    #Bismark summary report

    ${bismark_dir}/bismark2summary

    # Sort files for methylkit and IGV

    find *.bam \
    | xargs basename -s .bam \
    | xargs -I{} ${samtools} \
    sort \
    --threads 28 \
    {}.bam \
    -o {}.sorted.bam

    # Index sorted files for IGV
    # The "-@ 56" below specifies number of CPU threads to use.

    find *.sorted.bam \
    | xargs basename -s .sorted.bam \
    | xargs -I{} ${samtools} \
    index -@ 28 \
    {}.sorted.bam
    cd ${wd}
  done
done

Single end script:


#!/bin/bash
## Job Name
#SBATCH --job-name=bismark
## Allocation Definition
#SBATCH --account=coenv
#SBATCH --partition=coenv
## Resources
## Nodes
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=10-00:00:00
## Memory per node
#SBATCH --mem=120G
##turn on e-mail notification
#SBATCH --mail-type=ALL
#SBATCH --mail-user=samwhite@uw.edu
## Specify the working directory for this job
#SBATCH --workdir=/gscratch/scrubbed/samwhite/outputs/20190224_pgenerosa_rrbs_se_bismark

# Load Python Mox module for Python module availability

module load intel-python3_2017

# Document programs in PATH (primarily for program version ID)

date >> system_path.log
echo "" >> system_path.log
echo "System PATH for $SLURM_JOB_ID" >> system_path.log
echo "" >> system_path.log
printf "%0.s-" {1..10} >> system_path.log
echo ${PATH} | tr : \\n >> system_path.log


# Directories and programs
wd=$(pwd)
bismark_dir="/gscratch/srlab/programs/Bismark-0.19.0"
bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/"
samtools="/gscratch/srlab/programs/samtools-1.9/samtools"
reads_dir="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/"

## genomes
genome_v070="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v070"
genome_v071="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v071"
genome_v073="/gscratch/scrubbed/samwhite/data/P_generosa/Pgenerosa_v073"

genome_array=(${genome_v070} ${genome_v071} ${genome_v073})

## An array of subsets of reads to use in bismark:
## 100k, 500k, 1M, 2M, 5M, 10M
subset_array=(100000 500000 1000000 2000000 5000000 10000000)

## Concatenated FastQ Files
se_reads="/gscratch/scrubbed/samwhite/data/P_generosa/BSeq/rrbs/pgen_bsseq_all_R1.fastq.gz"

## FastQ files lists
R1_list="${wd}/pgen_bsseq_all_R1.list"

# Check for existence of previous concatenation
# If they exist, delete them

for file in ${se_reads} ${R1_list}
do
  if [ -e ${file} ]; then
    rm ${file}
  fi
done

# Concatenate R1 reads and generate lists of FastQs
for fastq in ${reads_dir}*R1*.gz
do
  echo ${fastq} >> ${R1_list}
  cat ${fastq} >> ${se_reads}
done

# Run bismark using bisulftie-converted genome
for genome in ${genome_array[@]}
do
  for subset in ${subset_array[@]}
  do
    genome_version=$(echo ${genome##*/})
    mkdir subset_${genome_version}_${subset}
    cd subset_${genome_version}_${subset}
    ${bismark_dir}/bismark \
    --path_to_bowtie ${bowtie2_dir} \
    --genome ${genome} \
    --score_min L,0,-0.6 \
    -u ${subset} \
    -p 28 \
    ${se_reads} \
    2> subset_${subset}_summary.txt

    # Deduplicate bam files
    ${bismark_dir}/deduplicate_bismark \
    --bam \
    --single \
    *.bam

    # Methylation extraction

    ${bismark_dir}/bismark_methylation_extractor \
    --bedgraph \
    --counts \
    --scaffolds \
    --remove_spaces \
    --multicore 28 \
    --buffer_size 75% \
    *deduplicated.bam

    # Bismark processing report

    ${bismark_dir}/bismark2report

    #Bismark summary report

    ${bismark_dir}/bismark2summary

    # Sort files for methylkit and IGV

    find *deduplicated.bam \
    | xargs basename -s .bam \
    | xargs -I{} ${samtools} \
    sort \
    --threads 28 \
    {}.bam \
    -o {}.sorted.bam

    # Index sorted files for IGV
    # The "-@ 56" below specifies number of CPU threads to use.

    find *.sorted.bam \
    | xargs basename -s .sorted.bam \
    | xargs -I{} ${samtools} \
    index -@ 28 \
    {}.sorted.bam
    cd ${wd}
  done
done

RESULTS

Output folders:

20190224_pgenerosa_rrbs_se_bismark/

Bedgraphs for each subset:

There is a bedgraph for each genome and each subset. I will not link them all. Here’re examples for each genome with the 10,000,000 subset:

Code used to pull mapping efficiencies:

find ./20190224_pgenerosa* -name "subset*.txt" -print0 | xargs -0 grep "Mapping efficiency" | sort -h
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_10000000/subset_10000000_summary.txt:Mapping efficiency:	56.2%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_1000000/subset_1000000_summary.txt:Mapping efficiency:	56.1%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_100000/subset_100000_summary.txt:Mapping efficiency:	56.3%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_2000000/subset_2000000_summary.txt:Mapping efficiency:	56.2%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_5000000/subset_5000000_summary.txt:Mapping efficiency:	56.2%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v070_500000/subset_500000_summary.txt:Mapping efficiency:	56.1%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_10000000/subset_10000000_summary.txt:Mapping efficiency:	53.9%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_1000000/subset_1000000_summary.txt:Mapping efficiency:	53.9%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_100000/subset_100000_summary.txt:Mapping efficiency:	54.1%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_2000000/subset_2000000_summary.txt:Mapping efficiency:	53.9%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_5000000/subset_5000000_summary.txt:Mapping efficiency:	54.0%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v071_500000/subset_500000_summary.txt:Mapping efficiency:	53.8%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_10000000/subset_10000000_summary.txt:Mapping efficiency:	49.7%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_1000000/subset_1000000_summary.txt:Mapping efficiency:	49.7%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_100000/subset_100000_summary.txt:Mapping efficiency:	49.8%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_2000000/subset_2000000_summary.txt:Mapping efficiency:	49.7%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_5000000/subset_5000000_summary.txt:Mapping efficiency:	49.8%
./20190224_pgenerosa_rrbs_se_bismark/subset_Pgenerosa_v073_500000/subset_500000_summary.txt:Mapping efficiency:	49.7%

Genome	Reads Subset	Mapping Efficiency (%)
v070	100000	56.3
v070	500000	56.1
v070	1000000	56.1
v070	2000000	56.2
v070	5000000	56.2
v070	10000000	56.2
v071	100000	54.1
v071	500000	53.8
v071	1000000	53.9
v071	2000000	53.9
v071	5000000	54.0
v071	10000000	53.9
v073	100000	49.8
v073	500000	49.7
v073	1000000	49.7
v073	2000000	49.7
v073	5000000	49.8
v073	10000000	49.7

Genome	Read Type (PE or SE)	Reads Subset	C methylated in CpG context (%)
v070	pe	100000	20.8
v070	pe	500000	21.0
v070	pe	1000000	21.0
v070	pe	2000000	21.0
v070	pe	5000000	21.0
v070	pe	10000000	21.0
v071	pe	100000	22.2
v071	pe	500000	22.3
v071	pe	1000000	22.3
v071	pe	2000000	22.3
v071	pe	5000000	22.4
v071	pe	10000000	22.4
v073	pe	100000	22.9
v073	pe	500000	23.0
v073	pe	1000000	23.0
v073	pe	2000000	23.0
v073	pe	5000000	23.1
v073	pe	10000000	23.1
v070	se	100000	21.5
v070	se	500000	21.7
v070	se	2000000	21.7
v070	se	5000000	21.7
v071	se	100000	22.6
v071	se	500000	22.8
v071	se	1000000	22.8
v071	se	2000000	22.8
v071	se	5000000	22.8
v071	se	10000000	22.8
v073	se	100000	23.3
v073	se	500000	23.3
v073	se	1000000	23.4
v073	se	2000000	23.4
v073	se	5000000	23.4
v073	se	10000000	23.4