pwd
This command uses bash commands to count the number of lines in the FASTQ file (wc-l
),
divides the total number of lines by 4
(there are 4 lines per read in Illumina FASTQ files).
The echo
command is used to print the result to the screen, which gets stored in the variable:
TotalSeqs
TotalSeqs = !echo $((`wc -l < 2112_lane1_NoIndex_L001_R1_001.fastq` / 4))
#Prints the value stored in TotalSeqs.
#Notice that this is a Python string list and is not an integer!
print TotalSeqs
#Converts the value in the TotalSeqs string list at index 0 (TotalSeqs[0]) to
#an integer value of base 10.
#This conversion will be used repeatedly throughout this notebook to allow
#mathematical calculations using the numbers generated by bash commands.
TotalSeqs = int(TotalSeqs[0], 10)
print TotalSeqs
grep
and wc -l
to count all the instances of each of the full-length TruSeq adaptor/index sequences.¶The index sequence is indicated in each of the respective variable names.
Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1).
BC1_ATCACG_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC1_ATCACG_full string list at index 0 (BC1_ATCACG_full[0]) to
#an integer value of base 10.
BC1_ATCACG_full = int(BC1_ATCACG_full[0] ,10)
print BC1_ATCACG_full
BC3_TTAGGC_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC3_TTAGGC_full string list at index 0 (BC3_TTAGGC_full[0]) to
#an integer value of base 10.
BC3_TTAGGC_full = int(BC3_TTAGGC_full[0], 10)
print BC3_TTAGGC_full
BC4_TGACCA_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC4_TGACCA_full string list at index 0 (BC4_TGACCA_full[0]) to
#an integer value of base 10.
BC4_TGACCA_full = int(BC4_TGACCA_full[0], 10)
print BC4_TGACCA_full
BC5_ACAGTG_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC5_ACAGTG_full string list at index 0 (BC5_ACAGTG_full[0]) to
#an integer value of base 10.
BC5_ACAGTG_full = int(BC5_ACAGTG_full[0], 10)
print BC5_ACAGTG_full
BC6_GCCAAT_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC6_GCCAAT_full string list at index 0 (BC6_GCCAAT_full[0]) to
#an integer value of base 10.
BC6_GCCAAT_full = int(BC6_GCCAAT_full[0], 10)
print BC6_GCCAAT_full
BC7_CAGATC_full = !grep -o 'GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC7_CAGATC_full string list at index 0 (BC7_CAGATC_full[0]) to
#an integer value of base 10.
BC7_CAGATC_full = int(BC7_CAGATC_full[0], 10)
print BC7_CAGATC_full
#Adds all of the counts from each full-length Illumina adaptor/index sequence.
#Saves to variable "sum_full".
sum_full = BC1_ATCACG_full + BC3_TTAGGC_full + BC4_TGACCA_full + BC5_ACAGTG_full + BC6_GCCAAT_full + BC7_CAGATC_full
print sum_full
#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(sum_full)/TotalSeqs)*100)
grep
and wc -l
to count all the instances of each of the TruSeq index sequences.¶The index sequence is indicated in each of the respective variable names.
Additionally, the Epigentek barcode number is indicated in the variable names (e.g. BC1 = barcode 1).
BC1_ATCACG = !grep -o 'ATCACG' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC1_ATCACG string list at index 0 (BC1_ATCACG[0]) to
#an integer value of base 10.
BC1_ATCACG = int(BC1_ATCACG[0] ,10)
print BC1_ATCACG
BC3_TTAGGC = !grep -o 'TTAGGC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC3_TTAGGC string list at index 0 (BC3_TTAGGC[0]) to
#an integer value of base 10.
BC3_TTAGGC = int(BC3_TTAGGC[0] ,10)
print BC3_TTAGGC
BC4_TGACCA = !grep -o 'TGACCA' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC4_TGACCA string list at index 0 (BC4_TGACCA[0]) to
#an integer value of base 10.
BC4_TGACCA = int(BC4_TGACCA[0] ,10)
print BC4_TGACCA
BC5_ACAGTG = !grep -o 'ACAGTG' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC5_ACAGTG string list at index 0 (BC5_ACAGTG[0]) to
#an integer value of base 10.
BC5_ACAGTG = int(BC5_ACAGTG[0] ,10)
print BC5_ACAGTG
BC6_GCCAAT = !grep -o 'GCCAAT' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC6_GCCAAT string list at index 0 (BC6_GCCAAT[0]) to
#an integer value of base 10.
BC6_GCCAAT = int(BC6_GCCAAT[0] ,10)
print BC6_GCCAAT
BC7_CAGATC = !grep -o 'CAGATC' 2112_lane1_NoIndex_L001_R1_001.fastq \
| wc -l
#Converts the value in the BC7_CAGATC string list at index 0 (BC7_CAGATC[0]) to
#an integer value of base 10.
BC7_CAGATC = int(BC7_CAGATC[0] ,10)
print BC7_CAGATC
#Adds all of the counts from each Illumina adaptor/index sequence.
#Saves to variable "sum_short".
sum_short = BC1_ATCACG + BC3_TTAGGC + BC4_TGACCA + BC5_ACAGTG + BC6_GCCAAT + BC7_CAGATC
print sum_short
#Calculates percentage of reads having full-lenght Illumina adaptor/index sequences.
#Uses "float" to convert integer values to floating point decimals. Necessary since
#the calculation on integers would be < 1 & would result in an answer of '0'.
print ((float(sum_short)/TotalSeqs)*100)
fastx_barcode_splitter
to identify full-length TruSeq adaptor/index sequences.¶#The full-lengths barcode file used by fastx_barcode_splitter.
!head TruSeqBarcodesLong.txt
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \
fastx_barcode_splitter.pl \
--bcfile TruSeqBarcodesLong.txt \
--bol \
--prefix ./bol_long_ \
--suffix ".fastq" | \
tee bol_long_stats.txt
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesLong.txt)
#Specify to look for barcode at beginning of file (--eol)
#Specify output location and append a prefix to new file name (--prefix ./eol_)
#Specify new file name suffix (--suffix ".fastq")
!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \
fastx_barcode_splitter.pl \
--bcfile TruSeqBarcodes.txt \
--eol \
--prefix ./eol_long_ \
--suffix ".fastq" | \
tee eol_long_stats.txt
fastx_barcode_splitter
to identify TruSeq index sequences.¶#The full-lenghts barcode file used by fastx_barcode_splitter.
!head TruSeqBarcodesShort.txt
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)
#Specify to look for barcode at beginning of file (--bol)
#Specify output location and append a prefix to new file name (--prefix ./bol_)
#Specify new file name suffix (--suffix ".fastq")
!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \
fastx_barcode_splitter.pl \
--bcfile TruSeqBarcodesShort.txt \
--bol \
--prefix ./bol_ \
--suffix ".fastq" | \
tee bol_short_stats.txt
#Gunzip the gzipped FASTQ file.
#Pipe the output of that to fastx_barcode_splitter.pl
#fastx_barcode_splitter uses a default mismatch value = 1
#Specify barcode file (--bcfile TruSeqBarcodesShort.txt)
#Specify to look for barcode at beginning of file (--eol)
#Specify output location and append a prefix to new file name (--prefix ./eol_)
#Specify new file name suffix (--suffix ".fastq")
!gunzip -c 2112_lane1_NoIndex_L001_R1_001.fastq.gz | \
fastx_barcode_splitter.pl \
--bcfile TruSeqBarcodesShort.txt \
--eol \
--prefix ./eol_ \
--suffix ".fastq" | \
tee eol_short_stats.txt