{ "metadata": { "name": "PhageNGS" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#make Anaplasma BLAST database\n", "#check source fasta file for number of sequences\n", "#by counting the number of greater-than symbols ('>') in the file, since each\n", "#sequence entry is preceeded by a '>'.\n", "!awk '/>/ { count++ } END { print count }' /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "12065\r\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "#make the Anaplasma BLAST database\n", "#put the \"time\" command at the beginning for fun\n", "#the \"time\" command is NOT part of the BLAST package, but is already built into the Terminal\n", "!time makeblastdb -dbtype nucl -in /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305.fasta -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "\r\n", "Building a new DB, current time: 03/12/2014 14:49:21\r\n", "New DB name: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305\r\n", "New DB title: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305.fasta\r\n", "Sequence type: Nucleotide\r\n", "Keep Linkouts: T\r\n", "Keep MBits: T\r\n", "Maximum file size: 1000000000B\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|43' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|44' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|45' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|46' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|587' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|588' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|589' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|590' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|591' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|592' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|593' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10021' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10022' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10023' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10430' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10431' as it has no sequence data\r\n", "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|10432' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Adding sequences from FASTA; added 12048 sequences in 2.09418 seconds.\r\n", "2.08user 0.02system 0:02.19elapsed 96%CPU (0avgtext+0avgdata 15324maxresident)k\r\n", "31704inputs+32024outputs (156major+5517minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "#check cowdria fasta file numbers\n", "!awk '/>/ { count++ } END { print count }' /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "2289\r\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "#make cowdria BLAST database\n", "!time makeblastdb -dbtype nucl -in /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305.fasta -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "\r\n", "Building a new DB, current time: 03/12/2014 14:53:33\r\n", "New DB name: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305\r\n", "New DB title: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305.fasta\r\n", "Sequence type: Nucleotide\r\n", "Keep Linkouts: T\r\n", "Keep MBits: T\r\n", "Maximum file size: 1000000000B\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|116' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|421' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Adding sequences from FASTA; added 2287 sequences in 0.763971 seconds.\r\n", "0.76user 0.01system 0:00.77elapsed 99%CPU (0avgtext+0avgdata 12408maxresident)k\r\n", "0inputs+13152outputs (0major+3981minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "#check ehrlichia fasta file numbers\n", "!awk '/>/ { count++ } END { print count }' /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305.fasta" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "3020\r\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "#make ehrlichia BLAST database\n", "!time makeblastdb -dbtype nucl -in /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305.fasta -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\r\n", "\r\n", "Building a new DB, current time: 03/12/2014 14:56:01\r\n", "New DB name: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305\r\n", "New DB title: /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305.fasta\r\n", "Sequence type: Nucleotide\r\n", "Keep Linkouts: T\r\n", "Keep MBits: T\r\n", "Maximum file size: 1000000000B\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|203' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Error: (1431.1) FASTA-Reader: Warning: FASTA-Reader: No residues given\r\n", "Ignoring sequence 'lcl|206' as it has no sequence data\r\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Adding sequences from FASTA; added 3018 sequences in 1.93499 seconds.\r\n", "1.92user 0.02system 0:01.94elapsed 100%CPU (0avgtext+0avgdata 19968maxresident)k\r\n", "0inputs+35296outputs (0major+10541minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "#perform BLASTN of de novo assembly of all abalone seqs\n", "#against anaplasma BLAST database\n", "!time blastn -db -task blastn -query /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/queries/AllAbDenovo7118contigs.fa -outfmt \"6 stitle std\" -max_target_seqs 3 -num_threads 16 -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/outputs/AllAbDenovo7118contigsAnaplasmaGBntBLASTN.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "USAGE\r\n", " blastn [-h] [-help] [-import_search_strategy filename]\r\n", " [-export_search_strategy filename] [-task task_name] [-db database_name]\r\n", " [-dbsize num_letters] [-gilist filename] [-seqidlist filename]\r\n", " [-negative_gilist filename] [-entrez_query entrez_query]\r\n", " [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]\r\n", " [-subject subject_input_file] [-subject_loc range] [-query input_file]\r\n", " [-out output_file] [-evalue evalue] [-word_size int_value]\r\n", " [-gapopen open_penalty] [-gapextend extend_penalty]\r\n", " [-perc_identity float_value] [-xdrop_ungap float_value]\r\n", " [-xdrop_gap float_value] [-xdrop_gap_final float_value]\r\n", " [-searchsp int_value] [-max_hsps int_value] [-sum_statistics]\r\n", " [-penalty penalty] [-reward reward] [-no_greedy]\r\n", " [-min_raw_gapped_score int_value] [-template_type type]\r\n", " [-template_length int_value] [-dust DUST_options]\r\n", " [-filtering_db filtering_database]\r\n", " [-window_masker_taxid window_masker_taxid]\r\n", " [-window_masker_db window_masker_db] [-soft_masking soft_masking]\r\n", " [-ungapped] [-culling_limit int_value] [-best_hit_overhang float_value]\r\n", " [-best_hit_score_edge float_value] [-window_size int_value]\r\n", " [-off_diagonal_range int_value] [-use_index boolean] [-index_name string]\r\n", " [-lcase_masking] [-query_loc range] [-strand strand] [-parse_deflines]\r\n", " [-outfmt format] [-show_gis] [-num_descriptions int_value]\r\n", " [-num_alignments int_value] [-html] [-max_target_seqs num_sequences]\r\n", " [-num_threads int_value] [-remote] [-version]\r\n", "\r\n", "DESCRIPTION\r\n", " Nucleotide-Nucleotide BLAST 2.2.29+\r\n", "\r\n", "Use '-help' to print detailed descriptions of command line arguments\r\n", "========================================================================\r\n", "\r\n", "Error: Too many positional arguments (1), the offending value: blastn\r\n", "Command exited with non-zero status 1\r\n", "0.01user 0.01system 0:00.13elapsed 21%CPU (0avgtext+0avgdata 11124maxresident)k\r\n", "43552inputs+0outputs (219major+2645minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 8 }, { "cell_type": "raw", "metadata": {}, "source": [ "All BLASTs will be performed in the same fashion with the following options utlized:\n", "-task blastn: Tells BLAST to use BLASTn instead of the default MegaBLAST\n", "-outfmt \"6 stitle std\": Specifies the output format number 6 (which is a tab-delimited ouput file) and indicates that the output should add the subject title (stitle) in addition to the standard (std) BLASTn output columns.\n", "- max_target_seqs 3: Tells BLAST to match a maximum of 3 database sequences per query sequence. The default is 500!\n", "- num_threads 16: Speciifies the number of processing threads to use. I just multiplied the number of CPUs I have listed in my resource monitor by four, since the processor is listed as a quad core. " ] }, { "cell_type": "code", "collapsed": false, "input": [ "#forgot to specify BLAST db\n", "!time blastn -db /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/AnaplasmaGBnt20140305 -task blastn -query /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/queries/AllAbDenovo7118contigs.fa -outfmt \"6 stitle std\" -max_target_seqs 3 -num_threads 16 -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/outputs/AllAbDenovo7118contigsAnaplasmaGBntBLASTN.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "90.64user 0.86system 0:29.70elapsed 308%CPU (0avgtext+0avgdata 67624maxresident)k\r\n", "6224inputs+7688outputs (17major+371373minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "#perform BLASTN of de novo assembly of all abalone seqs\n", "#against cowdria BLAST database\n", "!time blastn -db /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/CowdriaGBnt20140305 -task blastn -query /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/queries/AllAbDenovo7118contigs.fa -outfmt \"6 stitle std\" -max_target_seqs 3 -num_threads 16 -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/outputs/AllAbDenovo7118contigsCowdriaGBntBLASTN.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "100.50user 2.37system 0:36.14elapsed 284%CPU (0avgtext+0avgdata 62192maxresident)k\r\n", "0inputs+16328outputs (0major+415944minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "#perform BLASTN of de novo assembly of all abalone seqs\n", "#against ehrilichia BLAST database\n", "!time blastn -db /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/dbs/EhrlichiaGBnt20140305 -task blastn -query /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/queries/AllAbDenovo7118contigs.fa -outfmt \"6 stitle std\" -max_target_seqs 3 -num_threads 16 -out /home/samb/BioinformaticsTools/ncbi-blast-2.2.29+/outputs/AllAbDenovo7118contigsEhrlichiaGBntBLASTN.txt" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "136.71user 1.80system 0:43.49elapsed 318%CPU (0avgtext+0avgdata 72420maxresident)k\r\n", "0inputs+12312outputs (0major+587148minor)pagefaults 0swaps\r\n" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }