Package Bio :: Package Sequencing :: Package Applications :: Module _bwa
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Applications._bwa

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Command line wrapper for bwa 
  7  """ 
  8   
  9  from __future__ import print_function 
 10  from Bio._py3k import basestring 
 11   
 12  __docformat__ = "epytext en" 
 13   
 14  from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline 
 15  from Bio.Application import _StaticArgument 
 16   
 17   
18 -class BwaIndexCommandline(AbstractCommandline):
19 """Command line wrapper for Burrows Wheeler Aligner (BWA) index. 20 21 Index database sequences in the FASTA format, equivalent to:: 22 23 $ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta> 24 25 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 26 27 Example: 28 29 >>> from Bio.Sequencing.Applications import BwaIndexCommandline 30 >>> reference_genome = "/path/to/reference_genome.fasta" 31 >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") 32 >>> print(index_cmd) 33 bwa index -a bwtsw /path/to/reference_genome.fasta 34 35 You would typically run the command using index_cmd() or via the 36 Python subprocess module, as described in the Biopython tutorial. 37 38 """
39 - def __init__(self, cmd="bwa", **kwargs):
40 self.program_name = cmd 41 self.parameters = \ 42 [ 43 _StaticArgument("index"), 44 _Option(["-a", "a", "algorithm"], 45 """Algorithm for constructing BWT index. 46 47 Available options are: 48 - is: IS linear-time algorithm for constructing suffix array. 49 It requires 5.37N memory where N is the size of the database. 50 IS is moderately fast, but does not work with database larger 51 than 2GB. IS is the default algorithm due to its simplicity. 52 - bwtsw: Algorithm implemented in BWT-SW. This method works with the 53 whole human genome, but it does not work with database 54 smaller than 10MB and it is usually slower than IS.""", 55 checker_function=lambda x: x in ["is", "bwtsw"], 56 equate=False, is_required=True), 57 _Option(["-p", "p", "prefix"], 58 "Prefix of the output database [same as db filename]", 59 equate=False, is_required=False), 60 _Argument(["infile"], "Input file name", filename=True, is_required=True), 61 _Switch(["-c", "c"], 62 "Build color-space index. The input fasta should be in nucleotide space.") 63 ] 64 AbstractCommandline.__init__(self, cmd, **kwargs)
65 66
67 -class BwaAlignCommandline(AbstractCommandline):
68 """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. 69 70 Run a BWA alignment, equivalent to:: 71 72 $ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai> 73 74 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 75 76 Example: 77 78 >>> from Bio.Sequencing.Applications import BwaAlignCommandline 79 >>> reference_genome = "/path/to/reference_genome.fasta" 80 >>> read_file = "/path/to/read_1.fq" 81 >>> output_sai_file = "/path/to/read_1.sai" 82 >>> read_group="@RG\tID:foo\tSM:bar" 83 >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) 84 >>> print(align_cmd) 85 bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq 86 87 You would typically run the command line using align_cmd(stdout=output_sai_file) 88 or via the Python subprocess module, as described in the Biopython tutorial. 89 """
90 - def __init__(self, cmd="bwa", **kwargs):
91 self.program_name = cmd 92 self.parameters = \ 93 [ 94 _StaticArgument("aln"), 95 _Argument(["reference"], "Reference file name", 96 filename=True, is_required=True), 97 _Argument(["read_file"], "Read file name", 98 filename=True, is_required=True), 99 _Option(["-n", "n"], 100 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 101 checker_function=lambda x: isinstance(x, (int, float)), 102 equate=False), 103 _Option(["-o", "o"], 104 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 105 checker_function=lambda x: isinstance(x, (int, float)), 106 equate=False), 107 _Option(["-e", "e"], 108 "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", 109 checker_function=lambda x: isinstance(x, int), 110 equate=False), 111 _Option(["-d", "d"], 112 "Disallow a long deletion within INT bp towards the 3-end [16]", 113 checker_function=lambda x: isinstance(x, int), 114 equate=False), 115 _Option(["-i", "i"], 116 "Disallow an indel within INT bp towards the ends [5]", 117 checker_function=lambda x: isinstance(x, int), 118 equate=False), 119 _Option(["-l", "l"], 120 """Take the first INT subsequence as seed. 121 122 If INT is larger than the query sequence, seeding will be disabled. 123 For long reads, this option is typically ranged from 25 to 35 for 124 -k 2. [inf]""", 125 checker_function=lambda x: isinstance(x, int), 126 equate=False), 127 _Option(["-k", "k"], "Maximum edit distance in the seed [2]", 128 checker_function=lambda x: isinstance(x, int), 129 equate=False), 130 _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", 131 checker_function=lambda x: isinstance(x, int), 132 equate=False), 133 _Option(["-M", "M"], 134 "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", 135 checker_function=lambda x: isinstance(x, int), 136 equate=False), 137 _Option(["-O", "O"], "Gap open penalty [11]", 138 checker_function=lambda x: isinstance(x, int), 139 equate=False), 140 _Option(["-E", "E"], "Gap extension penalty [4]", 141 checker_function=lambda x: isinstance(x, int), 142 equate=False), 143 _Option(["-R", "R"], 144 """Proceed with suboptimal alignments if there are no more than INT equally best hits. 145 146 This option only affects paired-end mapping. Increasing this threshold helps 147 to improve the pairing accuracy at the cost of speed, especially for short 148 reads (~32bp).""", 149 checker_function=lambda x: isinstance(x, int), 150 equate=False), 151 _Option(["-q", "q"], 152 """Parameter for read trimming [0]. 153 154 BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT 155 where l is the original read length.""", 156 checker_function=lambda x: isinstance(x, int), 157 equate=False), 158 _Option(["-B", "B"], 159 "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", 160 checker_function=lambda x: isinstance(x, int), 161 equate=False), 162 _Switch(["-c", "c"], 163 "Reverse query but not complement it, which is required for alignment in the color space."), 164 _Switch(["-N", "N"], 165 "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default."), 166 _Switch(["-I", "I"], 167 "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)."), 168 _Switch(["-b", "b"], 169 "Specify the input read sequence file is the BAM format"), 170 _Switch(["-b1", "b1"], 171 "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)."), 172 _Switch(["-b2", "b2"], 173 "When -b is specified, only use the second read in a read pair in mapping.") 174 ] 175 AbstractCommandline.__init__(self, cmd, **kwargs)
176 177
178 -class BwaSamseCommandline(AbstractCommandline):
179 """Command line wrapper for Burrows Wheeler Aligner (BWA) samse. 180 181 Generate alignments in the SAM format given single-end reads. 182 Equvialent to:: 183 184 $ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam> 185 186 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 187 188 Example: 189 190 >>> from Bio.Sequencing.Applications import BwaSamseCommandline 191 >>> reference_genome = "/path/to/reference_genome.fasta" 192 >>> read_file = "/path/to/read_1.fq" 193 >>> sai_file = "/path/to/read_1.sai" 194 >>> output_sam_file = "/path/to/read_1.sam" 195 >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, 196 ... read_file=read_file, sai_file=sai_file) 197 >>> print(samse_cmd) 198 bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq 199 200 You would typically run the command line using samse_cmd(stdout=output_sam_file) 201 or via the Python subprocess module, as described in the Biopython tutorial. 202 """
203 - def __init__(self, cmd="bwa", **kwargs):
204 self.program_name = cmd 205 self.parameters = \ 206 [ 207 _StaticArgument("samse"), 208 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 209 _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), 210 _Argument(["read_file"], "Read file name", filename=True, is_required=True), 211 _Option(["-n", "n"], 212 """Maximum number of alignments to output in the XA tag for reads paired properly. 213 214 If a read has more than INT hits, the XA tag will not be written. [3]""", 215 checker_function=lambda x: isinstance(x, int), 216 equate=False), 217 _Option(["-r", "r"], 218 "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 219 checker_function=lambda x: isinstance(x, int), 220 equate=False), 221 ] 222 AbstractCommandline.__init__(self, cmd, **kwargs)
223 224
225 -class BwaSampeCommandline(AbstractCommandline):
226 """Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. 227 228 Generate alignments in the SAM format given paired-end reads. 229 Equivalent to:: 230 231 $ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam> 232 233 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 234 235 Example: 236 237 >>> from Bio.Sequencing.Applications import BwaSampeCommandline 238 >>> reference_genome = "/path/to/reference_genome.fasta" 239 >>> read_file1 = "/path/to/read_1.fq" 240 >>> read_file2 = "/path/to/read_2.fq" 241 >>> sai_file1 = "/path/to/read_1.sai" 242 >>> sai_file2 = "/path/to/read_2.sai" 243 >>> output_sam_file = "/path/to/output.sam" 244 >>> read_group = "@RG\tID:foo\tSM:bar" 245 >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, 246 ... sai_file1=sai_file1, sai_file2=sai_file2, 247 ... read_file1=read_file1, read_file2=read_file2, 248 ... r=read_group) 249 >>> print(sampe_cmd) 250 bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG ID:foo SM:bar 251 252 You would typically run the command line using sampe_cmd(stdout=output_sam_file) 253 or via the Python subprocess module, as described in the Biopython tutorial. 254 """ 255 #TODO - Should the read group have a raw tab in it, or \t?
256 - def __init__(self, cmd="bwa", **kwargs):
257 self.program_name = cmd 258 self.parameters = \ 259 [ 260 _StaticArgument("sampe"), 261 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 262 _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), 263 _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), 264 _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), 265 _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), 266 _Option(["-a", "a"], 267 """Maximum insert size for a read pair to be considered being mapped properly [500]. 268 269 Since 0.4.5, this option is only used when there are not enough 270 good alignments to infer the distribution of insert sizes.""", 271 checker_function=lambda x: isinstance(x, int), 272 equate=False), 273 _Option(["-o", "o"], 274 """Maximum occurrences of a read for pairing [100000]. 275 276 A read with more occurrences will be treated as a single-end read. 277 Reducing this parameter helps faster pairing.""", 278 checker_function=lambda x: isinstance(x, int), 279 equate=False), 280 _Option(["-n", "n"], 281 """Maximum number of alignments to output in the XA tag for reads paired properly [3]. 282 283 If a read has more than INT hits, the XA tag will not be written.""", 284 checker_function=lambda x: isinstance(x, int), 285 equate=False), 286 _Option(["-N", "N"], 287 """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. 288 289 . If a read has more than INT hits, the XA tag will not be written.""", 290 checker_function=lambda x: isinstance(x, int), 291 equate=False), 292 _Option(["-r", "r"], "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 293 checker_function=lambda x: isinstance(x, basestring), 294 equate=False), 295 ] 296 AbstractCommandline.__init__(self, cmd, **kwargs)
297 298
299 -class BwaBwaswCommandline(AbstractCommandline):
300 """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. 301 302 Align query sequences from FASTQ files. Equivalent to:: 303 304 $ bwa bwasw [...] <in.db.fasta> <in.fq> 305 306 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 307 308 Example: 309 310 >>> from Bio.Sequencing.Applications import BwaBwaswCommandline 311 >>> reference_genome = "/path/to/reference_genome.fasta" 312 >>> read_file = "/path/to/read_1.fq" 313 >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) 314 >>> print(bwasw_cmd) 315 bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq 316 317 You would typically run the command line using bwasw_cmd() or via the 318 Python subprocess module, as described in the Biopython tutorial. 319 """
320 - def __init__(self, cmd="bwa", **kwargs):
321 self.program_name = cmd 322 self.parameters = \ 323 [ 324 _StaticArgument("bwasw"), 325 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 326 _Argument(["read_file"], "Read file", filename=True, is_required=True), 327 _Argument(["mate_file"], "Mate file", filename=True, is_required=False), 328 _Option(["-a", "a"], 329 "Score of a match [1]", 330 checker_function=lambda x: isinstance(x, int), 331 equate=False), 332 _Option(["-b", "b"], 333 "Mismatch penalty [3]", 334 checker_function=lambda x: isinstance(x, int), 335 equate=False), 336 _Option(["-q", "q"], 337 "Gap open penalty [5]", 338 checker_function=lambda x: isinstance(x, int), 339 equate=False), 340 _Option(["-r", "r"], 341 "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", 342 checker_function=lambda x: isinstance(x, int), 343 equate=False), 344 _Option(["-t", "t"], 345 "Number of threads in the multi-threading mode [1]", 346 checker_function=lambda x: isinstance(x, int), 347 equate=False), 348 _Option(["-w", "w"], 349 "Band width in the banded alignment [33]", 350 checker_function=lambda x: isinstance(x, int), 351 equate=False), 352 _Option(["-T", "T"], 353 "Minimum score threshold divided by a [37]", 354 checker_function=lambda x: isinstance(x, int), 355 equate=False), 356 _Option(["-c", "c"], 357 """Coefficient for threshold adjustment according to query length [5.5]. 358 359 Given an l-long query, the threshold for a hit to be retained is 360 a*max{T,c*log(l)}.""", 361 checker_function=lambda x: isinstance(x, float), 362 equate=False), 363 _Option(["-z", "z"], 364 "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", 365 checker_function=lambda x: isinstance(x, int), 366 equate=False), 367 _Option(["-s", "s"], 368 """Maximum SA interval size for initiating a seed [3]. 369 370 Higher -s increases accuracy at the cost of speed.""", 371 checker_function=lambda x: isinstance(x, int), 372 equate=False), 373 _Option(["-N", "N"], 374 "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", 375 checker_function=lambda x: isinstance(x, int), 376 equate=False), 377 ] 378 AbstractCommandline.__init__(self, cmd, **kwargs)
379 380
381 -def _test():
382 """Run the module's doctests (PRIVATE).""" 383 print("Running modules doctests...") 384 import doctest 385 doctest.testmod() 386 print("Done")
387 388 389 if __name__ == "__main__": 390 _test() 391