Package Bio :: Package Sequencing :: Package Applications :: Module _bwa
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Applications._bwa

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Command line wrapper for bwa 
  7  """ 
  8   
  9  from __future__ import print_function 
 10  from Bio._py3k import basestring 
 11   
 12  __docformat__ = "restructuredtext en" 
 13   
 14  from Bio.Application import _Option, _Argument, _Switch, AbstractCommandline 
 15  from Bio.Application import _StaticArgument 
 16   
 17   
18 -class BwaIndexCommandline(AbstractCommandline):
19 """Command line wrapper for Burrows Wheeler Aligner (BWA) index. 20 21 Index database sequences in the FASTA format, equivalent to:: 22 23 $ bwa index [-p prefix] [-a algoType] [-c] <in.db.fasta> 24 25 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 26 27 Example 28 ------- 29 30 >>> from Bio.Sequencing.Applications import BwaIndexCommandline 31 >>> reference_genome = "/path/to/reference_genome.fasta" 32 >>> index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw") 33 >>> print(index_cmd) 34 bwa index -a bwtsw /path/to/reference_genome.fasta 35 36 You would typically run the command using index_cmd() or via the 37 Python subprocess module, as described in the Biopython tutorial. 38 39 """
40 - def __init__(self, cmd="bwa", **kwargs):
41 self.program_name = cmd 42 self.parameters = \ 43 [ 44 _StaticArgument("index"), 45 _Option(["-a", "a", "algorithm"], 46 """Algorithm for constructing BWT index. 47 48 Available options are: 49 - is: IS linear-time algorithm for constructing suffix array. 50 It requires 5.37N memory where N is the size of the database. 51 IS is moderately fast, but does not work with database larger 52 than 2GB. IS is the default algorithm due to its simplicity. 53 - bwtsw: Algorithm implemented in BWT-SW. This method works with the 54 whole human genome, but it does not work with database 55 smaller than 10MB and it is usually slower than IS.""", 56 checker_function=lambda x: x in ["is", "bwtsw"], 57 equate=False, is_required=True), 58 _Option(["-p", "p", "prefix"], 59 "Prefix of the output database [same as db filename]", 60 equate=False, is_required=False), 61 _Argument(["infile"], "Input file name", filename=True, is_required=True), 62 _Switch(["-c", "c"], 63 "Build color-space index. The input fasta should be in nucleotide space.") 64 ] 65 AbstractCommandline.__init__(self, cmd, **kwargs)
66 67
68 -class BwaAlignCommandline(AbstractCommandline):
69 """Command line wrapper for Burrows Wheeler Aligner (BWA) aln. 70 71 Run a BWA alignment, equivalent to:: 72 73 $ bwa aln [...] <in.db.fasta> <in.query.fq> > <out.sai> 74 75 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 76 77 Example: 78 79 >>> from Bio.Sequencing.Applications import BwaAlignCommandline 80 >>> reference_genome = "/path/to/reference_genome.fasta" 81 >>> read_file = "/path/to/read_1.fq" 82 >>> output_sai_file = "/path/to/read_1.sai" 83 >>> read_group="@RG\tID:foo\tSM:bar" 84 >>> align_cmd = BwaAlignCommandline(reference=reference_genome, read_file=read_file) 85 >>> print(align_cmd) 86 bwa aln /path/to/reference_genome.fasta /path/to/read_1.fq 87 88 You would typically run the command line using align_cmd(stdout=output_sai_file) 89 or via the Python subprocess module, as described in the Biopython tutorial. 90 """
91 - def __init__(self, cmd="bwa", **kwargs):
92 self.program_name = cmd 93 self.parameters = \ 94 [ 95 _StaticArgument("aln"), 96 _Argument(["reference"], "Reference file name", 97 filename=True, is_required=True), 98 _Argument(["read_file"], "Read file name", 99 filename=True, is_required=True), 100 _Option(["-n", "n"], 101 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 102 checker_function=lambda x: isinstance(x, (int, float)), 103 equate=False), 104 _Option(["-o", "o"], 105 "Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04]", 106 checker_function=lambda x: isinstance(x, (int, float)), 107 equate=False), 108 _Option(["-e", "e"], 109 "Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1]", 110 checker_function=lambda x: isinstance(x, int), 111 equate=False), 112 _Option(["-d", "d"], 113 "Disallow a long deletion within INT bp towards the 3-end [16]", 114 checker_function=lambda x: isinstance(x, int), 115 equate=False), 116 _Option(["-i", "i"], 117 "Disallow an indel within INT bp towards the ends [5]", 118 checker_function=lambda x: isinstance(x, int), 119 equate=False), 120 _Option(["-l", "l"], 121 """Take the first INT subsequence as seed. 122 123 If INT is larger than the query sequence, seeding will be disabled. 124 For long reads, this option is typically ranged from 25 to 35 for 125 -k 2. [inf]""", 126 checker_function=lambda x: isinstance(x, int), 127 equate=False), 128 _Option(["-k", "k"], "Maximum edit distance in the seed [2]", 129 checker_function=lambda x: isinstance(x, int), 130 equate=False), 131 _Option(["-t", "t"], "Number of threads (multi-threading mode) [1]", 132 checker_function=lambda x: isinstance(x, int), 133 equate=False), 134 _Option(["-M", "M"], 135 "Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3]", 136 checker_function=lambda x: isinstance(x, int), 137 equate=False), 138 _Option(["-O", "O"], "Gap open penalty [11]", 139 checker_function=lambda x: isinstance(x, int), 140 equate=False), 141 _Option(["-E", "E"], "Gap extension penalty [4]", 142 checker_function=lambda x: isinstance(x, int), 143 equate=False), 144 _Option(["-R", "R"], 145 """Proceed with suboptimal alignments if there are no more than INT equally best hits. 146 147 This option only affects paired-end mapping. Increasing this threshold helps 148 to improve the pairing accuracy at the cost of speed, especially for short 149 reads (~32bp).""", 150 checker_function=lambda x: isinstance(x, int), 151 equate=False), 152 _Option(["-q", "q"], 153 """Parameter for read trimming [0]. 154 155 BWA trims a read down to argmax_x{\sum_{i=x+1}^l(INT-q_i)} if q_l<INT 156 where l is the original read length.""", 157 checker_function=lambda x: isinstance(x, int), 158 equate=False), 159 _Option(["-B", "B"], 160 "Length of barcode starting from the 5-end. When INT is positive, the barcode of each read will be trimmed before mapping and will be written at the BC SAM tag. For paired-end reads, the barcode from both ends are concatenated. [0]", 161 checker_function=lambda x: isinstance(x, int), 162 equate=False), 163 _Switch(["-c", "c"], 164 "Reverse query but not complement it, which is required for alignment in the color space."), 165 _Switch(["-N", "N"], 166 "Disable iterative search. All hits with no more than maxDiff differences will be found. This mode is much slower than the default."), 167 _Switch(["-I", "I"], 168 "The input is in the Illumina 1.3+ read format (quality equals ASCII-64)."), 169 _Switch(["-b", "b"], 170 "Specify the input read sequence file is the BAM format"), 171 _Switch(["-b1", "b1"], 172 "When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads)."), 173 _Switch(["-b2", "b2"], 174 "When -b is specified, only use the second read in a read pair in mapping.") 175 ] 176 AbstractCommandline.__init__(self, cmd, **kwargs)
177 178
179 -class BwaSamseCommandline(AbstractCommandline):
180 """Command line wrapper for Burrows Wheeler Aligner (BWA) samse. 181 182 Generate alignments in the SAM format given single-end reads. 183 Equvialent to:: 184 185 $ bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam> 186 187 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 188 189 Example: 190 191 >>> from Bio.Sequencing.Applications import BwaSamseCommandline 192 >>> reference_genome = "/path/to/reference_genome.fasta" 193 >>> read_file = "/path/to/read_1.fq" 194 >>> sai_file = "/path/to/read_1.sai" 195 >>> output_sam_file = "/path/to/read_1.sam" 196 >>> samse_cmd = BwaSamseCommandline(reference=reference_genome, 197 ... read_file=read_file, sai_file=sai_file) 198 >>> print(samse_cmd) 199 bwa samse /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_1.fq 200 201 You would typically run the command line using samse_cmd(stdout=output_sam_file) 202 or via the Python subprocess module, as described in the Biopython tutorial. 203 """
204 - def __init__(self, cmd="bwa", **kwargs):
205 self.program_name = cmd 206 self.parameters = \ 207 [ 208 _StaticArgument("samse"), 209 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 210 _Argument(["sai_file"], "Sai file name", filename=True, is_required=True), 211 _Argument(["read_file"], "Read file name", filename=True, is_required=True), 212 _Option(["-n", "n"], 213 """Maximum number of alignments to output in the XA tag for reads paired properly. 214 215 If a read has more than INT hits, the XA tag will not be written. [3]""", 216 checker_function=lambda x: isinstance(x, int), 217 equate=False), 218 _Option(["-r", "r"], 219 "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 220 checker_function=lambda x: isinstance(x, int), 221 equate=False), 222 ] 223 AbstractCommandline.__init__(self, cmd, **kwargs)
224 225
226 -class BwaSampeCommandline(AbstractCommandline):
227 """Command line wrapper for Burrows Wheeler Aligner (BWA) sampe. 228 229 Generate alignments in the SAM format given paired-end reads. 230 Equivalent to:: 231 232 $ bwa sampe [...] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq> > <out.sam> 233 234 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 235 236 Example: 237 238 >>> from Bio.Sequencing.Applications import BwaSampeCommandline 239 >>> reference_genome = "/path/to/reference_genome.fasta" 240 >>> read_file1 = "/path/to/read_1.fq" 241 >>> read_file2 = "/path/to/read_2.fq" 242 >>> sai_file1 = "/path/to/read_1.sai" 243 >>> sai_file2 = "/path/to/read_2.sai" 244 >>> output_sam_file = "/path/to/output.sam" 245 >>> read_group = "@RG\tID:foo\tSM:bar" 246 >>> sampe_cmd = BwaSampeCommandline(reference=reference_genome, 247 ... sai_file1=sai_file1, sai_file2=sai_file2, 248 ... read_file1=read_file1, read_file2=read_file2, 249 ... r=read_group) 250 >>> print(sampe_cmd) 251 bwa sampe /path/to/reference_genome.fasta /path/to/read_1.sai /path/to/read_2.sai /path/to/read_1.fq /path/to/read_2.fq -r @RG ID:foo SM:bar 252 253 You would typically run the command line using sampe_cmd(stdout=output_sam_file) 254 or via the Python subprocess module, as described in the Biopython tutorial. 255 """ 256 # TODO - Should the read group have a raw tab in it, or \t?
257 - def __init__(self, cmd="bwa", **kwargs):
258 self.program_name = cmd 259 self.parameters = \ 260 [ 261 _StaticArgument("sampe"), 262 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 263 _Argument(["sai_file1"], "Sai file 1", filename=True, is_required=True), 264 _Argument(["sai_file2"], "Sai file 2", filename=True, is_required=True), 265 _Argument(["read_file1"], "Read file 1", filename=True, is_required=True), 266 _Argument(["read_file2"], "Read file 2", filename=True, is_required=True), 267 _Option(["-a", "a"], 268 """Maximum insert size for a read pair to be considered being mapped properly [500]. 269 270 Since 0.4.5, this option is only used when there are not enough 271 good alignments to infer the distribution of insert sizes.""", 272 checker_function=lambda x: isinstance(x, int), 273 equate=False), 274 _Option(["-o", "o"], 275 """Maximum occurrences of a read for pairing [100000]. 276 277 A read with more occurrences will be treated as a single-end read. 278 Reducing this parameter helps faster pairing.""", 279 checker_function=lambda x: isinstance(x, int), 280 equate=False), 281 _Option(["-n", "n"], 282 """Maximum number of alignments to output in the XA tag for reads paired properly [3]. 283 284 If a read has more than INT hits, the XA tag will not be written.""", 285 checker_function=lambda x: isinstance(x, int), 286 equate=False), 287 _Option(["-N", "N"], 288 """Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) [10]. 289 290 . If a read has more than INT hits, the XA tag will not be written.""", 291 checker_function=lambda x: isinstance(x, int), 292 equate=False), 293 _Option(["-r", "r"], "Specify the read group in a format like '@RG\tID:foo\tSM:bar'. [null]", 294 checker_function=lambda x: isinstance(x, basestring), 295 equate=False), 296 ] 297 AbstractCommandline.__init__(self, cmd, **kwargs)
298 299
300 -class BwaBwaswCommandline(AbstractCommandline):
301 """Command line wrapper for Burrows Wheeler Aligner (BWA) bwasw. 302 303 Align query sequences from FASTQ files. Equivalent to:: 304 305 $ bwa bwasw [...] <in.db.fasta> <in.fq> 306 307 See http://bio-bwa.sourceforge.net/bwa.shtml for details. 308 309 Example: 310 311 >>> from Bio.Sequencing.Applications import BwaBwaswCommandline 312 >>> reference_genome = "/path/to/reference_genome.fasta" 313 >>> read_file = "/path/to/read_1.fq" 314 >>> bwasw_cmd = BwaBwaswCommandline(reference=reference_genome, read_file=read_file) 315 >>> print(bwasw_cmd) 316 bwa bwasw /path/to/reference_genome.fasta /path/to/read_1.fq 317 318 You would typically run the command line using bwasw_cmd() or via the 319 Python subprocess module, as described in the Biopython tutorial. 320 """
321 - def __init__(self, cmd="bwa", **kwargs):
322 self.program_name = cmd 323 self.parameters = \ 324 [ 325 _StaticArgument("bwasw"), 326 _Argument(["reference"], "Reference file name", filename=True, is_required=True), 327 _Argument(["read_file"], "Read file", filename=True, is_required=True), 328 _Argument(["mate_file"], "Mate file", filename=True, is_required=False), 329 _Option(["-a", "a"], 330 "Score of a match [1]", 331 checker_function=lambda x: isinstance(x, int), 332 equate=False), 333 _Option(["-b", "b"], 334 "Mismatch penalty [3]", 335 checker_function=lambda x: isinstance(x, int), 336 equate=False), 337 _Option(["-q", "q"], 338 "Gap open penalty [5]", 339 checker_function=lambda x: isinstance(x, int), 340 equate=False), 341 _Option(["-r", "r"], 342 "Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2]", 343 checker_function=lambda x: isinstance(x, int), 344 equate=False), 345 _Option(["-t", "t"], 346 "Number of threads in the multi-threading mode [1]", 347 checker_function=lambda x: isinstance(x, int), 348 equate=False), 349 _Option(["-w", "w"], 350 "Band width in the banded alignment [33]", 351 checker_function=lambda x: isinstance(x, int), 352 equate=False), 353 _Option(["-T", "T"], 354 "Minimum score threshold divided by a [37]", 355 checker_function=lambda x: isinstance(x, int), 356 equate=False), 357 _Option(["-c", "c"], 358 """Coefficient for threshold adjustment according to query length [5.5]. 359 360 Given an l-long query, the threshold for a hit to be retained is 361 a*max{T,c*log(l)}.""", 362 checker_function=lambda x: isinstance(x, float), 363 equate=False), 364 _Option(["-z", "z"], 365 "Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1]", 366 checker_function=lambda x: isinstance(x, int), 367 equate=False), 368 _Option(["-s", "s"], 369 """Maximum SA interval size for initiating a seed [3]. 370 371 Higher -s increases accuracy at the cost of speed.""", 372 checker_function=lambda x: isinstance(x, int), 373 equate=False), 374 _Option(["-N", "N"], 375 "Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5]", 376 checker_function=lambda x: isinstance(x, int), 377 equate=False), 378 ] 379 AbstractCommandline.__init__(self, cmd, **kwargs)
380 381
382 -def _test():
383 """Run the module's doctests (PRIVATE).""" 384 print("Running modules doctests...") 385 import doctest 386 doctest.testmod() 387 print("Done")
388 389 390 if __name__ == "__main__": 391 _test() 392