Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  __docformat__ = "restructuredtext en"  # Don't just use plain text in epydoc API pages! 
 11   
 12  from Bio.Application import _Option, _Switch, AbstractCommandline 
 13   
 14   
15 -class MuscleCommandline(AbstractCommandline):
16 r"""Command line wrapper for the multiple alignment program MUSCLE. 17 18 http://www.drive5.com/muscle/ 19 20 Example: 21 -------- 22 23 >>> from Bio.Align.Applications import MuscleCommandline 24 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 25 >>> in_file = r"C:\My Documents\unaligned.fasta" 26 >>> out_file = r"C:\My Documents\aligned.fasta" 27 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 28 >>> print(muscle_cline) 29 "C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 30 31 You would typically run the command line with muscle_cline() or via 32 the Python subprocess module, as described in the Biopython tutorial. 33 34 Citations: 35 ---------- 36 37 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 38 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 39 40 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 41 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 42 43 Last checked against version: 3.7, briefly against 3.8 44 """
45 - def __init__(self, cmd="muscle", **kwargs):
46 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 47 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 48 "kmer4_6"] 49 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 50 ["pctid_kimura", "pctid_log"] 51 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 52 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 53 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 54 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", "gsc", "threeway"] 55 self.parameters = \ 56 [ 57 # Can't use "in" as the final alias as this is a reserved word in python: 58 _Option(["-in", "in", "input"], 59 "Input filename", 60 filename=True, 61 equate=False), 62 _Option(["-out", "out"], 63 "Output filename", 64 filename=True, 65 equate=False), 66 _Switch(["-diags", "diags"], 67 "Find diagonals (faster for similar sequences)"), 68 _Switch(["-profile", "profile"], 69 "Perform a profile alignment"), 70 _Option(["-in1", "in1"], 71 "First input filename for profile alignment", 72 filename=True, 73 equate=False), 74 _Option(["-in2", "in2"], 75 "Second input filename for a profile alignment", 76 filename=True, 77 equate=False), 78 # anchorspacing Integer 32 Minimum spacing between 79 _Option(["-anchorspacing", "anchorspacing"], 80 "Minimum spacing between anchor columns", 81 checker_function=lambda x: isinstance(x, int), 82 equate=False), 83 # center Floating point [1] Center parameter. 84 # Should be negative. 85 _Option(["-center", "center"], 86 "Center parameter - should be negative", 87 checker_function=lambda x: isinstance(x, float), 88 equate=False), 89 # cluster1 upgma upgmb Clustering method. 90 _Option(["-cluster1", "cluster1"], 91 "Clustering method used in iteration 1", 92 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 93 equate=False), 94 # cluster2 upgmb cluster1 is used in 95 # neighborjoining iteration 1 and 2, 96 # cluster2 in later 97 # iterations. 98 _Option(["-cluster2", "cluster2"], 99 "Clustering method used in iteration 2", 100 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 101 equate=False), 102 # diaglength Integer 24 Minimum length of 103 # diagonal. 104 _Option(["-diaglength", "diaglength"], 105 "Minimum length of diagonal", 106 checker_function=lambda x: isinstance(x, int), 107 equate=True), 108 # diagmargin Integer 5 Discard this many 109 # positions at ends of 110 # diagonal. 111 _Option(["-diagmargin", "diagmargin"], 112 "Discard this many positions at ends of diagonal", 113 checker_function=lambda x: isinstance(x, int), 114 equate=False), 115 # distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 116 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 117 # kmer20_4 118 # kbit20_3 119 # kmer4_6 120 _Option(["-distance1", "distance1"], 121 "Distance measure for iteration 1", 122 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 123 equate=False), 124 # distance2 kmer6_6 pctid_kimura Distance measure for 125 # kmer20_3 iterations 2, 3 ... 126 # kmer20_4 127 # kbit20_3 128 # pctid_kimura 129 # pctid_log 130 _Option(["-distance2", "distance2"], 131 "Distance measure for iteration 2", 132 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 133 equate=False), 134 # gapopen Floating point [1] The gap open score. 135 # Must be negative. 136 _Option(["-gapopen", "gapopen"], 137 "Gap open score - negative number", 138 checker_function=lambda x: isinstance(x, float), 139 equate=False), 140 # hydro Integer 5 Window size for 141 # determining whether a 142 # region is hydrophobic. 143 _Option(["-hydro", "hydro"], 144 "Window size for hydrophobic region", 145 checker_function=lambda x: isinstance(x, int), 146 equate=False), 147 # hydrofactor Floating point 1.2 Multiplier for gap 148 # open/close penalties in 149 # hydrophobic regions. 150 _Option(["-hydrofactor", "hydrofactor"], 151 "Multiplier for gap penalties in hydrophobic regions", 152 checker_function=lambda x: isinstance(x, float), 153 equate=False), 154 # log File name None. Log file name (delete 155 # existing file). 156 _Option(["-log", "log"], 157 "Log file name", 158 filename=True, 159 equate=False), 160 # loga File name None. Log file name (append 161 # to existing file). 162 _Option(["-loga", "loga"], 163 "Log file name (append to existing file)", 164 filename=True, 165 equate=False), 166 # maxdiagbreak Integer 1 Maximum distance 167 # between two diagonals 168 # that allows them to 169 # merge into one 170 # diagonal. 171 _Option(["-maxdiagbreak", "maxdiagbreak"], 172 "Maximum distance between two diagonals that allows " 173 "them to merge into one diagonal", 174 checker_function=lambda x: isinstance(x, int), 175 equate=False), 176 # maxhours Floating point None. Maximum time to run in 177 # hours. The actual time 178 # may exceed the 179 # requested limit by a 180 # few minutes. Decimals 181 # are allowed, so 1.5 182 # means one hour and 30 183 # minutes. 184 _Option(["-maxhours", "maxhours"], 185 "Maximum time to run in hours", 186 checker_function=lambda x: isinstance(x, float), 187 equate=False), 188 # maxiters Integer 1, 2 ... 16 Maximum number of 189 # iterations. 190 _Option(["-maxiters", "maxiters"], 191 "Maximum number of iterations", 192 checker_function=lambda x: isinstance(x, int), 193 equate=False), 194 # maxtrees Integer 1 Maximum number of new 195 # trees to build in 196 # iteration 2. 197 _Option(["-maxtrees", "maxtrees"], 198 "Maximum number of trees to build in iteration 2", 199 checker_function=lambda x: isinstance(x, int), 200 equate=False), 201 # minbestcolscore Floating point [1] Minimum score a column 202 # must have to be an 203 # anchor. 204 _Option(["-minbestcolscore", "minbestcolscore"], 205 "Minimum score a column must have to be an anchor", 206 checker_function=lambda x: isinstance(x, float), 207 equate=False), 208 # minsmoothscore Floating point [1] Minimum smoothed score 209 # a column must have to 210 # be an anchor. 211 _Option(["-minsmoothscore", "minsmoothscore"], 212 "Minimum smoothed score a column must have to " 213 "be an anchor", 214 checker_function=lambda x: isinstance(x, float), 215 equate=False), 216 # objscore sp spm Objective score used by 217 # ps tree dependent 218 # dp refinement. 219 # xp sp=sum-of-pairs score. 220 # spf spf=sum-of-pairs score 221 # spm (dimer approximation) 222 # spm=sp for < 100 seqs, 223 # otherwise spf 224 # dp=dynamic programming 225 # score. 226 # ps=average profile- 227 # sequence score. 228 # xp=cross profile score. 229 _Option(["-objscore", "objscore"], 230 "Objective score used by tree dependent refinement", 231 checker_function=lambda x: x in OBJECTIVE_SCORES, 232 equate=False), 233 # root1 pseudo pseudo Method used to root 234 _Option(["-root1", "root1"], 235 "Method used to root tree in iteration 1", 236 checker_function=lambda x: x in TREE_ROOT_METHODS, 237 equate=False), 238 # root2 midlongestspan tree; root1 is used in 239 # minavgleafdist iteration 1 and 2, 240 # root2 in later 241 # iterations. 242 _Option(["-root2", "root2"], 243 "Method used to root tree in iteration 2", 244 checker_function=lambda x: x in TREE_ROOT_METHODS, 245 equate=False), 246 # seqtype protein auto Sequence type. 247 # nucleo 248 # auto 249 _Option(["-seqtype", "seqtype"], 250 "Sequence type", 251 checker_function=lambda x: x in SEQUENCE_TYPES, 252 equate=False), 253 # smoothscoreceil Floating point [1] Maximum value of column 254 # score for smoothing 255 # purposes. 256 _Option(["-smoothscoreceil", "smoothscoreceil"], 257 "Maximum value of column score for smoothing", 258 checker_function=lambda x: isinstance(x, float), 259 equate=False), 260 # smoothwindow Integer 7 Window used for anchor 261 # column smoothing. 262 _Option(["-smoothwindow", "smoothwindow"], 263 "Window used for anchor column smoothing", 264 checker_function=lambda x: isinstance(x, int), 265 equate=False), 266 # SUEFF Floating point value 0.1 Constant used in UPGMB 267 # between 0 and 1. clustering. Determines 268 # the relative fraction 269 # of average linkage 270 # (SUEFF) vs. nearest- 271 # neighbor linkage (1 272 # SUEFF). 273 _Option(["-sueff", "sueff"], 274 "Constant used in UPGMB clustering", 275 checker_function=lambda x: isinstance(x, float), 276 equate=False), 277 # tree1 File name None Save tree produced in 278 _Option(["-tree1", "tree1"], 279 "Save Newick tree from iteration 1", 280 equate=False), 281 # tree2 first or second 282 # iteration to given file 283 # in Newick (Phylip- 284 # compatible) format. 285 _Option(["-tree2", "tree2"], 286 "Save Newick tree from iteration 2", 287 equate=False), 288 # weight1 none clustalw Sequence weighting 289 _Option(["-weight1", "weight1"], 290 "Weighting scheme used in iteration 1", 291 checker_function=lambda x: x in WEIGHTING_SCHEMES, 292 equate=False), 293 # weight2 henikoff scheme. 294 # henikoffpb weight1 is used in 295 # gsc iterations 1 and 2. 296 # clustalw weight2 is used for 297 # threeway tree-dependent 298 # refinement. 299 # none=all sequences have 300 # equal weight. 301 # henikoff=Henikoff & 302 # Henikoff weighting 303 # scheme. 304 # henikoffpb=Modified 305 # Henikoff scheme as used 306 # in PSI-BLAST. 307 # clustalw=CLUSTALW 308 # method. 309 # threeway=Gotoh three- 310 # way method. 311 _Option(["-weight2", "weight2"], 312 "Weighting scheme used in iteration 2", 313 checker_function=lambda x: x in WEIGHTING_SCHEMES, 314 equate=False), 315 # ################### FORMATS ####################################### 316 # Multiple formats can be specified on the command line 317 # If -msf appears it will be used regardless of other formats 318 # specified. If -clw appears (and not -msf), clustalw format will be 319 # used regardless of other formats specified. If both -clw and 320 # -clwstrict are specified -clwstrict will be used regardless of 321 # other formats specified. If -fasta is specified and not -msf, 322 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 323 # specified -fasta will be used. Only if -html is specified alone 324 # will html be used. I kid ye not. 325 # clw no Write output in CLUSTALW format (default is 326 # FASTA). 327 _Switch(["-clw", "clw"], 328 "Write output in CLUSTALW format (with a MUSCLE header)"), 329 # clwstrict no Write output in CLUSTALW format with the 330 # "CLUSTAL W (1.81)" header rather than the 331 # MUSCLE version. This is useful when a post- 332 # processing step is picky about the file 333 # header. 334 _Switch(["-clwstrict", "clwstrict"], 335 "Write output in CLUSTALW format with version 1.81 header"), 336 # fasta yes Write output in FASTA format. Alternatives 337 # include clw, 338 # clwstrict, msf and html. 339 _Switch(["-fasta", "fasta"], 340 "Write output in FASTA format"), 341 # html no Write output in HTML format (default is 342 # FASTA). 343 _Switch(["-html", "html"], 344 "Write output in HTML format"), 345 # msf no Write output in MSF format (default is 346 # FASTA). 347 _Switch(["-msf", "msf"], 348 "Write output in MSF format"), 349 # Phylip interleaved - undocumented as of 3.7 350 _Switch(["-phyi", "phyi"], 351 "Write output in PHYLIP interleaved format"), 352 # Phylip sequential - undocumented as of 3.7 353 _Switch(["-phys", "phys"], 354 "Write output in PHYLIP sequential format"), 355 # ################# Additional specified output files ######### 356 _Option(["-phyiout", "phyiout"], 357 "Write PHYLIP interleaved output to specified filename", 358 filename=True, 359 equate=False), 360 _Option(["-physout", "physout"], "Write PHYLIP sequential format to specified filename", 361 filename=True, 362 equate=False), 363 _Option(["-htmlout", "htmlout"], "Write HTML output to specified filename", 364 filename=True, 365 equate=False), 366 _Option(["-clwout", "clwout"], 367 "Write CLUSTALW output (with MUSCLE header) to specified " 368 "filename", 369 filename=True, 370 equate=False), 371 _Option(["-clwstrictout", "clwstrictout"], 372 "Write CLUSTALW output (with version 1.81 header) to " 373 "specified filename", 374 filename=True, 375 equate=False), 376 _Option(["-msfout", "msfout"], 377 "Write MSF format output to specified filename", 378 filename=True, 379 equate=False), 380 _Option(["-fastaout", "fastaout"], 381 "Write FASTA format output to specified filename", 382 filename=True, 383 equate=False), 384 # ############# END FORMATS ################################### 385 # anchors yes Use anchor optimization in tree dependent 386 # refinement iterations. 387 _Switch(["-anchors", "anchors"], 388 "Use anchor optimisation in tree dependent " 389 "refinement iterations"), 390 # noanchors no Disable anchor optimization. Default is 391 # anchors. 392 _Switch(["-noanchors", "noanchors"], 393 "Do not use anchor optimisation in tree dependent " 394 "refinement iterations"), 395 # group yes Group similar sequences together in the 396 # output. This is the default. See also 397 # stable. 398 _Switch(["-group", "group"], 399 "Group similar sequences in output"), 400 # stable no Preserve input order of sequences in output 401 # file. Default is to group sequences by 402 # similarity (group). 403 _Switch(["-stable", "stable"], 404 "Do not group similar sequences in output (not supported in v3.8)"), 405 # ############# log-expectation profile score ###################### 406 # One of either -le, -sp, or -sv 407 # 408 # According to the doc, spn is default and the only option for 409 # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can 410 # be used and produce numerically different logs (what is going on?) 411 # 412 # spn fails on proteins 413 # le maybe Use log-expectation profile score (VTML240). 414 # Alternatives are to use sp or sv. This is 415 # the default for amino acid sequences. 416 _Switch(["-le", "le"], 417 "Use log-expectation profile score (VTML240)"), 418 # sv no Use sum-of-pairs profile score (VTML240). 419 # Default is le. 420 _Switch(["-sv", "sv"], 421 "Use sum-of-pairs profile score (VTML240)"), 422 # sp no Use sum-of-pairs protein profile score 423 # (PAM200). Default is le. 424 _Switch(["-sp", "sp"], 425 "Use sum-of-pairs protein profile score (PAM200)"), 426 # spn maybe Use sum-of-pairs nucleotide profile score 427 # (BLASTZ parameters). This is the only option 428 # for nucleotides, and is therefore the 429 # default. 430 _Switch(["-spn", "spn"], 431 "Use sum-of-pairs protein nucleotide profile score"), 432 # ############# END log-expectation profile score ###################### 433 # quiet no Do not display progress messages. 434 _Switch(["-quiet", "quiet"], 435 "Use sum-of-pairs protein nucleotide profile score"), 436 # refine no Input file is already aligned, skip first 437 # two iterations and begin tree dependent 438 # refinement. 439 _Switch(["-refine", "refine"], 440 "Only do tree dependent refinement"), 441 # core yes in muscle, Do not catch exceptions. 442 # no in muscled. 443 _Switch(["-core", "core"], 444 "Catch exceptions"), 445 # nocore no in muscle, Catch exceptions and give an error message 446 # yes in muscled. if possible. 447 _Switch(["-nocore", "nocore"], 448 "Do not catch exceptions"), 449 # termgapsfull no Terminal gaps penalized with full penalty. 450 # [1] Not fully supported in this version. 451 # 452 # termgapshalf yes Terminal gaps penalized with half penalty. 453 # [1] Not fully supported in this version. 454 # 455 # termgapshalflonger no Terminal gaps penalized with half penalty if 456 # gap relative to 457 # longer sequence, otherwise with full 458 # penalty. 459 # [1] Not fully supported in this version. 460 # verbose no Write parameter settings and progress 461 # messages to log file. 462 _Switch(["-verbose", "verbose"], 463 "Write parameter settings and progress"), 464 # version no Write version string to stdout and exit. 465 _Switch(["-version", "version"], 466 "Write version string to stdout and exit"), 467 ] 468 AbstractCommandline.__init__(self, cmd, **kwargs)
469 470
471 -def _test():
472 """Run the module's doctests (PRIVATE).""" 473 print("Running MUSCLE doctests...") 474 import doctest 475 doctest.testmod() 476 print("Done")
477 478 if __name__ == "__main__": 479 _test() 480