Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6  """ 
  7   
  8  from __future__ import print_function 
  9   
 10  __docformat__ = "restructuredtext en" 
 11   
 12  import os 
 13  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 14   
 15   
16 -class MafftCommandline(AbstractCommandline):
17 """Command line wrapper for the multiple alignment program MAFFT. 18 19 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 20 21 Example: 22 -------- 23 24 >>> from Bio.Align.Applications import MafftCommandline 25 >>> mafft_exe = "/opt/local/mafft" 26 >>> in_file = "../Doc/examples/opuntia.fasta" 27 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 28 >>> print(mafft_cline) 29 /opt/local/mafft ../Doc/examples/opuntia.fasta 30 31 If the mafft binary is on the path (typically the case on a Unix style 32 operating system) then you don't need to supply the executable location: 33 34 >>> from Bio.Align.Applications import MafftCommandline 35 >>> in_file = "../Doc/examples/opuntia.fasta" 36 >>> mafft_cline = MafftCommandline(input=in_file) 37 >>> print(mafft_cline) 38 mafft ../Doc/examples/opuntia.fasta 39 40 You would typically run the command line with mafft_cline() or via 41 the Python subprocess module, as described in the Biopython tutorial. 42 Note that MAFFT will write the alignment to stdout, which you may 43 want to save to a file and then parse, e.g.:: 44 45 stdout, stderr = mafft_cline() 46 with open("aligned.fasta", "w") as handle: 47 handle.write(stdout) 48 from Bio import AlignIO 49 align = AlignIO.read("aligned.fasta", "fasta") 50 51 Alternatively, to parse the output with AlignIO directly you can 52 use StringIO to turn the string into a handle:: 53 54 stdout, stderr = mafft_cline() 55 from StringIO import StringIO 56 from Bio import AlignIO 57 align = AlignIO.read(StringIO(stdout), "fasta") 58 59 Citations: 60 ---------- 61 62 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 63 multiple ncRNA alignment by incorporating structural information into 64 a MAFFT-based framework (describes RNA structural alignment methods) 65 66 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 67 developments in the MAFFT multiple sequence alignment program 68 (outlines version 6) 69 70 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 71 algorithm to build an approximate tree from a large number of 72 unaligned sequences (describes the PartTree algorithm) 73 74 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 75 version 5: improvement in accuracy of multiple sequence alignment 76 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 77 strategies) 78 79 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 80 81 Last checked against version: MAFFT v6.717b (2009/12/03) 82 """
83 - def __init__(self, cmd="mafft", **kwargs):
84 BLOSUM_MATRICES = ["30", "45", "62", "80"] 85 self.parameters = \ 86 [ 87 # **** Algorithm **** 88 # Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 89 # i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 90 _Switch(["--auto", "auto"], 91 "Automatically select strategy. Default off."), 92 # Distance is calculated based on the number of shared 6mers. Default: on 93 _Switch(["--6merpair", "6merpair", "sixmerpair"], 94 "Distance is calculated based on the number of shared " 95 "6mers. Default: on"), 96 # All pairwise alignments are computed with the Needleman-Wunsch 97 # algorithm. More accurate but slower than --6merpair. Suitable for a 98 # set of globally alignable sequences. Applicable to up to ~200 99 # sequences. A combination with --maxiterate 1000 is recommended (G- 100 # INS-i). Default: off (6mer distance is used) 101 _Switch(["--globalpair", "globalpair"], 102 "All pairwise alignments are computed with the " 103 "Needleman-Wunsch algorithm. Default: off"), 104 # All pairwise alignments are computed with the Smith-Waterman 105 # algorithm. More accurate but slower than --6merpair. Suitable for a 106 # set of locally alignable sequences. Applicable to up to ~200 107 # sequences. A combination with --maxiterate 1000 is recommended (L- 108 # INS-i). Default: off (6mer distance is used) 109 _Switch(["--localpair", "localpair"], 110 "All pairwise alignments are computed with the " 111 "Smith-Waterman algorithm. Default: off"), 112 # All pairwise alignments are computed with a local algorithm with 113 # the generalized affine gap cost (Altschul 1998). More accurate but 114 # slower than --6merpair. Suitable when large internal gaps are 115 # expected. Applicable to up to ~200 sequences. A combination with -- 116 # maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 117 # distance is used) 118 _Switch(["--genafpair", "genafpair"], 119 "All pairwise alignments are computed with a local " 120 "algorithm with the generalized affine gap cost " 121 "(Altschul 1998). Default: off"), 122 # All pairwise alignments are computed with FASTA (Pearson and Lipman 123 # 1988). FASTA is required. Default: off (6mer distance is used) 124 _Switch(["--fastapair", "fastapair"], 125 "All pairwise alignments are computed with FASTA " 126 "(Pearson and Lipman 1988). Default: off"), 127 # Weighting factor for the consistency term calculated from pairwise 128 # alignments. Valid when either of --blobalpair, --localpair, -- 129 # genafpair, --fastapair or --blastpair is selected. Default: 2.7 130 _Option(["--weighti", "weighti"], 131 "Weighting factor for the consistency term calculated " 132 "from pairwise alignments. Default: 2.7", 133 checker_function=lambda x: isinstance(x, float), 134 equate=False), 135 # Guide tree is built number times in the progressive stage. Valid 136 # with 6mer distance. Default: 2 137 _Option(["--retree", "retree"], 138 "Guide tree is built number times in the progressive " 139 "stage. Valid with 6mer distance. Default: 2", 140 checker_function=lambda x: isinstance(x, int), 141 equate=False), 142 # Number cycles of iterative refinement are performed. Default: 0 143 _Option(["--maxiterate", "maxiterate"], 144 "Number cycles of iterative refinement are performed. " 145 "Default: 0", 146 checker_function=lambda x: isinstance(x, int), 147 equate=False), 148 # Use FFT approximation in group-to-group alignment. Default: on 149 _Switch(["--fft", "fft"], 150 "Use FFT approximation in group-to-group alignment. " 151 "Default: on"), 152 # Do not use FFT approximation in group-to-group alignment. Default: 153 # off 154 _Switch(["--nofft", "nofft"], 155 "Do not use FFT approximation in group-to-group " 156 "alignment. Default: off"), 157 # Alignment score is not checked in the iterative refinement stage. 158 # Default: off (score is checked) 159 _Switch(["--noscore", "noscore"], 160 "Alignment score is not checked in the iterative " 161 "refinement stage. Default: off (score is checked)"), 162 # Use the Myers-Miller (1988) algorithm. Default: automatically 163 # turned on when the alignment length exceeds 10,000 (aa/nt). 164 _Switch(["--memsave", "memsave"], 165 "Use the Myers-Miller (1988) algorithm. Default: " 166 "automatically turned on when the alignment length " 167 "exceeds 10,000 (aa/nt)."), 168 # Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 169 # the 6mer distance. Recommended for a large number (> ~10,000) of 170 # sequences are input. Default: off 171 _Switch(["--parttree", "parttree"], 172 "Use a fast tree-building method with the 6mer " 173 "distance. Default: off"), 174 # The PartTree algorithm is used with distances based on DP. Slightly 175 # more accurate and slower than --parttree. Recommended for a large 176 # number (> ~10,000) of sequences are input. Default: off 177 _Switch(["--dpparttree", "dpparttree"], 178 "The PartTree algorithm is used with distances " 179 "based on DP. Default: off"), 180 # The PartTree algorithm is used with distances based on FASTA. 181 # Slightly more accurate and slower than --parttree. Recommended for 182 # a large number (> ~10,000) of sequences are input. FASTA is 183 # required. Default: off 184 _Switch(["--fastaparttree", "fastaparttree"], 185 "The PartTree algorithm is used with distances based " 186 "on FASTA. Default: off"), 187 # The number of partitions in the PartTree algorithm. Default: 50 188 _Option(["--partsize", "partsize"], 189 "The number of partitions in the PartTree algorithm. " 190 "Default: 50", 191 checker_function=lambda x: isinstance(x, int), 192 equate=False), 193 # Do not make alignment larger than number sequences. Valid only with 194 # the --*parttree options. Default: the number of input sequences 195 _Switch(["--groupsize", "groupsize"], 196 "Do not make alignment larger than number sequences. " 197 "Default: the number of input sequences"), 198 # Adjust direction according to the first sequence 199 # Mafft V6 beta function 200 _Switch(["--adjustdirection", "adjustdirection"], 201 "Adjust direction according to the first sequence. " 202 "Default off."), 203 # Adjust direction according to the first sequence 204 # for highly diverged data; very slow 205 # Mafft V6 beta function 206 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"], 207 "Adjust direction according to the first sequence," 208 "for highly diverged data; very slow" 209 "Default off."), 210 # **** Parameter **** 211 # Gap opening penalty at group-to-group alignment. Default: 1.53 212 _Option(["--op", "op"], 213 "Gap opening penalty at group-to-group alignment. " 214 "Default: 1.53", 215 checker_function=lambda x: isinstance(x, float), 216 equate=False), 217 # Offset value, which works like gap extension penalty, for group-to- 218 # group alignment. Deafult: 0.123 219 _Option(["--ep", "ep"], 220 "Offset value, which works like gap extension penalty, " 221 "for group-to- group alignment. Default: 0.123", 222 checker_function=lambda x: isinstance(x, float), 223 equate=False), 224 # Gap opening penalty at local pairwise alignment. Valid when the -- 225 # localpair or --genafpair option is selected. Default: -2.00 226 _Option(["--lop", "lop"], 227 "Gap opening penalty at local pairwise alignment. " 228 "Default: 0.123", 229 checker_function=lambda x: isinstance(x, float), 230 equate=False), 231 # Offset value at local pairwise alignment. Valid when the -- 232 # localpair or --genafpair option is selected. Default: 0.1 233 _Option(["--lep", "lep"], 234 "Offset value at local pairwise alignment. " 235 "Default: 0.1", 236 checker_function=lambda x: isinstance(x, float), 237 equate=False), 238 # Gap extension penalty at local pairwise alignment. Valid when the - 239 # -localpair or --genafpair option is selected. Default: -0.1 240 _Option(["--lexp", "lexp"], 241 "Gap extension penalty at local pairwise alignment. " 242 "Default: -0.1", 243 checker_function=lambda x: isinstance(x, float), 244 equate=False), 245 # Gap opening penalty to skip the alignment. Valid when the -- 246 # genafpair option is selected. Default: -6.00 247 _Option(["--LOP", "LOP"], 248 "Gap opening penalty to skip the alignment. " 249 "Default: -6.00", 250 checker_function=lambda x: isinstance(x, float), 251 equate=False), 252 # Gap extension penalty to skip the alignment. Valid when the -- 253 # genafpair option is selected. Default: 0.00 254 _Option(["--LEXP", "LEXP"], 255 "Gap extension penalty to skip the alignment. " 256 "Default: 0.00", 257 checker_function=lambda x: isinstance(x, float), 258 equate=False), 259 260 # BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 261 # number=30, 45, 62 or 80. Default: 62 262 _Option(["--bl", "bl"], 263 "BLOSUM number matrix is used. Default: 62", 264 checker_function=lambda x: x in BLOSUM_MATRICES, 265 equate=False), 266 # JTT PAM number (Jones et al. 1992) matrix is used. number>0. 267 # Default: BLOSUM62 268 _Option(["--jtt", "jtt"], 269 "JTT PAM number (Jones et al. 1992) matrix is used. " 270 "number>0. Default: BLOSUM62", 271 equate=False), 272 # Transmembrane PAM number (Jones et al. 1994) matrix is used. 273 # number>0. Default: BLOSUM62 274 _Option(["--tm", "tm"], 275 "Transmembrane PAM number (Jones et al. 1994) " 276 "matrix is used. number>0. Default: BLOSUM62", 277 filename=True, 278 equate=False), 279 # Use a user-defined AA scoring matrix. The format of matrixfile is 280 # the same to that of BLAST. Ignored when nucleotide sequences are 281 # input. Default: BLOSUM62 282 _Option(["--aamatrix", "aamatrix"], 283 "Use a user-defined AA scoring matrix. " 284 "Default: BLOSUM62", 285 filename=True, 286 equate=False), 287 # Incorporate the AA/nuc composition information into the scoring 288 # matrix. Default: off 289 _Switch(["--fmodel", "fmodel"], 290 "Incorporate the AA/nuc composition information into " 291 "the scoring matrix (True) or not (False, default)"), 292 # **** Output **** 293 # Name length for CLUSTAL and PHYLIP format output 294 _Option(["--namelength", "namelength"], 295 """Name length in CLUSTAL and PHYLIP output. 296 297 MAFFT v6.847 (2011) added --namelength for use with 298 the --clustalout option for CLUSTAL output. 299 300 MAFFT v7.024 (2013) added support for this with the 301 --phylipout option for PHYLIP output (default 10). 302 """, 303 checker_function=lambda x: isinstance(x, int), 304 equate=False), 305 # Output format: clustal format. Default: off (fasta format) 306 _Switch(["--clustalout", "clustalout"], 307 "Output format: clustal (True) or fasta (False, default)"), 308 # Output format: phylip format. 309 # Added in beta with v6.847, fixed in v6.850 (2011) 310 _Switch(["--phylipout", "phylipout"], 311 "Output format: phylip (True), or fasta (False, default)"), 312 # Output order: same as input. Default: on 313 _Switch(["--inputorder", "inputorder"], 314 "Output order: same as input (True, default) or alignment " 315 "based (False)"), 316 # Output order: aligned. Default: off (inputorder) 317 _Switch(["--reorder", "reorder"], 318 "Output order: aligned (True) or in input order (False, " 319 "default)"), 320 # Guide tree is output to the input.tree file. Default: off 321 _Switch(["--treeout", "treeout"], 322 "Guide tree is output to the input.tree file (True) or " 323 "not (False, default)"), 324 # Do not report progress. Default: off 325 _Switch(["--quiet", "quiet"], 326 "Do not report progress (True) or not (False, default)."), 327 # **** Input **** 328 # Assume the sequences are nucleotide. Deafult: auto 329 _Switch(["--nuc", "nuc"], 330 "Assume the sequences are nucleotide (True/False). " 331 "Default: auto"), 332 # Assume the sequences are amino acid. Deafult: auto 333 _Switch(["--amino", "amino"], 334 "Assume the sequences are amino acid (True/False). " 335 "Default: auto"), 336 # MAFFT has multiple --seed commands where the unaligned input is 337 # aligned to the seed alignment. There can be multiple seeds in the 338 # form: "mafft --seed align1 --seed align2 [etc] input" 339 # Effectively for n number of seed alignments. 340 # TODO - Can we use class _ArgumentList here? 341 _Option(["--seed", "seed"], 342 "Seed alignments given in alignment_n (fasta format) " 343 "are aligned with sequences in input.", 344 filename=True, 345 equate=False), 346 # The input (must be FASTA format) 347 _Argument(["input"], 348 "Input file name", 349 filename=True, 350 is_required=True), 351 # mafft-profile takes a second alignment input as an argument: 352 # mafft-profile align1 align2 353 _Argument(["input1"], 354 "Second input file name for the mafft-profile command", 355 filename=True), 356 ] 357 AbstractCommandline.__init__(self, cmd, **kwargs)
358 359 360 if __name__ == "__main__": 361 from Bio._utils import run_doctest 362 run_doctest() 363