1
2
3
4
5 """Command line wrapper for the multiple alignment programme MAFFT.
6 """
7
8 __docformat__ = "epytext en"
9
10 import os
11 from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
12
13
15 """Command line wrapper for the multiple alignment program MAFFT.
16
17 http://align.bmr.kyushu-u.ac.jp/mafft/software/
18
19 Example:
20
21 >>> from Bio.Align.Applications import MafftCommandline
22 >>> mafft_exe = "/opt/local/mafft"
23 >>> in_file = "../Doc/examples/opuntia.fasta"
24 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
25 >>> print mafft_cline
26 /opt/local/mafft ../Doc/examples/opuntia.fasta
27
28 If the mafft binary is on the path (typically the case on a Unix style
29 operating system) then you don't need to supply the executable location:
30
31 >>> from Bio.Align.Applications import MafftCommandline
32 >>> in_file = "../Doc/examples/opuntia.fasta"
33 >>> mafft_cline = MafftCommandline(input=in_file)
34 >>> print mafft_cline
35 mafft ../Doc/examples/opuntia.fasta
36
37 You would typically run the command line with mafft_cline() or via
38 the Python subprocess module, as described in the Biopython tutorial.
39 Note that MAFFT will write the alignment to stdout, which you may
40 want to save to a file and then parse, e.g.
41
42 stdout, stderr = mafft_cline()
43 handle = open("aligned.fasta", "w")
44 handle.write(stdout)
45 handle.close()
46 from Bio import AlignIO
47 align = AlignIO.read("aligned.fasta", "fasta")
48
49 Alternatively, to parse the output with AlignIO directly you can
50 use StringIO to turn the string into a handle:
51
52 stdout, stderr = mafft_cline()
53 from StringIO import StringIO
54 from Bio import AlignIO
55 align = AlignIO.read(StringIO(stdout), "fasta")
56
57 Citations:
58
59 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
60 multiple ncRNA alignment by incorporating structural information into
61 a MAFFT-based framework (describes RNA structural alignment methods)
62
63 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
64 developments in the MAFFT multiple sequence alignment program
65 (outlines version 6)
66
67 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
68 algorithm to build an approximate tree from a large number of
69 unaligned sequences (describes the PartTree algorithm)
70
71 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
72 version 5: improvement in accuracy of multiple sequence alignment
73 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
74 strategies)
75
76 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
77
78 Last checked against version: MAFFT v6.717b (2009/12/03)
79 """
80 - def __init__(self, cmd="mafft", **kwargs):
81 BLOSUM_MATRICES = ["30","45","62","80"]
82 self.parameters = \
83 [
84
85
86
87 _Switch(["--auto", "auto"],
88 "Automatically select strategy. Default off."),
89
90 _Switch(["--6merpair", "6merpair", "sixmerpair"],
91 "Distance is calculated based on the number of shared "
92 "6mers. Default: on"),
93
94
95
96
97
98 _Switch(["--globalpair", "globalpair"],
99 "All pairwise alignments are computed with the "
100 "Needleman-Wunsch algorithm. Default: off"),
101
102
103
104
105
106 _Switch(["--localpair", "localpair"],
107 "All pairwise alignments are computed with the "
108 "Smith-Waterman algorithm. Default: off"),
109
110
111
112
113
114
115 _Switch(["--genafpair", "genafpair"],
116 "All pairwise alignments are computed with a local "
117 "algorithm with the generalized affine gap cost "
118 "(Altschul 1998). Default: off"),
119
120
121 _Switch(["--fastapair", "fastapair"],
122 "All pairwise alignments are computed with FASTA "
123 "(Pearson and Lipman 1988). Default: off"),
124
125
126
127 _Option(["--weighti", "weighti"],
128 "Weighting factor for the consistency term calculated "
129 "from pairwise alignments. Default: 2.7",
130 checker_function=lambda x: isinstance(x, float),
131 equate=False),
132
133
134 _Option(["--retree", "retree"],
135 "Guide tree is built number times in the progressive "
136 "stage. Valid with 6mer distance. Default: 2",
137 checker_function=lambda x: isinstance(x, int),
138 equate=False),
139
140 _Option(["--maxiterate", "maxiterate"],
141 "Number cycles of iterative refinement are performed. "
142 "Default: 0",
143 checker_function=lambda x: isinstance(x, int),
144 equate=False),
145
146 _Switch(["--fft", "fft"],
147 "Use FFT approximation in group-to-group alignment. "
148 "Default: on"),
149
150
151 _Switch(["--nofft", "nofft"],
152 "Do not use FFT approximation in group-to-group "
153 "alignment. Default: off"),
154
155
156 _Switch(["--noscore", "noscore"],
157 "Alignment score is not checked in the iterative "
158 "refinement stage. Default: off (score is checked)"),
159
160
161 _Switch(["--memsave", "memsave"],
162 "Use the Myers-Miller (1988) algorithm. Default: "
163 "automatically turned on when the alignment length "
164 "exceeds 10,000 (aa/nt)."),
165
166
167
168 _Switch(["--parttree", "parttree"],
169 "Use a fast tree-building method with the 6mer "
170 "distance. Default: off"),
171
172
173
174 _Switch(["--dpparttree", "dpparttree"],
175 "The PartTree algorithm is used with distances "
176 "based on DP. Default: off"),
177
178
179
180
181 _Switch(["--fastaparttree", "fastaparttree"],
182 "The PartTree algorithm is used with distances based "
183 "on FASTA. Default: off"),
184
185 _Option(["--partsize", "partsize"],
186 "The number of partitions in the PartTree algorithm. "
187 "Default: 50",
188 checker_function=lambda x: isinstance(x, int),
189 equate=False),
190
191
192 _Switch(["--groupsize", "groupsize"],
193 "Do not make alignment larger than number sequences. "
194 "Default: the number of input sequences"),
195
196
197 _Switch(["--adjustdirection", "adjustdirection"],
198 "Adjust direction according to the first sequence. "
199 "Default off."),
200
201
202
203 _Switch(["--adjustdirectionaccurately", "adjustdirectionaccurately"],
204 "Adjust direction according to the first sequence,"
205 "for highly diverged data; very slow"
206 "Default off."),
207
208
209 _Option(["--op", "op"],
210 "Gap opening penalty at group-to-group alignment. "
211 "Default: 1.53",
212 checker_function=lambda x: isinstance(x, float),
213 equate=False),
214
215
216 _Option(["--ep", "ep"],
217 "Offset value, which works like gap extension penalty, "
218 "for group-to- group alignment. Default: 0.123",
219 checker_function=lambda x: isinstance(x, float),
220 equate=False),
221
222
223 _Option(["--lop", "lop"],
224 "Gap opening penalty at local pairwise alignment. "
225 "Default: 0.123",
226 checker_function=lambda x: isinstance(x, float),
227 equate=False),
228
229
230 _Option(["--lep", "lep"],
231 "Offset value at local pairwise alignment. "
232 "Default: 0.1",
233 checker_function=lambda x: isinstance(x, float),
234 equate=False),
235
236
237 _Option(["--lexp", "lexp"],
238 "Gap extension penalty at local pairwise alignment. "
239 "Default: -0.1",
240 checker_function=lambda x: isinstance(x, float),
241 equate=False),
242
243
244 _Option(["--LOP", "LOP"],
245 "Gap opening penalty to skip the alignment. "
246 "Default: -6.00",
247 checker_function=lambda x: isinstance(x, float),
248 equate=False),
249
250
251 _Option(["--LEXP", "LEXP"],
252 "Gap extension penalty to skip the alignment. "
253 "Default: 0.00",
254 checker_function=lambda x: isinstance(x, float),
255 equate=False),
256
257
258
259 _Option(["--bl", "bl"],
260 "BLOSUM number matrix is used. Default: 62",
261 checker_function=lambda x: x in BLOSUM_MATRICES,
262 equate=False),
263
264
265 _Option(["--jtt", "jtt"],
266 "JTT PAM number (Jones et al. 1992) matrix is used. "
267 "number>0. Default: BLOSUM62",
268 equate=False),
269
270
271 _Option(["--tm", "tm"],
272 "Transmembrane PAM number (Jones et al. 1994) "
273 "matrix is used. number>0. Default: BLOSUM62",
274 filename=True,
275 equate=False),
276
277
278
279 _Option(["--aamatrix", "aamatrix"],
280 "Use a user-defined AA scoring matrix. "
281 "Default: BLOSUM62",
282 filename=True,
283 equate=False),
284
285
286 _Switch(["--fmodel", "fmodel"],
287 "Incorporate the AA/nuc composition information into "
288 "the scoring matrix (True) or not (False, default)"),
289
290
291 _Switch(["--clustalout", "clustalout"],
292 "Output format: clustal (True) or fasta (False, default)"),
293
294 _Switch(["--inputorder", "inputorder"],
295 "Output order: same as input (True, default) or alignment "
296 "based (False)"),
297
298 _Switch(["--reorder", "reorder"],
299 "Output order: aligned (True) or in input order (False, "
300 "default)"),
301
302 _Switch(["--treeout", "treeout"],
303 "Guide tree is output to the input.tree file (True) or "
304 "not (False, default)"),
305
306 _Switch(["--quiet", "quiet"],
307 "Do not report progress (True) or not (False, default)."),
308
309
310 _Switch(["--nuc", "nuc"],
311 "Assume the sequences are nucleotide (True/False). "
312 "Default: auto"),
313
314 _Switch(["--amino", "amino"],
315 "Assume the sequences are amino acid (True/False). "
316 "Default: auto"),
317
318
319
320
321
322
323 _Option(["--seed", "seed"],
324 "Seed alignments given in alignment_n (fasta format) "
325 "are aligned with sequences in input.",
326 filename=True,
327 equate=False),
328
329
330
331
332
333
334
335
336
337 _Argument(["input"],
338 "Input file name",
339 filename=True,
340 is_required=True),
341
342
343
344 _Argument(["input1"],
345 "Second input file name for the mafft-profile command",
346 filename=True),
347 ]
348 AbstractCommandline.__init__(self, cmd, **kwargs)
349
350
351 if __name__ == "__main__":
352 from Bio._utils import run_doctest
353 run_doctest()
354