1
2
3
4
5
6 """
7 Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11 """
12
13 from Bio.Seq import Seq
14 from Bio.SeqRecord import SeqRecord
15 from Bio.Align import MultipleSeqAlignment
16 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
17
18
20 """Clustalw alignment writer."""
22 """Use this to write (another) single alignment to an open file."""
23
24 if len(alignment) == 0:
25 raise ValueError("Must have at least one sequence")
26 if alignment.get_alignment_length() == 0:
27
28 raise ValueError("Non-empty sequences are required")
29
30
31 try:
32 version = str(alignment._version)
33 except AttributeError:
34 version = ""
35 if not version:
36 version = '1.81'
37 if version.startswith("2."):
38
39 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version
40 else:
41
42 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
43
44 cur_char = 0
45 max_length = len(alignment[0])
46
47 if max_length <= 0:
48 raise ValueError("Non-empty sequences are required")
49
50
51 while cur_char != max_length:
52
53
54 if (cur_char + 50) > max_length:
55 show_num = max_length - cur_char
56 else:
57 show_num = 50
58
59
60
61
62 for record in alignment:
63
64
65
66 line = record.id[0:30].replace(" ", "_").ljust(36)
67 line += str(record.seq[cur_char:(cur_char + show_num)])
68 output += line + "\n"
69
70
71
72 if hasattr(alignment, "_star_info") and alignment._star_info != '':
73 output += (" " * 36) + \
74 alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
75
76 output += "\n"
77 cur_char += show_num
78
79
80 self.handle.write(output + "\n")
81
82
84 """Clustalw alignment iterator."""
85
87 handle = self.handle
88 try:
89
90
91 line = self._header
92 del self._header
93 except AttributeError:
94 line = handle.readline()
95 if not line:
96 raise StopIteration
97
98
99 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS']
100 if line.strip().split()[0] not in known_headers:
101 raise ValueError("%s is not a known CLUSTAL header: %s" %
102 (line.strip().split()[0],
103 ", ".join(known_headers)))
104
105
106 version = None
107 for word in line.split():
108 if word[0] == '(' and word[-1] == ')':
109 word = word[1:-1]
110 if word[0] in '0123456789':
111 version = word
112 break
113
114
115 line = handle.readline()
116 while line.strip() == "":
117 line = handle.readline()
118
119
120
121
122 ids = []
123 seqs = []
124 consensus = ""
125 seq_cols = None
126
127
128 while True:
129 if line[0] != " " and line.strip() != "":
130
131 fields = line.rstrip().split()
132
133
134
135 if len(fields) < 2 or len(fields) > 3:
136 raise ValueError("Could not parse line:\n%s" % line)
137
138 ids.append(fields[0])
139 seqs.append(fields[1])
140
141
142 if seq_cols is None:
143 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
144 end = start + len(fields[1])
145 seq_cols = slice(start, end)
146 del start, end
147 assert fields[1] == line[seq_cols]
148
149 if len(fields) == 3:
150
151 try:
152 letters = int(fields[2])
153 except ValueError:
154 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
155 if len(fields[1].replace("-", "")) != letters:
156 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
157 elif line[0] == " ":
158
159 assert len(ids) == len(seqs)
160 assert len(ids) > 0
161 assert seq_cols is not None
162 consensus = line[seq_cols]
163 assert not line[:seq_cols.start].strip()
164 assert not line[seq_cols.stop:].strip()
165
166 line = handle.readline()
167 assert line.strip() == ""
168 break
169 else:
170
171 break
172 line = handle.readline()
173 if not line:
174 break
175
176 assert line.strip() == ""
177 assert seq_cols is not None
178
179
180 for s in seqs:
181 assert len(s) == len(seqs[0])
182 if consensus:
183 assert len(consensus) == len(seqs[0])
184
185
186 done = False
187 while not done:
188
189
190
191 while (not line) or line.strip() == "":
192 line = handle.readline()
193 if not line:
194 break
195 if not line:
196 break
197
198 if line.split(None, 1)[0] in known_headers:
199
200 done = True
201 self._header = line
202 break
203
204 for i in range(len(ids)):
205 assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
206 fields = line.rstrip().split()
207
208
209
210 if len(fields) < 2 or len(fields) > 3:
211 raise ValueError("Could not parse line:\n%s" % repr(line))
212
213 if fields[0] != ids[i]:
214 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'"
215 % (fields[0], ids[i]))
216
217 if fields[1] != line[seq_cols]:
218 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
219 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
220 end = start + len(fields[1])
221 seq_cols = slice(start, end)
222 del start, end
223
224
225 seqs[i] += fields[1]
226 assert len(seqs[i]) == len(seqs[0])
227
228 if len(fields) == 3:
229
230 try:
231 letters = int(fields[2])
232 except ValueError:
233 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
234 if len(seqs[i].replace("-", "")) != letters:
235 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
236
237
238 line = handle.readline()
239
240 if consensus:
241 assert line[0] == " "
242 assert seq_cols is not None
243 consensus += line[seq_cols]
244 assert len(consensus) == len(seqs[0])
245 assert not line[:seq_cols.start].strip()
246 assert not line[seq_cols.stop:].strip()
247
248 line = handle.readline()
249
250 assert len(ids) == len(seqs)
251 if len(seqs) == 0 or len(seqs[0]) == 0:
252 raise StopIteration
253
254 if self.records_per_alignment is not None \
255 and self.records_per_alignment != len(ids):
256 raise ValueError("Found %i records in this alignment, told to expect %i"
257 % (len(ids), self.records_per_alignment))
258
259 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i)
260 for (i, s) in zip(ids, seqs))
261 alignment = MultipleSeqAlignment(records, self.alphabet)
262
263
264 if version:
265 alignment._version = version
266 if consensus:
267 alignment_length = len(seqs[0])
268 assert len(consensus) == alignment_length, \
269 "Alignment length is %i, consensus length is %i, '%s'" \
270 % (alignment_length, len(consensus), consensus)
271 alignment._star_info = consensus
272 return alignment
273
274 if __name__ == "__main__":
275 print "Running a quick self-test"
276
277
278
279 aln_example1 = \
280 """CLUSTAL W (1.81) multiple sequence alignment
281
282
283 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
284 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
285 * *: :: :. :* : :. : . :* :: .
286
287 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
288 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
289 : ** **:... *.*** ..
290
291 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
292 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
293 .:* * *: .* :* : :* .*
294
295 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
296 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
297 *::. . .:: :*..* :* .* .. . : . :
298
299 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
300 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
301 *. .:: : .
302 """
303
304
305
306
307 aln_example2 = \
308 """CLUSTAL X (1.83) multiple sequence alignment
309
310
311 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
312 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
313 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
314 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
315 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
316 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
317 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
318 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
319 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
320 : . : :.
321
322 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
323 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
324 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
325 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
326 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
327 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
328 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
329 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
330 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
331 ** .: *::::. : :. . ..:
332
333 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
334 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
335 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
336 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
337 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
338 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
339 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
340 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
341 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
342 *.: . * . * *: :
343
344 """
345
346 from StringIO import StringIO
347
348 alignments = list(ClustalIterator(StringIO(aln_example1)))
349 assert 1 == len(alignments)
350 assert alignments[0]._version == "1.81"
351 alignment = alignments[0]
352 assert 2 == len(alignment)
353 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069"
354 assert alignment[1].id == "gi|671626|emb|CAA85685.1|"
355 assert str(alignment[0].seq) == \
356 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
357 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
358 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
359 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
360 "VPTTRAQRRA"
361
362 alignments = list(ClustalIterator(StringIO(aln_example2)))
363 assert 1 == len(alignments)
364 assert alignments[0]._version == "1.83"
365 alignment = alignments[0]
366 assert 9 == len(alignment)
367 assert alignment[-1].id == "HISJ_E_COLI"
368 assert str(alignment[-1].seq) == \
369 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
370 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
371 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
372
373 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)):
374 print "Alignment with %i records of length %i" \
375 % (len(alignment),
376 alignment.get_alignment_length())
377
378 print "Checking empty file..."
379 assert 0 == len(list(ClustalIterator(StringIO(""))))
380
381 print "Checking write/read..."
382 alignments = list(ClustalIterator(StringIO(aln_example1))) \
383 + list(ClustalIterator(StringIO(aln_example2)))*2
384 handle = StringIO()
385 ClustalWriter(handle).write_file(alignments)
386 handle.seek(0)
387 for i, a in enumerate(ClustalIterator(handle)):
388 assert a.get_alignment_length() == alignments[i].get_alignment_length()
389 handle.seek(0)
390
391 print "Testing write/read when there is only one sequence..."
392 alignment = alignment[0:1]
393 handle = StringIO()
394 ClustalWriter(handle).write_file([alignment])
395 handle.seek(0)
396 for i, a in enumerate(ClustalIterator(handle)):
397 assert a.get_alignment_length() == alignment.get_alignment_length()
398 assert len(a) == 1
399
400 aln_example3 = \
401 """CLUSTAL 2.0.9 multiple sequence alignment
402
403
404 Test1seq ------------------------------------------------------------
405 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
406 AT3G20900.1-CDS ------------------------------------------------------------
407
408
409 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
410 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
411 AT3G20900.1-CDS ------------------------------------------------------------
412
413
414 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
415 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
416 AT3G20900.1-CDS ------------------------------------------------------------
417
418
419 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
420 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
421 AT3G20900.1-CDS ------------------------------------------------------------
422
423
424 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
425 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
426 AT3G20900.1-CDS ------------------------------------------------------------
427
428
429 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
430 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
431 AT3G20900.1-CDS ------------------------------------------------------------
432
433
434 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
435 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
436 AT3G20900.1-CDS ------------------------------------------------------------
437
438
439 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
440 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
441 AT3G20900.1-CDS ------------------------------------------------------ATGAAC
442 *
443
444 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
445 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
446 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
447 * *** ***** * * ** ****************************
448
449 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
450 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
451 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
452 ******* ** * **** ***************************************
453
454 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
455 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
456 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
457 **************************************** *******************
458
459 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
460 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
461 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
462 *************************
463 """
464 alignments = list(ClustalIterator(StringIO(aln_example3)))
465 assert 1 == len(alignments)
466 assert alignments[0]._version == "2.0.9"
467
468 print "The End"
469