1
2
3
4
5
6 """
7 AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12 Support for "relaxed phylip" format is also provided. Relaxed phylip differs
13 from standard phylip format in the following ways:
14
15 * No whitespace is allowed in the sequence ID.
16 * No truncation is performed. Instead, sequence IDs are padded to the longest
17 ID length, rather than 10 characters. A space separates the sequence
18 identifier from the sequence.
19
20 Relaxed phylip is supported by RAxML and PHYML.
21
22 Note
23 ====
24 In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003)
25 a dot/period (".") in a sequence is interpreted as meaning the same
26 character as in the first sequence. The PHYLIP documentation from 3.3 to 3.69
27 http://evolution.genetics.washington.edu/phylip/doc/sequence.html says:
28
29 "a period was also previously allowed but it is no longer allowed,
30 because it sometimes is used in different senses in other programs"
31
32 Biopython 1.58 or later treats dots/periods in the sequence as invalid, both
33 for reading and writing. Older versions did nothing special with a dot/period.
34 """
35 import string
36
37 from Bio.Seq import Seq
38 from Bio.SeqRecord import SeqRecord
39 from Bio.Align import MultipleSeqAlignment
40 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
41
42 _PHYLIP_ID_WIDTH = 10
43
44
46 """Phylip alignment writer."""
47
49 """Use this to write (another) single alignment to an open file.
50
51 This code will write interlaced alignments (when the sequences are
52 longer than 50 characters).
53
54 Note that record identifiers are strictly truncated to id_width,
55 defaulting to the value required to comply with the PHYLIP standard.
56
57 For more information on the file format, please see:
58 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
59 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
60 """
61 handle = self.handle
62
63 if len(alignment) == 0:
64 raise ValueError("Must have at least one sequence")
65 length_of_seqs = alignment.get_alignment_length()
66 for record in alignment:
67 if length_of_seqs != len(record.seq):
68 raise ValueError("Sequences must all be the same length")
69 if length_of_seqs <= 0:
70 raise ValueError("Non-empty sequences are required")
71
72
73
74 names = []
75 seqs = []
76 for record in alignment:
77 """
78 Quoting the PHYLIP version 3.6 documentation:
79
80 The name should be ten characters in length, filled out to
81 the full ten characters by blanks if shorter. Any printable
82 ASCII/ISO character is allowed in the name, except for
83 parentheses ("(" and ")"), square brackets ("[" and "]"),
84 colon (":"), semicolon (";") and comma (","). If you forget
85 to extend the names to ten characters in length by blanks,
86 the program [i.e. PHYLIP] will get out of synchronization
87 with the contents of the data file, and an error message will
88 result.
89
90 Note that Tab characters count as only one character in the
91 species names. Their inclusion can cause trouble.
92 """
93 name = record.id.strip()
94
95
96 for char in "[](),":
97 name = name.replace(char, "")
98 for char in ":;":
99 name = name.replace(char, "|")
100 name = name[:id_width]
101 if name in names:
102 raise ValueError("Repeated name %r (originally %r), "
103 "possibly due to truncation"
104 % (name, record.id))
105 names.append(name)
106 sequence = str(record.seq)
107 if "." in sequence:
108
109 raise ValueError("PHYLIP format no longer allows dots in "
110 "sequence")
111 seqs.append(sequence)
112
113
114
115
116
117
118 handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
119 block = 0
120 while True:
121 for name, sequence in zip(names, seqs):
122 if block == 0:
123
124
125 handle.write(name[:id_width].ljust(id_width))
126 else:
127
128 handle.write(" " * id_width)
129
130 for chunk in range(0, 5):
131 i = block*50 + chunk*10
132 seq_segment = sequence[i:i+10]
133
134
135
136 handle.write(" %s" % seq_segment)
137 if i+10 > length_of_seqs:
138 break
139 handle.write("\n")
140 block = block+1
141 if block*50 > length_of_seqs:
142 break
143 handle.write("\n")
144
145
147 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator.
148
149 Record identifiers are limited to at most 10 characters.
150
151 It only copes with interlaced phylip files! Sequential files won't work
152 where the sequences are split over multiple lines.
153
154 For more information on the file format, please see:
155 http://evolution.genetics.washington.edu/phylip/doc/sequence.html
156 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
157 """
158
159
160 id_width = _PHYLIP_ID_WIDTH
161
163 line = line.strip()
164 parts = filter(None, line.split())
165 if len(parts) != 2:
166 return False
167 try:
168 number_of_seqs = int(parts[0])
169 length_of_seqs = int(parts[1])
170 return True
171 except ValueError:
172 return False
173
175 """
176 Extracts the sequence ID from a Phylip line, returning a tuple
177 containing:
178
179 (sequence_id, sequence_residues)
180
181 The first 10 characters in the line are are the sequence id, the
182 remainder are sequence data.
183 """
184 seq_id = line[:self.id_width].strip()
185 seq = line[self.id_width:].strip().replace(' ', '')
186 return seq_id, seq
187
189 handle = self.handle
190
191 try:
192
193
194 line = self._header
195 del self._header
196 except AttributeError:
197 line = handle.readline()
198
199 if not line:
200 raise StopIteration
201 line = line.strip()
202 parts = filter(None, line.split())
203 if len(parts) != 2:
204 raise ValueError("First line should have two integers")
205 try:
206 number_of_seqs = int(parts[0])
207 length_of_seqs = int(parts[1])
208 except ValueError:
209 raise ValueError("First line should have two integers")
210
211 assert self._is_header(line)
212
213 if self.records_per_alignment is not None \
214 and self.records_per_alignment != number_of_seqs:
215 raise ValueError("Found %i records in this alignment, told to expect %i"
216 % (number_of_seqs, self.records_per_alignment))
217
218 ids = []
219 seqs = []
220
221
222
223 for i in xrange(number_of_seqs):
224 line = handle.readline().rstrip()
225 sequence_id, s = self._split_id(line)
226 ids.append(sequence_id)
227 if "." in s:
228 raise ValueError("PHYLIP format no longer allows dots in sequence")
229 seqs.append([s])
230
231
232 line = ""
233 while True:
234
235 while "" == line.strip():
236 line = handle.readline()
237 if not line:
238 break
239 if not line:
240 break
241
242 if self._is_header(line):
243
244 self._header = line
245 break
246
247
248 for i in xrange(number_of_seqs):
249 s = line.strip().replace(" ", "")
250 if "." in s:
251 raise ValueError("PHYLIP format no longer allows dots in sequence")
252 seqs[i].append(s)
253 line = handle.readline()
254 if (not line) and i+1 < number_of_seqs:
255 raise ValueError("End of file mid-block")
256 if not line:
257 break
258
259 records = (SeqRecord(Seq("".join(s), self.alphabet),
260 id=i, name=i, description=i)
261 for (i, s) in zip(ids, seqs))
262 return MultipleSeqAlignment(records, self.alphabet)
263
264
265
267 """
268 Relaxed Phylip format writer
269 """
270
272 """
273 Write a relaxed phylip alignment
274 """
275
276 for name in (s.id.strip() for s in alignment):
277 if any(c in name for c in string.whitespace):
278 raise ValueError("Whitespace not allowed in identifier: %s"
279 % name)
280
281
282
283
284
285 if len(alignment) == 0:
286 id_width = 1
287 else:
288 id_width = max((len(s.id.strip()) for s in alignment)) + 1
289 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
290
291
293 """
294 Relaxed Phylip format Iterator
295 """
296
298 """Returns the ID, sequence data from a line
299 Extracts the sequence ID from a Phylip line, returning a tuple
300 containing:
301
302 (sequence_id, sequence_residues)
303
304 For relaxed format - split at the first whitespace character
305 """
306 seq_id, sequence = line.split(None, 1)
307 sequence = sequence.strip().replace(" ", "")
308 return seq_id, sequence
309
310
312 """
313 Sequential Phylip format Writer
314 """
316 handle = self.handle
317
318 if len(alignment) == 0:
319 raise ValueError("Must have at least one sequence")
320 length_of_seqs = alignment.get_alignment_length()
321 for record in alignment:
322 if length_of_seqs != len(record.seq):
323 raise ValueError("Sequences must all be the same length")
324 if length_of_seqs <= 0:
325 raise ValueError("Non-empty sequences are required")
326
327
328
329 names = []
330 for record in alignment:
331 name = record.id.strip()
332
333
334 for char in "[](),":
335 name = name.replace(char, "")
336 for char in ":;":
337 name = name.replace(char, "|")
338 name = name[:id_width]
339 if name in names:
340 raise ValueError("Repeated name %r (originally %r), "
341 "possibly due to truncation"
342 % (name, record.id))
343 names.append(name)
344
345
346
347
348
349
350 handle.write(" %i %s\n" % (len(alignment), length_of_seqs))
351 for name, record in zip(names, alignment):
352 sequence = str(record.seq)
353 if "." in sequence:
354 raise ValueError("PHYLIP format no longer allows dots in "
355 "sequence")
356 handle.write(name[:id_width].ljust(id_width))
357
358
359 handle.write(sequence)
360 handle.write("\n")
361
362
364 """
365 Sequential Phylip format Iterator
366
367 The sequential format carries the same restrictions as the normal
368 interleaved one, with the difference being that the sequences are listed
369 sequentially, each sequence written in its entirety before the start of
370 the next. According to the PHYLIP documentation for input file formatting,
371 newlines and spaces may optionally be entered at any point in the sequences.
372 """
374 handle = self.handle
375
376 try:
377
378
379 line = self._header
380 del self._header
381 except AttributeError:
382 line = handle.readline()
383
384 if not line:
385 raise StopIteration
386 line = line.strip()
387 parts = filter(None, line.split())
388 if len(parts) != 2:
389 raise ValueError("First line should have two integers")
390 try:
391 number_of_seqs = int(parts[0])
392 length_of_seqs = int(parts[1])
393 except ValueError:
394 raise ValueError("First line should have two integers")
395
396 assert self._is_header(line)
397
398 if self.records_per_alignment is not None \
399 and self.records_per_alignment != number_of_seqs:
400 raise ValueError("Found %i records in this alignment, told to expect %i"
401 % (number_of_seqs, self.records_per_alignment))
402
403 ids = []
404 seqs = []
405
406
407
408 for i in xrange(number_of_seqs):
409 line = handle.readline().rstrip()
410 sequence_id, s = self._split_id(line)
411 ids.append(sequence_id)
412 while len(s) < length_of_seqs:
413
414 line = handle.readline().strip()
415 if not line:
416 break
417 if line == "":
418 continue
419 s = "".join([s, line.strip().replace(" ", "")])
420 if len(s) > length_of_seqs:
421 raise ValueError("Found a record of length %i, should be %i"
422 % (len(s), length_of_seqs))
423 if "." in s:
424 raise ValueError("PHYLIP format no longer allows dots in sequence")
425 seqs.append(s)
426 while True:
427
428 line = handle.readline()
429 if not line:
430 break
431 if self._is_header(line):
432 self._header = line
433 break
434
435 records = (SeqRecord(Seq(s, self.alphabet),
436 id=i, name=i, description=i)
437 for (i, s) in zip(ids, seqs))
438 return MultipleSeqAlignment(records, self.alphabet)
439
440
441 if __name__ == "__main__":
442 print "Running short mini-test"
443
444 phylip_text = """ 8 286
445 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG
446 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG
447 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG
448 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG
449 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG
450 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA
451 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA
452 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG
453
454 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL
455 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE
456 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG
457 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG
458 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS
459 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA
460 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG
461 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS
462
463 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE
464 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD
465 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA
466 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE
467 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD
468 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK
469 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA
470 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE
471
472 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA
473 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA
474 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM
475 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA
476 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA
477 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA
478 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA
479 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA
480
481 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK
482 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK
483 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE
484 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA
485 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED
486 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE
487 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA
488 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE
489
490 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK----
491 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH
492 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK----
493 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ----
494 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK----
495 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K-----
496 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP---
497 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG---
498 """
499
500 from cStringIO import StringIO
501 handle = StringIO(phylip_text)
502 count = 0
503 for alignment in PhylipIterator(handle):
504 for record in alignment:
505 count = count+1
506 print record.id
507
508 assert count == 8
509
510 expected = """mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg
511 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly
512 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys
513 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre
514 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ", "").replace("\n", "").upper()
515 assert str(record.seq).replace("-", "") == expected
516
517
518
519 phylip_text2 = """5 60
520 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG
521 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG
522 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG
523 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG
524 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG
525
526 GAAATGGTCAATATTACAAGGT
527 GAAATGGTCAACATTAAAAGAT
528 GAAATCGTCAATATTAAAAGGT
529 GAAATGGTCAATCTTAAAAGGT
530 GAAATGGTCAATATTAAAAGGT"""
531
532 phylip_text3 = """5 60
533 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT
534 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT
535 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT
536 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT
537 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT"""
538
539 handle = StringIO(phylip_text2)
540 list2 = list(PhylipIterator(handle))
541 handle.close()
542 assert len(list2) == 1
543 assert len(list2[0]) == 5
544
545 handle = StringIO(phylip_text3)
546 list3 = list(PhylipIterator(handle))
547 handle.close()
548 assert len(list3) == 1
549 assert len(list3[0]) == 5
550
551 for i in range(0, 5):
552 list2[0][i].id == list3[0][i].id
553 str(list2[0][i].seq) == str(list3[0][i].seq)
554
555
556
557
558 phylip_text4 = """ 5 42
559 Turkey AAGCTNGGGC ATTTCAGGGT
560 Salmo gairAAGCCTTGGC AGTGCAGGGT
561 H. SapiensACCGGTTGGC CGTTCAGGGT
562 Chimp AAACCCTTGC CGTTACGCTT
563 Gorilla AAACCCTTGC CGGTACGCTT
564
565 GAGCCCGGGC AATACAGGGT AT
566 GAGCCGTGGC CGGGCACGGT AT
567 ACAGGTTGGC CGTTCAGGGT AA
568 AAACCGAGGC CGGGACACTC AT
569 AAACCATTGC CGGTACGCTT AA"""
570
571
572
573 phylip_text5 = """ 5 42
574 Turkey AAGCTNGGGC ATTTCAGGGT
575 GAGCCCGGGC AATACAGGGT AT
576 Salmo gairAAGCCTTGGC AGTGCAGGGT
577 GAGCCGTGGC CGGGCACGGT AT
578 H. SapiensACCGGTTGGC CGTTCAGGGT
579 ACAGGTTGGC CGTTCAGGGT AA
580 Chimp AAACCCTTGC CGTTACGCTT
581 AAACCGAGGC CGGGACACTC AT
582 Gorilla AAACCCTTGC CGGTACGCTT
583 AAACCATTGC CGGTACGCTT AA"""
584
585 phylip_text5a = """ 5 42
586 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT
587 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT
588 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA
589 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT
590 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA"""
591
592 handle = StringIO(phylip_text4)
593 list4 = list(PhylipIterator(handle))
594 handle.close()
595 assert len(list4) == 1
596 assert len(list4[0]) == 5
597
598 handle = StringIO(phylip_text5)
599 try:
600 list5 = list(PhylipIterator(handle))
601 assert len(list5) == 1
602 assert len(list5[0]) == 5
603 print "That should have failed..."
604 except ValueError:
605 print "Evil multiline non-interlaced example failed as expected"
606 handle.close()
607
608 handle = StringIO(phylip_text5a)
609 list5 = list(PhylipIterator(handle))
610 handle.close()
611 assert len(list5) == 1
612 assert len(list4[0]) == 5
613
614 print "Concatenation"
615 handle = StringIO(phylip_text4 + "\n" + phylip_text4)
616 assert len(list(PhylipIterator(handle))) == 2
617
618 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text)
619 assert len(list(PhylipIterator(handle))) == 3
620
621 print "OK"
622
623 print "Checking write/read"
624 handle = StringIO()
625 PhylipWriter(handle).write_file(list5)
626 handle.seek(0)
627 list6 = list(PhylipIterator(handle))
628 assert len(list5) == len(list6)
629 for a1, a2 in zip(list5, list6):
630 assert len(a1) == len(a2)
631 for r1, r2 in zip(a1, a2):
632 assert r1.id == r2.id
633 assert str(r1.seq) == str(r2.seq)
634 print "Done"
635