Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 12  from standard phylip format in the following ways: 
 13   
 14   * No whitespace is allowed in the sequence ID. 
 15   * No truncation is performed. Instead, sequence IDs are padded to the longest 
 16     ID length, rather than 10 characters. A space separates the sequence 
 17     identifier from the sequence. 
 18   
 19  Relaxed phylip is supported by RAxML and PHYML. 
 20   
 21  Note 
 22  ==== 
 23  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 24  a dot/period (".") in a sequence is interpreted as meaning the same 
 25  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 26  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 27   
 28     "a period was also previously allowed but it is no longer allowed, 
 29     because it sometimes is used in different senses in other programs" 
 30   
 31  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 32  for reading and writing. Older versions did nothing special with a dot/period. 
 33  """ 
 34  from __future__ import print_function 
 35   
 36  import string 
 37   
 38  from Bio._py3k import range 
 39   
 40  from Bio.Seq import Seq 
 41  from Bio.SeqRecord import SeqRecord 
 42  from Bio.Align import MultipleSeqAlignment 
 43  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 44   
 45  _PHYLIP_ID_WIDTH = 10 
 46   
 47   
48 -class PhylipWriter(SequentialAlignmentWriter):
49 """Phylip alignment writer.""" 50
51 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
52 """Use this to write (another) single alignment to an open file. 53 54 This code will write interlaced alignments (when the sequences are 55 longer than 50 characters). 56 57 Note that record identifiers are strictly truncated to id_width, 58 defaulting to the value required to comply with the PHYLIP standard. 59 60 For more information on the file format, please see: 61 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 62 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 63 """ 64 handle = self.handle 65 66 if len(alignment) == 0: 67 raise ValueError("Must have at least one sequence") 68 length_of_seqs = alignment.get_alignment_length() 69 for record in alignment: 70 if length_of_seqs != len(record.seq): 71 raise ValueError("Sequences must all be the same length") 72 if length_of_seqs <= 0: 73 raise ValueError("Non-empty sequences are required") 74 75 # Check for repeated identifiers... 76 # Apply this test *after* cleaning the identifiers 77 names = [] 78 seqs = [] 79 for record in alignment: 80 """ 81 Quoting the PHYLIP version 3.6 documentation: 82 83 The name should be ten characters in length, filled out to 84 the full ten characters by blanks if shorter. Any printable 85 ASCII/ISO character is allowed in the name, except for 86 parentheses ("(" and ")"), square brackets ("[" and "]"), 87 colon (":"), semicolon (";") and comma (","). If you forget 88 to extend the names to ten characters in length by blanks, 89 the program [i.e. PHYLIP] will get out of synchronization 90 with the contents of the data file, and an error message will 91 result. 92 93 Note that Tab characters count as only one character in the 94 species names. Their inclusion can cause trouble. 95 """ 96 name = record.id.strip() 97 #Either remove the banned characters, or map them to something 98 #else like an underscore "_" or pipe "|" character... 99 for char in "[](),": 100 name = name.replace(char, "") 101 for char in ":;": 102 name = name.replace(char, "|") 103 name = name[:id_width] 104 if name in names: 105 raise ValueError("Repeated name %r (originally %r), " 106 "possibly due to truncation" 107 % (name, record.id)) 108 names.append(name) 109 sequence = str(record.seq) 110 if "." in sequence: 111 # Do this check here (once per record, not once per block) 112 raise ValueError("PHYLIP format no longer allows dots in " 113 "sequence") 114 seqs.append(sequence) 115 116 # From experimentation, the use of tabs is not understood by the 117 # EMBOSS suite. The nature of the expected white space is not 118 # defined in the PHYLIP documentation, simply "These are in free 119 # format, separated by blanks". We'll use spaces to keep EMBOSS 120 # happy. 121 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 122 block = 0 123 while True: 124 for name, sequence in zip(names, seqs): 125 if block == 0: 126 #Write name (truncated/padded to id_width characters) 127 #Now truncate and right pad to expected length. 128 handle.write(name[:id_width].ljust(id_width)) 129 else: 130 #write indent 131 handle.write(" " * id_width) 132 #Write five chunks of ten letters per line... 133 for chunk in range(0, 5): 134 i = block*50 + chunk*10 135 seq_segment = sequence[i:i+10] 136 #TODO - Force any gaps to be '-' character? Look at the 137 #alphabet... 138 #TODO - How to cope with '?' or '.' in the sequence? 139 handle.write(" %s" % seq_segment) 140 if i+10 > length_of_seqs: 141 break 142 handle.write("\n") 143 block = block+1 144 if block*50 > length_of_seqs: 145 break 146 handle.write("\n")
147 148
149 -class PhylipIterator(AlignmentIterator):
150 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 151 152 Record identifiers are limited to at most 10 characters. 153 154 It only copes with interlaced phylip files! Sequential files won't work 155 where the sequences are split over multiple lines. 156 157 For more information on the file format, please see: 158 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 159 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 160 """ 161 162 # Default truncation length 163 id_width = _PHYLIP_ID_WIDTH 164
165 - def _is_header(self, line):
166 line = line.strip() 167 parts = [x for x in line.split() if x] 168 if len(parts) != 2: 169 return False # First line should have two integers 170 try: 171 number_of_seqs = int(parts[0]) 172 length_of_seqs = int(parts[1]) 173 return True 174 except ValueError: 175 return False # First line should have two integers
176
177 - def _split_id(self, line):
178 """ 179 Extracts the sequence ID from a Phylip line, returning a tuple 180 containing: 181 182 (sequence_id, sequence_residues) 183 184 The first 10 characters in the line are are the sequence id, the 185 remainder are sequence data. 186 """ 187 seq_id = line[:self.id_width].strip() 188 seq = line[self.id_width:].strip().replace(' ', '') 189 return seq_id, seq
190
191 - def __next__(self):
192 handle = self.handle 193 194 try: 195 #Header we saved from when we were parsing 196 #the previous alignment. 197 line = self._header 198 del self._header 199 except AttributeError: 200 line = handle.readline() 201 202 if not line: 203 raise StopIteration 204 line = line.strip() 205 parts = [x for x in line.split() if x] 206 if len(parts) != 2: 207 raise ValueError("First line should have two integers") 208 try: 209 number_of_seqs = int(parts[0]) 210 length_of_seqs = int(parts[1]) 211 except ValueError: 212 raise ValueError("First line should have two integers") 213 214 assert self._is_header(line) 215 216 if self.records_per_alignment is not None \ 217 and self.records_per_alignment != number_of_seqs: 218 raise ValueError("Found %i records in this alignment, told to expect %i" 219 % (number_of_seqs, self.records_per_alignment)) 220 221 ids = [] 222 seqs = [] 223 224 # By default, expects STRICT truncation / padding to 10 characters. 225 # Does not require any whitespace between name and seq. 226 for i in range(number_of_seqs): 227 line = handle.readline().rstrip() 228 sequence_id, s = self._split_id(line) 229 ids.append(sequence_id) 230 if "." in s: 231 raise ValueError("PHYLIP format no longer allows dots in sequence") 232 seqs.append([s]) 233 234 #Look for further blocks 235 line = "" 236 while True: 237 #Skip any blank lines between blocks... 238 while "" == line.strip(): 239 line = handle.readline() 240 if not line: 241 break # end of file 242 if not line: 243 break # end of file 244 245 if self._is_header(line): 246 #Looks like the start of a concatenated alignment 247 self._header = line 248 break 249 250 #print "New block..." 251 for i in range(number_of_seqs): 252 s = line.strip().replace(" ", "") 253 if "." in s: 254 raise ValueError("PHYLIP format no longer allows dots in sequence") 255 seqs[i].append(s) 256 line = handle.readline() 257 if (not line) and i+1 < number_of_seqs: 258 raise ValueError("End of file mid-block") 259 if not line: 260 break # end of file 261 262 records = (SeqRecord(Seq("".join(s), self.alphabet), 263 id=i, name=i, description=i) 264 for (i, s) in zip(ids, seqs)) 265 return MultipleSeqAlignment(records, self.alphabet)
266 267 268 # Relaxed Phylip
269 -class RelaxedPhylipWriter(PhylipWriter):
270 """ 271 Relaxed Phylip format writer 272 """ 273
274 - def write_alignment(self, alignment):
275 """ 276 Write a relaxed phylip alignment 277 """ 278 # Check inputs 279 for name in (s.id.strip() for s in alignment): 280 if any(c in name for c in string.whitespace): 281 raise ValueError("Whitespace not allowed in identifier: %s" 282 % name) 283 284 # Calculate a truncation length - maximum length of sequence ID plus a 285 # single character for padding 286 # If no sequences, set id_width to 1. super(...) call will raise a 287 # ValueError 288 if len(alignment) == 0: 289 id_width = 1 290 else: 291 id_width = max((len(s.id.strip()) for s in alignment)) + 1 292 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
293 294
295 -class RelaxedPhylipIterator(PhylipIterator):
296 """ 297 Relaxed Phylip format Iterator 298 """ 299
300 - def _split_id(self, line):
301 """Returns the ID, sequence data from a line 302 Extracts the sequence ID from a Phylip line, returning a tuple 303 containing: 304 305 (sequence_id, sequence_residues) 306 307 For relaxed format - split at the first whitespace character 308 """ 309 seq_id, sequence = line.split(None, 1) 310 sequence = sequence.strip().replace(" ", "") 311 return seq_id, sequence
312 313
314 -class SequentialPhylipWriter(SequentialAlignmentWriter):
315 """ 316 Sequential Phylip format Writer 317 """
318 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
319 handle = self.handle 320 321 if len(alignment) == 0: 322 raise ValueError("Must have at least one sequence") 323 length_of_seqs = alignment.get_alignment_length() 324 for record in alignment: 325 if length_of_seqs != len(record.seq): 326 raise ValueError("Sequences must all be the same length") 327 if length_of_seqs <= 0: 328 raise ValueError("Non-empty sequences are required") 329 330 # Check for repeated identifiers... 331 # Apply this test *after* cleaning the identifiers 332 names = [] 333 for record in alignment: 334 name = record.id.strip() 335 #Either remove the banned characters, or map them to something 336 #else like an underscore "_" or pipe "|" character... 337 for char in "[](),": 338 name = name.replace(char, "") 339 for char in ":;": 340 name = name.replace(char, "|") 341 name = name[:id_width] 342 if name in names: 343 raise ValueError("Repeated name %r (originally %r), " 344 "possibly due to truncation" 345 % (name, record.id)) 346 names.append(name) 347 348 # From experimentation, the use of tabs is not understood by the 349 # EMBOSS suite. The nature of the expected white space is not 350 # defined in the PHYLIP documentation, simply "These are in free 351 # format, separated by blanks". We'll use spaces to keep EMBOSS 352 # happy. 353 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 354 for name, record in zip(names, alignment): 355 sequence = str(record.seq) 356 if "." in sequence: 357 raise ValueError("PHYLIP format no longer allows dots in " 358 "sequence") 359 handle.write(name[:id_width].ljust(id_width)) 360 # Write the entire sequence to one line (see sequential format 361 # notes in the SequentialPhylipIterator docstring 362 handle.write(sequence) 363 handle.write("\n")
364 365
366 -class SequentialPhylipIterator(PhylipIterator):
367 """ 368 Sequential Phylip format Iterator 369 370 The sequential format carries the same restrictions as the normal 371 interleaved one, with the difference being that the sequences are listed 372 sequentially, each sequence written in its entirety before the start of 373 the next. According to the PHYLIP documentation for input file formatting, 374 newlines and spaces may optionally be entered at any point in the sequences. 375 """
376 - def __next__(self):
377 handle = self.handle 378 379 try: 380 #Header we saved from when we were parsing 381 #the previous alignment. 382 line = self._header 383 del self._header 384 except AttributeError: 385 line = handle.readline() 386 387 if not line: 388 raise StopIteration 389 line = line.strip() 390 parts = [x for x in line.split() if x] 391 if len(parts) != 2: 392 raise ValueError("First line should have two integers") 393 try: 394 number_of_seqs = int(parts[0]) 395 length_of_seqs = int(parts[1]) 396 except ValueError: 397 raise ValueError("First line should have two integers") 398 399 assert self._is_header(line) 400 401 if self.records_per_alignment is not None \ 402 and self.records_per_alignment != number_of_seqs: 403 raise ValueError("Found %i records in this alignment, told to expect %i" 404 % (number_of_seqs, self.records_per_alignment)) 405 406 ids = [] 407 seqs = [] 408 409 # By default, expects STRICT truncation / padding to 10 characters. 410 # Does not require any whitespace between name and seq. 411 for i in range(number_of_seqs): 412 line = handle.readline().rstrip() 413 sequence_id, s = self._split_id(line) 414 ids.append(sequence_id) 415 while len(s) < length_of_seqs: 416 # The sequence may be split into multiple lines 417 line = handle.readline().strip() 418 if not line: 419 break 420 if line == "": 421 continue 422 s = "".join([s, line.strip().replace(" ", "")]) 423 if len(s) > length_of_seqs: 424 raise ValueError("Found a record of length %i, should be %i" 425 % (len(s), length_of_seqs)) 426 if "." in s: 427 raise ValueError("PHYLIP format no longer allows dots in sequence") 428 seqs.append(s) 429 while True: 430 # Find other alignments in the file 431 line = handle.readline() 432 if not line: 433 break 434 if self._is_header(line): 435 self._header = line 436 break 437 438 records = (SeqRecord(Seq(s, self.alphabet), 439 id=i, name=i, description=i) 440 for (i, s) in zip(ids, seqs)) 441 return MultipleSeqAlignment(records, self.alphabet)
442 443 444 if __name__ == "__main__": 445 print("Running short mini-test") 446 447 phylip_text = """ 8 286 448 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 449 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 450 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 451 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 452 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 453 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 454 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 455 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 456 457 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 458 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 459 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 460 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 461 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 462 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 463 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 464 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 465 466 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 467 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 468 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 469 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 470 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 471 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 472 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 473 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 474 475 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 476 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 477 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 478 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 479 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 480 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 481 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 482 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 483 484 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 485 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 486 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 487 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 488 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 489 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 490 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 491 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 492 493 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 494 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 495 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 496 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 497 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 498 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 499 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 500 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 501 """ 502 503 from Bio._py3k import StringIO 504 handle = StringIO(phylip_text) 505 count = 0 506 for alignment in PhylipIterator(handle): 507 for record in alignment: 508 count = count+1 509 print(record.id) 510 #print str(record.seq) 511 assert count == 8 512 513 expected = """mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 514 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 515 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 516 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 517 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ", "").replace("\n", "").upper() 518 assert str(record.seq).replace("-", "") == expected 519 520 #From here: 521 #http://atgc.lirmm.fr/phyml/usersguide.html 522 phylip_text2 = """5 60 523 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 524 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 525 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 526 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 527 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 528 529 GAAATGGTCAATATTACAAGGT 530 GAAATGGTCAACATTAAAAGAT 531 GAAATCGTCAATATTAAAAGGT 532 GAAATGGTCAATCTTAAAAGGT 533 GAAATGGTCAATATTAAAAGGT""" 534 535 phylip_text3 = """5 60 536 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 537 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 538 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 539 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 540 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 541 542 handle = StringIO(phylip_text2) 543 list2 = list(PhylipIterator(handle)) 544 handle.close() 545 assert len(list2) == 1 546 assert len(list2[0]) == 5 547 548 handle = StringIO(phylip_text3) 549 list3 = list(PhylipIterator(handle)) 550 handle.close() 551 assert len(list3) == 1 552 assert len(list3[0]) == 5 553 554 for i in range(0, 5): 555 list2[0][i].id == list3[0][i].id 556 str(list2[0][i].seq) == str(list3[0][i].seq) 557 558 #From here: 559 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 560 #Note the lack of any white space between names 2 and 3 and their seqs. 561 phylip_text4 = """ 5 42 562 Turkey AAGCTNGGGC ATTTCAGGGT 563 Salmo gairAAGCCTTGGC AGTGCAGGGT 564 H. SapiensACCGGTTGGC CGTTCAGGGT 565 Chimp AAACCCTTGC CGTTACGCTT 566 Gorilla AAACCCTTGC CGGTACGCTT 567 568 GAGCCCGGGC AATACAGGGT AT 569 GAGCCGTGGC CGGGCACGGT AT 570 ACAGGTTGGC CGTTCAGGGT AA 571 AAACCGAGGC CGGGACACTC AT 572 AAACCATTGC CGGTACGCTT AA""" 573 574 #From here: 575 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 576 phylip_text5 = """ 5 42 577 Turkey AAGCTNGGGC ATTTCAGGGT 578 GAGCCCGGGC AATACAGGGT AT 579 Salmo gairAAGCCTTGGC AGTGCAGGGT 580 GAGCCGTGGC CGGGCACGGT AT 581 H. SapiensACCGGTTGGC CGTTCAGGGT 582 ACAGGTTGGC CGTTCAGGGT AA 583 Chimp AAACCCTTGC CGTTACGCTT 584 AAACCGAGGC CGGGACACTC AT 585 Gorilla AAACCCTTGC CGGTACGCTT 586 AAACCATTGC CGGTACGCTT AA""" 587 588 phylip_text5a = """ 5 42 589 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 590 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 591 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 592 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 593 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 594 595 handle = StringIO(phylip_text4) 596 list4 = list(PhylipIterator(handle)) 597 handle.close() 598 assert len(list4) == 1 599 assert len(list4[0]) == 5 600 601 handle = StringIO(phylip_text5) 602 try: 603 list5 = list(PhylipIterator(handle)) 604 assert len(list5) == 1 605 assert len(list5[0]) == 5 606 print("That should have failed...") 607 except ValueError: 608 print("Evil multiline non-interlaced example failed as expected") 609 handle.close() 610 611 handle = StringIO(phylip_text5a) 612 list5 = list(PhylipIterator(handle)) 613 handle.close() 614 assert len(list5) == 1 615 assert len(list4[0]) == 5 616 617 print("Concatenation") 618 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 619 assert len(list(PhylipIterator(handle))) == 2 620 621 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 622 assert len(list(PhylipIterator(handle))) == 3 623 624 print("OK") 625 626 print("Checking write/read") 627 handle = StringIO() 628 PhylipWriter(handle).write_file(list5) 629 handle.seek(0) 630 list6 = list(PhylipIterator(handle)) 631 assert len(list5) == len(list6) 632 for a1, a2 in zip(list5, list6): 633 assert len(a1) == len(a2) 634 for r1, r2 in zip(a1, a2): 635 assert r1.id == r2.id 636 assert str(r1.seq) == str(r2.seq) 637 print("Done") 638