Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the pairwise alignments produced by Bill 
 12  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 13  refered to as the "fasta-m10" file format (as we only support the machine 
 14  readable output format selected with the -m 10 command line option). 
 15   
 16  This module does NOT cover the generic "fasta" file format originally 
 17  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 18  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 19  which can also be used to store a multiple sequence alignments. 
 20  """ 
 21   
 22  from __future__ import print_function 
 23   
 24  from Bio.Seq import Seq 
 25  from Bio.SeqRecord import SeqRecord 
 26  from Bio.Align import MultipleSeqAlignment 
 27  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 28  from Bio.Alphabet import Gapped 
 29   
 30  __docformat__ = "restructuredtext en" 
 31   
 32   
33 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
34 """Helper function for the main parsing code (PRIVATE). 35 36 To get the actual pairwise alignment sequences, we must first 37 translate the un-gapped sequence based coordinates into positions 38 in the gapped sequence (which may have a flanking region shown 39 using leading - characters). To date, I have never seen any 40 trailing flanking region shown in the m10 file, but the 41 following code should also cope with that. 42 43 Note that this code seems to work fine even when the "sq_offset" 44 entries are prsent as a result of using the -X command line option. 45 """ 46 align_stripped = alignment_seq_with_flanking.strip("-") 47 display_start = int(annotation['al_display_start']) 48 if int(annotation['al_start']) <= int(annotation['al_stop']): 49 start = int(annotation['al_start']) \ 50 - display_start 51 end = int(annotation['al_stop']) \ 52 - display_start + 1 53 else: 54 # FASTA has flipped this sequence... 55 start = display_start \ 56 - int(annotation['al_start']) 57 end = display_start \ 58 - int(annotation['al_stop']) + 1 59 end += align_stripped.count("-") 60 assert 0 <= start and start < end and end <= len(align_stripped), \ 61 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 62 % (alignment_seq_with_flanking, start, end, annotation) 63 return align_stripped[start:end]
64 65
66 -def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
67 """Alignment iterator for the FASTA tool's pairwise alignment output. 68 69 This is for reading the pairwise alignments output by Bill Pearson's 70 FASTA program when called with the -m 10 command line option for machine 71 readable output. For more details about the FASTA tools, see the website 72 http://fasta.bioch.virginia.edu/ and the paper: 73 74 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 75 76 This class is intended to be used via the Bio.AlignIO.parse() function 77 by specifying the format as "fasta-m10" as shown in the following code:: 78 79 from Bio import AlignIO 80 handle = ... 81 for a in AlignIO.parse(handle, "fasta-m10"): 82 assert len(a) == 2, "Should be pairwise!" 83 print("Alignment length %i" % a.get_alignment_length()) 84 for record in a: 85 print("%s %s %s" % (record.seq, record.name, record.id)) 86 87 Note that this is not a full blown parser for all the information 88 in the FASTA output - for example, most of the header and all of the 89 footer is ignored. Also, the alignments are not batched according to 90 the input queries. 91 92 Also note that there can be up to about 30 letters of flanking region 93 included in the raw FASTA output as contextual information. This is NOT 94 part of the alignment itself, and is not included in the resulting 95 MultipleSeqAlignment objects returned. 96 """ 97 if alphabet is None: 98 alphabet = single_letter_alphabet 99 100 state_PREAMBLE = -1 101 state_NONE = 0 102 state_QUERY_HEADER = 1 103 state_ALIGN_HEADER = 2 104 state_ALIGN_QUERY = 3 105 state_ALIGN_MATCH = 4 106 state_ALIGN_CONS = 5 107 108 def build_hsp(): 109 if not query_tags and not match_tags: 110 raise ValueError("No data for query %r, match %r" 111 % (query_id, match_id)) 112 assert query_tags, query_tags 113 assert match_tags, match_tags 114 evalue = align_tags.get("fa_expect", None) 115 q = "?" # Just for printing len(q) in debug below 116 m = "?" # Just for printing len(m) in debug below 117 tool = global_tags.get("tool", "").upper() 118 try: 119 q = _extract_alignment_region(query_seq, query_tags) 120 if tool in ["TFASTX"] and len(match_seq) == len(q): 121 m = match_seq 122 # Quick hack until I can work out how -, * and / characters 123 # and the apparent mix of aa and bp coordinates works. 124 else: 125 m = _extract_alignment_region(match_seq, match_tags) 126 assert len(q) == len(m) 127 except AssertionError as err: 128 print("Darn... amino acids vs nucleotide coordinates?") 129 print(tool) 130 print(query_seq) 131 print(query_tags) 132 print("%s %i" % (q, len(q))) 133 print(match_seq) 134 print(match_tags) 135 print("%s %i" % (m, len(m))) 136 print(handle.name) 137 raise err 138 139 assert alphabet is not None 140 alignment = MultipleSeqAlignment([], alphabet) 141 142 # TODO - Introduce an annotated alignment class? 143 # For now, store the annotation a new private property: 144 alignment._annotations = {} 145 146 # Want to record both the query header tags, and the alignment tags. 147 for key, value in header_tags.items(): 148 alignment._annotations[key] = value 149 for key, value in align_tags.items(): 150 alignment._annotations[key] = value 151 152 # Query 153 # ===== 154 record = SeqRecord(Seq(q, alphabet), 155 id=query_id, 156 name="query", 157 description=query_descr, 158 annotations={"original_length": int(query_tags["sq_len"])}) 159 # TODO - handle start/end coordinates properly. Short term hack for now: 160 record._al_start = int(query_tags["al_start"]) 161 record._al_stop = int(query_tags["al_stop"]) 162 alignment.append(record) 163 164 # TODO - What if a specific alphabet has been requested? 165 # TODO - Use an IUPAC alphabet? 166 # TODO - Can FASTA output RNA? 167 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 168 if query_tags["sq_type"] == "D": 169 record.seq.alphabet = generic_dna 170 elif query_tags["sq_type"] == "p": 171 record.seq.alphabet = generic_protein 172 if "-" in q: 173 if not hasattr(record.seq.alphabet, "gap_char"): 174 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 175 176 # Match 177 # ===== 178 record = SeqRecord(Seq(m, alphabet), 179 id=match_id, 180 name="match", 181 description=match_descr, 182 annotations={"original_length": int(match_tags["sq_len"])}) 183 # TODO - handle start/end coordinates properly. Short term hack for now: 184 record._al_start = int(match_tags["al_start"]) 185 record._al_stop = int(match_tags["al_stop"]) 186 alignment.append(record) 187 188 # This is still a very crude way of dealing with the alphabet: 189 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 190 if match_tags["sq_type"] == "D": 191 record.seq.alphabet = generic_dna 192 elif match_tags["sq_type"] == "p": 193 record.seq.alphabet = generic_protein 194 if "-" in m: 195 if not hasattr(record.seq.alphabet, "gap_char"): 196 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 197 198 return alignment
199 200 state = state_PREAMBLE 201 query_id = None 202 match_id = None 203 query_descr = "" 204 match_descr = "" 205 global_tags = {} 206 header_tags = {} 207 align_tags = {} 208 query_tags = {} 209 match_tags = {} 210 query_seq = "" 211 match_seq = "" 212 cons_seq = "" 213 for line in handle: 214 if ">>>" in line and not line.startswith(">>>"): 215 if query_id and match_id: 216 # This happens on old FASTA output which lacked an end of 217 # query >>><<< marker line. 218 yield build_hsp() 219 state = state_NONE 220 query_descr = line[line.find(">>>") + 3:].strip() 221 query_id = query_descr.split(None, 1)[0] 222 match_id = None 223 header_tags = {} 224 align_tags = {} 225 query_tags = {} 226 match_tags = {} 227 query_seq = "" 228 match_seq = "" 229 cons_seq = "" 230 elif line.startswith("!! No "): 231 # e.g. 232 # !! No library sequences with E() < 0.5 233 # or on more recent versions, 234 # No sequences with E() < 0.05 235 assert state == state_NONE 236 assert not header_tags 237 assert not align_tags 238 assert not match_tags 239 assert not query_tags 240 assert match_id is None 241 assert not query_seq 242 assert not match_seq 243 assert not cons_seq 244 query_id = None 245 elif line.strip() in [">>><<<", ">>>///"]: 246 # End of query, possible end of all queries 247 if query_id and match_id: 248 yield build_hsp() 249 state = state_NONE 250 query_id = None 251 match_id = None 252 header_tags = {} 253 align_tags = {} 254 query_tags = {} 255 match_tags = {} 256 query_seq = "" 257 match_seq = "" 258 cons_seq = "" 259 elif line.startswith(">>>"): 260 # Should be start of a match! 261 assert query_id is not None 262 assert line[3:].split(", ", 1)[0] == query_id, line 263 assert match_id is None 264 assert not header_tags 265 assert not align_tags 266 assert not query_tags 267 assert not match_tags 268 assert not match_seq 269 assert not query_seq 270 assert not cons_seq 271 state = state_QUERY_HEADER 272 elif line.startswith(">>"): 273 # Should now be at start of a match alignment! 274 if query_id and match_id: 275 yield build_hsp() 276 align_tags = {} 277 query_tags = {} 278 match_tags = {} 279 query_seq = "" 280 match_seq = "" 281 cons_seq = "" 282 match_descr = line[2:].strip() 283 match_id = match_descr.split(None, 1)[0] 284 state = state_ALIGN_HEADER 285 elif line.startswith(">--"): 286 # End of one HSP 287 assert query_id and match_id, line 288 yield build_hsp() 289 # Clean up read for next HSP 290 # but reuse header_tags 291 align_tags = {} 292 query_tags = {} 293 match_tags = {} 294 query_seq = "" 295 match_seq = "" 296 cons_seq = "" 297 state = state_ALIGN_HEADER 298 elif line.startswith(">"): 299 if state == state_ALIGN_HEADER: 300 # Should be start of query alignment seq... 301 assert query_id is not None, line 302 assert match_id is not None, line 303 assert query_id.startswith(line[1:].split(None, 1)[0]), line 304 state = state_ALIGN_QUERY 305 elif state == state_ALIGN_QUERY: 306 # Should be start of match alignment seq 307 assert query_id is not None, line 308 assert match_id is not None, line 309 assert match_id.startswith(line[1:].split(None, 1)[0]), line 310 state = state_ALIGN_MATCH 311 elif state == state_NONE: 312 # Can get > as the last line of a histogram 313 pass 314 else: 315 assert False, "state %i got %r" % (state, line) 316 elif line.startswith("; al_cons"): 317 assert state == state_ALIGN_MATCH, line 318 state = state_ALIGN_CONS 319 # Next line(s) should be consensus seq... 320 elif line.startswith("; "): 321 if ": " in line: 322 key, value = [s.strip() for s in line[2:].split(": ", 1)] 323 else: 324 import warnings 325 # Seen in lalign36, specifically version 36.3.4 Apr, 2011 326 # Fixed in version 36.3.5b Oct, 2011(preload8) 327 warnings.warn("Missing colon in line: %r" % line) 328 try: 329 key, value = [s.strip() for s in line[2:].split(" ", 1)] 330 except ValueError: 331 raise ValueError("Bad line: %r" % line) 332 if state == state_QUERY_HEADER: 333 header_tags[key] = value 334 elif state == state_ALIGN_HEADER: 335 align_tags[key] = value 336 elif state == state_ALIGN_QUERY: 337 query_tags[key] = value 338 elif state == state_ALIGN_MATCH: 339 match_tags[key] = value 340 else: 341 assert False, "Unexpected state %r, %r" % (state, line) 342 elif state == state_ALIGN_QUERY: 343 query_seq += line.strip() 344 elif state == state_ALIGN_MATCH: 345 match_seq += line.strip() 346 elif state == state_ALIGN_CONS: 347 cons_seq += line.strip("\n") 348 elif state == state_PREAMBLE: 349 if line.startswith("#"): 350 global_tags["command"] = line[1:].strip() 351 elif line.startswith(" version "): 352 global_tags["version"] = line[9:].strip() 353 elif " compares a " in line: 354 global_tags["tool"] = line[:line.find(" compares a ")].strip() 355 elif " searches a " in line: 356 global_tags["tool"] = line[:line.find(" searches a ")].strip() 357 else: 358 pass 359 360 361 if __name__ == "__main__": 362 print("Running a quick self-test") 363 364 # http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 365 simple_example = \ 366 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 367 FASTA searches a protein or DNA sequence data bank 368 version 34.26 January 12, 2007 369 Please cite: 370 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 371 372 Query library NC_002127.faa vs NC_009649.faa library 373 searching NC_009649.faa library 374 375 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 376 vs NC_009649.faa library 377 378 45119 residues in 180 sequences 379 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 380 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 381 Lambda= 0.175043 382 383 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 384 join: 36, opt: 24, open/ext: -10/-2, width: 16 385 Scan time: 0.000 386 The best scores are: opt bits E(180) 387 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 388 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 389 390 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 391 ; pg_name: /opt/fasta/fasta34 392 ; pg_ver: 34.26 393 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 394 ; pg_name: FASTA 395 ; pg_ver: 3.5 Sept 2006 396 ; pg_matrix: BL50 (15:-5) 397 ; pg_open-ext: -10 -2 398 ; pg_ktup: 2 399 ; pg_optcut: 24 400 ; pg_cgap: 36 401 ; mp_extrap: 60000 180 402 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 403 ; mp_KS: -0.0000 (N=0) at 8159228 404 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 405 ; fa_frame: f 406 ; fa_initn: 65 407 ; fa_init1: 43 408 ; fa_opt: 71 409 ; fa_z-score: 90.3 410 ; fa_bits: 24.9 411 ; fa_expect: 0.58 412 ; sw_score: 71 413 ; sw_ident: 0.250 414 ; sw_sim: 0.574 415 ; sw_overlap: 108 416 >gi|10955263| .. 417 ; sq_len: 107 418 ; sq_offset: 1 419 ; sq_type: p 420 ; al_start: 5 421 ; al_stop: 103 422 ; al_display_start: 1 423 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 424 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 425 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 426 >gi|152973457|ref|YP_001338508.1| .. 427 ; sq_len: 931 428 ; sq_type: p 429 ; al_start: 96 430 ; al_stop: 195 431 ; al_display_start: 66 432 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 433 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 434 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 435 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 436 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 437 ; fa_frame: f 438 ; fa_initn: 33 439 ; fa_init1: 33 440 ; fa_opt: 63 441 ; fa_z-score: 86.1 442 ; fa_bits: 23.1 443 ; fa_expect: 0.99 444 ; sw_score: 63 445 ; sw_ident: 0.266 446 ; sw_sim: 0.656 447 ; sw_overlap: 64 448 >gi|10955263| .. 449 ; sq_len: 107 450 ; sq_offset: 1 451 ; sq_type: p 452 ; al_start: 32 453 ; al_stop: 94 454 ; al_display_start: 2 455 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 456 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 457 LSRLMAD 458 >gi|152973588|ref|YP_001338639.1| .. 459 ; sq_len: 459 460 ; sq_type: p 461 ; al_start: 191 462 ; al_stop: 248 463 ; al_display_start: 161 464 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 465 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 466 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 467 SNKALKSQISALLSSIQNKAVADEKLTDQE 468 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 469 vs NC_009649.faa library 470 471 45119 residues in 180 sequences 472 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 473 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 474 Lambda= 0.179384 475 476 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 477 join: 36, opt: 24, open/ext: -10/-2, width: 16 478 Scan time: 0.000 479 The best scores are: opt bits E(180) 480 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 481 482 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 483 ; pg_name: /opt/fasta/fasta34 484 ; pg_ver: 34.26 485 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 486 ; pg_name: FASTA 487 ; pg_ver: 3.5 Sept 2006 488 ; pg_matrix: BL50 (15:-5) 489 ; pg_open-ext: -10 -2 490 ; pg_ktup: 2 491 ; pg_optcut: 24 492 ; pg_cgap: 36 493 ; mp_extrap: 60000 180 494 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 495 ; mp_KS: -0.0000 (N=0) at 8159228 496 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 497 ; fa_frame: f 498 ; fa_initn: 50 499 ; fa_init1: 50 500 ; fa_opt: 58 501 ; fa_z-score: 95.8 502 ; fa_bits: 22.9 503 ; fa_expect: 0.29 504 ; sw_score: 58 505 ; sw_ident: 0.289 506 ; sw_sim: 0.632 507 ; sw_overlap: 38 508 >gi|10955264| .. 509 ; sq_len: 126 510 ; sq_offset: 1 511 ; sq_type: p 512 ; al_start: 1 513 ; al_stop: 38 514 ; al_display_start: 1 515 ------------------------------MKKDKKYQIEAIKNKDKTLF 516 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 517 NGEKFSSYTLNKVTKTDEYN 518 >gi|152973462|ref|YP_001338513.1| .. 519 ; sq_len: 101 520 ; sq_type: p 521 ; al_start: 44 522 ; al_stop: 81 523 ; al_display_start: 14 524 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 525 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 526 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 527 vs NC_009649.faa library 528 529 45119 residues in 180 sequences 530 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 531 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 532 Lambda= 0.210386 533 534 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 535 join: 37, opt: 25, open/ext: -10/-2, width: 16 536 Scan time: 0.020 537 The best scores are: opt bits E(180) 538 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 539 540 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 541 ; pg_name: /opt/fasta/fasta34 542 ; pg_ver: 34.26 543 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 544 ; pg_name: FASTA 545 ; pg_ver: 3.5 Sept 2006 546 ; pg_matrix: BL50 (15:-5) 547 ; pg_open-ext: -10 -2 548 ; pg_ktup: 2 549 ; pg_optcut: 25 550 ; pg_cgap: 37 551 ; mp_extrap: 60000 180 552 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 553 ; mp_KS: -0.0000 (N=0) at 8159228 554 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 555 ; fa_frame: f 556 ; fa_initn: 52 557 ; fa_init1: 52 558 ; fa_opt: 70 559 ; fa_z-score: 105.5 560 ; fa_bits: 27.5 561 ; fa_expect: 0.082 562 ; sw_score: 70 563 ; sw_ident: 0.279 564 ; sw_sim: 0.651 565 ; sw_overlap: 43 566 >gi|10955265| .. 567 ; sq_len: 346 568 ; sq_offset: 1 569 ; sq_type: p 570 ; al_start: 197 571 ; al_stop: 238 572 ; al_display_start: 167 573 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 574 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 575 GEYFTENKPKYIIREIHQET 576 >gi|152973545|ref|YP_001338596.1| .. 577 ; sq_len: 242 578 ; sq_type: p 579 ; al_start: 52 580 ; al_stop: 94 581 ; al_display_start: 22 582 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 583 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 584 QDFAFTRKMRREARQVEQSW 585 >>><<< 586 587 588 579 residues in 3 query sequences 589 45119 residues in 180 library sequences 590 Scomplib [34.26] 591 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 592 Total Scan time: 0.020 Total Display time: 0.010 593 594 Function used was FASTA [version 34.26 January 12, 2007] 595 596 """ 597 598 from Bio._py3k import StringIO 599 600 alignments = list(FastaM10Iterator(StringIO(simple_example))) 601 assert len(alignments) == 4, len(alignments) 602 assert len(alignments[0]) == 2 603 for a in alignments: 604 print("Alignment %i sequences of length %i" 605 % (len(a), a.get_alignment_length())) 606 for r in a: 607 print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) 608 # print(a.annotations) 609 print("Done") 610 611 import os 612 path = "../../Tests/Fasta/" 613 files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") 614 for filename in files: 615 if os.path.splitext(filename)[-1] == ".m10": 616 print("") 617 print(filename) 618 print("=" * len(filename)) 619 for i, a in enumerate(FastaM10Iterator(open(os.path.join(path, filename)))): 620 print("#%i, %s" % (i + 1, a)) 621 for r in a: 622 if "-" in r.seq: 623 assert r.seq.alphabet.gap_char == "-" 624 else: 625 assert not hasattr(r.seq.alphabet, "gap_char") 626