Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the pairwise alignments produced by Bill 
 12  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 13  refered to as the "fasta-m10" file format (as we only support the machine 
 14  readable output format selected with the -m 10 command line option). 
 15   
 16  This module does NOT cover the generic "fasta" file format originally 
 17  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 18  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 19  which can also be used to store a multiple sequence alignments. 
 20  """ 
 21   
 22  from __future__ import print_function 
 23   
 24  from Bio.Seq import Seq 
 25  from Bio.SeqRecord import SeqRecord 
 26  from Bio.Align import MultipleSeqAlignment 
 27  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 28  from Bio.Alphabet import Gapped 
 29   
 30   
31 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
32 """Helper function for the main parsing code (PRIVATE). 33 34 To get the actual pairwise alignment sequences, we must first 35 translate the un-gapped sequence based coordinates into positions 36 in the gapped sequence (which may have a flanking region shown 37 using leading - characters). To date, I have never seen any 38 trailing flanking region shown in the m10 file, but the 39 following code should also cope with that. 40 41 Note that this code seems to work fine even when the "sq_offset" 42 entries are prsent as a result of using the -X command line option. 43 """ 44 align_stripped = alignment_seq_with_flanking.strip("-") 45 display_start = int(annotation['al_display_start']) 46 if int(annotation['al_start']) <= int(annotation['al_stop']): 47 start = int(annotation['al_start']) \ 48 - display_start 49 end = int(annotation['al_stop']) \ 50 - display_start + 1 51 else: 52 #FASTA has flipped this sequence... 53 start = display_start \ 54 - int(annotation['al_start']) 55 end = display_start \ 56 - int(annotation['al_stop']) + 1 57 end += align_stripped.count("-") 58 assert 0 <= start and start < end and end <= len(align_stripped), \ 59 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 60 % (alignment_seq_with_flanking, start, end, annotation) 61 return align_stripped[start:end]
62 63
64 -def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
65 """Alignment iterator for the FASTA tool's pairwise alignment output. 66 67 This is for reading the pairwise alignments output by Bill Pearson's 68 FASTA program when called with the -m 10 command line option for machine 69 readable output. For more details about the FASTA tools, see the website 70 http://fasta.bioch.virginia.edu/ and the paper: 71 72 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 73 74 This class is intended to be used via the Bio.AlignIO.parse() function 75 by specifying the format as "fasta-m10" as shown in the following code: 76 77 from Bio import AlignIO 78 handle = ... 79 for a in AlignIO.parse(handle, "fasta-m10"): 80 assert len(a) == 2, "Should be pairwise!" 81 print("Alignment length %i" % a.get_alignment_length()) 82 for record in a: 83 print("%s %s %s" % (record.seq, record.name, record.id)) 84 85 Note that this is not a full blown parser for all the information 86 in the FASTA output - for example, most of the header and all of the 87 footer is ignored. Also, the alignments are not batched according to 88 the input queries. 89 90 Also note that there can be up to about 30 letters of flanking region 91 included in the raw FASTA output as contextual information. This is NOT 92 part of the alignment itself, and is not included in the resulting 93 MultipleSeqAlignment objects returned. 94 """ 95 if alphabet is None: 96 alphabet = single_letter_alphabet 97 98 state_PREAMBLE = -1 99 state_NONE = 0 100 state_QUERY_HEADER = 1 101 state_ALIGN_HEADER = 2 102 state_ALIGN_QUERY = 3 103 state_ALIGN_MATCH = 4 104 state_ALIGN_CONS = 5 105 106 def build_hsp(): 107 if not query_tags and not match_tags: 108 raise ValueError("No data for query %r, match %r" 109 % (query_id, match_id)) 110 assert query_tags, query_tags 111 assert match_tags, match_tags 112 evalue = align_tags.get("fa_expect", None) 113 q = "?" # Just for printing len(q) in debug below 114 m = "?" # Just for printing len(m) in debug below 115 tool = global_tags.get("tool", "").upper() 116 try: 117 q = _extract_alignment_region(query_seq, query_tags) 118 if tool in ["TFASTX"] and len(match_seq) == len(q): 119 m = match_seq 120 #Quick hack until I can work out how -, * and / characters 121 #and the apparent mix of aa and bp coordinates works. 122 else: 123 m = _extract_alignment_region(match_seq, match_tags) 124 assert len(q) == len(m) 125 except AssertionError as err: 126 print("Darn... amino acids vs nucleotide coordinates?") 127 print(tool) 128 print(query_seq) 129 print(query_tags) 130 print("%s %i" % (q, len(q))) 131 print(match_seq) 132 print(match_tags) 133 print("%s %i" % (m, len(m))) 134 print(handle.name) 135 raise err 136 137 assert alphabet is not None 138 alignment = MultipleSeqAlignment([], alphabet) 139 140 #TODO - Introduce an annotated alignment class? 141 #For now, store the annotation a new private property: 142 alignment._annotations = {} 143 144 #Want to record both the query header tags, and the alignment tags. 145 for key, value in header_tags.items(): 146 alignment._annotations[key] = value 147 for key, value in align_tags.items(): 148 alignment._annotations[key] = value 149 150 #Query 151 #===== 152 record = SeqRecord(Seq(q, alphabet), 153 id=query_id, 154 name="query", 155 description=query_descr, 156 annotations={"original_length": int(query_tags["sq_len"])}) 157 #TODO - handle start/end coordinates properly. Short term hack for now: 158 record._al_start = int(query_tags["al_start"]) 159 record._al_stop = int(query_tags["al_stop"]) 160 alignment.append(record) 161 162 #TODO - What if a specific alphabet has been requested? 163 #TODO - Use an IUPAC alphabet? 164 #TODO - Can FASTA output RNA? 165 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 166 if query_tags["sq_type"] == "D": 167 record.seq.alphabet = generic_dna 168 elif query_tags["sq_type"] == "p": 169 record.seq.alphabet = generic_protein 170 if "-" in q: 171 if not hasattr(record.seq.alphabet, "gap_char"): 172 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 173 174 #Match 175 #===== 176 record = SeqRecord(Seq(m, alphabet), 177 id=match_id, 178 name="match", 179 description=match_descr, 180 annotations={"original_length": int(match_tags["sq_len"])}) 181 #TODO - handle start/end coordinates properly. Short term hack for now: 182 record._al_start = int(match_tags["al_start"]) 183 record._al_stop = int(match_tags["al_stop"]) 184 alignment.append(record) 185 186 #This is still a very crude way of dealing with the alphabet: 187 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 188 if match_tags["sq_type"] == "D": 189 record.seq.alphabet = generic_dna 190 elif match_tags["sq_type"] == "p": 191 record.seq.alphabet = generic_protein 192 if "-" in m: 193 if not hasattr(record.seq.alphabet, "gap_char"): 194 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 195 196 return alignment
197 198 state = state_PREAMBLE 199 query_id = None 200 match_id = None 201 query_descr = "" 202 match_descr = "" 203 global_tags = {} 204 header_tags = {} 205 align_tags = {} 206 query_tags = {} 207 match_tags = {} 208 query_seq = "" 209 match_seq = "" 210 cons_seq = "" 211 for line in handle: 212 if ">>>" in line and not line.startswith(">>>"): 213 if query_id and match_id: 214 #This happens on old FASTA output which lacked an end of 215 #query >>><<< marker line. 216 yield build_hsp() 217 state = state_NONE 218 query_descr = line[line.find(">>>")+3:].strip() 219 query_id = query_descr.split(None, 1)[0] 220 match_id = None 221 header_tags = {} 222 align_tags = {} 223 query_tags = {} 224 match_tags = {} 225 query_seq = "" 226 match_seq = "" 227 cons_seq = "" 228 elif line.startswith("!! No "): 229 #e.g. 230 #!! No library sequences with E() < 0.5 231 #or on more recent versions, 232 #No sequences with E() < 0.05 233 assert state == state_NONE 234 assert not header_tags 235 assert not align_tags 236 assert not match_tags 237 assert not query_tags 238 assert match_id is None 239 assert not query_seq 240 assert not match_seq 241 assert not cons_seq 242 query_id = None 243 elif line.strip() in [">>><<<", ">>>///"]: 244 #End of query, possible end of all queries 245 if query_id and match_id: 246 yield build_hsp() 247 state = state_NONE 248 query_id = None 249 match_id = None 250 header_tags = {} 251 align_tags = {} 252 query_tags = {} 253 match_tags = {} 254 query_seq = "" 255 match_seq = "" 256 cons_seq = "" 257 elif line.startswith(">>>"): 258 #Should be start of a match! 259 assert query_id is not None 260 assert line[3:].split(", ", 1)[0] == query_id, line 261 assert match_id is None 262 assert not header_tags 263 assert not align_tags 264 assert not query_tags 265 assert not match_tags 266 assert not match_seq 267 assert not query_seq 268 assert not cons_seq 269 state = state_QUERY_HEADER 270 elif line.startswith(">>"): 271 #Should now be at start of a match alignment! 272 if query_id and match_id: 273 yield build_hsp() 274 align_tags = {} 275 query_tags = {} 276 match_tags = {} 277 query_seq = "" 278 match_seq = "" 279 cons_seq = "" 280 match_descr = line[2:].strip() 281 match_id = match_descr.split(None, 1)[0] 282 state = state_ALIGN_HEADER 283 elif line.startswith(">--"): 284 #End of one HSP 285 assert query_id and match_id, line 286 yield build_hsp() 287 #Clean up read for next HSP 288 #but reuse header_tags 289 align_tags = {} 290 query_tags = {} 291 match_tags = {} 292 query_seq = "" 293 match_seq = "" 294 cons_seq = "" 295 state = state_ALIGN_HEADER 296 elif line.startswith(">"): 297 if state == state_ALIGN_HEADER: 298 #Should be start of query alignment seq... 299 assert query_id is not None, line 300 assert match_id is not None, line 301 assert query_id.startswith(line[1:].split(None, 1)[0]), line 302 state = state_ALIGN_QUERY 303 elif state == state_ALIGN_QUERY: 304 #Should be start of match alignment seq 305 assert query_id is not None, line 306 assert match_id is not None, line 307 assert match_id.startswith(line[1:].split(None, 1)[0]), line 308 state = state_ALIGN_MATCH 309 elif state == state_NONE: 310 #Can get > as the last line of a histogram 311 pass 312 else: 313 assert False, "state %i got %r" % (state, line) 314 elif line.startswith("; al_cons"): 315 assert state == state_ALIGN_MATCH, line 316 state = state_ALIGN_CONS 317 #Next line(s) should be consensus seq... 318 elif line.startswith("; "): 319 if ": " in line: 320 key, value = [s.strip() for s in line[2:].split(": ", 1)] 321 else: 322 import warnings 323 #Seen in lalign36, specifically version 36.3.4 Apr, 2011 324 #Fixed in version 36.3.5b Oct, 2011(preload8) 325 warnings.warn("Missing colon in line: %r" % line) 326 try: 327 key, value = [s.strip() for s in line[2:].split(" ", 1)] 328 except ValueError: 329 raise ValueError("Bad line: %r" % line) 330 if state == state_QUERY_HEADER: 331 header_tags[key] = value 332 elif state == state_ALIGN_HEADER: 333 align_tags[key] = value 334 elif state == state_ALIGN_QUERY: 335 query_tags[key] = value 336 elif state == state_ALIGN_MATCH: 337 match_tags[key] = value 338 else: 339 assert False, "Unexpected state %r, %r" % (state, line) 340 elif state == state_ALIGN_QUERY: 341 query_seq += line.strip() 342 elif state == state_ALIGN_MATCH: 343 match_seq += line.strip() 344 elif state == state_ALIGN_CONS: 345 cons_seq += line.strip("\n") 346 elif state == state_PREAMBLE: 347 if line.startswith("#"): 348 global_tags["command"] = line[1:].strip() 349 elif line.startswith(" version "): 350 global_tags["version"] = line[9:].strip() 351 elif " compares a " in line: 352 global_tags["tool"] = line[:line.find(" compares a ")].strip() 353 elif " searches a " in line: 354 global_tags["tool"] = line[:line.find(" searches a ")].strip() 355 else: 356 pass 357 358 359 if __name__ == "__main__": 360 print("Running a quick self-test") 361 362 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 363 simple_example = \ 364 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 365 FASTA searches a protein or DNA sequence data bank 366 version 34.26 January 12, 2007 367 Please cite: 368 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 369 370 Query library NC_002127.faa vs NC_009649.faa library 371 searching NC_009649.faa library 372 373 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 374 vs NC_009649.faa library 375 376 45119 residues in 180 sequences 377 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 378 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 379 Lambda= 0.175043 380 381 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 382 join: 36, opt: 24, open/ext: -10/-2, width: 16 383 Scan time: 0.000 384 The best scores are: opt bits E(180) 385 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 386 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 387 388 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 389 ; pg_name: /opt/fasta/fasta34 390 ; pg_ver: 34.26 391 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 392 ; pg_name: FASTA 393 ; pg_ver: 3.5 Sept 2006 394 ; pg_matrix: BL50 (15:-5) 395 ; pg_open-ext: -10 -2 396 ; pg_ktup: 2 397 ; pg_optcut: 24 398 ; pg_cgap: 36 399 ; mp_extrap: 60000 180 400 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 401 ; mp_KS: -0.0000 (N=0) at 8159228 402 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 403 ; fa_frame: f 404 ; fa_initn: 65 405 ; fa_init1: 43 406 ; fa_opt: 71 407 ; fa_z-score: 90.3 408 ; fa_bits: 24.9 409 ; fa_expect: 0.58 410 ; sw_score: 71 411 ; sw_ident: 0.250 412 ; sw_sim: 0.574 413 ; sw_overlap: 108 414 >gi|10955263| .. 415 ; sq_len: 107 416 ; sq_offset: 1 417 ; sq_type: p 418 ; al_start: 5 419 ; al_stop: 103 420 ; al_display_start: 1 421 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 422 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 423 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 424 >gi|152973457|ref|YP_001338508.1| .. 425 ; sq_len: 931 426 ; sq_type: p 427 ; al_start: 96 428 ; al_stop: 195 429 ; al_display_start: 66 430 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 431 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 432 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 433 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 434 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 435 ; fa_frame: f 436 ; fa_initn: 33 437 ; fa_init1: 33 438 ; fa_opt: 63 439 ; fa_z-score: 86.1 440 ; fa_bits: 23.1 441 ; fa_expect: 0.99 442 ; sw_score: 63 443 ; sw_ident: 0.266 444 ; sw_sim: 0.656 445 ; sw_overlap: 64 446 >gi|10955263| .. 447 ; sq_len: 107 448 ; sq_offset: 1 449 ; sq_type: p 450 ; al_start: 32 451 ; al_stop: 94 452 ; al_display_start: 2 453 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 454 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 455 LSRLMAD 456 >gi|152973588|ref|YP_001338639.1| .. 457 ; sq_len: 459 458 ; sq_type: p 459 ; al_start: 191 460 ; al_stop: 248 461 ; al_display_start: 161 462 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 463 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 464 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 465 SNKALKSQISALLSSIQNKAVADEKLTDQE 466 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 467 vs NC_009649.faa library 468 469 45119 residues in 180 sequences 470 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 471 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 472 Lambda= 0.179384 473 474 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 475 join: 36, opt: 24, open/ext: -10/-2, width: 16 476 Scan time: 0.000 477 The best scores are: opt bits E(180) 478 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 479 480 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 481 ; pg_name: /opt/fasta/fasta34 482 ; pg_ver: 34.26 483 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 484 ; pg_name: FASTA 485 ; pg_ver: 3.5 Sept 2006 486 ; pg_matrix: BL50 (15:-5) 487 ; pg_open-ext: -10 -2 488 ; pg_ktup: 2 489 ; pg_optcut: 24 490 ; pg_cgap: 36 491 ; mp_extrap: 60000 180 492 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 493 ; mp_KS: -0.0000 (N=0) at 8159228 494 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 495 ; fa_frame: f 496 ; fa_initn: 50 497 ; fa_init1: 50 498 ; fa_opt: 58 499 ; fa_z-score: 95.8 500 ; fa_bits: 22.9 501 ; fa_expect: 0.29 502 ; sw_score: 58 503 ; sw_ident: 0.289 504 ; sw_sim: 0.632 505 ; sw_overlap: 38 506 >gi|10955264| .. 507 ; sq_len: 126 508 ; sq_offset: 1 509 ; sq_type: p 510 ; al_start: 1 511 ; al_stop: 38 512 ; al_display_start: 1 513 ------------------------------MKKDKKYQIEAIKNKDKTLF 514 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 515 NGEKFSSYTLNKVTKTDEYN 516 >gi|152973462|ref|YP_001338513.1| .. 517 ; sq_len: 101 518 ; sq_type: p 519 ; al_start: 44 520 ; al_stop: 81 521 ; al_display_start: 14 522 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 523 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 524 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 525 vs NC_009649.faa library 526 527 45119 residues in 180 sequences 528 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 529 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 530 Lambda= 0.210386 531 532 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 533 join: 37, opt: 25, open/ext: -10/-2, width: 16 534 Scan time: 0.020 535 The best scores are: opt bits E(180) 536 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 537 538 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 539 ; pg_name: /opt/fasta/fasta34 540 ; pg_ver: 34.26 541 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 542 ; pg_name: FASTA 543 ; pg_ver: 3.5 Sept 2006 544 ; pg_matrix: BL50 (15:-5) 545 ; pg_open-ext: -10 -2 546 ; pg_ktup: 2 547 ; pg_optcut: 25 548 ; pg_cgap: 37 549 ; mp_extrap: 60000 180 550 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 551 ; mp_KS: -0.0000 (N=0) at 8159228 552 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 553 ; fa_frame: f 554 ; fa_initn: 52 555 ; fa_init1: 52 556 ; fa_opt: 70 557 ; fa_z-score: 105.5 558 ; fa_bits: 27.5 559 ; fa_expect: 0.082 560 ; sw_score: 70 561 ; sw_ident: 0.279 562 ; sw_sim: 0.651 563 ; sw_overlap: 43 564 >gi|10955265| .. 565 ; sq_len: 346 566 ; sq_offset: 1 567 ; sq_type: p 568 ; al_start: 197 569 ; al_stop: 238 570 ; al_display_start: 167 571 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 572 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 573 GEYFTENKPKYIIREIHQET 574 >gi|152973545|ref|YP_001338596.1| .. 575 ; sq_len: 242 576 ; sq_type: p 577 ; al_start: 52 578 ; al_stop: 94 579 ; al_display_start: 22 580 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 581 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 582 QDFAFTRKMRREARQVEQSW 583 >>><<< 584 585 586 579 residues in 3 query sequences 587 45119 residues in 180 library sequences 588 Scomplib [34.26] 589 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 590 Total Scan time: 0.020 Total Display time: 0.010 591 592 Function used was FASTA [version 34.26 January 12, 2007] 593 594 """ 595 596 from Bio._py3k import StringIO 597 598 alignments = list(FastaM10Iterator(StringIO(simple_example))) 599 assert len(alignments) == 4, len(alignments) 600 assert len(alignments[0]) == 2 601 for a in alignments: 602 print("Alignment %i sequences of length %i" \ 603 % (len(a), a.get_alignment_length())) 604 for r in a: 605 print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) 606 #print(a.annotations) 607 print("Done") 608 609 import os 610 path = "../../Tests/Fasta/" 611 files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") 612 for filename in files: 613 if os.path.splitext(filename)[-1] == ".m10": 614 print("") 615 print(filename) 616 print("=" * len(filename)) 617 for i, a in enumerate(FastaM10Iterator(open(os.path.join(path, filename)))): 618 print("#%i, %s" % (i+1, a)) 619 for r in a: 620 if "-" in r.seq: 621 assert r.seq.alphabet.gap_char == "-" 622 else: 623 assert not hasattr(r.seq.alphabet, "gap_char") 624