Package Bio :: Package SeqIO :: Module IgIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.IgIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing IntelliGenetics format files as 
  7  # SeqRecord objects.  This file format appears to be the same as the MASE 
  8  # multiple sequence alignment format. 
  9   
 10  """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from Bio.Alphabet import single_letter_alphabet 
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17   
 18   
19 -def IgIterator(handle, alphabet=single_letter_alphabet):
20 """Iterate over IntelliGenetics records (as SeqRecord objects). 21 22 handle - input file 23 alphabet - optional alphabet 24 25 The optional free format file header lines (which start with two 26 semi-colons) are ignored. 27 28 The free format commentary lines at the start of each record (which 29 start with a semi-colon) are recorded as a single string with embedded 30 new line characters in the SeqRecord's annotations dictionary under the 31 key 'comment'. 32 """ 33 #Skip any file header text before the first record (;; lines) 34 while True: 35 line = handle.readline() 36 if not line: 37 break # Premature end of file, or just empty? 38 if not line.startswith(";;"): 39 break 40 41 while line: 42 #Now iterate over the records 43 if line[0] != ";": 44 raise ValueError( 45 "Records should start with ';' and not:\n%s" % repr(line)) 46 47 #Try and agree with SeqRecord convention from the GenBank parser, 48 #(and followed in the SwissProt parser) which stores the comments 49 #as a long string with newlines under annotations key 'comment'. 50 51 #Note some examples use "; ..." and others ";..." 52 comment_lines = [] 53 while line.startswith(";"): 54 #TODO - Extract identifier from lines like "LOCUS\tB_SF2"? 55 comment_lines.append(line[1:].strip()) 56 line = handle.readline() 57 title = line.rstrip() 58 59 seq_lines = [] 60 while True: 61 line = handle.readline() 62 if not line: 63 break 64 if line[0] == ";": 65 break 66 #Remove trailing whitespace, and any internal spaces 67 seq_lines.append(line.rstrip().replace(" ", "")) 68 seq_str = "".join(seq_lines) 69 if seq_str.endswith("1"): 70 #Remove the optional terminator (digit one) 71 seq_str = seq_str[:-1] 72 if "1" in seq_str: 73 raise ValueError( 74 "Potential terminator digit one found within sequence.") 75 76 #Return the record and then continue... 77 record = SeqRecord(Seq(seq_str, alphabet), 78 id=title, name=title) 79 record.annotations['comment'] = "\n".join(comment_lines) 80 yield record 81 82 #We should be at the end of the file now 83 assert not line
84 85 if __name__ == "__main__": 86 print "Running quick self test" 87 88 import os 89 path = "../../Tests/IntelliGenetics/" 90 if os.path.isdir(path): 91 for filename in os.listdir(path): 92 if os.path.splitext(filename)[-1] == ".txt": 93 print 94 print filename 95 print "-" * len(filename) 96 handle = open(os.path.join(path, filename)) 97 for record in IgIterator(handle): 98 print record.id, len(record) 99 handle.close() 100 print "Done" 101 else: 102 print "Could not find input files" 103