Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from Bio.Alphabet import single_letter_alphabet 
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 18   
 19  #This is a generator function! 
20 -def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None):
21 """Generator function to iterate over Fasta records (as SeqRecord objects). 22 23 handle - input file 24 alphabet - optional alphabet 25 title2ids - A function that, when given the title of the FASTA 26 file (without the beginning >), will return the id, name and 27 description (in that order) for the record as a tuple of strings. 28 29 If this is not given, then the entire title line will be used 30 as the description, and the first word as the id and name. 31 32 Note that use of title2ids matches that of Bio.Fasta.SequenceParser 33 but the defaults are slightly different. 34 """ 35 #Skip any text before the first record (e.g. blank lines, comments) 36 while True: 37 line = handle.readline() 38 if line == "" : return #Premature end of file, or just empty? 39 if line[0] == ">": 40 break 41 42 while True: 43 if line[0]!=">": 44 raise ValueError("Records in Fasta files should start with '>' character") 45 if title2ids: 46 id, name, descr = title2ids(line[1:].rstrip()) 47 else: 48 descr = line[1:].rstrip() 49 try: 50 id = descr.split()[0] 51 except IndexError: 52 assert not descr, repr(line) 53 #Should we use SeqRecord default for no ID? 54 id = "" 55 name = id 56 57 lines = [] 58 line = handle.readline() 59 while True: 60 if not line : break 61 if line[0] == ">": break 62 lines.append(line.rstrip()) 63 line = handle.readline() 64 65 #Remove trailing whitespace, and any internal spaces 66 #(and any embedded \r which are possible in mangled files 67 #when not opened in universal read lines mode) 68 result = "".join(lines).replace(" ", "").replace("\r", "") 69 70 #Return the record and then continue... 71 yield SeqRecord(Seq(result, alphabet), 72 id = id, name = name, description = descr) 73 74 if not line : return #StopIteration 75 assert False, "Should not reach this line"
76
77 -class FastaWriter(SequentialSequenceWriter):
78 """Class to write Fasta format files."""
79 - def __init__(self, handle, wrap=60, record2title=None):
80 """Create a Fasta writer. 81 82 handle - Handle to an output file, e.g. as returned 83 by open(filename, "w") 84 wrap - Optional line length used to wrap sequence lines. 85 Defaults to wrapping the sequence at 60 characters 86 Use zero (or None) for no wrapping, giving a single 87 long line for the sequence. 88 record2title - Optional function to return the text to be 89 used for the title line of each record. By default the 90 a combination of the record.id and record.description 91 is used. If the record.description starts with the 92 record.id, then just the record.description is used. 93 94 You can either use: 95 96 myWriter = FastaWriter(open(filename,"w")) 97 writer.write_file(myRecords) 98 99 Or, follow the sequential file writer system, for example: 100 101 myWriter = FastaWriter(open(filename,"w")) 102 writer.write_header() # does nothing for Fasta files 103 ... 104 Multiple calls to writer.write_record() and/or writer.write_records() 105 ... 106 writer.write_footer() # does nothing for Fasta files 107 writer.close() 108 """ 109 SequentialSequenceWriter.__init__(self, handle) 110 #self.handle = handle 111 self.wrap = None 112 if wrap: 113 if wrap < 1: 114 raise ValueError 115 self.wrap = wrap 116 self.record2title = record2title
117
118 - def write_record(self, record):
119 """Write a single Fasta record to the file.""" 120 assert self._header_written 121 assert not self._footer_written 122 self._record_written = True 123 124 if self.record2title: 125 title=self.clean(self.record2title(record)) 126 else: 127 id = self.clean(record.id) 128 description = self.clean(record.description) 129 130 #if description[:len(id)]==id: 131 if description and description.split(None,1)[0]==id: 132 #The description includes the id at the start 133 title = description 134 elif description: 135 title = "%s %s" % (id, description) 136 else: 137 title = id 138 139 assert "\n" not in title 140 assert "\r" not in title 141 self.handle.write(">%s\n" % title) 142 143 data = self._get_seq_string(record) #Catches sequence being None 144 145 assert "\n" not in data 146 assert "\r" not in data 147 148 if self.wrap: 149 for i in range(0, len(data), self.wrap): 150 self.handle.write(data[i:i+self.wrap] + "\n") 151 else: 152 self.handle.write(data + "\n")
153 154 if __name__ == "__main__": 155 print "Running quick self test" 156 157 import os 158 from Bio.Alphabet import generic_protein, generic_nucleotide 159 160 #Download the files from here: 161 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 162 fna_filename = "NC_005213.fna" 163 faa_filename = "NC_005213.faa" 164
165 - def genbank_name_function(text):
166 text, descr = text.split(None,1) 167 id = text.split("|")[3] 168 name = id.split(".",1)[0] 169 return id, name, descr
170 183 184 if os.path.isfile(fna_filename): 185 print "--------" 186 print "FastaIterator (single sequence)" 187 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 188 count=0 189 for record in iterator: 190 count=count+1 191 print_record(record) 192 assert count == 1 193 print str(record.__class__) 194 195 if os.path.isfile(faa_filename): 196 print "--------" 197 print "FastaIterator (multiple sequences)" 198 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 199 count=0 200 for record in iterator: 201 count=count+1 202 print_record(record) 203 break 204 assert count>0 205 print str(record.__class__) 206 207 from cStringIO import StringIO 208 print "--------" 209 print "FastaIterator (empty input file)" 210 #Just to make sure no errors happen 211 iterator = FastaIterator(StringIO("")) 212 count = 0 213 for record in iterator: 214 count = count+1 215 assert count==0 216 217 print "Done" 218