Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  from Bio.Alphabet import single_letter_alphabet 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 20   
 21   
22 -def SimpleFastaParser(handle):
23 """Generator function to iterator over Fasta records (as string tuples). 24 25 For each record a tuple of two strings is returned, the FASTA title 26 line (without the leading '>' character), and the sequence (with any 27 whitespace removed). The title line is not divided up into an 28 identifier (the first word) and comment or description. 29 30 >>> for values in SimpleFastaParser(open("Fasta/dups.fasta")): 31 ... print(values) 32 ('alpha', 'ACGTA') 33 ('beta', 'CGTC') 34 ('gamma', 'CCGCC') 35 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 36 ('delta', 'CGCGC') 37 38 """ 39 #Skip any text before the first record (e.g. blank lines, comments) 40 while True: 41 line = handle.readline() 42 if line == "": 43 return # Premature end of file, or just empty? 44 if line[0] == ">": 45 break 46 47 while True: 48 if line[0] != ">": 49 raise ValueError( 50 "Records in Fasta files should start with '>' character") 51 title = line[1:].rstrip() 52 lines = [] 53 line = handle.readline() 54 while True: 55 if not line: 56 break 57 if line[0] == ">": 58 break 59 lines.append(line.rstrip()) 60 line = handle.readline() 61 62 #Remove trailing whitespace, and any internal spaces 63 #(and any embedded \r which are possible in mangled files 64 #when not opened in universal read lines mode) 65 yield title, "".join(lines).replace(" ", "").replace("\r", "") 66 67 if not line: 68 return # StopIteration 69 70 assert False, "Should not reach this line"
71 72
73 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
74 """Generator function to iterate over Fasta records (as SeqRecord objects). 75 76 handle - input file 77 alphabet - optional alphabet 78 title2ids - A function that, when given the title of the FASTA 79 file (without the beginning >), will return the id, name and 80 description (in that order) for the record as a tuple of strings. 81 82 If this is not given, then the entire title line will be used 83 as the description, and the first word as the id and name. 84 85 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 86 with no custom handling of the title lines: 87 88 >>> for record in FastaIterator(open("Fasta/dups.fasta")): 89 ... print(record.id) 90 alpha 91 beta 92 gamma 93 alpha 94 delta 95 96 However, you can supply a title2ids function to alter this: 97 98 >>> def take_upper(title): 99 ... return title.split(None, 1)[0].upper(), "", title 100 >>> for record in FastaIterator(open("Fasta/dups.fasta"), title2ids=take_upper): 101 ... print(record.id) 102 ALPHA 103 BETA 104 GAMMA 105 ALPHA 106 DELTA 107 108 """ 109 if title2ids: 110 for title, sequence in SimpleFastaParser(handle): 111 id, name, descr = title2ids(title) 112 yield SeqRecord(Seq(sequence, alphabet), 113 id=id, name=name, description=descr) 114 else: 115 for title, sequence in SimpleFastaParser(handle): 116 try: 117 first_word = title.split(None, 1)[0] 118 except IndexError: 119 assert not title, repr(title) 120 #Should we use SeqRecord default for no ID? 121 first_word = "" 122 yield SeqRecord(Seq(sequence, alphabet), 123 id=first_word, name=first_word, description=title)
124 125
126 -class FastaWriter(SequentialSequenceWriter):
127 """Class to write Fasta format files."""
128 - def __init__(self, handle, wrap=60, record2title=None):
129 """Create a Fasta writer. 130 131 handle - Handle to an output file, e.g. as returned 132 by open(filename, "w") 133 wrap - Optional line length used to wrap sequence lines. 134 Defaults to wrapping the sequence at 60 characters 135 Use zero (or None) for no wrapping, giving a single 136 long line for the sequence. 137 record2title - Optional function to return the text to be 138 used for the title line of each record. By default 139 a combination of the record.id and record.description 140 is used. If the record.description starts with the 141 record.id, then just the record.description is used. 142 143 You can either use: 144 145 myWriter = FastaWriter(open(filename,"w")) 146 writer.write_file(myRecords) 147 148 Or, follow the sequential file writer system, for example: 149 150 myWriter = FastaWriter(open(filename,"w")) 151 writer.write_header() # does nothing for Fasta files 152 ... 153 Multiple calls to writer.write_record() and/or writer.write_records() 154 ... 155 writer.write_footer() # does nothing for Fasta files 156 """ 157 SequentialSequenceWriter.__init__(self, handle) 158 #self.handle = handle 159 self.wrap = None 160 if wrap: 161 if wrap < 1: 162 raise ValueError 163 self.wrap = wrap 164 self.record2title = record2title
165
166 - def write_record(self, record):
167 """Write a single Fasta record to the file.""" 168 assert self._header_written 169 assert not self._footer_written 170 self._record_written = True 171 172 if self.record2title: 173 title = self.clean(self.record2title(record)) 174 else: 175 id = self.clean(record.id) 176 description = self.clean(record.description) 177 178 #if description[:len(id)]==id: 179 if description and description.split(None, 1)[0] == id: 180 #The description includes the id at the start 181 title = description 182 elif description: 183 title = "%s %s" % (id, description) 184 else: 185 title = id 186 187 assert "\n" not in title 188 assert "\r" not in title 189 self.handle.write(">%s\n" % title) 190 191 data = self._get_seq_string(record) # Catches sequence being None 192 193 assert "\n" not in data 194 assert "\r" not in data 195 196 if self.wrap: 197 for i in range(0, len(data), self.wrap): 198 self.handle.write(data[i:i + self.wrap] + "\n") 199 else: 200 self.handle.write(data + "\n")
201 202 if __name__ == "__main__": 203 print("Running quick self test") 204 205 import os 206 from Bio.Alphabet import generic_protein, generic_nucleotide 207 208 #Download the files from here: 209 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 210 fna_filename = "NC_005213.fna" 211 faa_filename = "NC_005213.faa" 212
213 - def genbank_name_function(text):
214 text, descr = text.split(None, 1) 215 id = text.split("|")[3] 216 name = id.split(".", 1)[0] 217 return id, name, descr
218 232 233 if os.path.isfile(fna_filename): 234 print("--------") 235 print("FastaIterator (single sequence)") 236 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 237 count = 0 238 for record in iterator: 239 count += 1 240 print_record(record) 241 assert count == 1 242 print(str(record.__class__)) 243 244 if os.path.isfile(faa_filename): 245 print("--------") 246 print("FastaIterator (multiple sequences)") 247 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 248 count = 0 249 for record in iterator: 250 count += 1 251 print_record(record) 252 break 253 assert count > 0 254 print(str(record.__class__)) 255 256 from Bio._py3k import StringIO 257 print("--------") 258 print("FastaIterator (empty input file)") 259 #Just to make sure no errors happen 260 iterator = FastaIterator(StringIO("")) 261 count = 0 262 for record in iterator: 263 count += 1 264 assert count == 0 265 266 print("Done") 267