1
2
3
4
5
6
7
8
9
10 """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
11
12 You are expected to use this module via the Bio.SeqIO functions."""
13
14 from Bio.Alphabet import single_letter_alphabet
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
17 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
18
19
21 """Generator function to iterate over Fasta records (as SeqRecord objects).
22
23 handle - input file
24 alphabet - optional alphabet
25 title2ids - A function that, when given the title of the FASTA
26 file (without the beginning >), will return the id, name and
27 description (in that order) for the record as a tuple of strings.
28
29 If this is not given, then the entire title line will be used
30 as the description, and the first word as the id and name.
31
32 Note that use of title2ids matches that of Bio.Fasta.SequenceParser
33 but the defaults are slightly different.
34 """
35
36 while True:
37 line = handle.readline()
38 if line == "" : return
39 if line[0] == ">":
40 break
41
42 while True:
43 if line[0]!=">":
44 raise ValueError("Records in Fasta files should start with '>' character")
45 if title2ids:
46 id, name, descr = title2ids(line[1:].rstrip())
47 else:
48 descr = line[1:].rstrip()
49 try:
50 id = descr.split()[0]
51 except IndexError:
52 assert not descr, repr(line)
53
54 id = ""
55 name = id
56
57 lines = []
58 line = handle.readline()
59 while True:
60 if not line : break
61 if line[0] == ">": break
62 lines.append(line.rstrip())
63 line = handle.readline()
64
65
66
67
68 result = "".join(lines).replace(" ", "").replace("\r", "")
69
70
71 yield SeqRecord(Seq(result, alphabet),
72 id = id, name = name, description = descr)
73
74 if not line : return
75 assert False, "Should not reach this line"
76
78 """Class to write Fasta format files."""
79 - def __init__(self, handle, wrap=60, record2title=None):
80 """Create a Fasta writer.
81
82 handle - Handle to an output file, e.g. as returned
83 by open(filename, "w")
84 wrap - Optional line length used to wrap sequence lines.
85 Defaults to wrapping the sequence at 60 characters
86 Use zero (or None) for no wrapping, giving a single
87 long line for the sequence.
88 record2title - Optional function to return the text to be
89 used for the title line of each record. By default the
90 a combination of the record.id and record.description
91 is used. If the record.description starts with the
92 record.id, then just the record.description is used.
93
94 You can either use:
95
96 myWriter = FastaWriter(open(filename,"w"))
97 writer.write_file(myRecords)
98
99 Or, follow the sequential file writer system, for example:
100
101 myWriter = FastaWriter(open(filename,"w"))
102 writer.write_header() # does nothing for Fasta files
103 ...
104 Multiple calls to writer.write_record() and/or writer.write_records()
105 ...
106 writer.write_footer() # does nothing for Fasta files
107 writer.close()
108 """
109 SequentialSequenceWriter.__init__(self, handle)
110
111 self.wrap = None
112 if wrap:
113 if wrap < 1:
114 raise ValueError
115 self.wrap = wrap
116 self.record2title = record2title
117
153
154 if __name__ == "__main__":
155 print "Running quick self test"
156
157 import os
158 from Bio.Alphabet import generic_protein, generic_nucleotide
159
160
161
162 fna_filename = "NC_005213.fna"
163 faa_filename = "NC_005213.faa"
164
170
172
173
174 print "ID:" + record.id
175 print "Name:" + record.name
176 print "Descr:" + record.description
177 print record.seq
178 for feature in record.annotations:
179 print '/%s=%s' % (feature, record.annotations[feature])
180 if record.dbxrefs:
181 print "Database cross references:"
182 for x in record.dbxrefs : print " - %s" % x
183
184 if os.path.isfile(fna_filename):
185 print "--------"
186 print "FastaIterator (single sequence)"
187 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
188 count=0
189 for record in iterator:
190 count=count+1
191 print_record(record)
192 assert count == 1
193 print str(record.__class__)
194
195 if os.path.isfile(faa_filename):
196 print "--------"
197 print "FastaIterator (multiple sequences)"
198 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
199 count=0
200 for record in iterator:
201 count=count+1
202 print_record(record)
203 break
204 assert count>0
205 print str(record.__class__)
206
207 from cStringIO import StringIO
208 print "--------"
209 print "FastaIterator (empty input file)"
210
211 iterator = FastaIterator(StringIO(""))
212 count = 0
213 for record in iterator:
214 count = count+1
215 assert count==0
216
217 print "Done"
218