1
2
3
4
5
6
7
8
9
10 """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format.
11
12 You are expected to use this module via the Bio.SeqIO functions."""
13
14 from Bio.Alphabet import single_letter_alphabet
15 from Bio.Seq import Seq
16 from Bio.SeqRecord import SeqRecord
17 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
18
19
21 """Generator function to iterator over Fasta records (as string tuples).
22
23 For each record a tuple of two strings is returned, the FASTA title
24 line (without the leading '>' character), and the sequence (with any
25 whitespace removed). The title line is not divided up into an
26 identifier (the first word) and comment or description.
27
28 >>> for values in SimpleFastaParser(open("Fasta/dups.fasta")):
29 ... print values
30 ('alpha', 'ACGTA')
31 ('beta', 'CGTC')
32 ('gamma', 'CCGCC')
33 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA')
34 ('delta', 'CGCGC')
35
36 """
37
38 while True:
39 line = handle.readline()
40 if line == "":
41 return
42 if line[0] == ">":
43 break
44
45 while True:
46 if line[0] != ">":
47 raise ValueError(
48 "Records in Fasta files should start with '>' character")
49 title = line[1:].rstrip()
50 lines = []
51 line = handle.readline()
52 while True:
53 if not line:
54 break
55 if line[0] == ">":
56 break
57 lines.append(line.rstrip())
58 line = handle.readline()
59
60
61
62
63 yield title, "".join(lines).replace(" ", "").replace("\r", "")
64
65 if not line:
66 return
67
68 assert False, "Should not reach this line"
69
70
72 """Generator function to iterate over Fasta records (as SeqRecord objects).
73
74 handle - input file
75 alphabet - optional alphabet
76 title2ids - A function that, when given the title of the FASTA
77 file (without the beginning >), will return the id, name and
78 description (in that order) for the record as a tuple of strings.
79
80 If this is not given, then the entire title line will be used
81 as the description, and the first word as the id and name.
82
83 By default this will act like calling Bio.SeqIO.parse(handle, "fasta")
84 with no custom handling of the title lines:
85
86 >>> for record in FastaIterator(open("Fasta/dups.fasta")):
87 ... print record.id
88 alpha
89 beta
90 gamma
91 alpha
92 delta
93
94 However, you can supply a title2ids function to alter this:
95
96 >>> def take_upper(title):
97 ... return title.split(None,1)[0].upper(), "", title
98 >>> for record in FastaIterator(open("Fasta/dups.fasta"), title2ids=take_upper):
99 ... print record.id
100 ALPHA
101 BETA
102 GAMMA
103 ALPHA
104 DELTA
105
106 """
107 if title2ids:
108 for title, sequence in SimpleFastaParser(handle):
109 id, name, descr = title2ids(title)
110 yield SeqRecord(Seq(sequence, alphabet),
111 id=id, name=name, description=descr)
112 else:
113 for title, sequence in SimpleFastaParser(handle):
114 try:
115 first_word = title.split(None, 1)[0]
116 except IndexError:
117 assert not title, repr(title)
118
119 first_word = ""
120 yield SeqRecord(Seq(sequence, alphabet),
121 id=first_word, name=first_word, description=title)
122
123
125 """Class to write Fasta format files."""
126 - def __init__(self, handle, wrap=60, record2title=None):
127 """Create a Fasta writer.
128
129 handle - Handle to an output file, e.g. as returned
130 by open(filename, "w")
131 wrap - Optional line length used to wrap sequence lines.
132 Defaults to wrapping the sequence at 60 characters
133 Use zero (or None) for no wrapping, giving a single
134 long line for the sequence.
135 record2title - Optional function to return the text to be
136 used for the title line of each record. By default the
137 a combination of the record.id and record.description
138 is used. If the record.description starts with the
139 record.id, then just the record.description is used.
140
141 You can either use:
142
143 myWriter = FastaWriter(open(filename,"w"))
144 writer.write_file(myRecords)
145
146 Or, follow the sequential file writer system, for example:
147
148 myWriter = FastaWriter(open(filename,"w"))
149 writer.write_header() # does nothing for Fasta files
150 ...
151 Multiple calls to writer.write_record() and/or writer.write_records()
152 ...
153 writer.write_footer() # does nothing for Fasta files
154 writer.close()
155 """
156 SequentialSequenceWriter.__init__(self, handle)
157
158 self.wrap = None
159 if wrap:
160 if wrap < 1:
161 raise ValueError
162 self.wrap = wrap
163 self.record2title = record2title
164
200
201 if __name__ == "__main__":
202 print "Running quick self test"
203
204 import os
205 from Bio.Alphabet import generic_protein, generic_nucleotide
206
207
208
209 fna_filename = "NC_005213.fna"
210 faa_filename = "NC_005213.faa"
211
217
219
220
221 print "ID:" + record.id
222 print "Name:" + record.name
223 print "Descr:" + record.description
224 print record.seq
225 for feature in record.annotations:
226 print '/%s=%s' % (feature, record.annotations[feature])
227 if record.dbxrefs:
228 print "Database cross references:"
229 for x in record.dbxrefs:
230 print " - %s" % x
231
232 if os.path.isfile(fna_filename):
233 print "--------"
234 print "FastaIterator (single sequence)"
235 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function)
236 count = 0
237 for record in iterator:
238 count += 1
239 print_record(record)
240 assert count == 1
241 print str(record.__class__)
242
243 if os.path.isfile(faa_filename):
244 print "--------"
245 print "FastaIterator (multiple sequences)"
246 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function)
247 count = 0
248 for record in iterator:
249 count += 1
250 print_record(record)
251 break
252 assert count > 0
253 print str(record.__class__)
254
255 from cStringIO import StringIO
256 print "--------"
257 print "FastaIterator (empty input file)"
258
259 iterator = FastaIterator(StringIO(""))
260 count = 0
261 for record in iterator:
262 count += 1
263 assert count == 0
264
265 print "Done"
266