1
2
3
4
5 """
6 Parser for ACE files output by PHRAP.
7
8 Written by Frank Kauff (fkauff@duke.edu) and
9 Cymon J. Cox (cymon@duke.edu)
10
11 Usage:
12
13 There are two ways of reading an ace file:
14 1) The function 'read' reads the whole file at once;
15 2) The function 'parse' reads the file contig after contig.
16
17 1) Parse whole ace file at once:
18
19 from Bio.Sequencing import Ace
20 acefilerecord=Ace.read(open('my_ace_file.ace'))
21
22 This gives you:
23 acefilerecord.ncontigs (the number of contigs in the ace file)
24 acefilerecord.nreads (the number of reads in the ace file)
25 acefilerecord.contigs[] (one instance of the Contig class for each contig)
26
27 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
28 for this contig in a list of instances of the Read class, e.g.:
29
30 contig3=acefilerecord.contigs[2]
31 read4=contig3.reads[3]
32 RD_of_read4=read4.rd
33 DS_of_read4=read4.ds
34
35 CT, WA, RT tags from the end of the file can appear anywhere are automatically
36 sorted into the right place.
37
38 see _RecordConsumer for details.
39
40 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
41
42 from Bio.Sequencing import Ace
43 contigs=Ace.parse(open('my_ace_file.ace'))
44 for contig in contigs:
45 print contig.name
46 ...
47
48 Please note that for memory efficiency, when using the iterator approach, only one
49 contig is kept in memory at once. However, there can be a footer to the ACE file
50 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
51 Because the parser doesn't see this data until the final record, it cannot be added to
52 the appropriate records. Instead these tags will be returned with the last contig record.
53 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
54 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
55 """
56
57
59 """RD (reads), store a read with its name, sequence etc.
60
61 The location and strand each read is mapped to is held in the AF lines.
62 """
64 self.name = ''
65 self.padded_bases = None
66 self.info_items = None
67 self.read_tags = None
68 self.sequence = ''
69
70
72 """QA (read quality), including which part if any was used as the consensus."""
74 self.qual_clipping_start = None
75 self.qual_clipping_end = None
76 self.align_clipping_start = None
77 self.align_clipping_end = None
78 if line:
79 header = map(eval, line.split()[1:])
80 self.qual_clipping_start = header[0]
81 self.qual_clipping_end = header[1]
82 self.align_clipping_start = header[2]
83 self.align_clipping_end = header[3]
84
85
87 """DS lines, include file name of a read's chromatogram file."""
89 self.chromat_file = ''
90 self.phd_file = ''
91 self.time = ''
92 self.chem = ''
93 self.dye = ''
94 self.template = ''
95 self.direction = ''
96 if line:
97 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION']
98 poss = map(line.find, tags)
99 tagpos = dict(zip(poss, tags))
100 if -1 in tagpos:
101 del tagpos[-1]
102 ps = tagpos.keys()
103 ps.sort()
104 for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]):
105 setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
106
107
109 """AF lines, define the location of the read within the contig.
110
111 Note attribute coru is short for complemented (C) or uncomplemented (U),
112 since the strand information is stored in an ACE file using either the
113 C or U character.
114 """
116 self.name = ''
117 self.coru = None
118 self.padded_start = None
119 if line:
120 header = line.split()
121 self.name = header[1]
122 self.coru = header[2]
123 self.padded_start = int(header[3])
124
125
127 """BS (base segment), which read was chosen as the consensus at each position."""
129 self.name = ''
130 self.padded_start = None
131 self.padded_end = None
132 if line:
133 header = line.split()
134 self.padded_start = int(header[1])
135 self.padded_end = int(header[2])
136 self.name = header[3]
137
138
140 """RT (transient read tags), generated by crossmatch and phrap."""
142 self.name = ''
143 self.tag_type = ''
144 self.program = ''
145 self.padded_start = None
146 self.padded_end = None
147 self.date = ''
148 self.comment = []
149 if line:
150 header = line.split()
151 self.name = header[0]
152 self.tag_type = header[1]
153 self.program = header[2]
154 self.padded_start = int(header[3])
155 self.padded_end = int(header[4])
156 self.date = header[5]
157
158
160 """CT (consensus tags)."""
162 self.name = ''
163 self.tag_type = ''
164 self.program = ''
165 self.padded_start = None
166 self.padded_end = None
167 self.date = ''
168 self.notrans = ''
169 self.info = []
170 self.comment = []
171 if line:
172 header = line.split()
173 self.name = header[0]
174 self.tag_type = header[1]
175 self.program = header[2]
176 self.padded_start = int(header[3])
177 self.padded_end = int(header[4])
178 self.date = header[5]
179 if len(header) == 7:
180 self.notrans = header[6]
181
182
184 """WA (whole assembly tag), holds the assembly program name, version, etc."""
186 self.tag_type = ''
187 self.program = ''
188 self.date = ''
189 self.info = []
190 if line:
191 header = line.split()
192 self.tag_type = header[0]
193 self.program = header[1]
194 self.date = header[2]
195
196
210
211
213 """Holds information about a read supporting an ACE contig."""
227
228
230 """Holds information about a contig from an ACE record."""
232 self.name = ''
233 self.nbases = None
234 self.nreads = None
235 self.nsegments = None
236 self.uorc = None
237 self.sequence = ""
238 self.quality = []
239 self.af = []
240 self.bs = []
241 self.reads = []
242 self.ct = None
243 self.wa = None
244 if line:
245 header = line.split()
246 self.name = header[1]
247 self.nbases = int(header[2])
248 self.nreads = int(header[3])
249 self.nsegments = int(header[4])
250 self.uorc = header[5]
251
252
254 """parse(handle)
255
256 where handle is a file-like object.
257
258 This function returns an iterator that allows you to iterate
259 over the ACE file record by record:
260
261 records = parse(handle)
262 for record in records:
263 # do something with the record
264
265 where each record is a Contig object.
266 """
267
268 handle = iter(handle)
269
270 line = ""
271 while True:
272
273 try:
274 while True:
275 if line.startswith('CO'):
276 break
277 line = handle.next()
278 except StopIteration:
279 return
280
281 record = Contig(line)
282
283 for line in handle:
284 line = line.strip()
285 if not line:
286 break
287 record.sequence += line
288
289 for line in handle:
290 if line.strip():
291 break
292 if not line.startswith("BQ"):
293 raise ValueError("Failed to find BQ line")
294
295 for line in handle:
296 if not line.strip():
297 break
298 record.quality.extend(map(int, line.split()))
299
300 for line in handle:
301 if line.strip():
302 break
303
304 while True:
305 if not line.startswith("AF "):
306 break
307 record.af.append(af(line))
308 try:
309 line = handle.next()
310 except StopIteration:
311 raise ValueError("Unexpected end of AF block")
312
313 while True:
314 if line.strip():
315 break
316 try:
317 line = handle.next()
318 except StopIteration:
319 raise ValueError("Unexpected end of file")
320
321 while True:
322 if not line.startswith("BS "):
323 break
324 record.bs.append(bs(line))
325 try:
326 line = handle.next()
327 except StopIteration:
328 raise ValueError("Failed to find end of BS block")
329
330
331
332
333
334
335
336 while True:
337
338
339 try:
340 while True:
341
342 if line.startswith("RD "):
343 break
344 line = handle.next()
345 except StopIteration:
346 raise ValueError("Failed to find RD line")
347
348 record.reads.append(Reads(line))
349
350 for line in handle:
351 line = line.strip()
352 if not line:
353 break
354 record.reads[-1].rd.sequence += line
355
356 for line in handle:
357 if line.strip():
358 break
359 if not line.startswith("QA "):
360 raise ValueError("Failed to find QA line")
361 record.reads[-1].qa = qa(line)
362
363
364 for line in handle:
365 if line.strip():
366 break
367 else:
368 break
369
370 if line.startswith("DS "):
371 record.reads[-1].ds = ds(line)
372 line = ""
373
374
375 while True:
376
377 try:
378 while True:
379 if line.strip():
380 break
381 line = handle.next()
382 except StopIteration:
383
384 break
385 if line.startswith("RT{"):
386
387
388
389 if record.reads[-1].rt is None:
390 record.reads[-1].rt = []
391 for line in handle:
392 line = line.strip()
393
394 if line.startswith("COMMENT{"):
395 if line[8:].strip():
396
397 record.reads[-1].rt[-1].comment.append(line[8:])
398 for line in handle:
399 line = line.strip()
400 if line.endswith("C}"):
401 break
402 record.reads[-1].rt[-1].comment.append(line)
403 elif line == '}':
404 break
405 else:
406 record.reads[-1].rt.append(rt(line))
407 line = ""
408 elif line.startswith("WR{"):
409 if record.reads[-1].wr is None:
410 record.reads[-1].wr = []
411 for line in handle:
412 line = line.strip()
413 if line == '}':
414 break
415 record.reads[-1].wr.append(wr(line))
416 line = ""
417 elif line.startswith("WA{"):
418 if record.wa is None:
419 record.wa = []
420 try:
421 line = handle.next()
422 except StopIteration:
423 raise ValueError("Failed to read WA block")
424 record.wa.append(wa(line))
425 for line in handle:
426 line = line.strip()
427 if line == '}':
428 break
429 record.wa[-1].info.append(line)
430 line = ""
431 elif line.startswith("CT{"):
432 if record.ct is None:
433 record.ct = []
434 try:
435 line = handle.next()
436 except StopIteration:
437 raise ValueError("Failed to read CT block")
438 record.ct.append(ct(line))
439 for line in handle:
440 line = line.strip()
441 if line == "COMMENT{":
442 for line in handle:
443 line = line.strip()
444 if line.endswith("C}"):
445 break
446 record.ct[-1].comment.append(line)
447 elif line == '}':
448 break
449 else:
450 record.ct[-1].info.append(line)
451 line = ""
452 else:
453 break
454
455 if not line.startswith('RD'):
456 break
457
458 yield record
459
460
462 """Holds data of an ACE file.
463 """
465 self.ncontigs = None
466 self.nreads = None
467 self.contigs = []
468 self.wa = None
469
471 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible."""
472
473 ct = []
474 rt = []
475 wr = []
476
477 for i in range(len(self.contigs)):
478 c = self.contigs[i]
479 if c.wa:
480 if not self.wa:
481 self.wa = []
482 self.wa.extend(c.wa)
483 if c.ct:
484 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name]
485 for x in newcts:
486 self.contigs[i].ct.remove(x)
487 ct.extend(newcts)
488 for j in range(len(c.reads)):
489 r = c.reads[j]
490 if r.rt:
491 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name]
492 for x in newrts:
493 self.contigs[i].reads[j].rt.remove(x)
494 rt.extend(newrts)
495 if r.wr:
496 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name]
497 for x in newwrs:
498 self.contigs[i].reads[j].wr.remove(x)
499 wr.extend(newwrs)
500
501 for i in range(len(self.contigs)):
502 c = self.contigs[i]
503 for ct_tag in ct:
504 if ct_tag.name == c.name:
505 if self.contigs[i].ct is None:
506 self.contigs[i].ct = []
507 self.contigs[i].ct.append(ct_tag)
508 if rt or wr:
509 for j in range(len(c.reads)):
510 r = c.reads[j]
511 for rt_tag in rt:
512 if rt_tag.name == r.rd.name:
513 if self.contigs[i].reads[j].rt is None:
514 self.contigs[i].reads[j].rt = []
515 self.contigs[i].reads[j].rt.append(rt_tag)
516 for wr_tag in wr:
517 if wr_tag.name == r.rd.name:
518 if self.contigs[i].reads[j].wr is None:
519 self.contigs[i].reads[j].wr = []
520 self.contigs[i].reads[j].wr.append(wr_tag)
521
522
524 """Parses the full ACE file in list of contigs.
525
526 """
527
528 handle = iter(handle)
529
530 record = ACEFileRecord()
531
532 try:
533 line = handle.next()
534 except StopIteration:
535 raise ValueError("Premature end of file")
536
537
538 if not line.startswith('AS'):
539 raise ValueError("File does not start with 'AS'.")
540
541 words = line.split()
542 record.ncontigs, record.nreads = map(int, words[1:3])
543
544
545 record.contigs = list(parse(handle))
546
547
548
549
550
551
552 record.sort()
553 return record
554