1
2
3
4
5 """
6 Parser for ACE files output by PHRAP.
7
8 Written by Frank Kauff (fkauff@duke.edu) and
9 Cymon J. Cox (cymon@duke.edu)
10
11 Usage:
12
13 There are two ways of reading an ace file:
14 1) The function 'read' reads the whole file at once;
15 2) The function 'parse' reads the file contig after contig.
16
17 1) Parse whole ace file at once:
18
19 from Bio.Sequencing import Ace
20 acefilerecord=Ace.read(open('my_ace_file.ace'))
21
22 This gives you:
23 acefilerecord.ncontigs (the number of contigs in the ace file)
24 acefilerecord.nreads (the number of reads in the ace file)
25 acefilerecord.contigs[] (one instance of the Contig class for each contig)
26
27 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
28 for this contig in a list of instances of the Read class, e.g.:
29
30 contig3=acefilerecord.contigs[2]
31 read4=contig3.reads[3]
32 RD_of_read4=read4.rd
33 DS_of_read4=read4.ds
34
35 CT, WA, RT tags from the end of the file can appear anywhere are automatically
36 sorted into the right place.
37
38 see _RecordConsumer for details.
39
40 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
41
42 from Bio.Sequencing import Ace
43 contigs=Ace.parse(open('my_ace_file.ace'))
44 for contig in contigs:
45 print contig.name
46 ...
47
48 Please note that for memory efficiency, when using the iterator approach, only one
49 contig is kept in memory at once. However, there can be a footer to the ACE file
50 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
51 Because the parser doesn't see this data until the final record, it cannot be added to
52 the appropriate records. Instead these tags will be returned with the last contig record.
53 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
54 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
55 """
56
57
59 """RD (reads), store a read with its name, sequence etc.
60
61 The location and strand each read is mapped to is held in the AF lines.
62 """
64 self.name=''
65 self.padded_bases=None
66 self.info_items=None
67 self.read_tags=None
68 self.sequence=''
69
71 """QA (read quality), including which part if any was used as the consensus."""
73 self.qual_clipping_start=None
74 self.qual_clipping_end=None
75 self.align_clipping_start=None
76 self.align_clipping_end=None
77 if line:
78 header=map(eval,line.split()[1:])
79 self.qual_clipping_start=header[0]
80 self.qual_clipping_end=header[1]
81 self.align_clipping_start=header[2]
82 self.align_clipping_end=header[3]
83
85 """DS lines, include file name of a read's chromatogram file."""
87 self.chromat_file=''
88 self.phd_file=''
89 self.time=''
90 self.chem=''
91 self.dye=''
92 self.template=''
93 self.direction=''
94 if line:
95 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
96 poss=map(line.find,tags)
97 tagpos=dict(zip(poss,tags))
98 if -1 in tagpos:
99 del tagpos[-1]
100 ps=tagpos.keys()
101 ps.sort()
102 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
103 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
104
105
107 """AF lines, define the location of the read within the contig.
108
109 Note attribute coru is short for complemented (C) or uncomplemented (U),
110 since the strand information is stored in an ACE file using either the
111 C or U character.
112 """
114 self.name=''
115 self.coru=None
116 self.padded_start=None
117 if line:
118 header = line.split()
119 self.name = header[1]
120 self.coru = header[2]
121 self.padded_start = int(header[3])
122
124 """"BS (base segment), which read was chosen as the consensus at each position."""
126 self.name=''
127 self.padded_start=None
128 self.padded_end=None
129 if line:
130 header = line.split()
131 self.padded_start = int(header[1])
132 self.padded_end = int(header[2])
133 self.name = header[3]
134
136 """RT (transient read tags), generated by crossmatch and phrap."""
138 self.name=''
139 self.tag_type=''
140 self.program=''
141 self.padded_start=None
142 self.padded_end=None
143 self.date=''
144 self.comment=[]
145 if line:
146 header=line.split()
147 self.name=header[0]
148 self.tag_type=header[1]
149 self.program=header[2]
150 self.padded_start=int(header[3])
151 self.padded_end=int(header[4])
152 self.date=header[5]
153
155 """CT (consensus tags)."""
157 self.name=''
158 self.tag_type=''
159 self.program=''
160 self.padded_start=None
161 self.padded_end=None
162 self.date=''
163 self.notrans=''
164 self.info=[]
165 self.comment=[]
166 if line:
167 header=line.split()
168 self.name = header[0]
169 self.tag_type = header[1]
170 self.program = header[2]
171 self.padded_start = int(header[3])
172 self.padded_end = int(header[4])
173 self.date = header[5]
174 if len(header)==7:
175 self.notrans = header[6]
176
178 """WA (whole assembly tag), holds the assembly program name, version, etc."""
180 self.tag_type=''
181 self.program=''
182 self.date=''
183 self.info=[]
184 if line:
185 header = line.split()
186 self.tag_type = header[0]
187 self.program = header[1]
188 self.date = header[2]
189
203
205 """Holds information about a read supporting an ACE contig."""
219
221 """Holds information about a contig from an ACE record."""
223 self.name = ''
224 self.nbases=None
225 self.nreads=None
226 self.nsegments=None
227 self.uorc=None
228 self.sequence=""
229 self.quality=[]
230 self.af=[]
231 self.bs=[]
232 self.reads=[]
233 self.ct=None
234 self.wa=None
235 if line:
236 header = line.split()
237 self.name = header[1]
238 self.nbases = int(header[2])
239 self.nreads = int(header[3])
240 self.nsegments = int(header[4])
241 self.uorc = header[5]
242
244 """parse(handle)
245
246 where handle is a file-like object.
247
248 This function returns an iterator that allows you to iterate
249 over the ACE file record by record:
250
251 records = parse(handle)
252 for record in records:
253 # do something with the record
254
255 where each record is a Contig object.
256 """
257
258 handle = iter(handle)
259
260 line = ""
261 while True:
262
263 try:
264 while True:
265 if line.startswith('CO'):
266 break
267 line = handle.next()
268 except StopIteration:
269 return
270
271 record = Contig(line)
272
273 for line in handle:
274 line = line.strip()
275 if not line:
276 break
277 record.sequence+=line
278
279 for line in handle:
280 if line.strip():
281 break
282 if not line.startswith("BQ"):
283 raise ValueError("Failed to find BQ line")
284
285 for line in handle:
286 if not line.strip():
287 break
288 record.quality.extend(map(int,line.split()))
289
290 for line in handle:
291 if line.strip():
292 break
293
294 while True:
295 if not line.startswith("AF "):
296 break
297 record.af.append(af(line))
298 try:
299 line = handle.next()
300 except StopIteration:
301 raise ValueError("Unexpected end of AF block")
302
303 while True:
304 if line.strip():
305 break
306 try:
307 line = handle.next()
308 except StopIteration:
309 raise ValueError("Unexpected end of file")
310
311 while True:
312 if not line.startswith("BS "):
313 break
314 record.bs.append(bs(line))
315 try:
316 line = handle.next()
317 except StopIteration:
318 raise ValueError("Failed to find end of BS block")
319
320
321
322
323
324
325
326 while True:
327
328
329 try:
330 while True:
331
332 if line.startswith("RD "):
333 break
334 line = handle.next()
335 except StopIteration:
336 raise ValueError("Failed to find RD line")
337
338 record.reads.append(Reads(line))
339
340 for line in handle:
341 line = line.strip()
342 if not line:
343 break
344 record.reads[-1].rd.sequence+=line
345
346 for line in handle:
347 if line.strip():
348 break
349 if not line.startswith("QA "):
350 raise ValueError("Failed to find QA line")
351 record.reads[-1].qa = qa(line)
352
353
354 for line in handle:
355 if line.strip():
356 break
357 else:
358 break
359
360 if line.startswith("DS "):
361 record.reads[-1].ds = ds(line)
362 line = ""
363
364
365 while True:
366
367 try:
368 while True:
369 if line.strip():
370 break
371 line = handle.next()
372 except StopIteration:
373
374 break
375 if line.startswith("RT{"):
376
377
378
379 if record.reads[-1].rt is None:
380 record.reads[-1].rt=[]
381 for line in handle:
382 line=line.strip()
383
384 if line.startswith("COMMENT{"):
385 if line[8:].strip():
386
387 record.reads[-1].rt[-1].comment.append(line[8:])
388 for line in handle:
389 line = line.strip()
390 if line.endswith("C}"):
391 break
392 record.reads[-1].rt[-1].comment.append(line)
393 elif line=='}':
394 break
395 else:
396 record.reads[-1].rt.append(rt(line))
397 line = ""
398 elif line.startswith("WR{"):
399 if record.reads[-1].wr is None:
400 record.reads[-1].wr=[]
401 for line in handle:
402 line=line.strip()
403 if line=='}': break
404 record.reads[-1].wr.append(wr(line))
405 line = ""
406 elif line.startswith("WA{"):
407 if record.wa is None:
408 record.wa=[]
409 try:
410 line = handle.next()
411 except StopIteration:
412 raise ValueError("Failed to read WA block")
413 record.wa.append(wa(line))
414 for line in handle:
415 line=line.strip()
416 if line=='}': break
417 record.wa[-1].info.append(line)
418 line = ""
419 elif line.startswith("CT{"):
420 if record.ct is None:
421 record.ct=[]
422 try:
423 line = handle.next()
424 except StopIteration:
425 raise ValueError("Failed to read CT block")
426 record.ct.append(ct(line))
427 for line in handle:
428 line=line.strip()
429 if line=="COMMENT{":
430 for line in handle:
431 line = line.strip()
432 if line.endswith("C}"):
433 break
434 record.ct[-1].comment.append(line)
435 elif line=='}':
436 break
437 else:
438 record.ct[-1].info.append(line)
439 line = ""
440 else:
441 break
442
443 if not line.startswith('RD'):
444 break
445
446 yield record
447
449 """Holds data of an ACE file.
450 """
452 self.ncontigs=None
453 self.nreads=None
454 self.contigs=[]
455 self.wa=None
456
458 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
459
460 ct=[]
461 rt=[]
462 wr=[]
463
464 for i in range(len(self.contigs)):
465 c = self.contigs[i]
466 if c.wa:
467 if not self.wa:
468 self.wa=[]
469 self.wa.extend(c.wa)
470 if c.ct:
471 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
472 for x in newcts:
473 self.contigs[i].ct.remove(x)
474 ct.extend(newcts)
475 for j in range(len(c.reads)):
476 r = c.reads[j]
477 if r.rt:
478 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
479 for x in newrts:
480 self.contigs[i].reads[j].rt.remove(x)
481 rt.extend(newrts)
482 if r.wr:
483 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
484 for x in newwrs:
485 self.contigs[i].reads[j].wr.remove(x)
486 wr.extend(newwrs)
487
488 for i in range(len(self.contigs)):
489 c = self.contigs[i]
490 for ct_tag in ct:
491 if ct_tag.name==c.name:
492 if self.contigs[i].ct is None:
493 self.contigs[i].ct=[]
494 self.contigs[i].ct.append(ct_tag)
495 if rt or wr:
496 for j in range(len(c.reads)):
497 r = c.reads[j]
498 for rt_tag in rt:
499 if rt_tag.name==r.rd.name:
500 if self.contigs[i].reads[j].rt is None:
501 self.contigs[i].reads[j].rt=[]
502 self.contigs[i].reads[j].rt.append(rt_tag)
503 for wr_tag in wr:
504 if wr_tag.name==r.rd.name:
505 if self.contigs[i].reads[j].wr is None:
506 self.contigs[i].reads[j].wr=[]
507 self.contigs[i].reads[j].wr.append(wr_tag)
508
510 """Parses the full ACE file in list of contigs.
511
512 """
513
514 handle = iter(handle)
515
516 record=ACEFileRecord()
517
518 try:
519 line = handle.next()
520 except StopIteration:
521 raise ValueError("Premature end of file")
522
523
524 if not line.startswith('AS'):
525 raise ValueError("File does not start with 'AS'.")
526
527 words = line.split()
528 record.ncontigs, record.nreads = map(int, words[1:3])
529
530
531 record.contigs = list(parse(handle))
532
533
534
535
536
537
538 record.sort()
539 return record
540