Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Parser for ACE files output by PHRAP. 
  7   
  8  Written by Frank Kauff (fkauff@duke.edu) and 
  9  Cymon J. Cox (cymon@duke.edu) 
 10   
 11  Usage: 
 12   
 13  There are two ways of reading an ace file: 
 14  1) The function 'read' reads the whole file at once; 
 15  2) The function 'parse' reads the file contig after contig. 
 16       
 17  1) Parse whole ace file at once: 
 18   
 19          from Bio.Sequencing import Ace 
 20          acefilerecord=Ace.read(open('my_ace_file.ace')) 
 21   
 22  This gives you: 
 23          acefilerecord.ncontigs (the number of contigs in the ace file) 
 24          acefilerecord.nreads (the number of reads in the ace file) 
 25          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 26   
 27  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 28  for this contig in a list of instances of the Read class, e.g.: 
 29   
 30          contig3=acefilerecord.contigs[2] 
 31          read4=contig3.reads[3] 
 32          RD_of_read4=read4.rd 
 33          DS_of_read4=read4.ds 
 34   
 35  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 36  sorted into the right place. 
 37   
 38  see _RecordConsumer for details. 
 39   
 40  2) Or you can iterate over the contigs of an ace file one by one in the ususal way: 
 41   
 42          from Bio.Sequencing import Ace 
 43          contigs=Ace.parse(open('my_ace_file.ace')) 
 44          for contig in contigs: 
 45              print contig.name 
 46              ... 
 47   
 48  Please note that for memory efficiency, when using the iterator approach, only one 
 49  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 50  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 51  Because the parser doesn't see this data until the final record, it cannot be added to 
 52  the appropriate records.  Instead these tags will be returned with the last contig record. 
 53  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 54  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 55  """ 
 56   
 57   
58 -class rd(object):
59 """RD (reads), store a read with its name, sequence etc. 60 61 The location and strand each read is mapped to is held in the AF lines. 62 """
63 - def __init__(self):
64 self.name='' 65 self.padded_bases=None 66 self.info_items=None 67 self.read_tags=None 68 self.sequence=''
69
70 -class qa(object):
71 """QA (read quality), including which part if any was used as the consensus."""
72 - def __init__(self, line=None):
73 self.qual_clipping_start=None 74 self.qual_clipping_end=None 75 self.align_clipping_start=None 76 self.align_clipping_end=None 77 if line: 78 header=map(eval,line.split()[1:]) 79 self.qual_clipping_start=header[0] 80 self.qual_clipping_end=header[1] 81 self.align_clipping_start=header[2] 82 self.align_clipping_end=header[3]
83
84 -class ds(object):
85 """DS lines, include file name of a read's chromatogram file."""
86 - def __init__(self, line=None):
87 self.chromat_file='' 88 self.phd_file='' 89 self.time='' 90 self.chem='' 91 self.dye='' 92 self.template='' 93 self.direction='' 94 if line: 95 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] 96 poss=map(line.find,tags) 97 tagpos=dict(zip(poss,tags)) 98 if -1 in tagpos: 99 del tagpos[-1] 100 ps=tagpos.keys() 101 ps.sort() 102 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): 103 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
104 105
106 -class af(object):
107 """AF lines, define the location of the read within the contig. 108 109 Note attribute coru is short for complemented (C) or uncomplemented (U), 110 since the strand information is stored in an ACE file using either the 111 C or U character. 112 """
113 - def __init__(self, line=None):
114 self.name='' 115 self.coru=None 116 self.padded_start=None 117 if line: 118 header = line.split() 119 self.name = header[1] 120 self.coru = header[2] 121 self.padded_start = int(header[3])
122
123 -class bs(object):
124 """"BS (base segment), which read was chosen as the consensus at each position."""
125 - def __init__(self, line=None):
126 self.name='' 127 self.padded_start=None 128 self.padded_end=None 129 if line: 130 header = line.split() 131 self.padded_start = int(header[1]) 132 self.padded_end = int(header[2]) 133 self.name = header[3]
134
135 -class rt(object):
136 """RT (transient read tags), generated by crossmatch and phrap."""
137 - def __init__(self, line=None):
138 self.name='' 139 self.tag_type='' 140 self.program='' 141 self.padded_start=None 142 self.padded_end=None 143 self.date='' 144 self.comment=[] 145 if line: 146 header=line.split() 147 self.name=header[0] 148 self.tag_type=header[1] 149 self.program=header[2] 150 self.padded_start=int(header[3]) 151 self.padded_end=int(header[4]) 152 self.date=header[5]
153
154 -class ct(object):
155 """CT (consensus tags)."""
156 - def __init__(self, line=None):
157 self.name='' 158 self.tag_type='' 159 self.program='' 160 self.padded_start=None 161 self.padded_end=None 162 self.date='' 163 self.notrans='' 164 self.info=[] 165 self.comment=[] 166 if line: 167 header=line.split() 168 self.name = header[0] 169 self.tag_type = header[1] 170 self.program = header[2] 171 self.padded_start = int(header[3]) 172 self.padded_end = int(header[4]) 173 self.date = header[5] 174 if len(header)==7: 175 self.notrans = header[6]
176
177 -class wa(object):
178 """WA (whole assembly tag), holds the assembly program name, version, etc."""
179 - def __init__(self, line=None):
180 self.tag_type='' 181 self.program='' 182 self.date='' 183 self.info=[] 184 if line: 185 header = line.split() 186 self.tag_type = header[0] 187 self.program = header[1] 188 self.date = header[2]
189
190 -class wr(object):
191 """WR lines."""
192 - def __init__(self, line=None):
193 self.name='' 194 self.aligned='' 195 self.program='' 196 self.date=[] 197 if line: 198 header = line.split() 199 self.name = header[0] 200 self.aligned = header[1] 201 self.program = header[2] 202 self.date = header[3]
203
204 -class Reads(object):
205 """Holds information about a read supporting an ACE contig."""
206 - def __init__(self, line=None):
207 self.rd=None # one per read 208 self.qa=None # one per read 209 self.ds=None # none or one per read 210 self.rt=None # none or many per read 211 self.wr=None # none or many per read 212 if line: 213 self.rd = rd() 214 header = line.split() 215 self.rd.name = header[1] 216 self.rd.padded_bases = int(header[2]) 217 self.rd.info_items = int(header[3]) 218 self.rd.read_tags = int(header[4])
219
220 -class Contig(object):
221 """Holds information about a contig from an ACE record."""
222 - def __init__(self, line=None):
223 self.name = '' 224 self.nbases=None 225 self.nreads=None 226 self.nsegments=None 227 self.uorc=None 228 self.sequence="" 229 self.quality=[] 230 self.af=[] 231 self.bs=[] 232 self.reads=[] 233 self.ct=None # none or many 234 self.wa=None # none or many 235 if line: 236 header = line.split() 237 self.name = header[1] 238 self.nbases = int(header[2]) 239 self.nreads = int(header[3]) 240 self.nsegments = int(header[4]) 241 self.uorc = header[5]
242
243 -def parse(handle):
244 """parse(handle) 245 246 where handle is a file-like object. 247 248 This function returns an iterator that allows you to iterate 249 over the ACE file record by record: 250 251 records = parse(handle) 252 for record in records: 253 # do something with the record 254 255 where each record is a Contig object. 256 """ 257 258 handle = iter(handle) 259 260 line = "" 261 while True: 262 # at beginning, skip the AS and look for first CO command 263 try: 264 while True: 265 if line.startswith('CO'): 266 break 267 line = handle.next() 268 except StopIteration: 269 return 270 271 record = Contig(line) 272 273 for line in handle: 274 line = line.strip() 275 if not line: 276 break 277 record.sequence+=line 278 279 for line in handle: 280 if line.strip(): 281 break 282 if not line.startswith("BQ"): 283 raise ValueError("Failed to find BQ line") 284 285 for line in handle: 286 if not line.strip(): 287 break 288 record.quality.extend(map(int,line.split())) 289 290 for line in handle: 291 if line.strip(): 292 break 293 294 while True: 295 if not line.startswith("AF "): 296 break 297 record.af.append(af(line)) 298 try: 299 line = handle.next() 300 except StopIteration: 301 raise ValueError("Unexpected end of AF block") 302 303 while True: 304 if line.strip(): 305 break 306 try: 307 line = handle.next() 308 except StopIteration: 309 raise ValueError("Unexpected end of file") 310 311 while True: 312 if not line.startswith("BS "): 313 break 314 record.bs.append(bs(line)) 315 try: 316 line = handle.next() 317 except StopIteration: 318 raise ValueError("Failed to find end of BS block") 319 320 # now read all the read data 321 # it starts with a 'RD', and then a mandatory QA 322 # then follows an optional DS 323 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 324 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 325 # with that later. 326 while True: 327 328 # each read must have a rd and qa 329 try: 330 while True: 331 # If I've met the condition, then stop reading the line. 332 if line.startswith("RD "): 333 break 334 line = handle.next() 335 except StopIteration: 336 raise ValueError("Failed to find RD line") 337 338 record.reads.append(Reads(line)) 339 340 for line in handle: 341 line = line.strip() 342 if not line: 343 break 344 record.reads[-1].rd.sequence+=line 345 346 for line in handle: 347 if line.strip(): 348 break 349 if not line.startswith("QA "): 350 raise ValueError("Failed to find QA line") 351 record.reads[-1].qa = qa(line) 352 353 # now one ds can follow 354 for line in handle: 355 if line.strip(): 356 break 357 else: 358 break 359 360 if line.startswith("DS "): 361 record.reads[-1].ds = ds(line) 362 line = "" 363 # the file could just end, or there's some more stuff. In ace files, anything can happen. 364 # the following tags are interspersed between reads and can appear multiple times. 365 while True: 366 # something left 367 try: 368 while True: 369 if line.strip(): 370 break 371 line = handle.next() 372 except StopIteration: 373 # file ends here 374 break 375 if line.startswith("RT{"): 376 # now if we're at the end of the file, this rt could 377 # belong to a previous read, not the actual one. 378 # we store it here were it appears, the user can sort later. 379 if record.reads[-1].rt is None: 380 record.reads[-1].rt=[] 381 for line in handle: 382 line=line.strip() 383 #if line=="COMMENT{": 384 if line.startswith("COMMENT{"): 385 if line[8:].strip(): 386 #MIRA 3.0.5 would miss the new line out :( 387 record.reads[-1].rt[-1].comment.append(line[8:]) 388 for line in handle: 389 line = line.strip() 390 if line.endswith("C}"): 391 break 392 record.reads[-1].rt[-1].comment.append(line) 393 elif line=='}': 394 break 395 else: 396 record.reads[-1].rt.append(rt(line)) 397 line = "" 398 elif line.startswith("WR{"): 399 if record.reads[-1].wr is None: 400 record.reads[-1].wr=[] 401 for line in handle: 402 line=line.strip() 403 if line=='}': break 404 record.reads[-1].wr.append(wr(line)) 405 line = "" 406 elif line.startswith("WA{"): 407 if record.wa is None: 408 record.wa=[] 409 try: 410 line = handle.next() 411 except StopIteration: 412 raise ValueError("Failed to read WA block") 413 record.wa.append(wa(line)) 414 for line in handle: 415 line=line.strip() 416 if line=='}': break 417 record.wa[-1].info.append(line) 418 line = "" 419 elif line.startswith("CT{"): 420 if record.ct is None: 421 record.ct=[] 422 try: 423 line = handle.next() 424 except StopIteration: 425 raise ValueError("Failed to read CT block") 426 record.ct.append(ct(line)) 427 for line in handle: 428 line=line.strip() 429 if line=="COMMENT{": 430 for line in handle: 431 line = line.strip() 432 if line.endswith("C}"): 433 break 434 record.ct[-1].comment.append(line) 435 elif line=='}': 436 break 437 else: 438 record.ct[-1].info.append(line) 439 line = "" 440 else: 441 break 442 443 if not line.startswith('RD'): # another read? 444 break 445 446 yield record
447
448 -class ACEFileRecord(object):
449 """Holds data of an ACE file. 450 """
451 - def __init__(self):
452 self.ncontigs=None 453 self.nreads=None 454 self.contigs=[] 455 self.wa=None # none or many
456
457 - def sort(self):
458 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ 459 460 ct=[] 461 rt=[] 462 wr=[] 463 # search for tags that aren't in the right position 464 for i in range(len(self.contigs)): 465 c = self.contigs[i] 466 if c.wa: 467 if not self.wa: 468 self.wa=[] 469 self.wa.extend(c.wa) 470 if c.ct: 471 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] 472 for x in newcts: 473 self.contigs[i].ct.remove(x) 474 ct.extend(newcts) 475 for j in range(len(c.reads)): 476 r = c.reads[j] 477 if r.rt: 478 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] 479 for x in newrts: 480 self.contigs[i].reads[j].rt.remove(x) 481 rt.extend(newrts) 482 if r.wr: 483 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] 484 for x in newwrs: 485 self.contigs[i].reads[j].wr.remove(x) 486 wr.extend(newwrs) 487 # now sort them into their proper place 488 for i in range(len(self.contigs)): 489 c = self.contigs[i] 490 for ct_tag in ct: 491 if ct_tag.name==c.name: 492 if self.contigs[i].ct is None: 493 self.contigs[i].ct=[] 494 self.contigs[i].ct.append(ct_tag) 495 if rt or wr: 496 for j in range(len(c.reads)): 497 r = c.reads[j] 498 for rt_tag in rt: 499 if rt_tag.name==r.rd.name: 500 if self.contigs[i].reads[j].rt is None: 501 self.contigs[i].reads[j].rt=[] 502 self.contigs[i].reads[j].rt.append(rt_tag) 503 for wr_tag in wr: 504 if wr_tag.name==r.rd.name: 505 if self.contigs[i].reads[j].wr is None: 506 self.contigs[i].reads[j].wr=[] 507 self.contigs[i].reads[j].wr.append(wr_tag)
508
509 -def read(handle):
510 """Parses the full ACE file in list of contigs. 511 512 """ 513 514 handle = iter(handle) 515 516 record=ACEFileRecord() 517 518 try: 519 line = handle.next() 520 except StopIteration: 521 raise ValueError("Premature end of file") 522 523 # check if the file starts correctly 524 if not line.startswith('AS'): 525 raise ValueError("File does not start with 'AS'.") 526 527 words = line.split() 528 record.ncontigs, record.nreads = map(int, words[1:3]) 529 530 # now read all the records 531 record.contigs = list(parse(handle)) 532 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 533 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 534 # if all tags are at the end, they are read with the last contig. The concept of an 535 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 536 # them and put them into the appropriate contig/read instance. 537 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 538 record.sort() 539 return record
540