Package Bio :: Package Sequencing :: Module Phd
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Phd

  1  # Copyright 2004 by Cymon J. Cox and Frank Kauff.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # Revisions copyright 2009 by Cymon J. Cox.  All rights reserved. 
  4  # Revisions copyright 2009 by Peter Cock.  All rights reserved. 
  5  # 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9  """ 
 10  Parser for PHD files output by PHRED and used by PHRAP and CONSED. 
 11   
 12  This module can be used directly which will return Record objects 
 13  which should contain all the original data in the file. 
 14   
 15  Alternatively, using Bio.SeqIO with the "phd" format will call this module 
 16  internally.  This will give SeqRecord objects for each contig sequence. 
 17  """ 
 18   
 19  from Bio import Seq 
 20  from Bio.Alphabet import generic_dna 
 21   
 22  __docformat__ = "restructuredtext en" 
 23   
 24  CKEYWORDS = ['CHROMAT_FILE', 'ABI_THUMBPRINT', 'PHRED_VERSION', 'CALL_METHOD', 
 25          'QUALITY_LEVELS', 'TIME', 'TRACE_ARRAY_MIN_INDEX', 'TRACE_ARRAY_MAX_INDEX', 
 26          'TRIM', 'TRACE_PEAK_AREA_RATIO', 'CHEM', 'DYE'] 
 27   
 28   
29 -class Record(object):
30 """Hold information from a PHD file."""
31 - def __init__(self):
32 self.file_name = '' 33 self.comments = {} 34 for kw in CKEYWORDS: 35 self.comments[kw.lower()] = None 36 self.sites = [] 37 self.seq = '' 38 self.seq_trimmed = ''
39 40
41 -def read(handle):
42 """Reads the next PHD record from the file, returning it as a Record object. 43 44 This function reads PHD file data line by line from the handle, 45 and returns a single Record object. 46 """ 47 for line in handle: 48 if line.startswith("BEGIN_SEQUENCE"): 49 record = Record() 50 record.file_name = line[15:].rstrip() 51 break 52 else: 53 return # No record found 54 55 for line in handle: 56 if line.startswith("BEGIN_COMMENT"): 57 break 58 else: 59 raise ValueError("Failed to find BEGIN_COMMENT line") 60 61 for line in handle: 62 line = line.strip() 63 if not line: 64 continue 65 if line == "END_COMMENT": 66 break 67 keyword, value = line.split(":", 1) 68 keyword = keyword.lower() 69 value = value.strip() 70 if keyword in ('chromat_file', 71 'phred_version', 72 'call_method', 73 'chem', 74 'dye', 75 'time', 76 'basecaller_version', 77 'trace_processor_version'): 78 record.comments[keyword] = value 79 elif keyword in ('abi_thumbprint', 80 'quality_levels', 81 'trace_array_min_index', 82 'trace_array_max_index'): 83 record.comments[keyword] = int(value) 84 elif keyword == 'trace_peak_area_ratio': 85 record.comments[keyword] = float(value) 86 elif keyword == 'trim': 87 first, last, prob = value.split() 88 record.comments[keyword] = (int(first), int(last), float(prob)) 89 else: 90 raise ValueError("Failed to find END_COMMENT line") 91 92 for line in handle: 93 if line.startswith('BEGIN_DNA'): 94 break 95 else: 96 raise ValueError("Failed to find BEGIN_DNA line") 97 98 for line in handle: 99 if line.startswith('END_DNA'): 100 break 101 else: 102 # Line is: "site quality peak_location" 103 # Peak location is optional according to 104 # David Gordon (the Consed author) 105 parts = line.split() 106 if len(parts) in [2, 3]: 107 record.sites.append(tuple(parts)) 108 else: 109 raise ValueError("DNA line must contain a base and quality " 110 "score, and optionally a peak location.") 111 112 for line in handle: 113 if line.startswith("END_SEQUENCE"): 114 break 115 else: 116 raise ValueError("Failed to find END_SEQUENCE line") 117 118 record.seq = Seq.Seq(''.join(n[0] for n in record.sites), generic_dna) 119 if record.comments['trim'] is not None: 120 first, last = record.comments['trim'][:2] 121 record.seq_trimmed = record.seq[first:last] 122 123 return record
124 125
126 -def parse(handle):
127 """Iterates over a file returning multiple PHD records. 128 129 The data is read line by line from the handle. The handle can be a list 130 of lines, an open file, or similar; the only requirement is that we can 131 iterate over the handle to retrieve lines from it. 132 133 Typical usage:: 134 135 records = parse(handle) 136 for record in records: 137 # do something with the record object 138 """ 139 while True: 140 record = read(handle) 141 if not record: 142 return 143 yield record
144