Package Bio :: Package SeqIO :: Module PhdIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PhdIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2009 by Cymon J. Cox.  All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Bio.SeqIO support for the "phd" file format. 
  9   
 10  PHD files are output by PHRED and used by PHRAP and CONSED. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions, under the 
 13  format name "phd". See also the underlying Bio.Sequencing.Phd module. 
 14   
 15  For example, using Bio.SeqIO we can read in one of the example PHRED files 
 16  from the Biopython unit tests: 
 17   
 18      >>> from Bio import SeqIO 
 19      >>> for record in SeqIO.parse(open("Phd/phd1"), "phd"): 
 20      ...     print record.id 
 21      ...     print record.seq[:10], "..." 
 22      ...     print record.letter_annotations["phred_quality"][:10], "..." 
 23      34_222_(80-A03-19).b.ab1 
 24      ctccgtcgga ... 
 25      [9, 9, 10, 19, 22, 37, 28, 28, 24, 22] ... 
 26      425_103_(81-A03-19).g.ab1 
 27      cgggatccca ... 
 28      [14, 17, 22, 10, 10, 10, 15, 8, 8, 9] ... 
 29      425_7_(71-A03-19).b.ab1 
 30      acataaatca ... 
 31      [10, 10, 10, 10, 8, 8, 6, 6, 6, 6] ... 
 32   
 33  Since PHRED files contain quality scores, you can save them as FASTQ or as 
 34  QUAL files, for example using Bio.SeqIO.write(...), or simply with the format 
 35  method of the SeqRecord object: 
 36   
 37      >>> print record[:50].format("fastq") 
 38      @425_7_(71-A03-19).b.ab1 
 39      acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag 
 40      + 
 41      ++++))'''')(''')$!$''')''''(+.''$!$))))+)))''''''' 
 42      <BLANKLINE> 
 43   
 44  Or, 
 45   
 46      >>> print record[:50].format("qual") 
 47      >425_7_(71-A03-19).b.ab1 
 48      10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7 
 49      10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6 
 50      <BLANKLINE> 
 51   
 52  Note these examples only show the first 50 bases to keep the output short. 
 53  """ 
 54   
 55  from Bio.SeqRecord import SeqRecord 
 56  from Bio.Sequencing import Phd 
 57  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 58  from Bio.SeqIO import QualityIO 
 59   
 60   
61 -def PhdIterator(handle):
62 """Returns SeqRecord objects from a PHD file. 63 64 This uses the Bio.Sequencing.Phd module to do the hard work. 65 """ 66 phd_records = Phd.parse(handle) 67 for phd_record in phd_records: 68 #Convert the PHY record into a SeqRecord... 69 #The "filename" can contain spaces, e.g. 'HWI-EAS94_4_1_1_602_99 1' 70 #from unit test example file phd_solexa. 71 #This will cause problems if used as the record identifier 72 #(e.g. output for FASTQ format). 73 name = phd_record.file_name.split(None, 1)[0] 74 seq_record = SeqRecord(phd_record.seq, 75 id=name, name=name, 76 description=phd_record.file_name) 77 #Just re-use the comments dictionary as the SeqRecord's annotations 78 seq_record.annotations = phd_record.comments 79 #And store the qualities and peak locations as per-letter-annotation 80 seq_record.letter_annotations["phred_quality"] = \ 81 [int(site[1]) for site in phd_record.sites] 82 try: 83 seq_record.letter_annotations["peak_location"] = \ 84 [int(site[2]) for site in phd_record.sites] 85 except IndexError: 86 # peak locations are not always there according to 87 # David Gordon (the Consed author) 88 pass 89 yield seq_record
90 #All done 91 92
93 -class PhdWriter(SequentialSequenceWriter):
94 """Class to write Phd format files""" 95
96 - def __init__(self, handle):
98
99 - def write_record(self, record):
100 """Write a single Phd record to the file.""" 101 assert record.seq, "No sequence present in SeqRecord" 102 # This method returns the 'phred_quality' scores or converted 103 # 'solexa_quality' scores if present, else raises a value error 104 phred_qualities = QualityIO._get_phred_quality(record) 105 peak_locations = record.letter_annotations.get("peak_location", None) 106 assert len(record.seq) == len(phred_qualities), "Number of " + \ 107 "phd quality scores does not match length of sequence" 108 if peak_locations: 109 assert len(record.seq) == len(peak_locations), "Number " + \ 110 "of peak location scores does not match length of sequence" 111 if None in phred_qualities: 112 raise ValueError("A quality value of None was found") 113 if record.description.startswith("%s " % record.id): 114 title = record.description 115 else: 116 title = "%s %s" % (record.id, record.description) 117 self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n" 118 % self.clean(title)) 119 for annot in [k.lower() for k in Phd.CKEYWORDS]: 120 value = None 121 if annot == "trim": 122 if record.annotations.get("trim", None): 123 value = "%s %s %.4f" % record.annotations["trim"] 124 elif annot == "trace_peak_area_ratio": 125 if record.annotations.get("trace_peak_area_ratio", None): 126 value = "%.4f" % record.annotations[ 127 "trace_peak_area_ratio"] 128 else: 129 value = record.annotations.get(annot, None) 130 if value or value == 0: 131 self.handle.write("%s: %s\n" % (annot.upper(), value)) 132 133 self.handle.write("END_COMMENT\nBEGIN_DNA\n") 134 for i, site in enumerate(record.seq): 135 if peak_locations: 136 self.handle.write("%s %i %i\n" % ( 137 site, 138 round(phred_qualities[i]), 139 peak_locations[i]) 140 ) 141 else: 142 self.handle.write("%s %i\n" % ( 143 site, 144 round(phred_qualities[i])) 145 ) 146 147 self.handle.write("END_DNA\nEND_SEQUENCE\n")
148 149 150 if __name__ == "__main__": 151 from Bio._utils import run_doctest 152 run_doctest() 153