1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "phd" file format.
9
10 PHD files are output by PHRED and used by PHRAP and CONSED.
11
12 You are expected to use this module via the Bio.SeqIO functions, under the
13 format name "phd". See also the underlying Bio.Sequencing.Phd module.
14
15 For example, using Bio.SeqIO we can read in one of the example PHRED files
16 from the Biopython unit tests:
17
18 >>> from Bio import SeqIO
19 >>> for record in SeqIO.parse(open("Phd/phd1"), "phd"):
20 ... print record.id
21 ... print record.seq[:10], "..."
22 ... print record.letter_annotations["phred_quality"][:10], "..."
23 34_222_(80-A03-19).b.ab1
24 ctccgtcgga ...
25 [9, 9, 10, 19, 22, 37, 28, 28, 24, 22] ...
26 425_103_(81-A03-19).g.ab1
27 cgggatccca ...
28 [14, 17, 22, 10, 10, 10, 15, 8, 8, 9] ...
29 425_7_(71-A03-19).b.ab1
30 acataaatca ...
31 [10, 10, 10, 10, 8, 8, 6, 6, 6, 6] ...
32
33 Since PHRED files contain quality scores, you can save them as FASTQ or as
34 QUAL files, for example using Bio.SeqIO.write(...), or simply with the format
35 method of the SeqRecord object:
36
37 >>> print record[:50].format("fastq")
38 @425_7_(71-A03-19).b.ab1
39 acataaatcaaattactnaccaacacacaaaccngtctcgcgtagtggag
40 +
41 ++++))'''')(''')$!$''')''''(+.''$!$))))+)))'''''''
42 <BLANKLINE>
43
44 Or,
45
46 >>> print record[:50].format("qual")
47 >425_7_(71-A03-19).b.ab1
48 10 10 10 10 8 8 6 6 6 6 8 7 6 6 6 8 3 0 3 6 6 6 8 6 6 6 6 7
49 10 13 6 6 3 0 3 8 8 8 8 10 8 8 8 6 6 6 6 6 6 6
50 <BLANKLINE>
51
52 Note these examples only show the first 50 bases to keep the output short.
53 """
54
55 from Bio.SeqRecord import SeqRecord
56 from Bio.Sequencing import Phd
57 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
58 from Bio.SeqIO import QualityIO
59
60
62 """Returns SeqRecord objects from a PHD file.
63
64 This uses the Bio.Sequencing.Phd module to do the hard work.
65 """
66 phd_records = Phd.parse(handle)
67 for phd_record in phd_records:
68
69
70
71
72
73 name = phd_record.file_name.split(None, 1)[0]
74 seq_record = SeqRecord(phd_record.seq,
75 id=name, name=name,
76 description=phd_record.file_name)
77
78 seq_record.annotations = phd_record.comments
79
80 seq_record.letter_annotations["phred_quality"] = \
81 [int(site[1]) for site in phd_record.sites]
82 try:
83 seq_record.letter_annotations["peak_location"] = \
84 [int(site[2]) for site in phd_record.sites]
85 except IndexError:
86
87
88 pass
89 yield seq_record
90
91
92
94 """Class to write Phd format files"""
95
98
100 """Write a single Phd record to the file."""
101 assert record.seq, "No sequence present in SeqRecord"
102
103
104 phred_qualities = QualityIO._get_phred_quality(record)
105 peak_locations = record.letter_annotations.get("peak_location", None)
106 assert len(record.seq) == len(phred_qualities), "Number of " + \
107 "phd quality scores does not match length of sequence"
108 if peak_locations:
109 assert len(record.seq) == len(peak_locations), "Number " + \
110 "of peak location scores does not match length of sequence"
111 if None in phred_qualities:
112 raise ValueError("A quality value of None was found")
113 if record.description.startswith("%s " % record.id):
114 title = record.description
115 else:
116 title = "%s %s" % (record.id, record.description)
117 self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n"
118 % self.clean(title))
119 for annot in [k.lower() for k in Phd.CKEYWORDS]:
120 value = None
121 if annot == "trim":
122 if record.annotations.get("trim", None):
123 value = "%s %s %.4f" % record.annotations["trim"]
124 elif annot == "trace_peak_area_ratio":
125 if record.annotations.get("trace_peak_area_ratio", None):
126 value = "%.4f" % record.annotations[
127 "trace_peak_area_ratio"]
128 else:
129 value = record.annotations.get(annot, None)
130 if value or value == 0:
131 self.handle.write("%s: %s\n" % (annot.upper(), value))
132
133 self.handle.write("END_COMMENT\nBEGIN_DNA\n")
134 for i, site in enumerate(record.seq):
135 if peak_locations:
136 self.handle.write("%s %i %i\n" % (
137 site,
138 round(phred_qualities[i]),
139 peak_locations[i])
140 )
141 else:
142 self.handle.write("%s %i\n" % (
143 site,
144 round(phred_qualities[i]))
145 )
146
147 self.handle.write("END_DNA\nEND_SEQUENCE\n")
148
149
150 if __name__ == "__main__":
151 from Bio._utils import run_doctest
152 run_doctest()
153