1
2
3
4
5
6 """Find and deal with signatures in biological sequence data.
7
8 In addition to representing sequences according to motifs (see Motif.py
9 for more information), we can also use Signatures, which are conserved
10 regions that are not necessarily consecutive. This may be useful in
11 the case of very diverged sequences, where signatures may pick out
12 important conservation that can't be found by motifs (hopefully!).
13 """
14
15 from Bio.Alphabet import _verify_alphabet
16 from Bio.Seq import Seq
17
18
19 from .Pattern import PatternRepository
20
21
23 """Find Signatures in a group of sequence records.
24
25 In this simple implementation, signatures are just defined as a
26 two motifs separated by a gap. We need something a lot smarter than
27 this to find more complicated signatures.
28 """
30 """Initialize a finder to get signatures.
31
32 Arguments:
33
34 o alphabet_strict - Specify whether signatures should be required
35 to have all letters in the signature be consistent with the
36 alphabet of the original sequence. This requires that all Seqs
37 used have a consistent alphabet. This helps protect against getting
38 useless signatures full of ambiguity signals.
39 """
40 self._alphabet_strict = alphabet_strict
41
42 - def find(self, seq_records, signature_size, max_gap):
43 """Find all signatures in a group of sequences.
44
45 Arguments:
46
47 o seq_records - A list of SeqRecord objects we'll use the sequences
48 from to find signatures.
49
50 o signature_size - The size of each half of a signature (ie. if this
51 is set at 3, then the signature could be AGC-----GAC)
52
53 o max_gap - The maximum gap size between two parts of a signature.
54 """
55 sig_info = self._get_signature_dict(seq_records, signature_size,
56 max_gap)
57
58 return PatternRepository(sig_info)
59
61 """Return a dictionary with all signatures and their counts.
62
63 This internal function does all of the hard work for the
64 find_signatures function.
65 """
66 if self._alphabet_strict:
67 alphabet = seq_records[0].seq.alphabet
68 else:
69 alphabet = None
70
71
72 all_sigs = {}
73 for seq_record in seq_records:
74
75 if alphabet is not None:
76 assert seq_record.seq.alphabet == alphabet, \
77 "Working with alphabet %s and got %s" % \
78 (alphabet, seq_record.seq.alphabet)
79
80
81 largest_sig_size = sig_size * 2 + max_gap
82 for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
83
84 first_sig = str(seq_record.seq[start:start + sig_size])
85
86
87 for second in range(start + 1, (start + 1) + max_gap):
88 second_sig = str(seq_record.seq[second: second + sig_size])
89
90
91
92 if alphabet is not None:
93 first_seq = Seq(first_sig, alphabet)
94 second_seq = Seq(second_sig, alphabet)
95 if _verify_alphabet(first_seq) \
96 and _verify_alphabet(second_seq):
97 all_sigs = self._add_sig(all_sigs,
98 (first_sig, second_sig))
99
100
101 else:
102 all_sigs = self._add_sig(all_sigs,
103 (first_sig, second_sig))
104
105 return all_sigs
106
107 - def _add_sig(self, sig_dict, sig_to_add):
108 """Add a signature to the given dictionary.
109 """
110
111 if sig_to_add in sig_dict:
112 sig_dict[sig_to_add] += 1
113
114 else:
115 sig_dict[sig_to_add] = 1
116
117 return sig_dict
118
119
121 """Convert a Sequence into its signature representatives.
122
123 This takes a sequence and a set of signatures, and converts the
124 sequence into a list of numbers representing the relative amounts
125 each signature is seen in the sequence. This allows a sequence to
126 serve as input into a neural network.
127 """
128 - def __init__(self, signatures, max_gap):
129 """Initialize with the signatures to look for.
130
131 Arguments:
132
133 o signatures - A complete list of signatures, in order, that
134 are to be searched for in the sequences. The signatures should
135 be represented as a tuple of (first part of the signature,
136 second_part of the signature) -- ('GATC', 'GATC').
137
138 o max_gap - The maximum gap we can have between the two
139 elements of the signature.
140 """
141 self._signatures = signatures
142 self._max_gap = max_gap
143
144
145
146 if len(self._signatures) > 0:
147 first_sig_size = len(self._signatures[0][0])
148 second_sig_size = len(self._signatures[0][1])
149
150 assert first_sig_size == second_sig_size, \
151 "Ends of the signature do not match: %s" \
152 % self._signatures[0]
153
154 for sig in self._signatures:
155 assert len(sig[0]) == first_sig_size, \
156 "Got first part of signature %s, expected size %s" % \
157 (sig[0], first_sig_size)
158 assert len(sig[1]) == second_sig_size, \
159 "Got second part of signature %s, expected size %s" % \
160 (sig[1], second_sig_size)
161
163 """Convert a sequence into a representation of its signatures.
164
165 Arguments:
166
167 o sequence - A Seq object we are going to convert into a set of
168 signatures.
169
170 Returns:
171 A list of relative signature representations. Each item in the
172 list corresponds to the signature passed in to the initializer and
173 is the number of times that the signature was found, divided by the
174 total number of signatures found in the sequence.
175 """
176
177
178 if len(self._signatures) == 0:
179 return []
180
181
182 sequence_sigs = {}
183 for sig in self._signatures:
184 sequence_sigs[sig] = 0
185
186
187 all_first_sigs = []
188 for sig_start, sig_end in self._signatures:
189 all_first_sigs.append(sig_start)
190
191
192 sig_size = len(self._signatures[0][0])
193 smallest_sig_size = sig_size * 2
194
195 for start in range(len(sequence) - (smallest_sig_size - 1)):
196
197
198 first_sig = str(sequence[start:start + sig_size])
199 if first_sig in all_first_sigs:
200 for second in range(start + sig_size,
201 (start + sig_size + 1) + self._max_gap):
202 second_sig = str(sequence[second:second + sig_size])
203
204
205 if (first_sig, second_sig) in sequence_sigs:
206 sequence_sigs[(first_sig, second_sig)] += 1
207
208
209 min_count = min(sequence_sigs.values())
210 max_count = max(sequence_sigs.values())
211
212
213
214 if max_count > 0:
215 for sig in sequence_sigs:
216 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count)
217 / float(max_count))
218
219
220 sig_amounts = []
221 for sig in self._signatures:
222 sig_amounts.append(sequence_sigs[sig])
223
224 return sig_amounts
225