1 """Deal with Motifs or Signatures allowing ambiguity in the sequences.
2
3 This class contains Schema which deal with Motifs and Signatures at
4 a higher level, by introducing `don't care` (ambiguity) symbols into
5 the sequences. For instance, you could combine the following Motifs:
6
7 'GATC', 'GATG', 'GATG', 'GATT'
8
9 as all falling under a schema like 'GAT*', where the star indicates a
10 character can be anything. This helps us condense a whole ton of
11 motifs or signatures.
12 """
13
14 import random
15 import re
16
17
18 from Bio import Alphabet
19 from Bio.Seq import MutableSeq
20
21
22 from Pattern import PatternRepository
23
24
25 from Bio.GA import Organism
26 from Bio.GA.Evolver import GenerationEvolver
27 from Bio.GA.Mutation.Simple import SinglePositionMutation
28 from Bio.GA.Crossover.Point import SinglePointCrossover
29 from Bio.GA.Repair.Stabilizing import AmbiguousRepair
30 from Bio.GA.Selection.Tournament import TournamentSelection
31 from Bio.GA.Selection.Diversity import DiversitySelection
32
33
35 """Deal with motifs that have ambiguity characters in it.
36
37 This motif class allows specific ambiguity characters and tries to
38 speed up finding motifs using regular expressions.
39
40 This is likely to be a replacement for the Schema representation,
41 since it allows multiple ambiguity characters to be used.
42 """
44 """Initialize with ambiguity information.
45
46 Arguments:
47
48 o ambiguity_info - A dictionary which maps letters in the motifs to
49 the ambiguous characters which they might represent. For example,
50 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G.
51 All letters in the motif must be represented in the ambiguity_info
52 dictionary.
53 """
54 self._ambiguity_info = ambiguity_info
55
56
57 self._motif_cache = {}
58
60 """Encode the passed motif as a regular expression pattern object.
61
62 Arguments:
63
64 o motif - The motif we want to encode. This should be a string.
65
66 Returns:
67 A compiled regular expression pattern object that can be used
68 for searching strings.
69 """
70 regexp_string = ""
71
72 for motif_letter in motif:
73 try:
74 letter_matches = self._ambiguity_info[motif_letter]
75 except KeyError:
76 raise KeyError("No match information for letter %s"
77 % motif_letter)
78
79 if len(letter_matches) > 1:
80 regexp_match = "[" + letter_matches + "]"
81 elif len(letter_matches) == 1:
82 regexp_match = letter_matches
83 else:
84 raise ValueError("Unexpected match information %s"
85 % letter_matches)
86
87 regexp_string += regexp_match
88
89 return re.compile(regexp_string)
90
92 """Return the location of ambiguous items in the motif.
93
94 This just checks through the motif and compares each letter
95 against the ambiguity information. If a letter stands for multiple
96 items, it is ambiguous.
97 """
98 ambig_positions = []
99 for motif_letter_pos in range(len(motif)):
100 motif_letter = motif[motif_letter_pos]
101 try:
102 letter_matches = self._ambiguity_info[motif_letter]
103 except KeyError:
104 raise KeyError("No match information for letter %s"
105 % motif_letter)
106
107 if len(letter_matches) > 1:
108 ambig_positions.append(motif_letter_pos)
109
110 return ambig_positions
111
113 """Return the number of ambiguous letters in a given motif.
114 """
115 ambig_positions = self.find_ambiguous(motif)
116 return len(ambig_positions)
117
119 """Return all non-overlapping motif matches in the query string.
120
121 This utilizes the regular expression findall function, and will
122 return a list of all non-overlapping occurances in query that
123 match the ambiguous motif.
124 """
125 try:
126 motif_pattern = self._motif_cache[motif]
127 except KeyError:
128 motif_pattern = self.encode_motif(motif)
129 self._motif_cache[motif] = motif_pattern
130
131 return motif_pattern.findall(query)
132
134 """Find the number of non-overlapping times motif occurs in query.
135 """
136 all_matches = self.find_matches(motif, query)
137 return len(all_matches)
138
140 """Return a listing of all unambiguous letters allowed in motifs.
141 """
142 all_letters = sorted(self._ambiguity_info)
143 unambig_letters = []
144
145 for letter in all_letters:
146 possible_matches = self._ambiguity_info[letter]
147 if len(possible_matches) == 1:
148 unambig_letters.append(letter)
149
150 return unambig_letters
151
152
153
154
155
156
158 """Alphabet of a simple Schema for DNA sequences.
159
160 This defines a simple alphabet for DNA sequences that has a single
161 character which can match any other character.
162
163 o G,A,T,C - The standard unambiguous DNA alphabet.
164
165 o * - Any letter
166 """
167 letters = ["G", "A", "T", "C", "*"]
168
169 alphabet_matches = {"G": "G",
170 "A": "A",
171 "T": "T",
172 "C": "C",
173 "*": "GATC"}
174
175
176
177
179 """Find schemas using a genetic algorithm approach.
180
181 This approach to finding schema uses Genetic Algorithms to evolve
182 a set of schema and find the best schema for a specific set of
183 records.
184
185 The 'default' finder searches for ambiguous DNA elements. This
186 can be overridden easily by creating a GeneticAlgorithmFinder
187 with a different alphabet.
188 """
190 """Initialize a finder to get schemas using Genetic Algorithms.
191
192 Arguments:
193
194 o alphabet -- The alphabet which specifies the contents of the
195 schemas we'll be generating. This alphabet must contain the
196 attribute 'alphabet_matches', which is a dictionary specifying
197 the potential ambiguities of each letter in the alphabet. These
198 ambiguities will be used in building up the schema.
199 """
200 self.alphabet = alphabet
201
202 self.initial_population = 500
203 self.min_generations = 10
204
205 self._set_up_genetic_algorithm()
206
226
228 """Find the given number of unique schemas using a genetic algorithm
229
230 Arguments:
231
232 o fitness - A callable object (ie. function) which will evaluate
233 the fitness of a motif.
234
235 o num_schemas - The number of unique schemas with good fitness
236 that we want to generate.
237 """
238 start_population = \
239 Organism.function_population(self.motif_generator.random_motif,
240 self.initial_population,
241 fitness)
242 finisher = SimpleFinisher(num_schemas, self.min_generations)
243
244
245 evolver = GenerationEvolver(start_population, self.selector)
246 evolved_pop = evolver.evolve(finisher.is_finished)
247
248
249 schema_info = {}
250 for org in evolved_pop:
251
252
253 seq_genome = org.genome.toseq()
254 schema_info[str(seq_genome)] = org.fitness
255
256 return PatternRepository(schema_info)
257
258
259
260
262 """Calculate fitness for schemas that differentiate between sequences.
263 """
264 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
265 """Initialize with different sequences to evaluate
266
267 Arguments:
268
269 o positive_seq - A list of SeqRecord objects which are the 'positive'
270 sequences -- the ones we want to select for.
271
272 o negative_seq - A list of SeqRecord objects which are the 'negative'
273 sequences that we want to avoid selecting.
274
275 o schema_evaluator - An Schema class which can be used to
276 evaluate find motif matches in sequences.
277 """
278 self._pos_seqs = positive_seqs
279 self._neg_seqs = negative_seqs
280 self._schema_eval = schema_evaluator
281
283 """Calculate the fitness for a given schema.
284
285 Fitness is specified by the number of occurances of the schema in
286 the positive sequences minus the number of occurances in the
287 negative examples.
288
289 This fitness is then modified by multiplying by the length of the
290 schema and then dividing by the number of ambiguous characters in
291 the schema. This helps select for schema which are longer and have
292 less redundancy.
293 """
294
295 seq_motif = genome.toseq()
296 motif = str(seq_motif)
297
298
299 num_pos = 0
300 for seq_record in self._pos_seqs:
301 cur_counts = self._schema_eval.num_matches(motif,
302 str(seq_record.seq))
303 num_pos += cur_counts
304
305
306 num_neg = 0
307 for seq_record in self._neg_seqs:
308 cur_counts = self._schema_eval.num_matches(motif,
309 str(seq_record.seq))
310
311 num_neg += cur_counts
312
313 num_ambiguous = self._schema_eval.num_ambiguous(motif)
314
315 num_ambiguous = pow(2.0, num_ambiguous)
316
317 num_ambiguous += 1
318
319 motif_size = len(motif)
320 motif_size = motif_size * 4.0
321
322 discerning_power = num_pos - num_neg
323
324 diff = (discerning_power * motif_size) / float(num_ambiguous)
325 return diff
326
327
329 """Calculate a fitness giving weight to schemas that match many times.
330
331 This fitness function tries to maximize schemas which are found many
332 times in a group of sequences.
333 """
334 - def __init__(self, seq_records, schema_evaluator):
335 """Initialize with sequences to evaluate.
336
337 Arguments:
338
339 o seq_records -- A set of SeqRecord objects which we use to
340 calculate the fitness.
341
342 o schema_evaluator - An Schema class which can be used to
343 evaluate find motif matches in sequences.
344 """
345 self._records = seq_records
346 self._evaluator = schema_evaluator
347
349 """Calculate the fitness of a genome based on schema matches.
350
351 This bases the fitness of a genome completely on the number of times
352 it matches in the set of seq_records. Matching more times gives a
353 better fitness
354 """
355
356 seq_motif = genome.toseq()
357 motif = str(seq_motif)
358
359
360 num_times = 0
361 for seq_record in self._records:
362 cur_counts = self._evaluator.num_matches(motif,
363 str(seq_record.seq))
364 num_times += cur_counts
365
366 return num_times
367
368
369
371 """Generate a random motif within given parameters.
372 """
373 - def __init__(self, alphabet, min_size=12, max_size=17):
374 """Initialize with the motif parameters.
375
376 Arguments:
377
378 o alphabet - An alphabet specifying what letters can be inserted in
379 a motif.
380
381 o min_size, max_size - Specify the range of sizes for motifs.
382 """
383 self._alphabet = alphabet
384 self._min_size = min_size
385 self._max_size = max_size
386
388 """Create a random motif within the given parameters.
389
390 This returns a single motif string with letters from the given
391 alphabet. The size of the motif will be randomly chosen between
392 max_size and min_size.
393 """
394 motif_size = random.randrange(self._min_size, self._max_size)
395
396 motif = ""
397 for letter_num in range(motif_size):
398 cur_letter = random.choice(self._alphabet.letters)
399 motif += cur_letter
400
401 return MutableSeq(motif, self._alphabet)
402
403
405 """Determine when we are done evolving motifs.
406
407 This takes the very simple approach of halting evolution when the
408 GA has proceeded for a specified number of generations and has
409 a given number of unique schema with positive fitness.
410 """
411 - def __init__(self, num_schemas, min_generations=100):
412 """Initialize the finisher with its parameters.
413
414 Arguments:
415
416 o num_schemas -- the number of useful (positive fitness) schemas
417 we want to generation
418
419 o min_generations -- The minimum number of generations to allow
420 the GA to proceed.
421 """
422 self.num_generations = 0
423
424 self.num_schemas = num_schemas
425 self.min_generations = min_generations
426
428 """Determine when we can stop evolving the population.
429 """
430 self.num_generations += 1
431
432
433 if self.num_generations >= self.min_generations:
434 all_seqs = []
435 for org in organisms:
436 if org.fitness > 0:
437 if org.genome not in all_seqs:
438 all_seqs.append(org.genome)
439
440 if len(all_seqs) >= self.num_schemas:
441 return 1
442
443 return 0
444
445
446
448 """Find schema in a set of sequences using a genetic algorithm approach.
449
450 Finding good schemas is very difficult because it takes forever to
451 enumerate all of the potential schemas. This finder using a genetic
452 algorithm approach to evolve good schema which match many times in
453 a set of sequences.
454
455 The default implementation of the finder is ready to find schemas
456 in a set of DNA sequences, but the finder can be customized to deal
457 with any type of data.
458 """
465
466 - def find(self, seq_records):
467 """Find well-represented schemas in the given set of SeqRecords.
468 """
469 fitness_evaluator = MostCountSchemaFitness(seq_records,
470 self.evaluator)
471
472 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
473 self.num_schemas)
474
476 """Find schemas which differentiate between the two sets of SeqRecords.
477 """
478 fitness_evaluator = DifferentialSchemaFitness(first_records,
479 second_records,
480 self.evaluator)
481
482 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
483 self.num_schemas)
484
485
487 """Convert a sequence into a representation of ambiguous motifs (schemas).
488
489 This takes a sequence, and returns the number of times specified
490 motifs are found in the sequence. This lets you represent a sequence
491 as just a count of (possibly ambiguous) motifs.
492 """
493 - def __init__(self, schemas, ambiguous_converter):
494 """Initialize the coder to convert sequences
495
496 Arguments:
497
498 o schema - A list of all of the schemas we want to search for
499 in input sequences.
500
501 o ambiguous_converter - An Schema class which can be
502 used to convert motifs into regular expressions for searching.
503 """
504 self._schemas = schemas
505 self._converter = ambiguous_converter
506
508 """Represent the given input sequence as a bunch of motif counts.
509
510 Arguments:
511
512 o sequence - A Bio.Seq object we are going to represent as schemas.
513
514 This takes the sequence, searches for the motifs within it, and then
515 returns counts specifying the relative number of times each motifs
516 was found. The frequencies are in the order the original motifs were
517 passed into the initializer.
518 """
519 schema_counts = []
520
521 for schema in self._schemas:
522 num_counts = self._converter.num_matches(schema, str(sequence))
523 schema_counts.append(num_counts)
524
525
526 min_count = 0
527 max_count = max(schema_counts)
528
529
530
531 if max_count > 0:
532 for count_num in range(len(schema_counts)):
533 schema_counts[count_num] = (float(schema_counts[count_num]) -
534 float(min_count)) / float(max_count)
535
536 return schema_counts
537
538
540 """Determine whether or not the given pattern matches the schema.
541
542 Arguments:
543
544 o pattern - A string representing the pattern we want to check for
545 matching. This pattern can contain ambiguity characters (which are
546 assumed to be the same as those in the schema).
547
548 o schema - A string schema with ambiguity characters.
549
550 o ambiguity_character - The character used for ambiguity in the schema.
551 """
552 if len(pattern) != len(schema):
553 return 0
554
555
556
557 for pos in range(len(pattern)):
558 if schema[pos] != ambiguity_character and \
559 pattern[pos] != ambiguity_character and \
560 pattern[pos] != schema[pos]:
561
562 return 0
563
564 return 1
565
566
568 """Generate Schema from inputs of Motifs or Signatures.
569 """
570 - def __init__(self, ambiguity_symbol='*'):
571 """Initialize the SchemaFactory
572
573 Arguments:
574
575 o ambiguity_symbol -- The symbol to use when specifying that
576 a position is arbitrary.
577 """
578 self._ambiguity_symbol = ambiguity_symbol
579
580 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
581 """Generate schema from a list of motifs.
582
583 Arguments:
584
585 o motif_repository - A MotifRepository class that has all of the
586 motifs we want to convert to Schema.
587
588 o motif_percent - The percentage of motifs in the motif bank which
589 should be matches. We'll try to create schema that match this
590 percentage of motifs.
591
592 o num_ambiguous - The number of ambiguous characters to include
593 in each schema. The positions of these ambiguous characters will
594 be randomly selected.
595 """
596
597 all_motifs = motif_repository.get_top_percentage(motif_percent)
598
599
600 schema_info = {}
601
602
603 total_count = self._get_num_motifs(motif_repository, all_motifs)
604 matched_count = 0
605 assert total_count > 0, "Expected to have motifs to match"
606 while (float(matched_count) / float(total_count)) < motif_percent:
607 new_schema, matching_motifs = \
608 self._get_unique_schema(schema_info.keys(),
609 all_motifs, num_ambiguous)
610
611
612
613 schema_counts = 0
614 for motif in matching_motifs:
615
616 schema_counts += motif_repository.count(motif)
617
618
619
620 all_motifs.remove(motif)
621
622
623 schema_info[new_schema] = schema_counts
624
625 matched_count += schema_counts
626
627
628
629 return PatternRepository(schema_info)
630
632 """Return the number of motif counts for the list of motifs.
633 """
634 motif_count = 0
635 for motif in motif_list:
636 motif_count += repository.count(motif)
637
638 return motif_count
639
641 """Retrieve a unique schema from a motif.
642
643 We don't want to end up with schema that match the same thing,
644 since this could lead to ambiguous results, and be messy. This
645 tries to create schema, and checks that they do not match any
646 currently existing schema.
647 """
648
649
650
651 num_tries = 0
652
653 while 1:
654
655 cur_motif = random.choice(motif_list)
656
657 num_tries += 1
658
659 new_schema, matching_motifs = \
660 self._schema_from_motif(cur_motif, motif_list,
661 num_ambiguous)
662
663 has_match = 0
664 for old_schema in cur_schemas:
665 if matches_schema(new_schema, old_schema,
666 self._ambiguity_symbol):
667 has_match = 1
668
669
670
671 if not(has_match):
672 break
673
674
675 assert num_tries < 150, \
676 "Could not generate schema in %s tries from %s with %s" \
677 % (num_tries, motif_list, cur_schemas)
678
679 return new_schema, matching_motifs
680
682 """Create a schema from a given starting motif.
683
684 Arguments:
685
686 o motif - A motif with the pattern we will start from.
687
688 o motif_list - The total motifs we have.to match to.
689
690 o num_ambiguous - The number of ambiguous characters that should
691 be present in the schema.
692
693 Returns:
694
695 o A string representing the newly generated schema.
696
697 o A list of all of the motifs in motif_list that match the schema.
698 """
699 assert motif in motif_list, \
700 "Expected starting motif present in remaining motifs."
701
702
703
704 new_schema_list = list(motif)
705 for add_ambiguous in range(num_ambiguous):
706
707 while 1:
708 ambig_pos = random.choice(range(len(new_schema_list)))
709
710
711
712 if new_schema_list[ambig_pos] != self._ambiguity_symbol:
713 new_schema_list[ambig_pos] = self._ambiguity_symbol
714 break
715
716
717 new_schema = ''.join(new_schema_list)
718
719
720 matched_motifs = []
721 for motif in motif_list:
722 if matches_schema(motif, new_schema, self._ambiguity_symbol):
723 matched_motifs.append(motif)
724
725 return new_schema, matched_motifs
726
728 raise NotImplementedError("Still need to code this.")
729