Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  7   
  8  This class contains Schema which deal with Motifs and Signatures at 
  9  a higher level, by introducing \`don't care\` (ambiguity) symbols into 
 10  the sequences. For instance, you could combine the following Motifs: 
 11   
 12  'GATC', 'GATG', 'GATG', 'GATT' 
 13   
 14  as all falling under a schema like 'GAT*', where the star indicates a 
 15  character can be anything. This helps us condense a whole ton of 
 16  motifs or signatures. 
 17  """ 
 18  # standard modules 
 19  from __future__ import print_function 
 20   
 21  import random 
 22  import re 
 23   
 24  from Bio._py3k import range 
 25   
 26  from Bio import Alphabet 
 27  from Bio.Seq import MutableSeq 
 28   
 29  # neural network libraries 
 30  from .Pattern import PatternRepository 
 31   
 32  # genetic algorithm libraries 
 33  from Bio.GA import Organism 
 34  from Bio.GA.Evolver import GenerationEvolver 
 35  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 36  from Bio.GA.Crossover.Point import SinglePointCrossover 
 37  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 38  from Bio.GA.Selection.Tournament import TournamentSelection 
 39  from Bio.GA.Selection.Diversity import DiversitySelection 
 40   
 41  __docformat__ = "restructuredtext en" 
 42   
43 -class Schema(object):
44 """Deal with motifs that have ambiguity characters in it. 45 46 This motif class allows specific ambiguity characters and tries to 47 speed up finding motifs using regular expressions. 48 49 This is likely to be a replacement for the Schema representation, 50 since it allows multiple ambiguity characters to be used. 51 """
52 - def __init__(self, ambiguity_info):
53 """Initialize with ambiguity information. 54 55 Arguments: 56 57 o ambiguity_info - A dictionary which maps letters in the motifs to 58 the ambiguous characters which they might represent. For example, 59 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G. 60 All letters in the motif must be represented in the ambiguity_info 61 dictionary. 62 """ 63 self._ambiguity_info = ambiguity_info 64 65 # a cache of all encoded motifs 66 self._motif_cache = {}
67
68 - def encode_motif(self, motif):
69 """Encode the passed motif as a regular expression pattern object. 70 71 Arguments: 72 73 o motif - The motif we want to encode. This should be a string. 74 75 Returns: 76 A compiled regular expression pattern object that can be used 77 for searching strings. 78 """ 79 regexp_string = "" 80 81 for motif_letter in motif: 82 try: 83 letter_matches = self._ambiguity_info[motif_letter] 84 except KeyError: 85 raise KeyError("No match information for letter %s" 86 % motif_letter) 87 88 if len(letter_matches) > 1: 89 regexp_match = "[" + letter_matches + "]" 90 elif len(letter_matches) == 1: 91 regexp_match = letter_matches 92 else: 93 raise ValueError("Unexpected match information %s" 94 % letter_matches) 95 96 regexp_string += regexp_match 97 98 return re.compile(regexp_string)
99
100 - def find_ambiguous(self, motif):
101 """Return the location of ambiguous items in the motif. 102 103 This just checks through the motif and compares each letter 104 against the ambiguity information. If a letter stands for multiple 105 items, it is ambiguous. 106 """ 107 ambig_positions = [] 108 for motif_letter_pos in range(len(motif)): 109 motif_letter = motif[motif_letter_pos] 110 try: 111 letter_matches = self._ambiguity_info[motif_letter] 112 except KeyError: 113 raise KeyError("No match information for letter %s" 114 % motif_letter) 115 116 if len(letter_matches) > 1: 117 ambig_positions.append(motif_letter_pos) 118 119 return ambig_positions
120
121 - def num_ambiguous(self, motif):
122 """Return the number of ambiguous letters in a given motif. 123 """ 124 ambig_positions = self.find_ambiguous(motif) 125 return len(ambig_positions)
126
127 - def find_matches(self, motif, query):
128 """Return all non-overlapping motif matches in the query string. 129 130 This utilizes the regular expression findall function, and will 131 return a list of all non-overlapping occurances in query that 132 match the ambiguous motif. 133 """ 134 try: 135 motif_pattern = self._motif_cache[motif] 136 except KeyError: 137 motif_pattern = self.encode_motif(motif) 138 self._motif_cache[motif] = motif_pattern 139 140 return motif_pattern.findall(query)
141
142 - def num_matches(self, motif, query):
143 """Find the number of non-overlapping times motif occurs in query. 144 """ 145 all_matches = self.find_matches(motif, query) 146 return len(all_matches)
147
148 - def all_unambiguous(self):
149 """Return a listing of all unambiguous letters allowed in motifs. 150 """ 151 all_letters = sorted(self._ambiguity_info) 152 unambig_letters = [] 153 154 for letter in all_letters: 155 possible_matches = self._ambiguity_info[letter] 156 if len(possible_matches) == 1: 157 unambig_letters.append(letter) 158 159 return unambig_letters
160 161 # --- helper classes and functions for the default SchemaFinder 162 163 # -- Alphabets 164 165
166 -class SchemaDNAAlphabet(Alphabet.Alphabet):
167 """Alphabet of a simple Schema for DNA sequences. 168 169 This defines a simple alphabet for DNA sequences that has a single 170 character which can match any other character. 171 172 o G,A,T,C - The standard unambiguous DNA alphabet. 173 174 o * - Any letter 175 """ 176 letters = ["G", "A", "T", "C", "*"] 177 178 alphabet_matches = {"G": "G", 179 "A": "A", 180 "T": "T", 181 "C": "C", 182 "*": "GATC"}
183 184 # -- GA schema finder 185 186
187 -class GeneticAlgorithmFinder(object):
188 """Find schemas using a genetic algorithm approach. 189 190 This approach to finding schema uses Genetic Algorithms to evolve 191 a set of schema and find the best schema for a specific set of 192 records. 193 194 The 'default' finder searches for ambiguous DNA elements. This 195 can be overridden easily by creating a GeneticAlgorithmFinder 196 with a different alphabet. 197 """
198 - def __init__(self, alphabet=SchemaDNAAlphabet()):
199 """Initialize a finder to get schemas using Genetic Algorithms. 200 201 Arguments: 202 203 o alphabet -- The alphabet which specifies the contents of the 204 schemas we'll be generating. This alphabet must contain the 205 attribute 'alphabet_matches', which is a dictionary specifying 206 the potential ambiguities of each letter in the alphabet. These 207 ambiguities will be used in building up the schema. 208 """ 209 self.alphabet = alphabet 210 211 self.initial_population = 500 212 self.min_generations = 10 213 214 self._set_up_genetic_algorithm()
215
217 """Overrideable function to set up the genetic algorithm parameters. 218 219 This functions sole job is to set up the different genetic 220 algorithm functionality. Since this can be quite complicated, this 221 allows cusotmizablity of all of the parameters. If you want to 222 customize specially, you can inherit from this class and override 223 this function. 224 """ 225 self.motif_generator = RandomMotifGenerator(self.alphabet) 226 227 self.mutator = SinglePositionMutation(mutation_rate=0.1) 228 self.crossover = SinglePointCrossover(crossover_prob=0.25) 229 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 230 4) 231 self.base_selector = TournamentSelection(self.mutator, self.crossover, 232 self.repair, 2) 233 self.selector = DiversitySelection(self.base_selector, 234 self.motif_generator.random_motif)
235
236 - def find_schemas(self, fitness, num_schemas):
237 """Find the given number of unique schemas using a genetic algorithm 238 239 Arguments: 240 241 o fitness - A callable object (ie. function) which will evaluate 242 the fitness of a motif. 243 244 o num_schemas - The number of unique schemas with good fitness 245 that we want to generate. 246 """ 247 start_population = \ 248 Organism.function_population(self.motif_generator.random_motif, 249 self.initial_population, 250 fitness) 251 finisher = SimpleFinisher(num_schemas, self.min_generations) 252 253 # set up the evolver and do the evolution 254 evolver = GenerationEvolver(start_population, self.selector) 255 evolved_pop = evolver.evolve(finisher.is_finished) 256 257 # convert the evolved population into a PatternRepository 258 schema_info = {} 259 for org in evolved_pop: 260 # convert the Genome from a MutableSeq to a Seq so that 261 # the schemas are just strings (and not array("c")s) 262 seq_genome = org.genome.toseq() 263 schema_info[str(seq_genome)] = org.fitness 264 265 return PatternRepository(schema_info)
266 267 # -- fitness classes 268 269
270 -class DifferentialSchemaFitness(object):
271 """Calculate fitness for schemas that differentiate between sequences. 272 """
273 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
274 """Initialize with different sequences to evaluate 275 276 Arguments: 277 278 o positive_seq - A list of SeqRecord objects which are the 'positive' 279 sequences -- the ones we want to select for. 280 281 o negative_seq - A list of SeqRecord objects which are the 'negative' 282 sequences that we want to avoid selecting. 283 284 o schema_evaluator - An Schema class which can be used to 285 evaluate find motif matches in sequences. 286 """ 287 self._pos_seqs = positive_seqs 288 self._neg_seqs = negative_seqs 289 self._schema_eval = schema_evaluator
290
291 - def calculate_fitness(self, genome):
292 """Calculate the fitness for a given schema. 293 294 Fitness is specified by the number of occurances of the schema in 295 the positive sequences minus the number of occurances in the 296 negative examples. 297 298 This fitness is then modified by multiplying by the length of the 299 schema and then dividing by the number of ambiguous characters in 300 the schema. This helps select for schema which are longer and have 301 less redundancy. 302 """ 303 # convert the genome into a string 304 seq_motif = genome.toseq() 305 motif = str(seq_motif) 306 307 # get the counts in the positive examples 308 num_pos = 0 309 for seq_record in self._pos_seqs: 310 cur_counts = self._schema_eval.num_matches(motif, 311 str(seq_record.seq)) 312 num_pos += cur_counts 313 314 # get the counts in the negative examples 315 num_neg = 0 316 for seq_record in self._neg_seqs: 317 cur_counts = self._schema_eval.num_matches(motif, 318 str(seq_record.seq)) 319 320 num_neg += cur_counts 321 322 num_ambiguous = self._schema_eval.num_ambiguous(motif) 323 # weight the ambiguous stuff more highly 324 num_ambiguous = pow(2.0, num_ambiguous) 325 # increment num ambiguous to prevent division by zero errors. 326 num_ambiguous += 1 327 328 motif_size = len(motif) 329 motif_size = motif_size * 4.0 330 331 discerning_power = num_pos - num_neg 332 333 diff = (discerning_power * motif_size) / float(num_ambiguous) 334 return diff
335 336
337 -class MostCountSchemaFitness(object):
338 """Calculate a fitness giving weight to schemas that match many times. 339 340 This fitness function tries to maximize schemas which are found many 341 times in a group of sequences. 342 """
343 - def __init__(self, seq_records, schema_evaluator):
344 """Initialize with sequences to evaluate. 345 346 Arguments: 347 348 o seq_records -- A set of SeqRecord objects which we use to 349 calculate the fitness. 350 351 o schema_evaluator - An Schema class which can be used to 352 evaluate find motif matches in sequences. 353 """ 354 self._records = seq_records 355 self._evaluator = schema_evaluator
356
357 - def calculate_fitness(self, genome):
358 """Calculate the fitness of a genome based on schema matches. 359 360 This bases the fitness of a genome completely on the number of times 361 it matches in the set of seq_records. Matching more times gives a 362 better fitness 363 """ 364 # convert the genome into a string 365 seq_motif = genome.toseq() 366 motif = str(seq_motif) 367 368 # find the number of times the genome matches 369 num_times = 0 370 for seq_record in self._records: 371 cur_counts = self._evaluator.num_matches(motif, 372 str(seq_record.seq)) 373 num_times += cur_counts 374 375 return num_times
376 377 378 # -- Helper classes
379 -class RandomMotifGenerator(object):
380 """Generate a random motif within given parameters. 381 """
382 - def __init__(self, alphabet, min_size=12, max_size=17):
383 """Initialize with the motif parameters. 384 385 Arguments: 386 387 o alphabet - An alphabet specifying what letters can be inserted in 388 a motif. 389 390 o min_size, max_size - Specify the range of sizes for motifs. 391 """ 392 self._alphabet = alphabet 393 self._min_size = min_size 394 self._max_size = max_size
395
396 - def random_motif(self):
397 """Create a random motif within the given parameters. 398 399 This returns a single motif string with letters from the given 400 alphabet. The size of the motif will be randomly chosen between 401 max_size and min_size. 402 """ 403 motif_size = random.randrange(self._min_size, self._max_size) 404 405 motif = "" 406 for letter_num in range(motif_size): 407 cur_letter = random.choice(self._alphabet.letters) 408 motif += cur_letter 409 410 return MutableSeq(motif, self._alphabet)
411 412
413 -class SimpleFinisher(object):
414 """Determine when we are done evolving motifs. 415 416 This takes the very simple approach of halting evolution when the 417 GA has proceeded for a specified number of generations and has 418 a given number of unique schema with positive fitness. 419 """
420 - def __init__(self, num_schemas, min_generations=100):
421 """Initialize the finisher with its parameters. 422 423 Arguments: 424 425 o num_schemas -- the number of useful (positive fitness) schemas 426 we want to generation 427 428 o min_generations -- The minimum number of generations to allow 429 the GA to proceed. 430 """ 431 self.num_generations = 0 432 433 self.num_schemas = num_schemas 434 self.min_generations = min_generations
435
436 - def is_finished(self, organisms):
437 """Determine when we can stop evolving the population. 438 """ 439 self.num_generations += 1 440 # print "generation %s" % self.num_generations 441 442 if self.num_generations >= self.min_generations: 443 all_seqs = [] 444 for org in organisms: 445 if org.fitness > 0: 446 if org.genome not in all_seqs: 447 all_seqs.append(org.genome) 448 449 if len(all_seqs) >= self.num_schemas: 450 return 1 451 452 return 0
453 # --- 454 455
456 -class SchemaFinder(object):
457 """Find schema in a set of sequences using a genetic algorithm approach. 458 459 Finding good schemas is very difficult because it takes forever to 460 enumerate all of the potential schemas. This finder using a genetic 461 algorithm approach to evolve good schema which match many times in 462 a set of sequences. 463 464 The default implementation of the finder is ready to find schemas 465 in a set of DNA sequences, but the finder can be customized to deal 466 with any type of data. 467 """
468 - def __init__(self, num_schemas=100, 469 schema_finder=GeneticAlgorithmFinder()):
470 self.num_schemas = num_schemas 471 self._finder = schema_finder 472 473 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
474
475 - def find(self, seq_records):
476 """Find well-represented schemas in the given set of SeqRecords. 477 """ 478 fitness_evaluator = MostCountSchemaFitness(seq_records, 479 self.evaluator) 480 481 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 482 self.num_schemas)
483
484 - def find_differences(self, first_records, second_records):
485 """Find schemas which differentiate between the two sets of SeqRecords. 486 """ 487 fitness_evaluator = DifferentialSchemaFitness(first_records, 488 second_records, 489 self.evaluator) 490 491 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 492 self.num_schemas)
493 494
495 -class SchemaCoder(object):
496 """Convert a sequence into a representation of ambiguous motifs (schemas). 497 498 This takes a sequence, and returns the number of times specified 499 motifs are found in the sequence. This lets you represent a sequence 500 as just a count of (possibly ambiguous) motifs. 501 """
502 - def __init__(self, schemas, ambiguous_converter):
503 """Initialize the coder to convert sequences 504 505 Arguments: 506 507 o schema - A list of all of the schemas we want to search for 508 in input sequences. 509 510 o ambiguous_converter - An Schema class which can be 511 used to convert motifs into regular expressions for searching. 512 """ 513 self._schemas = schemas 514 self._converter = ambiguous_converter
515
516 - def representation(self, sequence):
517 """Represent the given input sequence as a bunch of motif counts. 518 519 Arguments: 520 521 o sequence - A Bio.Seq object we are going to represent as schemas. 522 523 This takes the sequence, searches for the motifs within it, and then 524 returns counts specifying the relative number of times each motifs 525 was found. The frequencies are in the order the original motifs were 526 passed into the initializer. 527 """ 528 schema_counts = [] 529 530 for schema in self._schemas: 531 num_counts = self._converter.num_matches(schema, str(sequence)) 532 schema_counts.append(num_counts) 533 534 # normalize the counts to go between zero and one 535 min_count = 0 536 max_count = max(schema_counts) 537 538 # only normalize if we've actually found something, otherwise 539 # we'll just return 0 for everything 540 if max_count > 0: 541 for count_num in range(len(schema_counts)): 542 schema_counts[count_num] = (float(schema_counts[count_num]) - 543 float(min_count)) / float(max_count) 544 545 return schema_counts
546 547
548 -def matches_schema(pattern, schema, ambiguity_character='*'):
549 """Determine whether or not the given pattern matches the schema. 550 551 Arguments: 552 553 o pattern - A string representing the pattern we want to check for 554 matching. This pattern can contain ambiguity characters (which are 555 assumed to be the same as those in the schema). 556 557 o schema - A string schema with ambiguity characters. 558 559 o ambiguity_character - The character used for ambiguity in the schema. 560 """ 561 if len(pattern) != len(schema): 562 return 0 563 564 # check each position, and return a non match if the schema and pattern 565 # are non ambiguous and don't match 566 for pos in range(len(pattern)): 567 if schema[pos] != ambiguity_character and \ 568 pattern[pos] != ambiguity_character and \ 569 pattern[pos] != schema[pos]: 570 571 return 0 572 573 return 1
574 575
576 -class SchemaFactory(object):
577 """Generate Schema from inputs of Motifs or Signatures. 578 """
579 - def __init__(self, ambiguity_symbol='*'):
580 """Initialize the SchemaFactory 581 582 Arguments: 583 584 o ambiguity_symbol -- The symbol to use when specifying that 585 a position is arbitrary. 586 """ 587 self._ambiguity_symbol = ambiguity_symbol
588
589 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
590 """Generate schema from a list of motifs. 591 592 Arguments: 593 594 o motif_repository - A MotifRepository class that has all of the 595 motifs we want to convert to Schema. 596 597 o motif_percent - The percentage of motifs in the motif bank which 598 should be matches. We'll try to create schema that match this 599 percentage of motifs. 600 601 o num_ambiguous - The number of ambiguous characters to include 602 in each schema. The positions of these ambiguous characters will 603 be randomly selected. 604 """ 605 # get all of the motifs we can deal with 606 all_motifs = motif_repository.get_top_percentage(motif_percent) 607 608 # start building up schemas 609 schema_info = {} 610 # continue until we've built schema matching the desired percentage 611 # of motifs 612 total_count = self._get_num_motifs(motif_repository, all_motifs) 613 matched_count = 0 614 assert total_count > 0, "Expected to have motifs to match" 615 while (float(matched_count) / float(total_count)) < motif_percent: 616 new_schema, matching_motifs = \ 617 self._get_unique_schema(list(schema_info.keys()), 618 all_motifs, num_ambiguous) 619 620 # get the number of counts for the new schema and clean up 621 # the motif list 622 schema_counts = 0 623 for motif in matching_motifs: 624 # get the counts for the motif 625 schema_counts += motif_repository.count(motif) 626 627 # remove the motif from the motif list since it is already 628 # represented by this schema 629 all_motifs.remove(motif) 630 631 # all the schema info 632 schema_info[new_schema] = schema_counts 633 634 matched_count += schema_counts 635 636 # print "percentage:", float(matched_count) / float(total_count) 637 638 return PatternRepository(schema_info)
639
640 - def _get_num_motifs(self, repository, motif_list):
641 """Return the number of motif counts for the list of motifs. 642 """ 643 motif_count = 0 644 for motif in motif_list: 645 motif_count += repository.count(motif) 646 647 return motif_count
648
649 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
650 """Retrieve a unique schema from a motif. 651 652 We don't want to end up with schema that match the same thing, 653 since this could lead to ambiguous results, and be messy. This 654 tries to create schema, and checks that they do not match any 655 currently existing schema. 656 """ 657 # create a schema starting with a random motif 658 # we'll keep doing this until we get a completely new schema that 659 # doesn't match any old schema 660 num_tries = 0 661 662 while True: 663 # pick a motif to work from and make a schema from it 664 cur_motif = random.choice(motif_list) 665 666 num_tries += 1 667 668 new_schema, matching_motifs = \ 669 self._schema_from_motif(cur_motif, motif_list, 670 num_ambiguous) 671 672 has_match = 0 673 for old_schema in cur_schemas: 674 if matches_schema(new_schema, old_schema, 675 self._ambiguity_symbol): 676 has_match = 1 677 678 # if the schema doesn't match any other schema we've got 679 # a good one 680 if not(has_match): 681 break 682 683 # check for big loops in which we can't find a new schema 684 assert num_tries < 150, \ 685 "Could not generate schema in %s tries from %s with %s" \ 686 % (num_tries, motif_list, cur_schemas) 687 688 return new_schema, matching_motifs
689
690 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
691 """Create a schema from a given starting motif. 692 693 Arguments: 694 695 o motif - A motif with the pattern we will start from. 696 697 o motif_list - The total motifs we have.to match to. 698 699 o num_ambiguous - The number of ambiguous characters that should 700 be present in the schema. 701 702 Returns: 703 704 o A string representing the newly generated schema. 705 706 o A list of all of the motifs in motif_list that match the schema. 707 """ 708 assert motif in motif_list, \ 709 "Expected starting motif present in remaining motifs." 710 711 # convert random positions in the motif to ambiguous characters 712 # convert the motif into a list of characters so we can manipulate it 713 new_schema_list = list(motif) 714 for add_ambiguous in range(num_ambiguous): 715 # add an ambiguous position in a new place in the motif 716 while True: 717 ambig_pos = random.choice(list(range(len(new_schema_list)))) 718 719 # only add a position if it isn't already ambiguous 720 # otherwise, we'll try again 721 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 722 new_schema_list[ambig_pos] = self._ambiguity_symbol 723 break 724 725 # convert the schema back to a string 726 new_schema = ''.join(new_schema_list) 727 728 # get the motifs that the schema matches 729 matched_motifs = [] 730 for motif in motif_list: 731 if matches_schema(motif, new_schema, self._ambiguity_symbol): 732 matched_motifs.append(motif) 733 734 return new_schema, matched_motifs
735
736 - def from_signatures(self, signature_repository, num_ambiguous):
737 raise NotImplementedError("Still need to code this.")
738