Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  7   
  8  This class contains Schema which deal with Motifs and Signatures at 
  9  a higher level, by introducing `don't care` (ambiguity) symbols into 
 10  the sequences. For instance, you could combine the following Motifs: 
 11   
 12  'GATC', 'GATG', 'GATG', 'GATT' 
 13   
 14  as all falling under a schema like 'GAT*', where the star indicates a 
 15  character can be anything. This helps us condense a whole ton of 
 16  motifs or signatures. 
 17  """ 
 18  # standard modules 
 19  from __future__ import print_function 
 20   
 21  import random 
 22  import re 
 23   
 24  from Bio._py3k import range 
 25   
 26  from Bio import Alphabet 
 27  from Bio.Seq import MutableSeq 
 28   
 29  # neural network libraries 
 30  from .Pattern import PatternRepository 
 31   
 32  # genetic algorithm libraries 
 33  from Bio.GA import Organism 
 34  from Bio.GA.Evolver import GenerationEvolver 
 35  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 36  from Bio.GA.Crossover.Point import SinglePointCrossover 
 37  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 38  from Bio.GA.Selection.Tournament import TournamentSelection 
 39  from Bio.GA.Selection.Diversity import DiversitySelection 
 40   
 41   
42 -class Schema(object):
43 """Deal with motifs that have ambiguity characters in it. 44 45 This motif class allows specific ambiguity characters and tries to 46 speed up finding motifs using regular expressions. 47 48 This is likely to be a replacement for the Schema representation, 49 since it allows multiple ambiguity characters to be used. 50 """
51 - def __init__(self, ambiguity_info):
52 """Initialize with ambiguity information. 53 54 Arguments: 55 56 o ambiguity_info - A dictionary which maps letters in the motifs to 57 the ambiguous characters which they might represent. For example, 58 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G. 59 All letters in the motif must be represented in the ambiguity_info 60 dictionary. 61 """ 62 self._ambiguity_info = ambiguity_info 63 64 # a cache of all encoded motifs 65 self._motif_cache = {}
66
67 - def encode_motif(self, motif):
68 """Encode the passed motif as a regular expression pattern object. 69 70 Arguments: 71 72 o motif - The motif we want to encode. This should be a string. 73 74 Returns: 75 A compiled regular expression pattern object that can be used 76 for searching strings. 77 """ 78 regexp_string = "" 79 80 for motif_letter in motif: 81 try: 82 letter_matches = self._ambiguity_info[motif_letter] 83 except KeyError: 84 raise KeyError("No match information for letter %s" 85 % motif_letter) 86 87 if len(letter_matches) > 1: 88 regexp_match = "[" + letter_matches + "]" 89 elif len(letter_matches) == 1: 90 regexp_match = letter_matches 91 else: 92 raise ValueError("Unexpected match information %s" 93 % letter_matches) 94 95 regexp_string += regexp_match 96 97 return re.compile(regexp_string)
98
99 - def find_ambiguous(self, motif):
100 """Return the location of ambiguous items in the motif. 101 102 This just checks through the motif and compares each letter 103 against the ambiguity information. If a letter stands for multiple 104 items, it is ambiguous. 105 """ 106 ambig_positions = [] 107 for motif_letter_pos in range(len(motif)): 108 motif_letter = motif[motif_letter_pos] 109 try: 110 letter_matches = self._ambiguity_info[motif_letter] 111 except KeyError: 112 raise KeyError("No match information for letter %s" 113 % motif_letter) 114 115 if len(letter_matches) > 1: 116 ambig_positions.append(motif_letter_pos) 117 118 return ambig_positions
119
120 - def num_ambiguous(self, motif):
121 """Return the number of ambiguous letters in a given motif. 122 """ 123 ambig_positions = self.find_ambiguous(motif) 124 return len(ambig_positions)
125
126 - def find_matches(self, motif, query):
127 """Return all non-overlapping motif matches in the query string. 128 129 This utilizes the regular expression findall function, and will 130 return a list of all non-overlapping occurances in query that 131 match the ambiguous motif. 132 """ 133 try: 134 motif_pattern = self._motif_cache[motif] 135 except KeyError: 136 motif_pattern = self.encode_motif(motif) 137 self._motif_cache[motif] = motif_pattern 138 139 return motif_pattern.findall(query)
140
141 - def num_matches(self, motif, query):
142 """Find the number of non-overlapping times motif occurs in query. 143 """ 144 all_matches = self.find_matches(motif, query) 145 return len(all_matches)
146
147 - def all_unambiguous(self):
148 """Return a listing of all unambiguous letters allowed in motifs. 149 """ 150 all_letters = sorted(self._ambiguity_info) 151 unambig_letters = [] 152 153 for letter in all_letters: 154 possible_matches = self._ambiguity_info[letter] 155 if len(possible_matches) == 1: 156 unambig_letters.append(letter) 157 158 return unambig_letters
159 160 # --- helper classes and functions for the default SchemaFinder 161 162 # -- Alphabets 163 164
165 -class SchemaDNAAlphabet(Alphabet.Alphabet):
166 """Alphabet of a simple Schema for DNA sequences. 167 168 This defines a simple alphabet for DNA sequences that has a single 169 character which can match any other character. 170 171 o G,A,T,C - The standard unambiguous DNA alphabet. 172 173 o * - Any letter 174 """ 175 letters = ["G", "A", "T", "C", "*"] 176 177 alphabet_matches = {"G": "G", 178 "A": "A", 179 "T": "T", 180 "C": "C", 181 "*": "GATC"}
182 183 # -- GA schema finder 184 185
186 -class GeneticAlgorithmFinder(object):
187 """Find schemas using a genetic algorithm approach. 188 189 This approach to finding schema uses Genetic Algorithms to evolve 190 a set of schema and find the best schema for a specific set of 191 records. 192 193 The 'default' finder searches for ambiguous DNA elements. This 194 can be overridden easily by creating a GeneticAlgorithmFinder 195 with a different alphabet. 196 """
197 - def __init__(self, alphabet=SchemaDNAAlphabet()):
198 """Initialize a finder to get schemas using Genetic Algorithms. 199 200 Arguments: 201 202 o alphabet -- The alphabet which specifies the contents of the 203 schemas we'll be generating. This alphabet must contain the 204 attribute 'alphabet_matches', which is a dictionary specifying 205 the potential ambiguities of each letter in the alphabet. These 206 ambiguities will be used in building up the schema. 207 """ 208 self.alphabet = alphabet 209 210 self.initial_population = 500 211 self.min_generations = 10 212 213 self._set_up_genetic_algorithm()
214
216 """Overrideable function to set up the genetic algorithm parameters. 217 218 This functions sole job is to set up the different genetic 219 algorithm functionality. Since this can be quite complicated, this 220 allows cusotmizablity of all of the parameters. If you want to 221 customize specially, you can inherit from this class and override 222 this function. 223 """ 224 self.motif_generator = RandomMotifGenerator(self.alphabet) 225 226 self.mutator = SinglePositionMutation(mutation_rate=0.1) 227 self.crossover = SinglePointCrossover(crossover_prob=0.25) 228 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 229 4) 230 self.base_selector = TournamentSelection(self.mutator, self.crossover, 231 self.repair, 2) 232 self.selector = DiversitySelection(self.base_selector, 233 self.motif_generator.random_motif)
234
235 - def find_schemas(self, fitness, num_schemas):
236 """Find the given number of unique schemas using a genetic algorithm 237 238 Arguments: 239 240 o fitness - A callable object (ie. function) which will evaluate 241 the fitness of a motif. 242 243 o num_schemas - The number of unique schemas with good fitness 244 that we want to generate. 245 """ 246 start_population = \ 247 Organism.function_population(self.motif_generator.random_motif, 248 self.initial_population, 249 fitness) 250 finisher = SimpleFinisher(num_schemas, self.min_generations) 251 252 # set up the evolver and do the evolution 253 evolver = GenerationEvolver(start_population, self.selector) 254 evolved_pop = evolver.evolve(finisher.is_finished) 255 256 # convert the evolved population into a PatternRepository 257 schema_info = {} 258 for org in evolved_pop: 259 # convert the Genome from a MutableSeq to a Seq so that 260 # the schemas are just strings (and not array("c")s) 261 seq_genome = org.genome.toseq() 262 schema_info[str(seq_genome)] = org.fitness 263 264 return PatternRepository(schema_info)
265 266 # -- fitness classes 267 268
269 -class DifferentialSchemaFitness(object):
270 """Calculate fitness for schemas that differentiate between sequences. 271 """
272 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
273 """Initialize with different sequences to evaluate 274 275 Arguments: 276 277 o positive_seq - A list of SeqRecord objects which are the 'positive' 278 sequences -- the ones we want to select for. 279 280 o negative_seq - A list of SeqRecord objects which are the 'negative' 281 sequences that we want to avoid selecting. 282 283 o schema_evaluator - An Schema class which can be used to 284 evaluate find motif matches in sequences. 285 """ 286 self._pos_seqs = positive_seqs 287 self._neg_seqs = negative_seqs 288 self._schema_eval = schema_evaluator
289
290 - def calculate_fitness(self, genome):
291 """Calculate the fitness for a given schema. 292 293 Fitness is specified by the number of occurances of the schema in 294 the positive sequences minus the number of occurances in the 295 negative examples. 296 297 This fitness is then modified by multiplying by the length of the 298 schema and then dividing by the number of ambiguous characters in 299 the schema. This helps select for schema which are longer and have 300 less redundancy. 301 """ 302 # convert the genome into a string 303 seq_motif = genome.toseq() 304 motif = str(seq_motif) 305 306 # get the counts in the positive examples 307 num_pos = 0 308 for seq_record in self._pos_seqs: 309 cur_counts = self._schema_eval.num_matches(motif, 310 str(seq_record.seq)) 311 num_pos += cur_counts 312 313 # get the counts in the negative examples 314 num_neg = 0 315 for seq_record in self._neg_seqs: 316 cur_counts = self._schema_eval.num_matches(motif, 317 str(seq_record.seq)) 318 319 num_neg += cur_counts 320 321 num_ambiguous = self._schema_eval.num_ambiguous(motif) 322 # weight the ambiguous stuff more highly 323 num_ambiguous = pow(2.0, num_ambiguous) 324 # increment num ambiguous to prevent division by zero errors. 325 num_ambiguous += 1 326 327 motif_size = len(motif) 328 motif_size = motif_size * 4.0 329 330 discerning_power = num_pos - num_neg 331 332 diff = (discerning_power * motif_size) / float(num_ambiguous) 333 return diff
334 335
336 -class MostCountSchemaFitness(object):
337 """Calculate a fitness giving weight to schemas that match many times. 338 339 This fitness function tries to maximize schemas which are found many 340 times in a group of sequences. 341 """
342 - def __init__(self, seq_records, schema_evaluator):
343 """Initialize with sequences to evaluate. 344 345 Arguments: 346 347 o seq_records -- A set of SeqRecord objects which we use to 348 calculate the fitness. 349 350 o schema_evaluator - An Schema class which can be used to 351 evaluate find motif matches in sequences. 352 """ 353 self._records = seq_records 354 self._evaluator = schema_evaluator
355
356 - def calculate_fitness(self, genome):
357 """Calculate the fitness of a genome based on schema matches. 358 359 This bases the fitness of a genome completely on the number of times 360 it matches in the set of seq_records. Matching more times gives a 361 better fitness 362 """ 363 # convert the genome into a string 364 seq_motif = genome.toseq() 365 motif = str(seq_motif) 366 367 # find the number of times the genome matches 368 num_times = 0 369 for seq_record in self._records: 370 cur_counts = self._evaluator.num_matches(motif, 371 str(seq_record.seq)) 372 num_times += cur_counts 373 374 return num_times
375 376 377 # -- Helper classes
378 -class RandomMotifGenerator(object):
379 """Generate a random motif within given parameters. 380 """
381 - def __init__(self, alphabet, min_size=12, max_size=17):
382 """Initialize with the motif parameters. 383 384 Arguments: 385 386 o alphabet - An alphabet specifying what letters can be inserted in 387 a motif. 388 389 o min_size, max_size - Specify the range of sizes for motifs. 390 """ 391 self._alphabet = alphabet 392 self._min_size = min_size 393 self._max_size = max_size
394
395 - def random_motif(self):
396 """Create a random motif within the given parameters. 397 398 This returns a single motif string with letters from the given 399 alphabet. The size of the motif will be randomly chosen between 400 max_size and min_size. 401 """ 402 motif_size = random.randrange(self._min_size, self._max_size) 403 404 motif = "" 405 for letter_num in range(motif_size): 406 cur_letter = random.choice(self._alphabet.letters) 407 motif += cur_letter 408 409 return MutableSeq(motif, self._alphabet)
410 411
412 -class SimpleFinisher(object):
413 """Determine when we are done evolving motifs. 414 415 This takes the very simple approach of halting evolution when the 416 GA has proceeded for a specified number of generations and has 417 a given number of unique schema with positive fitness. 418 """
419 - def __init__(self, num_schemas, min_generations=100):
420 """Initialize the finisher with its parameters. 421 422 Arguments: 423 424 o num_schemas -- the number of useful (positive fitness) schemas 425 we want to generation 426 427 o min_generations -- The minimum number of generations to allow 428 the GA to proceed. 429 """ 430 self.num_generations = 0 431 432 self.num_schemas = num_schemas 433 self.min_generations = min_generations
434
435 - def is_finished(self, organisms):
436 """Determine when we can stop evolving the population. 437 """ 438 self.num_generations += 1 439 # print "generation %s" % self.num_generations 440 441 if self.num_generations >= self.min_generations: 442 all_seqs = [] 443 for org in organisms: 444 if org.fitness > 0: 445 if org.genome not in all_seqs: 446 all_seqs.append(org.genome) 447 448 if len(all_seqs) >= self.num_schemas: 449 return 1 450 451 return 0
452 # --- 453 454
455 -class SchemaFinder(object):
456 """Find schema in a set of sequences using a genetic algorithm approach. 457 458 Finding good schemas is very difficult because it takes forever to 459 enumerate all of the potential schemas. This finder using a genetic 460 algorithm approach to evolve good schema which match many times in 461 a set of sequences. 462 463 The default implementation of the finder is ready to find schemas 464 in a set of DNA sequences, but the finder can be customized to deal 465 with any type of data. 466 """
467 - def __init__(self, num_schemas=100, 468 schema_finder=GeneticAlgorithmFinder()):
469 self.num_schemas = num_schemas 470 self._finder = schema_finder 471 472 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
473
474 - def find(self, seq_records):
475 """Find well-represented schemas in the given set of SeqRecords. 476 """ 477 fitness_evaluator = MostCountSchemaFitness(seq_records, 478 self.evaluator) 479 480 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 481 self.num_schemas)
482
483 - def find_differences(self, first_records, second_records):
484 """Find schemas which differentiate between the two sets of SeqRecords. 485 """ 486 fitness_evaluator = DifferentialSchemaFitness(first_records, 487 second_records, 488 self.evaluator) 489 490 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 491 self.num_schemas)
492 493
494 -class SchemaCoder(object):
495 """Convert a sequence into a representation of ambiguous motifs (schemas). 496 497 This takes a sequence, and returns the number of times specified 498 motifs are found in the sequence. This lets you represent a sequence 499 as just a count of (possibly ambiguous) motifs. 500 """
501 - def __init__(self, schemas, ambiguous_converter):
502 """Initialize the coder to convert sequences 503 504 Arguments: 505 506 o schema - A list of all of the schemas we want to search for 507 in input sequences. 508 509 o ambiguous_converter - An Schema class which can be 510 used to convert motifs into regular expressions for searching. 511 """ 512 self._schemas = schemas 513 self._converter = ambiguous_converter
514
515 - def representation(self, sequence):
516 """Represent the given input sequence as a bunch of motif counts. 517 518 Arguments: 519 520 o sequence - A Bio.Seq object we are going to represent as schemas. 521 522 This takes the sequence, searches for the motifs within it, and then 523 returns counts specifying the relative number of times each motifs 524 was found. The frequencies are in the order the original motifs were 525 passed into the initializer. 526 """ 527 schema_counts = [] 528 529 for schema in self._schemas: 530 num_counts = self._converter.num_matches(schema, str(sequence)) 531 schema_counts.append(num_counts) 532 533 # normalize the counts to go between zero and one 534 min_count = 0 535 max_count = max(schema_counts) 536 537 # only normalize if we've actually found something, otherwise 538 # we'll just return 0 for everything 539 if max_count > 0: 540 for count_num in range(len(schema_counts)): 541 schema_counts[count_num] = (float(schema_counts[count_num]) - 542 float(min_count)) / float(max_count) 543 544 return schema_counts
545 546
547 -def matches_schema(pattern, schema, ambiguity_character='*'):
548 """Determine whether or not the given pattern matches the schema. 549 550 Arguments: 551 552 o pattern - A string representing the pattern we want to check for 553 matching. This pattern can contain ambiguity characters (which are 554 assumed to be the same as those in the schema). 555 556 o schema - A string schema with ambiguity characters. 557 558 o ambiguity_character - The character used for ambiguity in the schema. 559 """ 560 if len(pattern) != len(schema): 561 return 0 562 563 # check each position, and return a non match if the schema and pattern 564 # are non ambiguous and don't match 565 for pos in range(len(pattern)): 566 if schema[pos] != ambiguity_character and \ 567 pattern[pos] != ambiguity_character and \ 568 pattern[pos] != schema[pos]: 569 570 return 0 571 572 return 1
573 574
575 -class SchemaFactory(object):
576 """Generate Schema from inputs of Motifs or Signatures. 577 """
578 - def __init__(self, ambiguity_symbol='*'):
579 """Initialize the SchemaFactory 580 581 Arguments: 582 583 o ambiguity_symbol -- The symbol to use when specifying that 584 a position is arbitrary. 585 """ 586 self._ambiguity_symbol = ambiguity_symbol
587
588 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
589 """Generate schema from a list of motifs. 590 591 Arguments: 592 593 o motif_repository - A MotifRepository class that has all of the 594 motifs we want to convert to Schema. 595 596 o motif_percent - The percentage of motifs in the motif bank which 597 should be matches. We'll try to create schema that match this 598 percentage of motifs. 599 600 o num_ambiguous - The number of ambiguous characters to include 601 in each schema. The positions of these ambiguous characters will 602 be randomly selected. 603 """ 604 # get all of the motifs we can deal with 605 all_motifs = motif_repository.get_top_percentage(motif_percent) 606 607 # start building up schemas 608 schema_info = {} 609 # continue until we've built schema matching the desired percentage 610 # of motifs 611 total_count = self._get_num_motifs(motif_repository, all_motifs) 612 matched_count = 0 613 assert total_count > 0, "Expected to have motifs to match" 614 while (float(matched_count) / float(total_count)) < motif_percent: 615 new_schema, matching_motifs = \ 616 self._get_unique_schema(list(schema_info.keys()), 617 all_motifs, num_ambiguous) 618 619 # get the number of counts for the new schema and clean up 620 # the motif list 621 schema_counts = 0 622 for motif in matching_motifs: 623 # get the counts for the motif 624 schema_counts += motif_repository.count(motif) 625 626 # remove the motif from the motif list since it is already 627 # represented by this schema 628 all_motifs.remove(motif) 629 630 # all the schema info 631 schema_info[new_schema] = schema_counts 632 633 matched_count += schema_counts 634 635 # print "percentage:", float(matched_count) / float(total_count) 636 637 return PatternRepository(schema_info)
638
639 - def _get_num_motifs(self, repository, motif_list):
640 """Return the number of motif counts for the list of motifs. 641 """ 642 motif_count = 0 643 for motif in motif_list: 644 motif_count += repository.count(motif) 645 646 return motif_count
647
648 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
649 """Retrieve a unique schema from a motif. 650 651 We don't want to end up with schema that match the same thing, 652 since this could lead to ambiguous results, and be messy. This 653 tries to create schema, and checks that they do not match any 654 currently existing schema. 655 """ 656 # create a schema starting with a random motif 657 # we'll keep doing this until we get a completely new schema that 658 # doesn't match any old schema 659 num_tries = 0 660 661 while True: 662 # pick a motif to work from and make a schema from it 663 cur_motif = random.choice(motif_list) 664 665 num_tries += 1 666 667 new_schema, matching_motifs = \ 668 self._schema_from_motif(cur_motif, motif_list, 669 num_ambiguous) 670 671 has_match = 0 672 for old_schema in cur_schemas: 673 if matches_schema(new_schema, old_schema, 674 self._ambiguity_symbol): 675 has_match = 1 676 677 # if the schema doesn't match any other schema we've got 678 # a good one 679 if not(has_match): 680 break 681 682 # check for big loops in which we can't find a new schema 683 assert num_tries < 150, \ 684 "Could not generate schema in %s tries from %s with %s" \ 685 % (num_tries, motif_list, cur_schemas) 686 687 return new_schema, matching_motifs
688
689 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
690 """Create a schema from a given starting motif. 691 692 Arguments: 693 694 o motif - A motif with the pattern we will start from. 695 696 o motif_list - The total motifs we have.to match to. 697 698 o num_ambiguous - The number of ambiguous characters that should 699 be present in the schema. 700 701 Returns: 702 703 o A string representing the newly generated schema. 704 705 o A list of all of the motifs in motif_list that match the schema. 706 """ 707 assert motif in motif_list, \ 708 "Expected starting motif present in remaining motifs." 709 710 # convert random positions in the motif to ambiguous characters 711 # convert the motif into a list of characters so we can manipulate it 712 new_schema_list = list(motif) 713 for add_ambiguous in range(num_ambiguous): 714 # add an ambiguous position in a new place in the motif 715 while True: 716 ambig_pos = random.choice(list(range(len(new_schema_list)))) 717 718 # only add a position if it isn't already ambiguous 719 # otherwise, we'll try again 720 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 721 new_schema_list[ambig_pos] = self._ambiguity_symbol 722 break 723 724 # convert the schema back to a string 725 new_schema = ''.join(new_schema_list) 726 727 # get the motifs that the schema matches 728 matched_motifs = [] 729 for motif in motif_list: 730 if matches_schema(motif, new_schema, self._ambiguity_symbol): 731 matched_motifs.append(motif) 732 733 return new_schema, matched_motifs
734
735 - def from_signatures(self, signature_repository, num_ambiguous):
736 raise NotImplementedError("Still need to code this.")
737