Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  2   
  3  This class contains Schema which deal with Motifs and Signatures at 
  4  a higher level, by introducing `don't care` (ambiguity) symbols into 
  5  the sequences. For instance, you could combine the following Motifs: 
  6   
  7  'GATC', 'GATG', 'GATG', 'GATT' 
  8   
  9  as all falling under a schema like 'GAT*', where the star indicates a 
 10  character can be anything. This helps us condense a whole ton of 
 11  motifs or signatures. 
 12  """ 
 13  # standard modules 
 14  import random 
 15  import re 
 16   
 17  # biopython 
 18  from Bio import Alphabet 
 19  from Bio.Seq import MutableSeq 
 20   
 21  # neural network libraries 
 22  from Pattern import PatternRepository 
 23   
 24  # genetic algorithm libraries 
 25  from Bio.GA import Organism 
 26  from Bio.GA.Evolver import GenerationEvolver 
 27  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 28  from Bio.GA.Crossover.Point import SinglePointCrossover 
 29  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 30  from Bio.GA.Selection.Tournament import TournamentSelection 
 31  from Bio.GA.Selection.Diversity import DiversitySelection 
 32   
 33   
34 -class Schema(object):
35 """Deal with motifs that have ambiguity characters in it. 36 37 This motif class allows specific ambiguity characters and tries to 38 speed up finding motifs using regular expressions. 39 40 This is likely to be a replacement for the Schema representation, 41 since it allows multiple ambiguity characters to be used. 42 """
43 - def __init__(self, ambiguity_info):
44 """Initialize with ambiguity information. 45 46 Arguments: 47 48 o ambiguity_info - A dictionary which maps letters in the motifs to 49 the ambiguous characters which they might represent. For example, 50 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G. 51 All letters in the motif must be represented in the ambiguity_info 52 dictionary. 53 """ 54 self._ambiguity_info = ambiguity_info 55 56 # a cache of all encoded motifs 57 self._motif_cache = {}
58
59 - def encode_motif(self, motif):
60 """Encode the passed motif as a regular expression pattern object. 61 62 Arguments: 63 64 o motif - The motif we want to encode. This should be a string. 65 66 Returns: 67 A compiled regular expression pattern object that can be used 68 for searching strings. 69 """ 70 regexp_string = "" 71 72 for motif_letter in motif: 73 try: 74 letter_matches = self._ambiguity_info[motif_letter] 75 except KeyError: 76 raise KeyError("No match information for letter %s" 77 % motif_letter) 78 79 if len(letter_matches) > 1: 80 regexp_match = "[" + letter_matches + "]" 81 elif len(letter_matches) == 1: 82 regexp_match = letter_matches 83 else: 84 raise ValueError("Unexpected match information %s" 85 % letter_matches) 86 87 regexp_string += regexp_match 88 89 return re.compile(regexp_string)
90
91 - def find_ambiguous(self, motif):
92 """Return the location of ambiguous items in the motif. 93 94 This just checks through the motif and compares each letter 95 against the ambiguity information. If a letter stands for multiple 96 items, it is ambiguous. 97 """ 98 ambig_positions = [] 99 for motif_letter_pos in range(len(motif)): 100 motif_letter = motif[motif_letter_pos] 101 try: 102 letter_matches = self._ambiguity_info[motif_letter] 103 except KeyError: 104 raise KeyError("No match information for letter %s" 105 % motif_letter) 106 107 if len(letter_matches) > 1: 108 ambig_positions.append(motif_letter_pos) 109 110 return ambig_positions
111
112 - def num_ambiguous(self, motif):
113 """Return the number of ambiguous letters in a given motif. 114 """ 115 ambig_positions = self.find_ambiguous(motif) 116 return len(ambig_positions)
117
118 - def find_matches(self, motif, query):
119 """Return all non-overlapping motif matches in the query string. 120 121 This utilizes the regular expression findall function, and will 122 return a list of all non-overlapping occurances in query that 123 match the ambiguous motif. 124 """ 125 try: 126 motif_pattern = self._motif_cache[motif] 127 except KeyError: 128 motif_pattern = self.encode_motif(motif) 129 self._motif_cache[motif] = motif_pattern 130 131 return motif_pattern.findall(query)
132
133 - def num_matches(self, motif, query):
134 """Find the number of non-overlapping times motif occurs in query. 135 """ 136 all_matches = self.find_matches(motif, query) 137 return len(all_matches)
138
139 - def all_unambiguous(self):
140 """Return a listing of all unambiguous letters allowed in motifs. 141 """ 142 all_letters = sorted(self._ambiguity_info) 143 unambig_letters = [] 144 145 for letter in all_letters: 146 possible_matches = self._ambiguity_info[letter] 147 if len(possible_matches) == 1: 148 unambig_letters.append(letter) 149 150 return unambig_letters
151 152 # --- helper classes and functions for the default SchemaFinder 153 154 # -- Alphabets 155 156
157 -class SchemaDNAAlphabet(Alphabet.Alphabet):
158 """Alphabet of a simple Schema for DNA sequences. 159 160 This defines a simple alphabet for DNA sequences that has a single 161 character which can match any other character. 162 163 o G,A,T,C - The standard unambiguous DNA alphabet. 164 165 o * - Any letter 166 """ 167 letters = ["G", "A", "T", "C", "*"] 168 169 alphabet_matches = {"G": "G", 170 "A": "A", 171 "T": "T", 172 "C": "C", 173 "*": "GATC"}
174 175 # -- GA schema finder 176 177
178 -class GeneticAlgorithmFinder(object):
179 """Find schemas using a genetic algorithm approach. 180 181 This approach to finding schema uses Genetic Algorithms to evolve 182 a set of schema and find the best schema for a specific set of 183 records. 184 185 The 'default' finder searches for ambiguous DNA elements. This 186 can be overridden easily by creating a GeneticAlgorithmFinder 187 with a different alphabet. 188 """
189 - def __init__(self, alphabet=SchemaDNAAlphabet()):
190 """Initialize a finder to get schemas using Genetic Algorithms. 191 192 Arguments: 193 194 o alphabet -- The alphabet which specifies the contents of the 195 schemas we'll be generating. This alphabet must contain the 196 attribute 'alphabet_matches', which is a dictionary specifying 197 the potential ambiguities of each letter in the alphabet. These 198 ambiguities will be used in building up the schema. 199 """ 200 self.alphabet = alphabet 201 202 self.initial_population = 500 203 self.min_generations = 10 204 205 self._set_up_genetic_algorithm()
206
208 """Overrideable function to set up the genetic algorithm parameters. 209 210 This functions sole job is to set up the different genetic 211 algorithm functionality. Since this can be quite complicated, this 212 allows cusotmizablity of all of the parameters. If you want to 213 customize specially, you can inherit from this class and override 214 this function. 215 """ 216 self.motif_generator = RandomMotifGenerator(self.alphabet) 217 218 self.mutator = SinglePositionMutation(mutation_rate=0.1) 219 self.crossover = SinglePointCrossover(crossover_prob=0.25) 220 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 221 4) 222 self.base_selector = TournamentSelection(self.mutator, self.crossover, 223 self.repair, 2) 224 self.selector = DiversitySelection(self.base_selector, 225 self.motif_generator.random_motif)
226
227 - def find_schemas(self, fitness, num_schemas):
228 """Find the given number of unique schemas using a genetic algorithm 229 230 Arguments: 231 232 o fitness - A callable object (ie. function) which will evaluate 233 the fitness of a motif. 234 235 o num_schemas - The number of unique schemas with good fitness 236 that we want to generate. 237 """ 238 start_population = \ 239 Organism.function_population(self.motif_generator.random_motif, 240 self.initial_population, 241 fitness) 242 finisher = SimpleFinisher(num_schemas, self.min_generations) 243 244 # set up the evolver and do the evolution 245 evolver = GenerationEvolver(start_population, self.selector) 246 evolved_pop = evolver.evolve(finisher.is_finished) 247 248 # convert the evolved population into a PatternRepository 249 schema_info = {} 250 for org in evolved_pop: 251 # convert the Genome from a MutableSeq to a Seq so that 252 # the schemas are just strings (and not array("c")s) 253 seq_genome = org.genome.toseq() 254 schema_info[str(seq_genome)] = org.fitness 255 256 return PatternRepository(schema_info)
257 258 # -- fitness classes 259 260
261 -class DifferentialSchemaFitness(object):
262 """Calculate fitness for schemas that differentiate between sequences. 263 """
264 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
265 """Initialize with different sequences to evaluate 266 267 Arguments: 268 269 o positive_seq - A list of SeqRecord objects which are the 'positive' 270 sequences -- the ones we want to select for. 271 272 o negative_seq - A list of SeqRecord objects which are the 'negative' 273 sequences that we want to avoid selecting. 274 275 o schema_evaluator - An Schema class which can be used to 276 evaluate find motif matches in sequences. 277 """ 278 self._pos_seqs = positive_seqs 279 self._neg_seqs = negative_seqs 280 self._schema_eval = schema_evaluator
281
282 - def calculate_fitness(self, genome):
283 """Calculate the fitness for a given schema. 284 285 Fitness is specified by the number of occurances of the schema in 286 the positive sequences minus the number of occurances in the 287 negative examples. 288 289 This fitness is then modified by multiplying by the length of the 290 schema and then dividing by the number of ambiguous characters in 291 the schema. This helps select for schema which are longer and have 292 less redundancy. 293 """ 294 # convert the genome into a string 295 seq_motif = genome.toseq() 296 motif = str(seq_motif) 297 298 # get the counts in the positive examples 299 num_pos = 0 300 for seq_record in self._pos_seqs: 301 cur_counts = self._schema_eval.num_matches(motif, 302 str(seq_record.seq)) 303 num_pos += cur_counts 304 305 # get the counts in the negative examples 306 num_neg = 0 307 for seq_record in self._neg_seqs: 308 cur_counts = self._schema_eval.num_matches(motif, 309 str(seq_record.seq)) 310 311 num_neg += cur_counts 312 313 num_ambiguous = self._schema_eval.num_ambiguous(motif) 314 # weight the ambiguous stuff more highly 315 num_ambiguous = pow(2.0, num_ambiguous) 316 # increment num ambiguous to prevent division by zero errors. 317 num_ambiguous += 1 318 319 motif_size = len(motif) 320 motif_size = motif_size * 4.0 321 322 discerning_power = num_pos - num_neg 323 324 diff = (discerning_power * motif_size) / float(num_ambiguous) 325 return diff
326 327
328 -class MostCountSchemaFitness(object):
329 """Calculate a fitness giving weight to schemas that match many times. 330 331 This fitness function tries to maximize schemas which are found many 332 times in a group of sequences. 333 """
334 - def __init__(self, seq_records, schema_evaluator):
335 """Initialize with sequences to evaluate. 336 337 Arguments: 338 339 o seq_records -- A set of SeqRecord objects which we use to 340 calculate the fitness. 341 342 o schema_evaluator - An Schema class which can be used to 343 evaluate find motif matches in sequences. 344 """ 345 self._records = seq_records 346 self._evaluator = schema_evaluator
347
348 - def calculate_fitness(self, genome):
349 """Calculate the fitness of a genome based on schema matches. 350 351 This bases the fitness of a genome completely on the number of times 352 it matches in the set of seq_records. Matching more times gives a 353 better fitness 354 """ 355 # convert the genome into a string 356 seq_motif = genome.toseq() 357 motif = str(seq_motif) 358 359 # find the number of times the genome matches 360 num_times = 0 361 for seq_record in self._records: 362 cur_counts = self._evaluator.num_matches(motif, 363 str(seq_record.seq)) 364 num_times += cur_counts 365 366 return num_times
367 368 369 # -- Helper classes
370 -class RandomMotifGenerator(object):
371 """Generate a random motif within given parameters. 372 """
373 - def __init__(self, alphabet, min_size=12, max_size=17):
374 """Initialize with the motif parameters. 375 376 Arguments: 377 378 o alphabet - An alphabet specifying what letters can be inserted in 379 a motif. 380 381 o min_size, max_size - Specify the range of sizes for motifs. 382 """ 383 self._alphabet = alphabet 384 self._min_size = min_size 385 self._max_size = max_size
386
387 - def random_motif(self):
388 """Create a random motif within the given parameters. 389 390 This returns a single motif string with letters from the given 391 alphabet. The size of the motif will be randomly chosen between 392 max_size and min_size. 393 """ 394 motif_size = random.randrange(self._min_size, self._max_size) 395 396 motif = "" 397 for letter_num in range(motif_size): 398 cur_letter = random.choice(self._alphabet.letters) 399 motif += cur_letter 400 401 return MutableSeq(motif, self._alphabet)
402 403
404 -class SimpleFinisher(object):
405 """Determine when we are done evolving motifs. 406 407 This takes the very simple approach of halting evolution when the 408 GA has proceeded for a specified number of generations and has 409 a given number of unique schema with positive fitness. 410 """
411 - def __init__(self, num_schemas, min_generations=100):
412 """Initialize the finisher with its parameters. 413 414 Arguments: 415 416 o num_schemas -- the number of useful (positive fitness) schemas 417 we want to generation 418 419 o min_generations -- The minimum number of generations to allow 420 the GA to proceed. 421 """ 422 self.num_generations = 0 423 424 self.num_schemas = num_schemas 425 self.min_generations = min_generations
426
427 - def is_finished(self, organisms):
428 """Determine when we can stop evolving the population. 429 """ 430 self.num_generations += 1 431 # print "generation %s" % self.num_generations 432 433 if self.num_generations >= self.min_generations: 434 all_seqs = [] 435 for org in organisms: 436 if org.fitness > 0: 437 if org.genome not in all_seqs: 438 all_seqs.append(org.genome) 439 440 if len(all_seqs) >= self.num_schemas: 441 return 1 442 443 return 0
444 # --- 445 446
447 -class SchemaFinder(object):
448 """Find schema in a set of sequences using a genetic algorithm approach. 449 450 Finding good schemas is very difficult because it takes forever to 451 enumerate all of the potential schemas. This finder using a genetic 452 algorithm approach to evolve good schema which match many times in 453 a set of sequences. 454 455 The default implementation of the finder is ready to find schemas 456 in a set of DNA sequences, but the finder can be customized to deal 457 with any type of data. 458 """
459 - def __init__(self, num_schemas=100, 460 schema_finder=GeneticAlgorithmFinder()):
461 self.num_schemas = num_schemas 462 self._finder = schema_finder 463 464 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
465
466 - def find(self, seq_records):
467 """Find well-represented schemas in the given set of SeqRecords. 468 """ 469 fitness_evaluator = MostCountSchemaFitness(seq_records, 470 self.evaluator) 471 472 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 473 self.num_schemas)
474
475 - def find_differences(self, first_records, second_records):
476 """Find schemas which differentiate between the two sets of SeqRecords. 477 """ 478 fitness_evaluator = DifferentialSchemaFitness(first_records, 479 second_records, 480 self.evaluator) 481 482 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 483 self.num_schemas)
484 485
486 -class SchemaCoder(object):
487 """Convert a sequence into a representation of ambiguous motifs (schemas). 488 489 This takes a sequence, and returns the number of times specified 490 motifs are found in the sequence. This lets you represent a sequence 491 as just a count of (possibly ambiguous) motifs. 492 """
493 - def __init__(self, schemas, ambiguous_converter):
494 """Initialize the coder to convert sequences 495 496 Arguments: 497 498 o schema - A list of all of the schemas we want to search for 499 in input sequences. 500 501 o ambiguous_converter - An Schema class which can be 502 used to convert motifs into regular expressions for searching. 503 """ 504 self._schemas = schemas 505 self._converter = ambiguous_converter
506
507 - def representation(self, sequence):
508 """Represent the given input sequence as a bunch of motif counts. 509 510 Arguments: 511 512 o sequence - A Bio.Seq object we are going to represent as schemas. 513 514 This takes the sequence, searches for the motifs within it, and then 515 returns counts specifying the relative number of times each motifs 516 was found. The frequencies are in the order the original motifs were 517 passed into the initializer. 518 """ 519 schema_counts = [] 520 521 for schema in self._schemas: 522 num_counts = self._converter.num_matches(schema, str(sequence)) 523 schema_counts.append(num_counts) 524 525 # normalize the counts to go between zero and one 526 min_count = 0 527 max_count = max(schema_counts) 528 529 # only normalize if we've actually found something, otherwise 530 # we'll just return 0 for everything 531 if max_count > 0: 532 for count_num in range(len(schema_counts)): 533 schema_counts[count_num] = (float(schema_counts[count_num]) - 534 float(min_count)) / float(max_count) 535 536 return schema_counts
537 538
539 -def matches_schema(pattern, schema, ambiguity_character='*'):
540 """Determine whether or not the given pattern matches the schema. 541 542 Arguments: 543 544 o pattern - A string representing the pattern we want to check for 545 matching. This pattern can contain ambiguity characters (which are 546 assumed to be the same as those in the schema). 547 548 o schema - A string schema with ambiguity characters. 549 550 o ambiguity_character - The character used for ambiguity in the schema. 551 """ 552 if len(pattern) != len(schema): 553 return 0 554 555 # check each position, and return a non match if the schema and pattern 556 # are non ambiguous and don't match 557 for pos in range(len(pattern)): 558 if schema[pos] != ambiguity_character and \ 559 pattern[pos] != ambiguity_character and \ 560 pattern[pos] != schema[pos]: 561 562 return 0 563 564 return 1
565 566
567 -class SchemaFactory(object):
568 """Generate Schema from inputs of Motifs or Signatures. 569 """
570 - def __init__(self, ambiguity_symbol='*'):
571 """Initialize the SchemaFactory 572 573 Arguments: 574 575 o ambiguity_symbol -- The symbol to use when specifying that 576 a position is arbitrary. 577 """ 578 self._ambiguity_symbol = ambiguity_symbol
579
580 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
581 """Generate schema from a list of motifs. 582 583 Arguments: 584 585 o motif_repository - A MotifRepository class that has all of the 586 motifs we want to convert to Schema. 587 588 o motif_percent - The percentage of motifs in the motif bank which 589 should be matches. We'll try to create schema that match this 590 percentage of motifs. 591 592 o num_ambiguous - The number of ambiguous characters to include 593 in each schema. The positions of these ambiguous characters will 594 be randomly selected. 595 """ 596 # get all of the motifs we can deal with 597 all_motifs = motif_repository.get_top_percentage(motif_percent) 598 599 # start building up schemas 600 schema_info = {} 601 # continue until we've built schema matching the desired percentage 602 # of motifs 603 total_count = self._get_num_motifs(motif_repository, all_motifs) 604 matched_count = 0 605 assert total_count > 0, "Expected to have motifs to match" 606 while (float(matched_count) / float(total_count)) < motif_percent: 607 new_schema, matching_motifs = \ 608 self._get_unique_schema(schema_info.keys(), 609 all_motifs, num_ambiguous) 610 611 # get the number of counts for the new schema and clean up 612 # the motif list 613 schema_counts = 0 614 for motif in matching_motifs: 615 # get the counts for the motif 616 schema_counts += motif_repository.count(motif) 617 618 # remove the motif from the motif list since it is already 619 # represented by this schema 620 all_motifs.remove(motif) 621 622 # all the schema info 623 schema_info[new_schema] = schema_counts 624 625 matched_count += schema_counts 626 627 # print "percentage:", float(matched_count) / float(total_count) 628 629 return PatternRepository(schema_info)
630
631 - def _get_num_motifs(self, repository, motif_list):
632 """Return the number of motif counts for the list of motifs. 633 """ 634 motif_count = 0 635 for motif in motif_list: 636 motif_count += repository.count(motif) 637 638 return motif_count
639
640 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
641 """Retrieve a unique schema from a motif. 642 643 We don't want to end up with schema that match the same thing, 644 since this could lead to ambiguous results, and be messy. This 645 tries to create schema, and checks that they do not match any 646 currently existing schema. 647 """ 648 # create a schema starting with a random motif 649 # we'll keep doing this until we get a completely new schema that 650 # doesn't match any old schema 651 num_tries = 0 652 653 while 1: 654 # pick a motif to work from and make a schema from it 655 cur_motif = random.choice(motif_list) 656 657 num_tries += 1 658 659 new_schema, matching_motifs = \ 660 self._schema_from_motif(cur_motif, motif_list, 661 num_ambiguous) 662 663 has_match = 0 664 for old_schema in cur_schemas: 665 if matches_schema(new_schema, old_schema, 666 self._ambiguity_symbol): 667 has_match = 1 668 669 # if the schema doesn't match any other schema we've got 670 # a good one 671 if not(has_match): 672 break 673 674 # check for big loops in which we can't find a new schema 675 assert num_tries < 150, \ 676 "Could not generate schema in %s tries from %s with %s" \ 677 % (num_tries, motif_list, cur_schemas) 678 679 return new_schema, matching_motifs
680
681 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
682 """Create a schema from a given starting motif. 683 684 Arguments: 685 686 o motif - A motif with the pattern we will start from. 687 688 o motif_list - The total motifs we have.to match to. 689 690 o num_ambiguous - The number of ambiguous characters that should 691 be present in the schema. 692 693 Returns: 694 695 o A string representing the newly generated schema. 696 697 o A list of all of the motifs in motif_list that match the schema. 698 """ 699 assert motif in motif_list, \ 700 "Expected starting motif present in remaining motifs." 701 702 # convert random positions in the motif to ambiguous characters 703 # convert the motif into a list of characters so we can manipulate it 704 new_schema_list = list(motif) 705 for add_ambiguous in range(num_ambiguous): 706 # add an ambiguous position in a new place in the motif 707 while 1: 708 ambig_pos = random.choice(range(len(new_schema_list))) 709 710 # only add a position if it isn't already ambiguous 711 # otherwise, we'll try again 712 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 713 new_schema_list[ambig_pos] = self._ambiguity_symbol 714 break 715 716 # convert the schema back to a string 717 new_schema = ''.join(new_schema_list) 718 719 # get the motifs that the schema matches 720 matched_motifs = [] 721 for motif in motif_list: 722 if matches_schema(motif, new_schema, self._ambiguity_symbol): 723 matched_motifs.append(motif) 724 725 return new_schema, matched_motifs
726
727 - def from_signatures(self, signature_repository, num_ambiguous):
728 raise NotImplementedError("Still need to code this.")
729