1 """Deal with Motifs or Signatures allowing ambiguity in the sequences.
2
3 This class contains Schema which deal with Motifs and Signatures at
4 a higher level, by introducing `don't care` (ambiguity) symbols into
5 the sequences. For instance, you could combine the following Motifs:
6
7 'GATC', 'GATG', 'GATG', 'GATT'
8
9 as all falling under a schema like 'GAT*', where the star indicates a
10 character can be anything. This helps us condense a whole ton of
11 motifs or signatures.
12 """
13
14 import random
15 import string
16 import re
17
18
19 from Bio import Alphabet
20 from Bio.Seq import MutableSeq
21
22
23 from Pattern import PatternRepository
24
25
26 from Bio.GA import Organism
27 from Bio.GA.Evolver import GenerationEvolver
28 from Bio.GA.Mutation.Simple import SinglePositionMutation
29 from Bio.GA.Crossover.Point import SinglePointCrossover
30 from Bio.GA.Repair.Stabilizing import AmbiguousRepair
31 from Bio.GA.Selection.Tournament import TournamentSelection
32 from Bio.GA.Selection.Diversity import DiversitySelection
33
35 """Deal with motifs that have ambiguity characters in it.
36
37 This motif class allows specific ambiguity characters and tries to
38 speed up finding motifs using regular expressions.
39
40 This is likely to be a replacement for the Schema representation,
41 since it allows multiple ambiguity characters to be used.
42 """
44 """Initialize with ambiguity information.
45
46 Arguments:
47
48 o ambiguity_info - A dictionary which maps letters in the motifs to
49 the ambiguous characters which they might represent. For example,
50 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G.
51 All letters in the motif must be represented in the ambiguity_info
52 dictionary.
53 """
54 self._ambiguity_info = ambiguity_info
55
56
57 self._motif_cache = {}
58
60 """Encode the passed motif as a regular expression pattern object.
61
62 Arguments:
63
64 o motif - The motif we want to encode. This should be a string.
65
66 Returns:
67 A compiled regular expression pattern object that can be used
68 for searching strings.
69 """
70 regexp_string = ""
71
72 for motif_letter in motif:
73 try:
74 letter_matches = self._ambiguity_info[motif_letter]
75 except KeyError:
76 raise KeyError("No match information for letter %s"
77 % motif_letter)
78
79 if len(letter_matches) > 1:
80 regexp_match = "[" + letter_matches + "]"
81 elif len(letter_matches) == 1:
82 regexp_match = letter_matches
83 else:
84 raise ValueError("Unexpected match information %s"
85 % letter_matches)
86
87 regexp_string += regexp_match
88
89 return re.compile(regexp_string)
90
92 """Return the location of ambiguous items in the motif.
93
94 This just checks through the motif and compares each letter
95 against the ambiguity information. If a letter stands for multiple
96 items, it is ambiguous.
97 """
98 ambig_positions = []
99 for motif_letter_pos in range(len(motif)):
100 motif_letter = motif[motif_letter_pos]
101 try:
102 letter_matches = self._ambiguity_info[motif_letter]
103 except KeyError:
104 raise KeyError("No match information for letter %s"
105 % motif_letter)
106
107 if len(letter_matches) > 1:
108 ambig_positions.append(motif_letter_pos)
109
110 return ambig_positions
111
113 """Return the number of ambiguous letters in a given motif.
114 """
115 ambig_positions = self.find_ambiguous(motif)
116 return len(ambig_positions)
117
119 """Return all non-overlapping motif matches in the query string.
120
121 This utilizes the regular expression findall function, and will
122 return a list of all non-overlapping occurances in query that
123 match the ambiguous motif.
124 """
125 try:
126 motif_pattern = self._motif_cache[motif]
127 except KeyError:
128 motif_pattern = self.encode_motif(motif)
129 self._motif_cache[motif] = motif_pattern
130
131 return motif_pattern.findall(query)
132
134 """Find the number of non-overlapping times motif occurs in query.
135 """
136 all_matches = self.find_matches(motif, query)
137 return len(all_matches)
138
140 """Return a listing of all unambiguous letters allowed in motifs.
141 """
142 all_letters = self._ambiguity_info.keys()
143 all_letters.sort()
144
145 unambig_letters = []
146
147 for letter in all_letters:
148 possible_matches = self._ambiguity_info[letter]
149 if len(possible_matches) == 1:
150 unambig_letters.append(letter)
151
152 return unambig_letters
153
154
155
156
157
159 """Alphabet of a simple Schema for DNA sequences.
160
161 This defines a simple alphabet for DNA sequences that has a single
162 character which can match any other character.
163
164 o G,A,T,C - The standard unambiguous DNA alphabet.
165
166 o * - Any letter
167 """
168 letters = ["G", "A", "T", "C", "*"]
169
170 alphabet_matches = {"G" : "G",
171 "A" : "A",
172 "T" : "T",
173 "C" : "C",
174 "*" : "GATC"}
175
176
177
179 """Find schemas using a genetic algorithm approach.
180
181 This approach to finding schema uses Genetic Algorithms to evolve
182 a set of schema and find the best schema for a specific set of
183 records.
184
185 The 'default' finder searches for ambiguous DNA elements. This
186 can be overridden easily by creating a GeneticAlgorithmFinder
187 with a different alphabet.
188 """
190 """Initialize a finder to get schemas using Genetic Algorithms.
191
192 Arguments:
193
194 o alphabet -- The alphabet which specifies the contents of the
195 schemas we'll be generating. This alphabet must contain the
196 attribute 'alphabet_matches', which is a dictionary specifying
197 the potential ambiguities of each letter in the alphabet. These
198 ambiguities will be used in building up the schema.
199 """
200 self.alphabet = alphabet
201
202 self.initial_population = 500
203 self.min_generations = 10
204
205 self._set_up_genetic_algorithm()
206
226
228 """Find the given number of unique schemas using a genetic algorithm
229
230 Arguments:
231
232 o fitness - A callable object (ie. function) which will evaluate
233 the fitness of a motif.
234
235 o num_schemas - The number of unique schemas with good fitness
236 that we want to generate.
237 """
238 start_population = \
239 Organism.function_population(self.motif_generator.random_motif,
240 self.initial_population,
241 fitness)
242 finisher = SimpleFinisher(num_schemas, self.min_generations)
243
244
245 evolver = GenerationEvolver(start_population, self.selector)
246 evolved_pop = evolver.evolve(finisher.is_finished)
247
248
249 schema_info = {}
250 for org in evolved_pop:
251
252
253 seq_genome = org.genome.toseq()
254 schema_info[seq_genome.data] = org.fitness
255
256 return PatternRepository(schema_info)
257
258
259
261 """Calculate fitness for schemas that differentiate between sequences.
262 """
263 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
264 """Initialize with different sequences to evaluate
265
266 Arguments:
267
268 o positive_seq - A list of SeqRecord objects which are the 'positive'
269 sequences -- the ones we want to select for.
270
271 o negative_seq - A list of SeqRecord objects which are the 'negative'
272 sequences that we want to avoid selecting.
273
274 o schema_evaluator - An Schema class which can be used to
275 evaluate find motif matches in sequences.
276 """
277 self._pos_seqs = positive_seqs
278 self._neg_seqs = negative_seqs
279 self._schema_eval = schema_evaluator
280
282 """Calculate the fitness for a given schema.
283
284 Fitness is specified by the number of occurances of the schema in
285 the positive sequences minus the number of occurances in the
286 negative examples.
287
288 This fitness is then modified by multiplying by the length of the
289 schema and then dividing by the number of ambiguous characters in
290 the schema. This helps select for schema which are longer and have
291 less redundancy.
292 """
293
294 seq_motif = genome.toseq()
295 motif = seq_motif.data
296
297
298 num_pos = 0
299 for seq_record in self._pos_seqs:
300 cur_counts = self._schema_eval.num_matches(motif,
301 seq_record.seq.data)
302 num_pos += cur_counts
303
304
305 num_neg = 0
306 for seq_record in self._neg_seqs:
307 cur_counts = self._schema_eval.num_matches(motif,
308 seq_record.seq.data)
309
310 num_neg += cur_counts
311
312 num_ambiguous = self._schema_eval.num_ambiguous(motif)
313
314 num_ambiguous = pow(2.0, num_ambiguous)
315
316 num_ambiguous += 1
317
318 motif_size = len(motif)
319 motif_size = motif_size * 4.0
320
321 discerning_power = num_pos - num_neg
322
323 diff = (discerning_power * motif_size) / float(num_ambiguous)
324 return diff
325
327 """Calculate a fitness giving weight to schemas that match many times.
328
329 This fitness function tries to maximize schemas which are found many
330 times in a group of sequences.
331 """
332 - def __init__(self, seq_records, schema_evaluator):
333 """Initialize with sequences to evaluate.
334
335 Arguments:
336
337 o seq_records -- A set of SeqRecord objects which we use to
338 calculate the fitness.
339
340 o schema_evaluator - An Schema class which can be used to
341 evaluate find motif matches in sequences.
342 """
343 self._records = seq_records
344 self._evaluator = schema_evaluator
345
347 """Calculate the fitness of a genome based on schema matches.
348
349 This bases the fitness of a genome completely on the number of times
350 it matches in the set of seq_records. Matching more times gives a
351 better fitness
352 """
353
354 seq_motif = genome.toseq()
355 motif = seq_motif.data
356
357
358 num_times = 0
359 for seq_record in self._records:
360 cur_counts = self._evaluator.num_matches(motif,
361 seq_record.seq.data)
362 num_times += cur_counts
363
364 return num_times
365
366
368 """Generate a random motif within given parameters.
369 """
370 - def __init__(self, alphabet, min_size = 12, max_size = 17):
371 """Initialize with the motif parameters.
372
373 Arguments:
374
375 o alphabet - An alphabet specifying what letters can be inserted in
376 a motif.
377
378 o min_size, max_size - Specify the range of sizes for motifs.
379 """
380 self._alphabet = alphabet
381 self._min_size = min_size
382 self._max_size = max_size
383
385 """Create a random motif within the given parameters.
386
387 This returns a single motif string with letters from the given
388 alphabet. The size of the motif will be randomly chosen between
389 max_size and min_size.
390 """
391 motif_size = random.randrange(self._min_size, self._max_size)
392
393 motif = ""
394 for letter_num in range(motif_size):
395 cur_letter = random.choice(self._alphabet.letters)
396 motif += cur_letter
397
398 return MutableSeq(motif, self._alphabet)
399
401 """Determine when we are done evolving motifs.
402
403 This takes the very simple approach of halting evolution when the
404 GA has proceeded for a specified number of generations and has
405 a given number of unique schema with positive fitness.
406 """
407 - def __init__(self, num_schemas, min_generations = 100):
408 """Initialize the finisher with its parameters.
409
410 Arguments:
411
412 o num_schemas -- the number of useful (positive fitness) schemas
413 we want to generation
414
415 o min_generations -- The minimum number of generations to allow
416 the GA to proceed.
417 """
418 self.num_generations = 0
419
420 self.num_schemas = num_schemas
421 self.min_generations = min_generations
422
424 """Determine when we can stop evolving the population.
425 """
426 self.num_generations += 1
427
428
429 if self.num_generations >= self.min_generations:
430 all_seqs = []
431 for org in organisms:
432 if org.fitness > 0:
433 if org.genome not in all_seqs:
434 all_seqs.append(org.genome)
435
436 if len(all_seqs) >= self.num_schemas:
437 return 1
438
439 return 0
440
441
443 """Find schema in a set of sequences using a genetic algorithm approach.
444
445 Finding good schemas is very difficult because it takes forever to
446 enumerate all of the potential schemas. This finder using a genetic
447 algorithm approach to evolve good schema which match many times in
448 a set of sequences.
449
450 The default implementation of the finder is ready to find schemas
451 in a set of DNA sequences, but the finder can be customized to deal
452 with any type of data.
453 """
460
461 - def find(self, seq_records):
462 """Find well-represented schemas in the given set of SeqRecords.
463 """
464 fitness_evaluator = MostCountSchemaFitness(seq_records,
465 self.evaluator)
466
467 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
468 self.num_schemas)
469
471 """Find schemas which differentiate between the two sets of SeqRecords.
472 """
473 fitness_evaluator = DifferentialSchemaFitness(first_records,
474 second_records,
475 self.evaluator)
476
477 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
478 self.num_schemas)
479
481 """Convert a sequence into a representation of ambiguous motifs (schemas).
482
483 This takes a sequence, and returns the number of times specified
484 motifs are found in the sequence. This lets you represent a sequence
485 as just a count of (possibly ambiguous) motifs.
486 """
487 - def __init__(self, schemas, ambiguous_converter):
488 """Initialize the coder to convert sequences
489
490 Arguments:
491
492 o schema - A list of all of the schemas we want to search for
493 in input sequences.
494
495 o ambiguous_converter - An Schema class which can be
496 used to convert motifs into regular expressions for searching.
497 """
498 self._schemas = schemas
499 self._converter = ambiguous_converter
500
502 """Represent the given input sequence as a bunch of motif counts.
503
504 Arguments:
505
506 o sequence - A Bio.Seq object we are going to represent as schemas.
507
508 This takes the sequence, searches for the motifs within it, and then
509 returns counts specifying the relative number of times each motifs
510 was found. The frequencies are in the order the original motifs were
511 passed into the initializer.
512 """
513 schema_counts = []
514
515 for schema in self._schemas:
516 num_counts = self._converter.num_matches(schema, sequence.data)
517 schema_counts.append(num_counts)
518
519
520 min_count = 0
521 max_count = max(schema_counts)
522
523
524
525 if max_count > 0:
526 for count_num in range(len(schema_counts)):
527 schema_counts[count_num] = (float(schema_counts[count_num]) -
528 float(min_count)) / float(max_count)
529
530 return schema_counts
531
533 """Determine whether or not the given pattern matches the schema.
534
535 Arguments:
536
537 o pattern - A string representing the pattern we want to check for
538 matching. This pattern can contain ambiguity characters (which are
539 assumed to be the same as those in the schema).
540
541 o schema - A string schema with ambiguity characters.
542
543 o ambiguity_character - The character used for ambiguity in the schema.
544 """
545 if len(pattern) != len(schema):
546 return 0
547
548
549
550 for pos in range(len(pattern)):
551 if (schema[pos] != ambiguity_character and
552 pattern[pos] != ambiguity_character and
553 pattern[pos] != schema[pos]):
554
555 return 0
556
557 return 1
558
560 """Generate Schema from inputs of Motifs or Signatures.
561 """
562 - def __init__(self, ambiguity_symbol = '*'):
563 """Initialize the SchemaFactory
564
565 Arguments:
566
567 o ambiguity_symbol -- The symbol to use when specifying that
568 a position is arbitrary.
569 """
570 self._ambiguity_symbol = ambiguity_symbol
571
572 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
573 """Generate schema from a list of motifs.
574
575 Arguments:
576
577 o motif_repository - A MotifRepository class that has all of the
578 motifs we want to convert to Schema.
579
580 o motif_percent - The percentage of motifs in the motif bank which
581 should be matches. We'll try to create schema that match this
582 percentage of motifs.
583
584 o num_ambiguous - The number of ambiguous characters to include
585 in each schema. The positions of these ambiguous characters will
586 be randomly selected.
587 """
588
589 all_motifs = motif_repository.get_top_percentage(motif_percent)
590
591
592 schema_info = {}
593
594
595 total_count = self._get_num_motifs(motif_repository, all_motifs)
596 matched_count = 0
597 assert total_count > 0, "Expected to have motifs to match"
598 while (float(matched_count) / float(total_count)) < motif_percent:
599
600 new_schema, matching_motifs = \
601 self._get_unique_schema(schema_info.keys(),
602 all_motifs, num_ambiguous)
603
604
605
606 schema_counts = 0
607 for motif in matching_motifs:
608
609 schema_counts += motif_repository.count(motif)
610
611
612
613 all_motifs.remove(motif)
614
615
616
617 schema_info[new_schema] = schema_counts
618
619 matched_count += schema_counts
620
621
622
623 return PatternRepository(schema_info)
624
626 """Return the number of motif counts for the list of motifs.
627 """
628 motif_count = 0
629 for motif in motif_list:
630 motif_count += repository.count(motif)
631
632 return motif_count
633
635 """Retrieve a unique schema from a motif.
636
637 We don't want to end up with schema that match the same thing,
638 since this could lead to ambiguous results, and be messy. This
639 tries to create schema, and checks that they do not match any
640 currently existing schema.
641 """
642
643
644
645 num_tries = 0
646
647 while 1:
648
649 cur_motif = random.choice(motif_list)
650
651 num_tries += 1
652
653 new_schema, matching_motifs = \
654 self._schema_from_motif(cur_motif, motif_list,
655 num_ambiguous)
656
657 has_match = 0
658 for old_schema in cur_schemas:
659 if matches_schema(new_schema, old_schema,
660 self._ambiguity_symbol):
661 has_match = 1
662
663
664
665 if not(has_match):
666 break
667
668
669 assert num_tries < 150, \
670 "Could not generate schema in %s tries from %s with %s" \
671 % (num_tries, motif_list, cur_schemas)
672
673 return new_schema, matching_motifs
674
676 """Create a schema from a given starting motif.
677
678 Arguments:
679
680 o motif - A motif with the pattern we will start from.
681
682 o motif_list - The total motifs we have.to match to.
683
684 o num_ambiguous - The number of ambiguous characters that should
685 be present in the schema.
686
687 Returns:
688
689 o A string representing the newly generated schema.
690
691 o A list of all of the motifs in motif_list that match the schema.
692 """
693 assert motif in motif_list, \
694 "Expected starting motif present in remaining motifs."
695
696
697
698 new_schema_list = list(motif)
699 for add_ambiguous in range(num_ambiguous):
700
701 while 1:
702 ambig_pos = random.choice(range(len(new_schema_list)))
703
704
705
706 if new_schema_list[ambig_pos] != self._ambiguity_symbol:
707 new_schema_list[ambig_pos] = self._ambiguity_symbol
708 break
709
710
711 new_schema = string.join(new_schema_list, '')
712
713
714 matched_motifs = []
715 for motif in motif_list:
716 if matches_schema(motif, new_schema, self._ambiguity_symbol):
717 matched_motifs.append(motif)
718
719 return new_schema, matched_motifs
720
722 raise NotImplementedError("Still need to code this.")
723