1 """
2 Contains classes to deal with generic sequence alignment stuff not
3 specific to a particular program or format.
4
5 classes:
6 o Alignment
7 """
8
9
10 import string
11
12
13 from Bio.Seq import Seq
14 from Bio.SeqRecord import SeqRecord
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17
19 """Represent a set of alignments.
20
21 This is a base class to represent alignments, which can be subclassed
22 to deal with an alignment in a specific format.
23 """
25 """Initialize a new Alignment object.
26
27 Arguments:
28 o alphabet - The alphabet to use for the sequence objects that are
29 created. This alphabet must be a gapped type.
30 """
31 self._alphabet = alphabet
32
33 self._records = []
34
36 """Return all of the sequences involved in the alignment.
37
38 The return value is a list of SeqRecord objects.
39 """
40 return self._records
41
43 """Iterate over alignment rows as SeqRecord objects
44
45 e.g.
46
47 for record in align :
48 print record.id
49 print record.seq
50 """
51 return iter(self._records)
52
54 """Retrieve a sequence by the number of the sequence in the consensus.
55
56 Returns:
57 o A Seq object for the requested sequence.
58
59 Raises:
60 o IndexError - If the specified number is out of range.
61 """
62 return self._records[number].seq
63
65 """Return the maximum length of the alignment.
66
67 All objects in the alignment should (hopefully) have the same
68 length. This function will go through and find this length
69 by finding the maximum length of sequences in the alignment.
70 """
71 max_length = 0
72
73 for record in self._records:
74 if len(record.seq) > max_length:
75 max_length = len(record.seq)
76
77 return max_length
78
79 - def add_sequence(self, descriptor, sequence, start = None, end = None,
80 weight = 1.0):
81 """Add a sequence to the alignment.
82
83 This doesn't do any kind of alignment, it just adds in the sequence
84 object, which is assumed to be prealigned with the existing
85 sequences.
86
87 Arguments:
88 o descriptor - The descriptive id of the sequence being added.
89 This will be used as the resulting SeqRecord's
90 .id property (and, for historical compatibility,
91 also the .description property)
92 o sequence - A string with sequence info.
93 o start - You can explicitly set the start point of the sequence.
94 This is useful (at least) for BLAST alignments, which can just
95 be partial alignments of sequences.
96 o end - Specify the end of the sequence, which is important
97 for the same reason as the start.
98 o weight - The weight to place on the sequence in the alignment.
99 By default, all sequences have the same weight. (0.0 => no weight,
100 1.0 => highest weight)
101 """
102 new_seq = Seq(sequence, self._alphabet)
103
104
105
106
107
108
109 new_record = SeqRecord(new_seq,
110 id = descriptor,
111 description = descriptor)
112
113
114
115
116
117
118
119
120 if start:
121 new_record.annotations['start'] = start
122 if end:
123 new_record.annotations['end'] = end
124
125
126 new_record.annotations['weight'] = weight
127
128 self._records.append(new_record)
129
131 """Returns a string containing a given column"""
132 col_str = ''
133 assert col >= 0 and col <= self.get_alignment_length()
134 for rec in self._records:
135 col_str += rec.seq[col]
136 return col_str
137
138 if __name__ == "__main__" :
139 print "Mini self test..."
140
141 raw_data = ["ACGATCAGCTAGCT", "CCGATCAGCTAGCT", "ACGATGAGCTAGCT"]
142 a = Alignment(Alphabet.generic_dna)
143 a.add_sequence("Alpha", raw_data[0], weight=2)
144 a.add_sequence("Beta", raw_data[1])
145 a.add_sequence("Gamma", raw_data[2])
146
147
148 for rec in a :
149 assert isinstance(rec, SeqRecord)
150 for r,rec in enumerate(a) :
151 assert isinstance(rec, SeqRecord)
152 assert raw_data[r] == rec.seq.tostring()
153 if r==0 : assert rec.annotations['weight']==2
154 print "Alignment iteraction as SeqRecord OK"
155