1 """
2 Contains classes to deal with generic sequence alignment stuff not
3 specific to a particular program or format.
4
5 classes:
6 o Alignment
7 """
8
9
10 import string
11
12
13 from Bio.Seq import Seq
14 from Bio.SeqRecord import SeqRecord
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17
19 """Represent a set of alignments.
20
21 This is a base class to represent alignments, which can be subclassed
22 to deal with an alignment in a specific format.
23 """
25 """Initialize a new Alignment object.
26
27 Arguments:
28 o alphabet - The alphabet to use for the sequence objects that are
29 created. This alphabet must be a gapped type.
30 """
31 self._alphabet = alphabet
32
33 self._records = []
34
36 """Returns a truncated string representation of a SeqRecord.
37
38 This is a PRIVATE function used by the __str__ method.
39 """
40 if len(record.seq) <= 50 :
41 return "%s %s" % (record.seq, record.id)
42 else :
43 return "%s...%s %s" \
44 % (record.seq[:44], record.seq[-3:], record.id)
45
47 """Returns a multi-line string summary of the alignment.
48
49 This output is intended to be readable, but large alignments are
50 shown truncated. A maximum of 20 rows (sequences) and 50 columns
51 are shown, with the record identifiers. This should fit nicely on a
52 single screen. e.g.
53
54 DNAAlphabet() alignment with 3 rows and 14 columns
55 ACGATCAGCTAGCT Alpha
56 CCGATCAGCTAGCT Beta
57 ACGATGAGCTAGCT Gamma
58 """
59 rows = len(self._records)
60 lines = ["%s alignment with %i rows and %i columns" \
61 % (str(self._alphabet), rows, self.get_alignment_length())]
62 if rows <= 20 :
63 lines.extend([self._str_line(rec) for rec in self._records])
64 else :
65 lines.extend([self._str_line(rec) for rec in self._records[:18]])
66 lines.append("...")
67 lines.append(self._str_line(self._records[-1]))
68 return "\n".join(lines)
69
71 """Returns a representation of the object for debugging.
72
73 The representation cannot be used with eval() to recreate the object,
74 which is usually possible with simple python ojects. For example:
75
76 <Bio.Align.Generic.Alignment instance (2 records of length 14,
77 SingleLetterAlphabet()) at a3c184c>
78
79 The hex string is the memory address of the object, see help(id).
80 This provides a simple way to visually distinguish alignments of
81 the same size.
82 """
83 return "<%s instance (%i records of length %i, %s) at %x>" % \
84 (self.__class__, len(self._records),
85 self.get_alignment_length(), repr(self._alphabet), id(self))
86
87
88
89
90
92 """Return all of the sequences involved in the alignment.
93
94 The return value is a list of SeqRecord objects.
95 """
96 return self._records
97
99 """Iterate over alignment rows as SeqRecord objects
100
101 e.g.
102
103 for record in align :
104 print record.id
105 print record.seq
106 """
107 return iter(self._records)
108
110 """Retrieve a sequence by row number.
111
112 Returns:
113 o A Seq object for the requested sequence.
114
115 Raises:
116 o IndexError - If the specified number is out of range.
117 """
118 return self._records[number].seq
119
121 """Return the maximum length of the alignment.
122
123 All objects in the alignment should (hopefully) have the same
124 length. This function will go through and find this length
125 by finding the maximum length of sequences in the alignment.
126 """
127 max_length = 0
128
129 for record in self._records:
130 if len(record.seq) > max_length:
131 max_length = len(record.seq)
132
133 return max_length
134
135 - def add_sequence(self, descriptor, sequence, start = None, end = None,
136 weight = 1.0):
137 """Add a sequence to the alignment.
138
139 This doesn't do any kind of alignment, it just adds in the sequence
140 object, which is assumed to be prealigned with the existing
141 sequences.
142
143 Arguments:
144 o descriptor - The descriptive id of the sequence being added.
145 This will be used as the resulting SeqRecord's
146 .id property (and, for historical compatibility,
147 also the .description property)
148 o sequence - A string with sequence info.
149 o start - You can explicitly set the start point of the sequence.
150 This is useful (at least) for BLAST alignments, which can just
151 be partial alignments of sequences.
152 o end - Specify the end of the sequence, which is important
153 for the same reason as the start.
154 o weight - The weight to place on the sequence in the alignment.
155 By default, all sequences have the same weight. (0.0 => no weight,
156 1.0 => highest weight)
157 """
158 new_seq = Seq(sequence, self._alphabet)
159
160
161
162
163
164
165 new_record = SeqRecord(new_seq,
166 id = descriptor,
167 description = descriptor)
168
169
170
171
172
173
174
175
176 if start:
177 new_record.annotations['start'] = start
178 if end:
179 new_record.annotations['end'] = end
180
181
182 new_record.annotations['weight'] = weight
183
184 self._records.append(new_record)
185
187 """Returns a string containing a given column."""
188 col_str = ''
189 assert col >= 0 and col <= self.get_alignment_length()
190 for rec in self._records:
191 col_str += rec.seq[col]
192 return col_str
193
194 if __name__ == "__main__" :
195 print "Mini self test..."
196
197 raw_data = ["ACGATCAGCTAGCT", "CCGATCAGCTAGCT", "ACGATGAGCTAGCT"]
198 a = Alignment(Alphabet.generic_dna)
199 a.add_sequence("Alpha", raw_data[0], weight=2)
200 a.add_sequence("Beta", raw_data[1])
201 a.add_sequence("Gamma", raw_data[2])
202
203 print
204 print "str(a):"
205 print str(a)
206 print
207 print "repr(a):"
208 print repr(a)
209 print
210
211
212 for rec in a :
213 assert isinstance(rec, SeqRecord)
214 for r,rec in enumerate(a) :
215 assert isinstance(rec, SeqRecord)
216 assert raw_data[r] == rec.seq.tostring()
217 if r==0 : assert rec.annotations['weight']==2
218 print "Alignment iteraction as SeqRecord OK"
219