1
2
3
4
5
6
7
8 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
9
10 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
11 records (representing the sequence of the molecule used in an experiment) to
12 the ATOM records (representing the atoms experimentally observed).
13
14 This data is derived from the Protein Data Bank CIF files. Known errors in the
15 CIF files are corrected manually, with the original PDB file serving as the
16 final arbiter in case of discrepancies.
17
18 Residues are referenced by residue ID. This consists of a the PDB residue
19 sequence number (upto 4 digits) and an optional PDB insertion code (an
20 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
21
22 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
23
24 to_one_letter_code -- A mapping from the 3-letter amino acid codes found
25 in PDB files to 1-letter codes. The 3-letter codes
26 include chemically modified residues.
27 """
28
29 from copy import copy
30 from types import *
31
32 from Residues import Residues
33 from FileIndex import FileIndex
34
35
36
37 to_one_letter_code= {
38 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
39 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
40 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
41 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
42 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A',
43 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D',
44 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A',
45 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C',
46 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C',
47 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C',
48 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C',
49 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F',
50 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E',
51 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V',
52 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P',
53 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y',
54 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E',
55 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R',
56 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W',
57 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K',
58 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A',
59 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G',
60 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H',
61 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S',
62 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C',
63 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y',
64 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C',
65 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K',
66 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W',
67 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y',
68 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G',
69 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z',
70 'UNK':'X'
71 }
72
73
75 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
76
77 Letters are uppercased, and "." ("Unknown") is converted to "X".
78 """
79 if one_letter_code == '.' :
80 return 'X'
81 else :
82 return one_letter_code.upper()
83
84
86 """An RAF file index.
87
88 The RAF file itself is about 50 MB. This index provides rapid, random
89 access of RAF records without having to load the entire file into memory.
90
91 The index key is a concatenation of the PDB ID and chain ID. e.g
92 "2drcA", "155c_". RAF uses an underscore to indicate blank
93 chain IDs.
94 """
95
97
98
99 iterator_gen = Iterator
100 key_gen = lambda rec : rec[0:5]
101 FileIndex.__init__(self, raf_filename, iterator_gen, key_gen)
102
103
104 self.iterator_gen = lambda fh : Iterator(fh, Parser())
105
106
108 """Get the sequence map for a collection of residues.
109
110 residues -- A Residues instance, or a string that can be converted into
111 a Residues instance.
112 """
113 if type(residues) == StringType :
114 residues = Residues(residues)
115
116 pdbid = residues.pdbid
117 frags = residues.fragments
118 if not frags: frags =(('_','',''),)
119
120 seqMap = None
121 for frag in frags :
122 chainid = frag[0]
123 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
124 chainid = '_'
125 id = pdbid + chainid
126
127
128 sm = self[id]
129
130
131 start = 0
132 end = len(sm.res)
133 if frag[1] : start = int(sm.index(frag[1], chainid))
134 if frag[2] : end = int(sm.index(frag[2], chainid)+1)
135
136 sm = sm[start:end]
137
138 if seqMap == None :
139 seqMap = sm
140 else :
141 seqMap += sm
142
143 return seqMap
144
145
146
148 """An ASTRAL RAF (Rapid Access Format) Sequence Map.
149
150 This is a list like object; You can find the location of particular residues
151 with index(), slice this SeqMap into fragments, and glue fragments back
152 together with extend().
153
154 pdbid -- The PDB 4 character ID
155
156 pdb_datestamp -- From the PDB file
157
158 version -- The RAF format version. e.g. 0.01
159
160 flags -- RAF flags. (See release notes for more information.)
161
162 res -- A list of Res objects, one for each residue in this sequence map
163 """
164
166 self.pdbid = ''
167 self.pdb_datestamp = ''
168 self.version = ''
169 self.flags = ''
170 self.res = []
171
172 - def index(self, resid, chainid="_") :
173 for i in range(0, len(self.res)) :
174 if self.res[i].resid == resid and self.res[i].chainid == chainid :
175 return i
176 raise KeyError, "No such residue "+chainid+resid
177
179 s = copy(self)
180 s.res = s.res[i:j]
181 return s
182
184 """Append another Res object onto the list of residue mappings."""
185 self.res.append(res)
186
188 """Append another SeqMap onto the end of self.
189
190 Both SeqMaps must have the same PDB ID, PDB datestamp and
191 RAF version. The RAF flags are erased if they are inconsistent. This
192 may happen when fragments are taken from different chains.
193 """
194 if not isinstance(other, SeqMap):
195 raise TypeError, "Can only extend a SeqMap with a SeqMap."
196 if self.pdbid != other.pdbid :
197 raise TypeError, "Cannot add fragments from different proteins"
198 if self.version != other.version :
199 raise TypeError, "Incompatible rafs"
200 if self.pdb_datestamp != other.pdb_datestamp :
201 raise TypeError, "Different pdb dates!"
202 if self.flags != other.flags :
203 self.flags = ''
204 self.res += other.res
205
207 self.extend(other)
208 return self
209
214
215 - def getAtoms(self, pdb_handle, out_handle) :
216 """Extract all relevant ATOM and HETATOM records from a PDB file.
217
218 The PDB file is scanned for ATOM and HETATOM records. If the
219 chain ID, residue ID (seqNum and iCode), and residue type match
220 a residue in this sequence map, then the record is echoed to the
221 output handle.
222
223 This is typically used to find the coordinates of a domain, or other
224 residue subset.
225
226 pdb_handle -- A handle to the relevant PDB file.
227
228 out_handle -- All output is written to this file like object.
229 """
230
231
232
233 resSet = {}
234 for r in self.res :
235 if r.atom=='X' :
236 continue
237 chainid = r.chainid
238 if chainid == '_':
239 chainid = ' '
240 resid = r.resid
241 resSet[(chainid,resid)] = r
242
243 resFound = {}
244 for line in pdb_handle.xreadlines() :
245 if line.startswith("ATOM ") or line.startswith("HETATM") :
246 chainid = line[21:22]
247 resid = line[22:27].strip()
248 key = (chainid, resid)
249 if resSet.has_key(key):
250 res = resSet[key]
251 atom_aa = res.atom
252 resName = line[17:20]
253 if to_one_letter_code.has_key(resName) :
254 if to_one_letter_code[resName] == atom_aa :
255 out_handle.write(line)
256 resFound[key] = res
257
258 if len(resSet) != len(resFound) :
259
260
261
262
263 raise RuntimeError, 'I could not find at least one ATOM or ' \
264 +'HETATM record for each and every residue in this sequence map.'
265
266
267
269 """ A single residue mapping from a RAF record.
270
271 chainid -- A single character chain ID.
272
273 resid -- The residue ID.
274
275 atom -- amino acid one-letter code from ATOM records.
276
277 seqres -- amino acid one-letter code from SEQRES records.
278 """
280 self.chainid = ''
281 self.resid = ''
282 self.atom = ''
283 self.seqres = ''
284
285
286
288 """Iterates over a RAF file.
289 """
290 - def __init__(self, handle, parser=None):
291 """Create an object that iterates over a RAF file.
292
293 handle -- file-like object.
294
295 parser -- an optional Parser object to change the results into
296 another form. If set to None, then the raw contents
297 of the file will be returned.
298
299 """
300 if type(handle) is not FileType and type(handle) is not InstanceType:
301 raise TypeError, "I expected a file handle or file-like object"
302 self._handle = handle
303 self._parser = parser
304
306 """Retrieve the next record."""
307 while 1:
308 line = self._handle.readline()
309 if not line: return None
310 if line[0] !='#': break
311 if self._parser is not None :
312 return self._parser.parse(line)
313 return line
314
316 return iter(self.next, None)
317
318
320 """Parses a RAF record into a SeqMap object.
321 """
323 """Returns a SeqMap """
324 header_len = 38
325
326 seqMap = SeqMap()
327 line = line.rstrip()
328
329 if len(line)<header_len:
330 raise SyntaxError, "Incomplete header: "+rafLine
331
332 seqMap.pdbid = line[0:4]
333 chainid = line[4:5]
334
335 seqMap.version = line[6:10]
336
337
338 if(seqMap.version != "0.01" and seqMap.version !="0.02") :
339 raise SyntaxError, "Incompatible RAF version: "+seqMap.version
340
341 seqMap.pdb_datestamp = line[14:20]
342 seqMap.flags = line[21:27]
343
344 for i in range(header_len, len(line), 7) :
345 f = line[i : i+7]
346 if len(f)!=7:
347 raise SyntaxError, "Corrupt Field: ("+f+")"
348 r = Res()
349 r.chainid = chainid
350 r.resid = f[0:5].strip()
351 r.atom = normalize_letters(f[5:6])
352 r.seqres = normalize_letters(f[6:7])
353
354 seqMap.res.append(r)
355
356 return seqMap
357