Package Bio :: Package SCOP :: Module Raf
[hide private]
[frames] | no frames]

Source Code for Module Bio.SCOP.Raf

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Gavin E. Crooks 2001-10-10 
  7   
  8  """ASTRAL RAF (Rapid Access Format) Sequence Maps. 
  9   
 10  The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES 
 11  records (representing the sequence of the molecule used in an experiment) to  
 12  the ATOM records (representing the atoms experimentally observed).  
 13   
 14  This data is derived from the Protein Data Bank CIF files. Known errors in the 
 15  CIF files are corrected manually, with the original PDB file serving as the 
 16  final arbiter in case of discrepancies.  
 17   
 18  Residues are referenced by residue ID. This consists of a the PDB residue 
 19  sequence number (upto 4 digits) and an optional PDB  insertion code (an 
 20  ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" 
 21   
 22  See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html 
 23   
 24  to_one_letter_code -- A mapping from the 3-letter amino acid codes found 
 25                          in PDB files to 1-letter codes.  The 3-letter codes 
 26                          include chemically modified residues. 
 27  """ 
 28   
 29  from copy import copy  
 30  from types import * 
 31   
 32  from Residues import Residues 
 33  from FileIndex import FileIndex 
 34   
 35  # This table is taken from the RAF release notes, and includes the 
 36  # undocumented mapping "UNK" -> "X" 
 37  to_one_letter_code= { 
 38      'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', 
 39      'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', 
 40      'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', 
 41      'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G', 
 42      '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A', 
 43      'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D', 
 44      'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A', 
 45      'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C', 
 46      'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C', 
 47      'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C', 
 48      'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C', 
 49      'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F', 
 50      'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E', 
 51      'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V', 
 52      'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P', 
 53      'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y', 
 54      'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E', 
 55      'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R', 
 56      'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W', 
 57      'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K', 
 58      'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A', 
 59      'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G', 
 60      'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H', 
 61      'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S', 
 62      'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C', 
 63      'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y', 
 64      'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C', 
 65      'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K', 
 66      'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W', 
 67      'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y', 
 68      'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 
 69      'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z', 
 70      'UNK':'X' 
 71      } 
 72   
 73   
74 -def normalize_letters(one_letter_code) :
75 """Convert RAF one-letter amino acid codes into IUPAC standard codes. 76 77 Letters are uppercased, and "." ("Unknown") is converted to "X". 78 """ 79 if one_letter_code == '.' : 80 return 'X' 81 else : 82 return one_letter_code.upper()
83 84
85 -class SeqMapIndex(FileIndex) :
86 """An RAF file index. 87 88 The RAF file itself is about 50 MB. This index provides rapid, random 89 access of RAF records without having to load the entire file into memory. 90 91 The index key is a concatenation of the PDB ID and chain ID. e.g 92 "2drcA", "155c_". RAF uses an underscore to indicate blank 93 chain IDs. 94 """ 95
96 - def __init__(self, raf_filename) :
97 #We take a shortcut when making the index to avoid having to parse 98 #every line 99 iterator_gen = Iterator 100 key_gen = lambda rec : rec[0:5] 101 FileIndex.__init__(self, raf_filename, iterator_gen, key_gen) 102 103 #Replace Iterator with Parsing version 104 self.iterator_gen = lambda fh : Iterator(fh, Parser())
105 106
107 - def getSeqMap(self, residues) :
108 """Get the sequence map for a collection of residues. 109 110 residues -- A Residues instance, or a string that can be converted into 111 a Residues instance. 112 """ 113 if type(residues) == StringType : 114 residues = Residues(residues) 115 116 pdbid = residues.pdbid 117 frags = residues.fragments 118 if not frags: frags =(('_','',''),) # All residues of unnamed chain 119 120 seqMap = None 121 for frag in frags : 122 chainid = frag[0] 123 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': 124 chainid = '_' 125 id = pdbid + chainid 126 127 128 sm = self[id] 129 130 #Cut out fragment of interest 131 start = 0 132 end = len(sm.res) 133 if frag[1] : start = int(sm.index(frag[1], chainid)) 134 if frag[2] : end = int(sm.index(frag[2], chainid)+1) 135 136 sm = sm[start:end] 137 138 if seqMap == None : 139 seqMap = sm 140 else : 141 seqMap += sm 142 143 return seqMap
144 145 146
147 -class SeqMap :
148 """An ASTRAL RAF (Rapid Access Format) Sequence Map. 149 150 This is a list like object; You can find the location of particular residues 151 with index(), slice this SeqMap into fragments, and glue fragments back 152 together with extend(). 153 154 pdbid -- The PDB 4 character ID 155 156 pdb_datestamp -- From the PDB file 157 158 version -- The RAF format version. e.g. 0.01 159 160 flags -- RAF flags. (See release notes for more information.) 161 162 res -- A list of Res objects, one for each residue in this sequence map 163 """ 164
165 - def __init__(self) :
166 self.pdbid = '' 167 self.pdb_datestamp = '' 168 self.version = '' 169 self.flags = '' 170 self.res = []
171
172 - def index(self, resid, chainid="_") :
173 for i in range(0, len(self.res)) : 174 if self.res[i].resid == resid and self.res[i].chainid == chainid : 175 return i 176 raise KeyError, "No such residue "+chainid+resid
177
178 - def __getslice__(self, i, j) :
179 s = copy(self) 180 s.res = s.res[i:j] 181 return s
182
183 - def append(self, res) :
184 """Append another Res object onto the list of residue mappings.""" 185 self.res.append(res)
186
187 - def extend(self, other) :
188 """Append another SeqMap onto the end of self. 189 190 Both SeqMaps must have the same PDB ID, PDB datestamp and 191 RAF version. The RAF flags are erased if they are inconsistent. This 192 may happen when fragments are taken from different chains. 193 """ 194 if not isinstance(other, SeqMap): 195 raise TypeError, "Can only extend a SeqMap with a SeqMap." 196 if self.pdbid != other.pdbid : 197 raise TypeError, "Cannot add fragments from different proteins" 198 if self.version != other.version : 199 raise TypeError, "Incompatible rafs" 200 if self.pdb_datestamp != other.pdb_datestamp : 201 raise TypeError, "Different pdb dates!" 202 if self.flags != other.flags : 203 self.flags = '' 204 self.res += other.res
205
206 - def __iadd__(self, other) :
207 self.extend(other) 208 return self
209
210 - def __add__(self, other) :
211 s = copy(self) 212 s.extend(other) 213 return s
214
215 - def getAtoms(self, pdb_handle, out_handle) :
216 """Extract all relevant ATOM and HETATOM records from a PDB file. 217 218 The PDB file is scanned for ATOM and HETATOM records. If the 219 chain ID, residue ID (seqNum and iCode), and residue type match 220 a residue in this sequence map, then the record is echoed to the 221 output handle. 222 223 This is typically used to find the coordinates of a domain, or other 224 residue subset. 225 226 pdb_handle -- A handle to the relevant PDB file. 227 228 out_handle -- All output is written to this file like object. 229 """ 230 #This code should be refactored when (if?) biopython gets a PDB parser 231 232 #The set of residues that I have to find records for. 233 resSet = {} 234 for r in self.res : 235 if r.atom=='X' : #Unknown residue type 236 continue 237 chainid = r.chainid 238 if chainid == '_': 239 chainid = ' ' 240 resid = r.resid 241 resSet[(chainid,resid)] = r 242 243 resFound = {} 244 for line in pdb_handle.xreadlines() : 245 if line.startswith("ATOM ") or line.startswith("HETATM") : 246 chainid = line[21:22] 247 resid = line[22:27].strip() 248 key = (chainid, resid) 249 if resSet.has_key(key): 250 res = resSet[key] 251 atom_aa = res.atom 252 resName = line[17:20] 253 if to_one_letter_code.has_key(resName) : 254 if to_one_letter_code[resName] == atom_aa : 255 out_handle.write(line) 256 resFound[key] = res 257 258 if len(resSet) != len(resFound) : 259 #for k in resFound.keys(): 260 # del resSet[k] 261 #print resSet 262 263 raise RuntimeError, 'I could not find at least one ATOM or ' \ 264 +'HETATM record for each and every residue in this sequence map.'
265 266 267
268 -class Res :
269 """ A single residue mapping from a RAF record. 270 271 chainid -- A single character chain ID. 272 273 resid -- The residue ID. 274 275 atom -- amino acid one-letter code from ATOM records. 276 277 seqres -- amino acid one-letter code from SEQRES records. 278 """
279 - def __init__(self) :
280 self.chainid = '' 281 self.resid = '' 282 self.atom = '' 283 self.seqres = ''
284 285 286
287 -class Iterator:
288 """Iterates over a RAF file. 289 """
290 - def __init__(self, handle, parser=None):
291 """Create an object that iterates over a RAF file. 292 293 handle -- file-like object. 294 295 parser -- an optional Parser object to change the results into 296 another form. If set to None, then the raw contents 297 of the file will be returned. 298 299 """ 300 if type(handle) is not FileType and type(handle) is not InstanceType: 301 raise TypeError, "I expected a file handle or file-like object" 302 self._handle = handle 303 self._parser = parser
304
305 - def next(self):
306 """Retrieve the next record.""" 307 while 1: 308 line = self._handle.readline() 309 if not line: return None 310 if line[0] !='#': break # Not a comment line 311 if self._parser is not None : 312 return self._parser.parse(line) 313 return line
314
315 - def __iter__(self):
316 return iter(self.next, None)
317 318
319 -class Parser:
320 """Parses a RAF record into a SeqMap object. 321 """
322 - def parse(self, line):
323 """Returns a SeqMap """ 324 header_len = 38 325 326 seqMap = SeqMap() 327 line = line.rstrip() # no trailing whitespace 328 329 if len(line)<header_len: 330 raise SyntaxError, "Incomplete header: "+rafLine 331 332 seqMap.pdbid = line[0:4] 333 chainid = line[4:5] 334 335 seqMap.version = line[6:10] 336 337 #Raf format versions 0.01 and 0.02 are identical for practical purposes 338 if(seqMap.version != "0.01" and seqMap.version !="0.02") : 339 raise SyntaxError, "Incompatible RAF version: "+seqMap.version 340 341 seqMap.pdb_datestamp = line[14:20] 342 seqMap.flags = line[21:27] 343 344 for i in range(header_len, len(line), 7) : 345 f = line[i : i+7] 346 if len(f)!=7: 347 raise SyntaxError, "Corrupt Field: ("+f+")" 348 r = Res() 349 r.chainid = chainid 350 r.resid = f[0:5].strip() 351 r.atom = normalize_letters(f[5:6]) 352 r.seqres = normalize_letters(f[6:7]) 353 354 seqMap.res.append(r) 355 356 return seqMap
357