Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2   
  3  # 
  4  # PDBList.py 
  5  # 
  6  # A tool for tracking changes in the PDB Protein Structure Database. 
  7  # 
  8  # Version 2.0 
  9  # 
 10  # (c) 2003 Kristian Rother 
 11  # This work was supported by the German Ministry of Education 
 12  # and Research (BMBF). Project http://www.bcbio.de 
 13  #  
 14  # Contact the author 
 15  #    homepage : http://www.rubor.de/bioinf 
 16  #    email    : krother@genesilico.pl 
 17  # 
 18  # 
 19  # This Code is released under the conditions of the Biopython license. 
 20  # It may be distributed freely with respect to the original author. 
 21  # Any maintainer of the BioPython code may change this notice 
 22  # when appropriate. 
 23  # 
 24  # Last modified on Fri, Oct 24th 2006, Warszawa 
 25  # 
 26  # Removed 'write' options from retrieve_pdb_file method: it is not used. 
 27  # Also added a 'dir' options (pdb file is put in this directory if given), 
 28  # and an 'exist' option (test if the file is already there). This method 
 29  # now returns the name of the downloaded uncompressed file. 
 30  # 
 31  # -Thomas, 1/06/04 
 32  # 
 33  # 
 34  # Including bugfixes from Sunjoong Lee (9/2006) 
 35  # 
 36   
 37  __doc__="Access the PDB over the internet (for example to download structures)." 
 38   
 39  import urllib,string,re,os,sys 
 40   
41 -class PDBList:
42 """ 43 This class provides quick access to the structure lists on the 44 PDB server or its mirrors. The structure lists contain 45 four-letter PDB codes, indicating that structures are 46 new, have been modified or are obsolete. The lists are released 47 on a weekly basis. 48 49 It also provides a function to retrieve PDB files from the server. 50 To use it properly, prepare a directory /pdb or the like, 51 where PDB files are stored. 52 53 If You want to use this module from inside a proxy, add 54 the proxy variable to Your environment, e.g. in Unix 55 export HTTP_PROXY='http://realproxy.charite.de:888' 56 (This can also be added to ~/.bashrc) 57 """ 58 59 PDB_REF=""" 60 The Protein Data Bank: a computer-based archival file for macromolecular structures. 61 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 62 J. Mol. Biol. 112 pp. 535-542 (1977) 63 http://www.pdb.org/. 64 """ 65 66 alternative_download_url = "http://www.rcsb.org/pdb/files/" 67 # just append PDB code to this, and then it works. 68
69 - def __init__(self,server='ftp://ftp.rcsb.org', pdb=os.getcwd(), obsolete_pdb=None):
70 """Initialize the class with the default server or a custom one.""" 71 # remote pdb server 72 self.pdb_server = server 73 74 # local pdb file tree 75 self.local_pdb = pdb 76 77 # local file tree for obsolete pdb files 78 if obsolete_pdb: 79 self.obsolete_pdb = obsolete_pdb 80 else: 81 self.obsolete_pdb = self.local_pdb + os.sep + 'obsolete' 82 if not os.access(self.obsolete_pdb,os.F_OK): 83 os.makedirs(self.obsolete_pdb) 84 85 # variables for command-line options 86 self.overwrite = 0 87 self.flat_tree = 0
88 89
90 - def get_status_list(self,url):
91 """Retrieves a list of pdb codes in the weekly pdb status file 92 from the given URL. Used by get_recent_files. 93 94 Typical contents of the list files parsed by this method; 95 -rw-r--r-- 1 rcsb rcsb 330156 Oct 14 2003 pdb1cyq.ent 96 -rw-r--r-- 1 rcsb rcsb 333639 Oct 14 2003 pdb1cz0.ent 97 """ 98 url = urllib.urlopen(url) 99 file = url.readlines() 100 list = [] 101 102 # added by S. Lee 103 list = map(lambda x: x[3:7], \ 104 filter(lambda x: x[-4:] == '.ent', \ 105 map(lambda x: x.split()[-1], file))) 106 return list
107 108
109 - def get_recent_changes(self):
110 """Returns three lists of the newest weekly files (added,mod,obsolete). 111 112 Reads the directories with changed entries from the PDB server and 113 returns a tuple of three URL's to the files of new, modified and 114 obsolete entries from the most recent list. The directory with the 115 largest numerical name is used. 116 Returns None if something goes wrong. 117 118 Contents of the data/status dir (20031013 would be used); 119 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 120 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 121 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 122 123 124 """ 125 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/') 126 file = url.readlines() 127 128 try: 129 # added by S.Lee 130 recent = filter(lambda x: x.isdigit(), \ 131 map(lambda x: x.split()[-1], file))[-1] 132 133 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent) 134 # retrieve the lists 135 added = self.get_status_list(path+'added.pdb') 136 modified = self.get_status_list(path+'modified.pdb') 137 obsolete = self.get_status_list(path+'obsolete.pdb') 138 return [added,modified,obsolete] 139 except: 140 return None
141 142 143
144 - def get_all_entries(self):
145 """Retrieves a big file containing all the 146 PDB entries and some annotation to them. 147 Returns a list of PDB codes in the index file. 148 """ 149 entries = [] 150 print "retrieving index file. Takes about 5 MB." 151 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx') 152 # extract four-letter-codes 153 entries = map(lambda x: x[:4], \ 154 filter(lambda x: len(x)>4, url.readlines()[2:])) 155 156 return entries
157 158 159
160 - def get_all_obsolete(self):
161 """Returns a list of all obsolete entries ever in the PDB. 162 163 Returns a list of all obsolete pdb codes that have ever been 164 in the PDB. 165 166 Gets and parses the file from the PDB server in the format 167 (the first pdb_code column is the one used). 168 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 169 OBSLTE 30-SEP-03 1Q1D 1QZR 170 OBSLTE 26-SEP-03 1DYV 1UN2 171 """ 172 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat') 173 # extract pdb codes 174 obsolete = map(lambda x: string.lower(x[21:25]), 175 filter(lambda x: x[:6] == 'OBSLTE', url.readlines())) 176 177 return obsolete
178 179 180
181 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.Z', 182 uncompress="gunzip", pdir=None):
183 """Retrieves a PDB structure file from the PDB server and 184 stores it in a local file tree. 185 The PDB structure is returned as a single string. 186 If obsolete is 1, the file will be by default saved in a special file tree. 187 The compression should be '.Z' or '.gz'. 'uncompress' is 188 the command called to uncompress the files. 189 190 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 191 @type pdir: string 192 193 @return: filename 194 @rtype: string 195 """ 196 # get the structure 197 code=string.lower(pdb_code) 198 filename="pdb%s.ent%s"%(code,compression) 199 if not obsolete: 200 url=(self.pdb_server+ 201 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s' 202 % (code[1:3],code,compression)) 203 else: 204 url=(self.pdb_server+ 205 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s' 206 % (code[1:3],code,compression)) 207 208 # in which dir to put the pdb file? 209 if pdir is None: 210 if self.flat_tree: 211 if not obsolete: 212 path=self.local_pdb 213 else: 214 path=self.obsolete_pdb 215 else: 216 # Put in PDB style directory tree 217 if not obsolete: 218 path=self.local_pdb+os.sep+code[1:3] 219 else: 220 path=self.obsolete_pdb+os.sep+code[1:3] 221 else: 222 # Put in specified directory 223 path=pdir 224 225 if not os.access(path,os.F_OK): 226 os.makedirs(path) 227 228 filename=path+os.sep+filename 229 # the final uncompressed file 230 final_file=path+os.sep+"pdb%s.ent" % code 231 232 # check whether the file exists 233 if not self.overwrite: 234 if os.path.exists(final_file): 235 print "file exists, not retrieved",final_file 236 return final_file 237 238 # Retrieve the file 239 print 'retrieving',url 240 lines=urllib.urlopen(url).read() 241 open(filename,'wb').write(lines) 242 # uncompress the file 243 os.system("%s %s" % (uncompress, filename)) 244 245 return final_file
246 247 248
249 - def update_pdb(self):
250 """ 251 I guess this is the 'most wanted' function from this module. 252 It gets the weekly lists of new and modified pdb entries and 253 automatically downloads the according PDB files. 254 You can call this module as a weekly cronjob. 255 """ 256 changes = self.get_recent_changes() 257 new = changes[0] 258 modified = changes[1] 259 obsolete = changes[2] 260 261 for pdb_code in new+modified: 262 try: 263 print 'retrieving %s'%(pdb_code) 264 self.retrieve_pdb_file(pdb_code) 265 except: 266 print 'error %s'%(pdb_code) 267 # you can insert here some more log notes that 268 # something has gone wrong. 269 270 # move the obsolete files to a special folder 271 for pdb_code in obsolete: 272 if self.flat_tree: 273 old_file = self.local_pdb + os.sep + 'pdb%s.ent'%(pdb_code) 274 new_file = self.obsolete_pdb + os.sep + 'pdb%s.ent'%(pdb_code) 275 else: 276 old_file = self.local_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code) 277 new_file = self.obsolete_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code) 278 os.cmd('mv %s %s'%(old_file,new_file))
279 280
281 - def download_entire_pdb(self,listfile=None):
282 """Retrieves all PDB entries not present in the local PDB copy. 283 Writes a list file containing all PDB codes (optional, if listfile is given). 284 """ 285 entries = self.get_all_entries() 286 for pdb_code in entries: self.retrieve_pdb_file(pdb_code) 287 288 # write the list 289 if listfile: 290 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
291 292
293 - def download_obsolete_entries(self,listfile=None):
294 295 """Retrieves all obsolete PDB entries not present in the local obsolete PDB copy. 296 Writes a list file containing all PDB codes (optional, if listfile is given). 297 """ 298 entries = self.get_all_obsolete() 299 for pdb_code in entries: self.retrieve_pdb_file(pdb_code,obsolete=1) 300 301 # write the list 302 if listfile: 303 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
304 305 306 307 # 308 # this is actually easter egg code not used by any of the methods 309 # maybe someone will find it useful. 310 #
311 - def get_seqres_file(self,savefile='pdb_seqres.txt'):
312 """Retrieves a (big) file containing all the sequences 313 of PDB entries and writes it to a file.""" 314 print "retrieving sequence file. Takes about 15 MB." 315 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/pdb_seqres.txt') 316 file = url.readlines() 317 open(savefile,'w').writelines(file)
318 319 320 321 if __name__ == '__main__': 322 doc = """PDBList.py 323 (c) Kristian Rother 2003, Contributed to BioPython 324 325 Usage: 326 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 327 local pdb tree. 328 PDBList.py all <pdb_path> [options] - write all PDB entries to 329 local pdb tree. 330 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 331 entries to local pdb tree. 332 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 333 334 Options: 335 -d A single directory will be used as <pdb_path>, not a tree. 336 -o Overwrite existing structure files. 337 """ 338 print doc 339 340 if len(sys.argv)>2: 341 pdb_path = sys.argv[2] 342 pl = PDBList(pdb=pdb_path) 343 if len(sys.argv)>3: 344 for option in sys.argv[3:]: 345 if option == '-d': pl.flat_tree = 1 346 elif option == '-o': pl.overwrite = 1 347 348 else: 349 pdb_path = os.getcwd() 350 pl = PDBList() 351 pl.flat_tree = 1 352 353 if len(sys.argv) > 1: 354 if sys.argv[1] == 'update': 355 # update PDB 356 print "updating local PDB at "+pdb_path 357 pl.update_pdb() 358 359 elif sys.argv[1] == 'all': 360 # get the entire PDB 361 pl.download_entire_pdb() 362 363 elif sys.argv[1] == 'obsol': 364 # get all obsolete entries 365 pl.download_obsolete_entries(pdb_path) 366 367 elif re.search('^\d...$',sys.argv[1]): 368 # get single PDB entry 369 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path) 370