Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Changes made by James Casbon: 
  9  # - New Astral class 
 10  # - SQL functionality for both Scop and Astral classes 
 11  # - All sunids are int not strings 
 12  # 
 13  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 14  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 15   
 16  """ SCOP: Structural Classification of Proteins. 
 17   
 18  The SCOP database aims to provide a manually constructed classification of 
 19  all know protein structures into a hierarchy, the main levels of which 
 20  are family, superfamily and fold. 
 21   
 22  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 23   
 24  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 25   
 26  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 27   
 28  The Scop object in this module represents the entire SCOP classification. It 
 29  can be built from the three SCOP parsable files, modified is so desired, and 
 30  converted back to the same file formats. A single SCOP domain (represented 
 31  by the Domain class) can be obtained from Scop using the domain's SCOP 
 32  identifier (sid). 
 33   
 34   
 35  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 36                    description. The known node types are 'cl' (class), 'cf' 
 37                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),  
 38                    'sp' (species), 'px' (domain). Additional node types may 
 39                    be added in the future. 
 40   
 41  This module also provides code to access SCOP over the WWW. 
 42   
 43  Functions: 
 44  search        -- Access the main CGI script. 
 45  _open         -- Internally used function. 
 46   
 47  """ 
 48   
 49  from types import * 
 50  import os, string 
 51   
 52  import Des 
 53  import Cla 
 54  import Hie 
 55  from Residues import * 
 56  from Bio import SeqIO 
 57  from Bio.Alphabet import IUPAC 
 58  from Bio.Seq import Seq 
 59   
 60   
 61  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 62                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 63   
 64   
 65  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 66                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 67   
 68  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]  
 69   
 70  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 71   
 72  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 73               1e-20, 1e-25, 1e-50] 
 74   
 75  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 76                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 77                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 78                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 79   
 80  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 81                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 82                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 83                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 84   
 85   
86 -def cmp_sccs(sccs1, sccs2) :
87 """Order SCOP concise classification strings (sccs). 88 89 a.4.5.1 < a.4.5.11 < b.1.1.1 90 91 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 92 The letter represents the class, and the numbers are the fold, 93 superfamily, and family, respectively. 94 95 """ 96 97 s1 = sccs1.split(".") 98 s2 = sccs2.split(".") 99 100 if s1[0] != s2[0]: return cmp(s1[0], s2[0]) 101 102 s1 = map(int, s1[1:]) 103 s2 = map(int, s2[1:]) 104 105 return cmp(s1,s2)
106 107 108 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 109
110 -def parse_domain(str) :
111 """Convert an ASTRAL header string into a Scop domain. 112 113 An ASTRAL (http://astral.stanford.edu/) header contains a concise 114 description of a SCOP domain. A very similar format is used when a 115 Domain object is converted into a string. The Domain returned by this 116 method contains most of the SCOP information, but it will not be located 117 within the SCOP hierarchy (i.e. The parent node will be None). The 118 description is composed of the SCOP protein and species descriptions. 119 120 A typical ASTRAL header looks like -- 121 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 122 """ 123 124 m = _domain_re.match(str) 125 if (not m) : raise ValueError, "Domain: "+ str 126 127 dom = Domain() 128 dom.sid = m.group(1) 129 dom.sccs = m.group(2) 130 dom.residues = Residues(m.group(3)) 131 if not dom.residues.pdbid : 132 dom.residues.pdbid= dom.sid[1:5] 133 dom.description = m.group(4).strip() 134 135 return dom
136
137 -def _open_scop_file(scop_dir_path, version, filetype) :
138 filename = "dir.%s.scop.txt_%s" % (filetype,version) 139 handle = open(os.path.join( scop_dir_path, filename)) 140 return handle
141 142
143 -class Scop:
144 """The entire SCOP hierarchy. 145 146 root -- The root node of the hierarchy 147 """
148 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 149 dir_path=None, db_handle=None, version=None):
150 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 151 152 If no file handles are given, then a Scop object with a single 153 empty root node is returned. 154 155 If a directory and version are given (with dir_path=.., version=...) or 156 file handles for each file, the whole scop tree will be built in memory. 157 158 If a MySQLdb database handle is given, the tree will be built as needed, 159 minimising construction times. To build the SQL database to the methods 160 write_xxx_sql to create the tables. 161 162 """ 163 self._sidDict = {} 164 self._sunidDict = {} 165 166 if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 167 168 if dir_path is None and db_handle is None: 169 if cla_handle == None or des_handle==None or hie_handle==None: 170 raise RuntimeError,"Need CLA, DES and HIE files to build SCOP" 171 172 sunidDict = {} 173 174 self.db_handle = db_handle 175 try: 176 177 if db_handle: 178 # do nothing if we have a db handle, we'll do it all on the fly 179 pass 180 181 else: 182 # open SCOP parseable files 183 if dir_path: 184 if not version : 185 raise RuntimeError, "Need SCOP version to find parsable files in directory" 186 if cla_handle or des_handle or hie_handle: 187 raise RuntimeError,"Cannot specify SCOP directory and specific files" 188 189 cla_handle = _open_scop_file( dir_path, version, 'cla') 190 des_handle = _open_scop_file( dir_path, version, 'des') 191 hie_handle = _open_scop_file( dir_path, version, 'hie') 192 193 root = Node() 194 domains = [] 195 root.sunid=0 196 root.type='ro' 197 sunidDict[root.sunid] = root 198 self.root = root 199 root.description = 'SCOP Root' 200 201 # Build the rest of the nodes using the DES file 202 records = Des.parse(des_handle) 203 for record in records: 204 if record.nodetype =='px' : 205 n = Domain() 206 n.sid = record.name 207 domains.append(n) 208 else : 209 n = Node() 210 n.sunid = record.sunid 211 n.type = record.nodetype 212 n.sccs = record.sccs 213 n.description = record.description 214 215 sunidDict[n.sunid] = n 216 217 # Glue all of the Nodes together using the HIE file 218 records = Hie.parse(hie_handle) 219 for record in records: 220 if not sunidDict.has_key(record.sunid) : 221 print record.sunid 222 223 n = sunidDict[record.sunid] 224 225 if record.parent != '' : # Not root node 226 227 if not sunidDict.has_key(record.parent): 228 raise ValueError, "Incomplete data?" 229 230 n.parent = sunidDict[record.parent] 231 232 for c in record.children: 233 if not sunidDict.has_key(c) : 234 raise ValueError, "Incomplete data?" 235 n.children.append(sunidDict[c]) 236 237 238 # Fill in the gaps with information from the CLA file 239 sidDict = {} 240 records = Cla.parse(cla_handle) 241 for record in records: 242 n = sunidDict[record.sunid] 243 assert n.sccs == record.sccs 244 assert n.sid == record.sid 245 n.residues = record.residues 246 sidDict[n.sid] = n 247 248 # Clean up 249 self._sunidDict = sunidDict 250 self._sidDict = sidDict 251 self._domains = tuple(domains) 252 253 finally: 254 if dir_path : 255 # If we opened the files, we close the files 256 if cla_handle : cla_handle.close() 257 if des_handle : des_handle.close() 258 if hie_handle : hie_handle.close()
259 260
261 - def getRoot(self):
262 return self.getNodeBySunid(0)
263 264
265 - def getDomainBySid(self, sid) :
266 """Return a domain from its sid""" 267 if self._sidDict.has_key(sid): 268 return self._sidDict[sid] 269 if self.db_handle: 270 self.getDomainFromSQL(sid=sid) 271 if self._sidDict.has_key(sid): 272 return self._sidDict[sid] 273 else: 274 return None
275 276
277 - def getNodeBySunid(self, sunid) :
278 """Return a node from its sunid""" 279 if self._sunidDict.has_key(sunid): 280 return self._sunidDict[sunid] 281 if self.db_handle: 282 self.getDomainFromSQL(sunid=sunid) 283 if self._sunidDict.has_key(sunid): 284 return self._sunidDict[sunid] 285 else: 286 return None
287 288
289 - def getDomains(self) :
290 """Returns an ordered tuple of all SCOP Domains""" 291 if self.db_handle: 292 return self.getRoot().getDescendents('px') 293 else: 294 return self._domains
295 296 297
298 - def write_hie(self, handle) :
299 """Build an HIE SCOP parsable file from this object""" 300 nodes = self._sunidDict.values() 301 # We order nodes to ease comparison with original file 302 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 303 304 for n in nodes : 305 handle.write(str(n.toHieRecord()))
306 307
308 - def write_des(self, handle) :
309 """Build a DES SCOP parsable file from this object""" 310 nodes = self._sunidDict.values() 311 # Origional SCOP file is not ordered? 312 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 313 314 for n in nodes : 315 if n != self.root : 316 handle.write(str(n.toDesRecord()))
317 318
319 - def write_cla(self, handle) :
320 """Build a CLA SCOP parsable file from this object""" 321 nodes = self._sidDict.values() 322 # We order nodes to ease comparison with original file 323 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 324 325 for n in nodes : 326 handle.write(str(n.toClaRecord()))
327 328
329 - def getDomainFromSQL(self, sunid=None, sid=None):
330 """Load a node from the SQL backend using sunid or sid""" 331 if sunid==sid==None: return None 332 333 cur = self.db_handle.cursor() 334 335 if sid: 336 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 337 res = cur.fetchone() 338 if res is None: 339 return None 340 sunid = res[0] 341 342 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 343 data = cur.fetchone() 344 345 if data is not None: 346 n = None 347 348 #determine if Node or Domain 349 if data[1] != "px": 350 n = Node(scop=self) 351 352 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 353 children = [] 354 for c in cur.fetchall(): 355 children.append(c[0]) 356 n.children = children 357 358 359 else: 360 n = Domain(scop=self) 361 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 362 sunid) 363 364 [n.sid,n.residues,pdbid] = cur.fetchone() 365 n.residues = Residues(n.residues) 366 n.residues.pdbid=pdbid 367 self._sidDict[n.sid] = n 368 369 [n.sunid,n.type,n.sccs,n.description] = data 370 371 if data[1] != 'ro': 372 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 373 n.parent = cur.fetchone()[0] 374 375 n.sunid = int(n.sunid) 376 377 self._sunidDict[n.sunid] = n
378 379
380 - def getAscendentFromSQL(self, node, type):
381 """Get ascendents using SQL backend""" 382 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None 383 384 cur = self.db_handle.cursor() 385 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 386 result = cur.fetchone() 387 if result is not None: 388 return self.getNodeBySunid(result[0]) 389 else: 390 return None
391 392
393 - def getDescendentsFromSQL(self, node, type):
394 """Get descendents of a node using the database backend. This avoids 395 repeated iteration of SQL calls and is therefore much quicker than 396 repeatedly calling node.getChildren(). 397 """ 398 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return [] 399 400 des_list = [] 401 402 403 # SQL cla table knows nothing about 'ro' 404 if node.type == 'ro': 405 for c in node.getChildren(): 406 for d in self.getDescendentsFromSQL(c,type): 407 des_list.append(d) 408 return des_list 409 410 cur = self.db_handle.cursor() 411 412 413 if type != 'px': 414 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 415 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 416 data = cur.fetchall() 417 for d in data: 418 if not self._sunidDict.has_key(int(d[0])): 419 n = Node(scop=self) 420 [n.sunid,n.type,n.sccs,n.description] = d 421 n.sunid=int(n.sunid) 422 self._sunidDict[n.sunid] = n 423 424 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 425 n.parent = cur.fetchone()[0] 426 427 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 428 children = [] 429 for c in cur.fetchall(): 430 children.append(c[0]) 431 n.children = children 432 433 434 des_list.append( self._sunidDict[int(d[0])] ) 435 436 else: 437 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 438 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 439 node.sunid) 440 441 data = cur.fetchall() 442 for d in data: 443 if not self._sunidDict.has_key(int(d[0])): 444 n = Domain(scop=self) 445 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 446 #n.description,n.parent] = data 447 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 448 n.parent] = d[0:8] 449 n.residues = Residues(n.residues) 450 n.residues.pdbid = pdbid 451 n.sunid = int(n.sunid) 452 self._sunidDict[n.sunid] = n 453 self._sidDict[n.sid] = n 454 455 456 des_list.append( self._sunidDict[int(d[0])] ) 457 458 return des_list
459 460 461 462
463 - def write_hie_sql(self, handle):
464 """Write HIE data to SQL database""" 465 cur = handle.cursor() 466 467 cur.execute("DROP TABLE IF EXISTS hie") 468 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 469 INDEX (parent) )") 470 471 for p in self._sunidDict.values(): 472 for c in p.children: 473 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
474 475
476 - def write_cla_sql(self, handle):
477 """Write CLA data to SQL database""" 478 cur = handle.cursor() 479 480 cur.execute("DROP TABLE IF EXISTS cla") 481 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 482 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 483 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 484 485 for n in self._sidDict.values(): 486 c = n.toClaRecord() 487 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 488 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 489 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 490 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 491 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 492 n.sunid ))
493 494
495 - def write_des_sql(self, handle):
496 """Write DES data to SQL database""" 497 cur = handle.cursor() 498 499 cur.execute("DROP TABLE IF EXISTS des") 500 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 501 description VARCHAR(255),\ 502 PRIMARY KEY (sunid) )") 503 504 for n in self._sunidDict.values(): 505 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 506 ( n.sunid, n.type, n.sccs, n.description ) )
507 508 509
510 -class Node :
511 """ A node in the Scop hierarchy 512 513 sunid -- SCOP unique identifiers. e.g. '14986' 514 515 parent -- The parent node 516 517 children -- A list of child nodes 518 519 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 520 521 type -- A 2 letter node type code. e.g. 'px' for domains 522 523 description -- 524 525 """
526 - def __init__(self, scop=None) :
527 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 528 constructor, this will be used to lookup related references using the SQL 529 methods. If no instance is provided, it is assumed the whole tree exists 530 and is connected.""" 531 self.sunid='' 532 self.parent = None 533 self.children=[] 534 self.sccs = '' 535 self.type ='' 536 self.description ='' 537 self.scop=scop
538
539 - def __str__(self) :
540 s = [] 541 s.append(str(self.sunid)) 542 s.append(self.sccs) 543 s.append(self.type) 544 s.append(self.description) 545 546 return " ".join(s)
547
548 - def toHieRecord(self):
549 """Return an Hie.Record""" 550 rec = Hie.Record() 551 rec.sunid = str(self.sunid) 552 if self.getParent() : #Not root node 553 rec.parent = str(self.getParent().sunid) 554 else: 555 rec.parent = '-' 556 for c in self.getChildren() : 557 rec.children.append(str(c.sunid)) 558 return rec
559
560 - def toDesRecord(self):
561 """Return a Des.Record""" 562 rec = Des.Record() 563 rec.sunid = str(self.sunid) 564 rec.nodetype = self.type 565 rec.sccs = self.sccs 566 rec.description = self.description 567 return rec
568
569 - def getChildren(self):
570 """Return a list of children of this Node""" 571 if self.scop is None: 572 return self.children 573 else: 574 return map ( self.scop.getNodeBySunid, self.children )
575
576 - def getParent(self):
577 """Return the parent of this Node""" 578 if self.scop is None: 579 return self.parent 580 else: 581 return self.scop.getNodeBySunid( self.parent )
582
583 - def getDescendents( self, node_type) :
584 """ Return a list of all decendent nodes of the given type. Node type can a 585 two letter code or longer description. e.g. 'fa' or 'family' 586 """ 587 if _nodetype_to_code.has_key(node_type): 588 node_type = _nodetype_to_code[node_type] 589 590 nodes = [self] 591 if self.scop: 592 return self.scop.getDescendentsFromSQL(self,node_type) 593 while nodes[0].type != node_type: 594 if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy 595 child_list = [] 596 for n in nodes: 597 for child in n.getChildren(): 598 child_list.append( child ) 599 nodes = child_list 600 601 return nodes
602 603
604 - def getAscendent( self, node_type) :
605 """ Return the ancenstor node of the given type, or None.Node type can a 606 two letter code or longer description. e.g. 'fa' or 'family'""" 607 if _nodetype_to_code.has_key(node_type): 608 node_type = _nodetype_to_code[node_type] 609 610 if self.scop: 611 return self.scop.getAscendentFromSQL(self,node_type) 612 else: 613 n = self 614 if n.type == node_type: return None 615 while n.type != node_type: 616 if n.type == 'ro': return None # Fell of the top of the hierarchy 617 n = n.getParent() 618 619 return n
620 621 622
623 -class Domain(Node) :
624 """ A SCOP domain. A leaf node in the Scop hierarchy. 625 626 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 627 628 residues -- A Residue object. It defines the collection 629 of PDB atoms that make up this domain. 630 """
631 - def __init__(self,scop=None) :
632 Node.__init__(self,scop=scop) 633 self.sid = '' 634 self.residues = None
635
636 - def __str__(self) :
637 s = [] 638 s.append(self.sid) 639 s.append(self.sccs) 640 s.append("("+str(self.residues)+")") 641 642 if not self.getParent() : 643 s.append(self.description) 644 else : 645 sp = self.getParent() 646 dm = sp.getParent() 647 s.append(dm.description) 648 s.append("{"+sp.description+"}") 649 650 return " ".join(s)
651
652 - def toDesRecord(self):
653 """Return a Des.Record""" 654 rec = Node.toDesRecord(self) 655 rec.name = self.sid 656 return rec
657
658 - def toClaRecord(self) :
659 """Return a Cla.Record""" 660 rec = Cla.Record() 661 rec.sid = self.sid 662 rec.residues = self.residues 663 rec.sccs = self.sccs 664 rec.sunid = self.sunid 665 666 n = self 667 while n.sunid != 0: #Not root node 668 rec.hierarchy.append( (n.type, str(n.sunid)) ) 669 n = n.getParent() 670 671 rec.hierarchy.reverse() 672 673 return rec
674
675 -class Astral:
676 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 677 as well as clusterings by percent id or evalue. 678 """ 679
680 - def __init__( self, dir_path=None, version=None, scop=None, ind_file=None, 681 astral_file=None, db_handle=None):
682 """ 683 Initialise the astral database. 684 685 You must provide either a directory of SCOP files: 686 687 dir_path - string, the path to location of the scopseq-x.xx directory 688 (not the directory itself), and 689 version -a version number. 690 691 or, a FASTA file: 692 693 astral_file - string, a path to a fasta file (which will be loaded in memory) 694 695 or, a MYSQL database: 696 697 db_handle - a database handle for a MYSQL database containing a table 698 'astral' with the astral data in it. This can be created 699 using writeToSQL. 700 701 Note that the ind_file argument is deprecated. 702 """ 703 if ind_file : 704 raise RuntimeError, "The ind_file (index file) argument is deprecated" 705 706 if astral_file==dir_path==db_handle==None: 707 raise RunTimeError,"Need either file handle, or (dir_path + version)\ 708 or database handle to construct Astral" 709 if not scop: 710 raise RuntimeError, "Must provide a Scop instance to construct" 711 712 self.scop = scop 713 self.db_handle = db_handle 714 715 716 if not astral_file and not db_handle: 717 if dir_path == None or version == None: 718 raise RuntimeError, "must provide dir_path and version" 719 720 self.version = version 721 self.path = os.path.join( dir_path, "scopseq-%s" % version) 722 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 723 astral_file = os.path.join (self.path, astral_file) 724 725 if astral_file: 726 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 727 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 728 729 self.astral_file = astral_file 730 self.EvDatasets = {} 731 self.EvDatahash = {} 732 self.IdDatasets = {} 733 self.IdDatahash = {}
734 735
736 - def domainsClusteredByEv(self,id):
737 """get domains clustered by evalue""" 738 if not self.EvDatasets.has_key(id): 739 if self.db_handle: 740 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 741 742 else: 743 if not self.path: 744 raise RuntimeError, "No scopseq directory specified" 745 746 file_prefix = "astral-scopdom-seqres-sel-gs" 747 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 748 self.version) 749 filename = os.path.join(self.path,filename) 750 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 751 return self.EvDatasets[id]
752 753
754 - def domainsClusteredById(self,id):
755 """get domains clustered by percent id""" 756 if not self.IdDatasets.has_key(id): 757 if self.db_handle: 758 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 759 760 else: 761 if not self.path: 762 raise RuntimeError, "No scopseq directory specified" 763 764 file_prefix = "astral-scopdom-seqres-sel-gs" 765 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 766 filename = os.path.join(self.path,filename) 767 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 768 return self.IdDatasets[id]
769 770
771 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
772 """Get the scop domains from a file containing a list of sids""" 773 if file_handle == filename == none: 774 raise RuntimeError, "You must provide a filename or handle" 775 if not file_handle: 776 file_handle = open(filename) 777 doms = [] 778 while 1: 779 line = file_handle.readline() 780 if not line: 781 break 782 line = line.rstrip() 783 doms.append(line) 784 if filename: 785 file_handle.close() 786 787 doms = filter( lambda a: a[0]=='d', doms ) 788 doms = map( self.scop.getDomainBySid, doms ) 789 return doms
790
791 - def getAstralDomainsFromSQL(self, column):
792 """Load a set of astral domains from a column in the astral table of a MYSQL 793 database (which can be created with writeToSQL(...)""" 794 cur = self.db_handle.cursor() 795 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 796 data = cur.fetchall() 797 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 798 799 return data
800 801
802 - def getSeqBySid(self,domain):
803 """get the seq record of a given domain from its sid""" 804 if self.db_handle is None: 805 return self.fasta_dict[domain].seq 806 807 else: 808 cur = self.db_handle.cursor() 809 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 810 return Seq(cur.fetchone()[0])
811
812 - def getSeq(self,domain):
813 """Return seq associated with domain""" 814 return self.getSeqBySid(domain.sid)
815 816
817 - def hashedDomainsById(self,id):
818 """Get domains clustered by sequence identity in a dict""" 819 if not self.IdDatahash.has_key(id): 820 self.IdDatahash[id] = {} 821 for d in self.domainsClusteredById(id): 822 self.IdDatahash[id][d] = 1 823 return self.IdDatahash[id]
824
825 - def hashedDomainsByEv(self,id):
826 """Get domains clustered by evalue in a dict""" 827 if not self.EvDatahash.has_key(id): 828 self.EvDatahash[id] = {} 829 for d in self.domainsClusteredByEv(id): 830 self.EvDatahash[id][d] = 1 831 return self.EvDatahash[id]
832 833
834 - def isDomainInId(self,dom,id):
835 """Returns true if the domain is in the astral clusters for percent ID""" 836 return self.hashedDomainsById(id).has_key(dom)
837
838 - def isDomainInEv(self,dom,id):
839 """Returns true if the domain is in the ASTRAL clusters for evalues""" 840 return self.hashedDomainsByEv(id).has_key(dom)
841 842
843 - def writeToSQL(self, db_handle):
844 """Write the ASTRAL database to a MYSQL database""" 845 cur = db_handle.cursor() 846 847 cur.execute("DROP TABLE IF EXISTS astral") 848 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 849 850 for dom in self.fasta_dict.keys(): 851 cur.execute( "INSERT INTO astral (sid,seq) values (%s,%s)", 852 (dom, self.fasta_dict[dom].seq.data)) 853 854 855 for i in astralBibIds: 856 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 857 858 for d in self.domainsClusteredById(i): 859 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 860 d.sid) 861 862 for ev in astralEvs: 863 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 864 865 for d in self.domainsClusteredByEv(ev): 866 867 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 868 d.sid)
869
870 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 871 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
872 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 873 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 874 875 Access search.cgi and return a handle to the results. See the 876 online help file for an explanation of the parameters: 877 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 878 879 Raises an IOError if there's a network error. 880 881 """ 882 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 883 'dir' : dir, 'loc' : loc} 884 variables = {} 885 for k in params.keys(): 886 if params[k] is not None: 887 variables[k] = params[k] 888 variables.update(keywds) 889 return _open(cgi, variables)
890
891 -def _open(cgi, params={}, get=1):
892 """_open(cgi, params={}, get=1) -> UndoHandle 893 894 Open a handle to SCOP. cgi is the URL for the cgi script to access. 895 params is a dictionary with the options to pass to it. get is a boolean 896 that describes whether a GET should be used. Does some 897 simple error checking, and will raise an IOError if it encounters one. 898 899 """ 900 import urllib 901 from Bio import File 902 # Open a handle to SCOP. 903 options = urllib.urlencode(params) 904 if get: # do a GET 905 fullcgi = cgi 906 if options: 907 fullcgi = "%s?%s" % (cgi, options) 908 handle = urllib.urlopen(fullcgi) 909 else: # do a POST 910 handle = urllib.urlopen(cgi, options) 911 912 # Wrap the handle inside an UndoHandle. 913 uhandle = File.UndoHandle(handle) 914 # Should I check for 404? timeout? etc? 915 return uhandle
916