Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Changes made by James Casbon: 
  9  # - New Astral class 
 10  # - SQL functionality for both Scop and Astral classes 
 11  # - All sunids are int not strings 
 12   
 13   
 14   
 15  """ SCOP: Structural Classification of Proteins. 
 16   
 17  The SCOP database aims to provide a manually constructed classification of 
 18  all know protein structures into a hierarchy, the main levels of which 
 19  are family, superfamily and fold. 
 20   
 21  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 22   
 23  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 24   
 25  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 26   
 27  The Scop object in this module represents the entire SCOP classification. It 
 28  can be built from the three SCOP parsable files, modified is so desired, and 
 29  converted back to the same file formats. A single SCOP domain (represented 
 30  by the Domain class) can be obtained from Scop using the domain's SCOP 
 31  identifier (sid). 
 32   
 33   
 34  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 35                    description. The known node types are 'cl' (class), 'cf' 
 36                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),  
 37                    'sp' (species), 'px' (domain). Additional node types may 
 38                    be added in the future. 
 39  """ 
 40  from types import * 
 41  import os, string 
 42   
 43  import Des 
 44  import Cla 
 45  import Hie 
 46  from Residues import * 
 47  from Bio import SeqIO 
 48  from Bio.Alphabet import IUPAC 
 49  from Bio.Seq import Seq 
 50   
 51   
 52  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 53                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 54   
 55   
 56  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 57                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 58   
 59  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]  
 60   
 61  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 62   
 63  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 64               1e-20, 1e-25, 1e-50] 
 65   
 66  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 67                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 68                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 69                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 70   
 71  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 72                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 73                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 74                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 75   
 76   
77 -def cmp_sccs(sccs1, sccs2) :
78 """Order SCOP concise classification strings (sccs). 79 80 a.4.5.1 < a.4.5.11 < b.1.1.1 81 82 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 83 The letter represents the class, and the numbers are the fold, 84 superfamily, and family, respectively. 85 86 """ 87 88 s1 = sccs1.split(".") 89 s2 = sccs2.split(".") 90 91 if s1[0] != s2[0]: return cmp(s1[0], s2[0]) 92 93 s1 = map(int, s1[1:]) 94 s2 = map(int, s2[1:]) 95 96 return cmp(s1,s2)
97 98 99 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 100
101 -def parse_domain(str) :
102 """Convert an ASTRAL header string into a Scop domain. 103 104 An ASTRAL (http://astral.stanford.edu/) header contains a concise 105 description of a SCOP domain. A very similar format is used when a 106 Domain object is converted into a string. The Domain returned by this 107 method contains most of the SCOP information, but it will not be located 108 within the SCOP hierarchy (i.e. The parent node will be None). The 109 description is composed of the SCOP protein and species descriptions. 110 111 A typical ASTRAL header looks like -- 112 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 113 """ 114 115 m = _domain_re.match(str) 116 if (not m) : raise SyntaxError, "Domain: "+ str 117 118 dom = Domain() 119 dom.sid = m.group(1) 120 dom.sccs = m.group(2) 121 dom.residues = Residues(m.group(3)) 122 if not dom.residues.pdbid : 123 dom.residues.pdbid= dom.sid[1:5] 124 dom.description = m.group(4).strip() 125 126 return dom
127
128 -def _open_scop_file(scop_dir_path, version, filetype) :
129 filename = "dir.%s.scop.txt_%s" % (filetype,version) 130 handle = open(os.path.join( scop_dir_path, filename)) 131 return handle
132 133
134 -class Scop:
135 """The entire SCOP hierarchy. 136 137 root -- The root node of the hierarchy 138 """
139 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 140 dir_path=None, db_handle=None, version=None):
141 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 142 143 If no file handles are given, then a Scop object with a single 144 empty root node is returned. 145 146 If a directory and version are given (with dir_path=.., version=...) or 147 file handles for each file, the whole scop tree will be built in memory. 148 149 If a MySQLdb database handle is given, the tree will be built as needed, 150 minimising construction times. To build the SQL database to the methods 151 write_xxx_sql to create the tables. 152 153 """ 154 self._sidDict = {} 155 self._sunidDict = {} 156 157 if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 158 159 if dir_path is None and db_handle is None: 160 if cla_handle == None or des_handle==None or hie_handle==None: 161 raise RuntimeError,"Need CLA, DES and HIE files to build SCOP" 162 163 sunidDict = {} 164 165 self.db_handle = db_handle 166 try: 167 168 if db_handle: 169 # do nothing if we have a db handle, we'll do it all on the fly 170 pass 171 172 else: 173 # open SCOP parseable files 174 if dir_path: 175 if not version : 176 raise RuntimeError, "Need SCOP version to find parsable files in directory" 177 if cla_handle or des_handle or hie_handle: 178 raise RuntimeError,"Cannot specify SCOP directory and specific files" 179 180 cla_handle = _open_scop_file( dir_path, version, 'cla') 181 des_handle = _open_scop_file( dir_path, version, 'des') 182 hie_handle = _open_scop_file( dir_path, version, 'hie') 183 184 root = Node() 185 domains = [] 186 root.sunid=0 187 root.type='ro' 188 sunidDict[root.sunid] = root 189 self.root = root 190 root.description = 'SCOP Root' 191 192 # Build the rest of the nodes using the DES file 193 i = Des.Iterator(des_handle, Des.Parser()) 194 while 1 : 195 rec = i.next() 196 if rec is None : break 197 if rec.nodetype =='px' : 198 n = Domain() 199 n.sid = rec.name 200 domains.append(n) 201 else : 202 n = Node() 203 n.sunid = rec.sunid 204 n.type = rec.nodetype 205 n.sccs = rec.sccs 206 n.description = rec.description 207 208 sunidDict[n.sunid] = n 209 210 # Glue all of the Nodes together using the HIE file 211 i = Hie.Iterator(hie_handle, Hie.Parser()) 212 while 1 : 213 rec = i.next() 214 if rec is None : break 215 if not sunidDict.has_key(rec.sunid) : 216 print rec.sunid 217 218 n = sunidDict[rec.sunid] 219 220 if rec.parent != '' : # Not root node 221 222 if not sunidDict.has_key(rec.parent): 223 raise SyntaxError, "Incomplete data?" 224 225 n.parent = sunidDict[rec.parent] 226 227 for c in rec.children: 228 if not sunidDict.has_key(c) : 229 raise SyntaxError, "Incomplete data?" 230 n.children.append(sunidDict[c]) 231 232 233 # Fill in the gaps with information from the CLA file 234 sidDict = {} 235 i = Cla.Iterator(cla_handle, Cla.Parser()) 236 while 1 : 237 rec = i.next() 238 if rec is None : break 239 n = sunidDict[rec.sunid] 240 assert n.sccs == rec.sccs 241 assert n.sid == rec.sid 242 n.residues = rec.residues 243 sidDict[n.sid] = n 244 245 # Clean up 246 self._sunidDict = sunidDict 247 self._sidDict = sidDict 248 self._domains = tuple(domains) 249 250 finally: 251 if dir_path : 252 # If we opened the files, we close the files 253 if cla_handle : cla_handle.close() 254 if des_handle : des_handle.close() 255 if hie_handle : hie_handle.close()
256 257
258 - def getRoot(self):
259 return self.getNodeBySunid(0)
260 261
262 - def getDomainBySid(self, sid) :
263 """Return a domain from its sid""" 264 if self._sidDict.has_key(sid): 265 return self._sidDict[sid] 266 if self.db_handle: 267 self.getDomainFromSQL(sid=sid) 268 if self._sidDict.has_key(sid): 269 return self._sidDict[sid] 270 else: 271 return None
272 273
274 - def getNodeBySunid(self, sunid) :
275 """Return a node from its sunid""" 276 if self._sunidDict.has_key(sunid): 277 return self._sunidDict[sunid] 278 if self.db_handle: 279 self.getDomainFromSQL(sunid=sunid) 280 if self._sunidDict.has_key(sunid): 281 return self._sunidDict[sunid] 282 else: 283 return None
284 285
286 - def getDomains(self) :
287 """Returns an ordered tuple of all SCOP Domains""" 288 if self.db_handle: 289 return self.getRoot().getDescendents('px') 290 else: 291 return self._domains
292 293 294
295 - def write_hie(self, handle) :
296 """Build an HIE SCOP parsable file from this object""" 297 nodes = self._sunidDict.values() 298 # We order nodes to ease comparison with original file 299 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 300 301 for n in nodes : 302 handle.write(str(n.toHieRecord()))
303 304
305 - def write_des(self, handle) :
306 """Build a DES SCOP parsable file from this object""" 307 nodes = self._sunidDict.values() 308 # Origional SCOP file is not ordered? 309 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 310 311 for n in nodes : 312 if n != self.root : 313 handle.write(str(n.toDesRecord()))
314 315
316 - def write_cla(self, handle) :
317 """Build a CLA SCOP parsable file from this object""" 318 nodes = self._sidDict.values() 319 # We order nodes to ease comparison with original file 320 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid)) 321 322 for n in nodes : 323 handle.write(str(n.toClaRecord()))
324 325
326 - def getDomainFromSQL(self, sunid=None, sid=None):
327 """Load a node from the SQL backend using sunid or sid""" 328 if sunid==sid==None: return None 329 330 cur = self.db_handle.cursor() 331 332 if sid: 333 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 334 res = cur.fetchone() 335 if res is None: 336 return None 337 sunid = res[0] 338 339 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 340 data = cur.fetchone() 341 342 if data is not None: 343 n = None 344 345 #determine if Node or Domain 346 if data[1] != "px": 347 n = Node(scop=self) 348 349 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 350 children = [] 351 for c in cur.fetchall(): 352 children.append(c[0]) 353 n.children = children 354 355 356 else: 357 n = Domain(scop=self) 358 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 359 sunid) 360 361 [n.sid,n.residues,pdbid] = cur.fetchone() 362 n.residues = Residues(n.residues) 363 n.residues.pdbid=pdbid 364 self._sidDict[n.sid] = n 365 366 [n.sunid,n.type,n.sccs,n.description] = data 367 368 if data[1] != 'ro': 369 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 370 n.parent = cur.fetchone()[0] 371 372 n.sunid = int(n.sunid) 373 374 self._sunidDict[n.sunid] = n
375 376
377 - def getAscendentFromSQL(self, node, type):
378 """Get ascendents using SQL backend""" 379 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None 380 381 cur = self.db_handle.cursor() 382 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 383 result = cur.fetchone() 384 if result is not None: 385 return self.getNodeBySunid(result[0]) 386 else: 387 return None
388 389
390 - def getDescendentsFromSQL(self, node, type):
391 """Get descendents of a node using the database backend. This avoids 392 repeated iteration of SQL calls and is therefore much quicker than 393 repeatedly calling node.getChildren(). 394 """ 395 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return [] 396 397 des_list = [] 398 399 400 # SQL cla table knows nothing about 'ro' 401 if node.type == 'ro': 402 for c in node.getChildren(): 403 for d in self.getDescendentsFromSQL(c,type): 404 des_list.append(d) 405 return des_list 406 407 cur = self.db_handle.cursor() 408 409 410 if type != 'px': 411 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 412 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 413 data = cur.fetchall() 414 for d in data: 415 if not self._sunidDict.has_key(int(d[0])): 416 n = Node(scop=self) 417 [n.sunid,n.type,n.sccs,n.description] = d 418 n.sunid=int(n.sunid) 419 self._sunidDict[n.sunid] = n 420 421 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 422 n.parent = cur.fetchone()[0] 423 424 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 425 children = [] 426 for c in cur.fetchall(): 427 children.append(c[0]) 428 n.children = children 429 430 431 des_list.append( self._sunidDict[int(d[0])] ) 432 433 else: 434 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 435 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 436 node.sunid) 437 438 data = cur.fetchall() 439 for d in data: 440 if not self._sunidDict.has_key(int(d[0])): 441 n = Domain(scop=self) 442 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 443 #n.description,n.parent] = data 444 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 445 n.parent] = d[0:8] 446 n.residues = Residues(n.residues) 447 n.residues.pdbid = pdbid 448 n.sunid = int(n.sunid) 449 self._sunidDict[n.sunid] = n 450 self._sidDict[n.sid] = n 451 452 453 des_list.append( self._sunidDict[int(d[0])] ) 454 455 return des_list
456 457 458 459
460 - def write_hie_sql(self, handle):
461 """Write HIE data to SQL database""" 462 cur = handle.cursor() 463 464 cur.execute("DROP TABLE IF EXISTS hie") 465 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 466 INDEX (parent) )") 467 468 for p in self._sunidDict.values(): 469 for c in p.children: 470 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
471 472
473 - def write_cla_sql(self, handle):
474 """Write CLA data to SQL database""" 475 cur = handle.cursor() 476 477 cur.execute("DROP TABLE IF EXISTS cla") 478 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 479 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 480 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 481 482 for n in self._sidDict.values(): 483 c = n.toClaRecord() 484 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 485 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 486 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 487 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 488 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 489 n.sunid ))
490 491
492 - def write_des_sql(self, handle):
493 """Write DES data to SQL database""" 494 cur = handle.cursor() 495 496 cur.execute("DROP TABLE IF EXISTS des") 497 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 498 description VARCHAR(255),\ 499 PRIMARY KEY (sunid) )") 500 501 for n in self._sunidDict.values(): 502 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 503 ( n.sunid, n.type, n.sccs, n.description ) )
504 505 506
507 -class Node :
508 """ A node in the Scop hierarchy 509 510 sunid -- SCOP unique identifiers. e.g. '14986' 511 512 parent -- The parent node 513 514 children -- A list of child nodes 515 516 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 517 518 type -- A 2 letter node type code. e.g. 'px' for domains 519 520 description -- 521 522 """
523 - def __init__(self, scop=None) :
524 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 525 constructor, this will be used to lookup related references using the SQL 526 methods. If no instance is provided, it is assumed the whole tree exists 527 and is connected.""" 528 self.sunid='' 529 self.parent = None 530 self.children=[] 531 self.sccs = '' 532 self.type ='' 533 self.description ='' 534 self.scop=scop
535
536 - def __str__(self) :
537 s = [] 538 s.append(str(self.sunid)) 539 s.append(self.sccs) 540 s.append(self.type) 541 s.append(self.description) 542 543 return " ".join(s)
544
545 - def toHieRecord(self):
546 """Return an Hie.Record""" 547 rec = Hie.Record() 548 rec.sunid = str(self.sunid) 549 if self.getParent() : #Not root node 550 rec.parent = str(self.getParent().sunid) 551 else: 552 rec.parent = '-' 553 for c in self.getChildren() : 554 rec.children.append(str(c.sunid)) 555 return rec
556
557 - def toDesRecord(self):
558 """Return a Des.Record""" 559 rec = Des.Record() 560 rec.sunid = str(self.sunid) 561 rec.nodetype = self.type 562 rec.sccs = self.sccs 563 rec.description = self.description 564 return rec
565
566 - def getChildren(self):
567 """Return a list of children of this Node""" 568 if self.scop is None: 569 return self.children 570 else: 571 return map ( self.scop.getNodeBySunid, self.children )
572
573 - def getParent(self):
574 """Return the parent of this Node""" 575 if self.scop is None: 576 return self.parent 577 else: 578 return self.scop.getNodeBySunid( self.parent )
579
580 - def getDescendents( self, node_type) :
581 """ Return a list of all decendent nodes of the given type. Node type can a 582 two letter code or longer description. e.g. 'fa' or 'family' 583 """ 584 if _nodetype_to_code.has_key(node_type): 585 node_type = _nodetype_to_code[node_type] 586 587 nodes = [self] 588 if self.scop: 589 return self.scop.getDescendentsFromSQL(self,node_type) 590 while nodes[0].type != node_type: 591 if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy 592 child_list = [] 593 for n in nodes: 594 for child in n.getChildren(): 595 child_list.append( child ) 596 nodes = child_list 597 598 return nodes
599 600
601 - def getAscendent( self, node_type) :
602 """ Return the ancenstor node of the given type, or None.Node type can a 603 two letter code or longer description. e.g. 'fa' or 'family'""" 604 if _nodetype_to_code.has_key(node_type): 605 node_type = _nodetype_to_code[node_type] 606 607 if self.scop: 608 return self.scop.getAscendentFromSQL(self,node_type) 609 else: 610 n = self 611 if n.type == node_type: return None 612 while n.type != node_type: 613 if n.type == 'ro': return None # Fell of the top of the hierarchy 614 n = n.getParent() 615 616 return n
617 618 619
620 -class Domain(Node) :
621 """ A SCOP domain. A leaf node in the Scop hierarchy. 622 623 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 624 625 residues -- A Residue object. It defines the collection 626 of PDB atoms that make up this domain. 627 """
628 - def __init__(self,scop=None) :
629 Node.__init__(self,scop=scop) 630 self.sid = '' 631 self.residues = None
632
633 - def __str__(self) :
634 s = [] 635 s.append(self.sid) 636 s.append(self.sccs) 637 s.append("("+str(self.residues)+")") 638 639 if not self.getParent() : 640 s.append(self.description) 641 else : 642 sp = self.getParent() 643 dm = sp.getParent() 644 s.append(dm.description) 645 s.append("{"+sp.description+"}") 646 647 return " ".join(s)
648
649 - def toDesRecord(self):
650 """Return a Des.Record""" 651 rec = Node.toDesRecord(self) 652 rec.name = self.sid 653 return rec
654
655 - def toClaRecord(self) :
656 """Return a Cla.Record""" 657 rec = Cla.Record() 658 rec.sid = self.sid 659 rec.residues = self.residues 660 rec.sccs = self.sccs 661 rec.sunid = self.sunid 662 663 n = self 664 while n.sunid != 0: #Not root node 665 rec.hierarchy.append( (n.type, str(n.sunid)) ) 666 n = n.getParent() 667 668 rec.hierarchy.reverse() 669 670 return rec
671
672 -class Astral:
673 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 674 as well as clusterings by percent id or evalue. 675 """ 676
677 - def __init__( self, dir_path=None, version=None, scop=None, ind_file=None, 678 astral_file=None, db_handle=None):
679 """ 680 Initialise the astral database. 681 682 You must provide either a directory of SCOP files: 683 684 dir_path - string, the path to location of the scopseq-x.xx directory 685 (not the directory itself), and 686 version -a version number. 687 688 or, a FASTA file: 689 690 astral_file - string, a path to a fasta file (which will be loaded in memory) 691 692 or, a MYSQL database: 693 694 db_handle - a database handle for a MYSQL database containing a table 695 'astral' with the astral data in it. This can be created 696 using writeToSQL. 697 698 Note that the ind_file argument is deprecated. 699 """ 700 if ind_file : 701 raise RuntimeError, "The ind_file (index file) argument is deprecated" 702 703 if astral_file==dir_path==db_handle==None: 704 raise RunTimeError,"Need either file handle, or (dir_path + version)\ 705 or database handle to construct Astral" 706 if not scop: 707 raise RuntimeError, "Must provide a Scop instance to construct" 708 709 self.scop = scop 710 self.db_handle = db_handle 711 712 713 if not astral_file and not db_handle: 714 if dir_path == None or version == None: 715 raise RuntimeError, "must provide dir_path and version" 716 717 self.version = version 718 self.path = os.path.join( dir_path, "scopseq-%s" % version) 719 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 720 astral_file = os.path.join (self.path, astral_file) 721 722 if astral_file: 723 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 724 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 725 726 self.astral_file = astral_file 727 self.EvDatasets = {} 728 self.EvDatahash = {} 729 self.IdDatasets = {} 730 self.IdDatahash = {}
731 732
733 - def domainsClusteredByEv(self,id):
734 """get domains clustered by evalue""" 735 if not self.EvDatasets.has_key(id): 736 if self.db_handle: 737 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 738 739 else: 740 if not self.path: 741 raise RuntimeError, "No scopseq directory specified" 742 743 file_prefix = "astral-scopdom-seqres-sel-gs" 744 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 745 self.version) 746 filename = os.path.join(self.path,filename) 747 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 748 return self.EvDatasets[id]
749 750
751 - def domainsClusteredById(self,id):
752 """get domains clustered by percent id""" 753 if not self.IdDatasets.has_key(id): 754 if self.db_handle: 755 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 756 757 else: 758 if not self.path: 759 raise RuntimeError, "No scopseq directory specified" 760 761 file_prefix = "astral-scopdom-seqres-sel-gs" 762 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 763 filename = os.path.join(self.path,filename) 764 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 765 return self.IdDatasets[id]
766 767
768 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
769 """Get the scop domains from a file containing a list of sids""" 770 if file_handle == filename == none: 771 raise RuntimeError, "You must provide a filename or handle" 772 if not file_handle: 773 file_handle = open(filename) 774 doms = [] 775 while 1: 776 line = file_handle.readline() 777 if not line: 778 break 779 line = line.rstrip() 780 doms.append(line) 781 if filename: 782 file_handle.close() 783 784 doms = filter( lambda a: a[0]=='d', doms ) 785 doms = map( self.scop.getDomainBySid, doms ) 786 return doms
787
788 - def getAstralDomainsFromSQL(self, column):
789 """Load a set of astral domains from a column in the astral table of a MYSQL 790 database (which can be created with writeToSQL(...)""" 791 cur = self.db_handle.cursor() 792 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 793 data = cur.fetchall() 794 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 795 796 return data
797 798
799 - def getSeqBySid(self,domain):
800 """get the seq record of a given domain from its sid""" 801 if self.db_handle is None: 802 return self.fasta_dict[domain].seq 803 804 else: 805 cur = self.db_handle.cursor() 806 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 807 return Seq(cur.fetchone()[0])
808
809 - def getSeq(self,domain):
810 """Return seq associated with domain""" 811 return self.getSeqBySid(domain.sid)
812 813
814 - def hashedDomainsById(self,id):
815 """Get domains clustered by sequence identity in a dict""" 816 if not self.IdDatahash.has_key(id): 817 self.IdDatahash[id] = {} 818 for d in self.domainsClusteredById(id): 819 self.IdDatahash[id][d] = 1 820 return self.IdDatahash[id]
821
822 - def hashedDomainsByEv(self,id):
823 """Get domains clustered by evalue in a dict""" 824 if not self.EvDatahash.has_key(id): 825 self.EvDatahash[id] = {} 826 for d in self.domainsClusteredByEv(id): 827 self.EvDatahash[id][d] = 1 828 return self.EvDatahash[id]
829 830
831 - def isDomainInId(self,dom,id):
832 """Returns true if the domain is in the astral clusters for percent ID""" 833 return self.hashedDomainsById(id).has_key(dom)
834
835 - def isDomainInEv(self,dom,id):
836 """Returns true if the domain is in the ASTRAL clusters for evalues""" 837 return self.hashedDomainsByEv(id).has_key(dom)
838 839
840 - def writeToSQL(self, db_handle):
841 """Write the ASTRAL database to a MYSQL database""" 842 cur = db_handle.cursor() 843 844 cur.execute("DROP TABLE IF EXISTS astral") 845 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 846 847 for dom in self.fasta_dict.keys(): 848 cur.execute( "INSERT INTO astral (sid,seq) values (%s,%s)", 849 (dom, self.fasta_dict[dom].seq.data)) 850 851 852 for i in astralBibIds: 853 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 854 855 for d in self.domainsClusteredById(i): 856 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 857 d.sid) 858 859 for ev in astralEvs: 860 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 861 862 for d in self.domainsClusteredByEv(ev): 863 864 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 865 d.sid)
866