Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """ SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25   
 26  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 27   
 28  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 29   
 30  The Scop object in this module represents the entire SCOP classification. It 
 31  can be built from the three SCOP parsable files, modified is so desired, and 
 32  converted back to the same file formats. A single SCOP domain (represented 
 33  by the Domain class) can be obtained from Scop using the domain's SCOP 
 34  identifier (sid). 
 35   
 36   
 37  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 38                    description. The known node types are 'cl' (class), 'cf' 
 39                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 40                    'sp' (species), 'px' (domain). Additional node types may 
 41                    be added in the future. 
 42   
 43  This module also provides code to access SCOP over the WWW. 
 44   
 45  Functions: 
 46  search        -- Access the main CGI script. 
 47  _open         -- Internally used function. 
 48   
 49  """ 
 50   
 51  import os 
 52  import re 
 53   
 54  import Des 
 55  import Cla 
 56  import Hie 
 57  import Residues 
 58  from Bio import SeqIO 
 59  from Bio.Seq import Seq 
 60   
 61   
 62  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 63                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 64   
 65   
 66  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 67                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 68   
 69  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ] 
 70   
 71  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 72   
 73  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 74               1e-20, 1e-25, 1e-50] 
 75   
 76  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 77                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 78                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 79                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 80   
 81  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 82                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 83                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 84                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 85   
 86  try: 
 87      #See if the cmp function exists (will on Python 2) 
 88      _cmp = cmp 
 89  except NameError: 
90 - def _cmp(a,b):
91 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 92 93 Based on Python 3 docs which say if you really need the cmp() 94 functionality, you could use the expression (a > b) - (a < b) 95 as the equivalent for cmp(a, b) 96 """ 97 return (a > b) - (a < b)
98 99
100 -def cmp_sccs(sccs1, sccs2):
101 """Order SCOP concise classification strings (sccs). 102 103 a.4.5.1 < a.4.5.11 < b.1.1.1 104 105 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 106 The letter represents the class, and the numbers are the fold, 107 superfamily, and family, respectively. 108 109 """ 110 111 s1 = sccs1.split(".") 112 s2 = sccs2.split(".") 113 114 if s1[0] != s2[0]: 115 return _cmp(s1[0], s2[0]) 116 117 s1 = list(map(int, s1[1:])) 118 s2 = list(map(int, s2[1:])) 119 120 return _cmp(s1,s2)
121 122 123 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 124 125
126 -def parse_domain(str):
127 """Convert an ASTRAL header string into a Scop domain. 128 129 An ASTRAL (http://astral.stanford.edu/) header contains a concise 130 description of a SCOP domain. A very similar format is used when a 131 Domain object is converted into a string. The Domain returned by this 132 method contains most of the SCOP information, but it will not be located 133 within the SCOP hierarchy (i.e. The parent node will be None). The 134 description is composed of the SCOP protein and species descriptions. 135 136 A typical ASTRAL header looks like -- 137 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 138 """ 139 140 m = _domain_re.match(str) 141 if (not m): 142 raise ValueError("Domain: "+ str) 143 144 dom = Domain() 145 dom.sid = m.group(1) 146 dom.sccs = m.group(2) 147 dom.residues = Residues.Residues(m.group(3)) 148 if not dom.residues.pdbid: 149 dom.residues.pdbid= dom.sid[1:5] 150 dom.description = m.group(4).strip() 151 152 return dom
153 154
155 -def _open_scop_file(scop_dir_path, version, filetype):
156 filename = "dir.%s.scop.txt_%s" % (filetype,version) 157 handle = open(os.path.join( scop_dir_path, filename)) 158 return handle
159 160
161 -class Scop(object):
162 """The entire SCOP hierarchy. 163 164 root -- The root node of the hierarchy 165 """
166 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 167 dir_path=None, db_handle=None, version=None):
168 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 169 170 If no file handles are given, then a Scop object with a single 171 empty root node is returned. 172 173 If a directory and version are given (with dir_path=.., version=...) or 174 file handles for each file, the whole scop tree will be built in memory. 175 176 If a MySQLdb database handle is given, the tree will be built as needed, 177 minimising construction times. To build the SQL database to the methods 178 write_xxx_sql to create the tables. 179 180 """ 181 self._sidDict = {} 182 self._sunidDict = {} 183 184 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 185 return 186 187 if dir_path is None and db_handle is None: 188 if cla_handle is None or des_handle is None or hie_handle is None: 189 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 190 191 sunidDict = {} 192 193 self.db_handle = db_handle 194 try: 195 if db_handle: 196 # do nothing if we have a db handle, we'll do it all on the fly 197 pass 198 else: 199 # open SCOP parseable files 200 if dir_path: 201 if not version: 202 raise RuntimeError("Need SCOP version to find parsable files in directory") 203 if cla_handle or des_handle or hie_handle: 204 raise RuntimeError("Cannot specify SCOP directory and specific files") 205 206 cla_handle = _open_scop_file( dir_path, version, 'cla') 207 des_handle = _open_scop_file( dir_path, version, 'des') 208 hie_handle = _open_scop_file( dir_path, version, 'hie') 209 210 root = Node() 211 domains = [] 212 root.sunid=0 213 root.type='ro' 214 sunidDict[root.sunid] = root 215 self.root = root 216 root.description = 'SCOP Root' 217 218 # Build the rest of the nodes using the DES file 219 records = Des.parse(des_handle) 220 for record in records: 221 if record.nodetype =='px': 222 n = Domain() 223 n.sid = record.name 224 domains.append(n) 225 else : 226 n = Node() 227 n.sunid = record.sunid 228 n.type = record.nodetype 229 n.sccs = record.sccs 230 n.description = record.description 231 232 sunidDict[n.sunid] = n 233 234 # Glue all of the Nodes together using the HIE file 235 records = Hie.parse(hie_handle) 236 for record in records: 237 if record.sunid not in sunidDict: 238 print record.sunid 239 240 n = sunidDict[record.sunid] 241 242 if record.parent != '' : # Not root node 243 244 if record.parent not in sunidDict: 245 raise ValueError("Incomplete data?") 246 247 n.parent = sunidDict[record.parent] 248 249 for c in record.children: 250 if c not in sunidDict: 251 raise ValueError("Incomplete data?") 252 n.children.append(sunidDict[c]) 253 254 # Fill in the gaps with information from the CLA file 255 sidDict = {} 256 records = Cla.parse(cla_handle) 257 for record in records: 258 n = sunidDict[record.sunid] 259 assert n.sccs == record.sccs 260 assert n.sid == record.sid 261 n.residues = record.residues 262 sidDict[n.sid] = n 263 264 # Clean up 265 self._sunidDict = sunidDict 266 self._sidDict = sidDict 267 self._domains = tuple(domains) 268 269 finally: 270 if dir_path: 271 # If we opened the files, we close the files 272 if cla_handle: 273 cla_handle.close() 274 if des_handle: 275 des_handle.close() 276 if hie_handle: 277 hie_handle.close()
278
279 - def getRoot(self):
280 return self.getNodeBySunid(0)
281
282 - def getDomainBySid(self, sid):
283 """Return a domain from its sid""" 284 if sid in self._sidDict: 285 return self._sidDict[sid] 286 if self.db_handle: 287 self.getDomainFromSQL(sid=sid) 288 if sid in self._sidDict: 289 return self._sidDict[sid] 290 else: 291 return None
292
293 - def getNodeBySunid(self, sunid):
294 """Return a node from its sunid""" 295 if sunid in self._sunidDict: 296 return self._sunidDict[sunid] 297 if self.db_handle: 298 self.getDomainFromSQL(sunid=sunid) 299 if sunid in self._sunidDict: 300 return self._sunidDict[sunid] 301 else: 302 return None
303
304 - def getDomains(self):
305 """Returns an ordered tuple of all SCOP Domains""" 306 if self.db_handle: 307 return self.getRoot().getDescendents('px') 308 else: 309 return self._domains
310
311 - def write_hie(self, handle):
312 """Build an HIE SCOP parsable file from this object""" 313 nodes = self._sunidDict.values() 314 # We order nodes to ease comparison with original file 315 nodes.sort(key = lambda n: n.sunid) 316 for n in nodes: 317 handle.write(str(n.toHieRecord()))
318
319 - def write_des(self, handle):
320 """Build a DES SCOP parsable file from this object""" 321 nodes = self._sunidDict.values() 322 # Origional SCOP file is not ordered? 323 nodes.sort(key = lambda n: n.sunid) 324 for n in nodes: 325 if n != self.root: 326 handle.write(str(n.toDesRecord()))
327
328 - def write_cla(self, handle):
329 """Build a CLA SCOP parsable file from this object""" 330 nodes = self._sidDict.values() 331 # We order nodes to ease comparison with original file 332 nodes.sort(key = lambda n: n.sunid) 333 for n in nodes: 334 handle.write(str(n.toClaRecord()))
335
336 - def getDomainFromSQL(self, sunid=None, sid=None):
337 """Load a node from the SQL backend using sunid or sid""" 338 if sunid is None and sid is None: 339 return None 340 341 cur = self.db_handle.cursor() 342 343 if sid: 344 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 345 res = cur.fetchone() 346 if res is None: 347 return None 348 sunid = res[0] 349 350 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 351 data = cur.fetchone() 352 353 if data is not None: 354 n = None 355 356 #determine if Node or Domain 357 if data[1] != "px": 358 n = Node(scop=self) 359 360 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 361 children = [] 362 for c in cur.fetchall(): 363 children.append(c[0]) 364 n.children = children 365 else: 366 n = Domain(scop=self) 367 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 368 sunid) 369 370 [n.sid,n.residues,pdbid] = cur.fetchone() 371 n.residues = Residues.Residues(n.residues) 372 n.residues.pdbid=pdbid 373 self._sidDict[n.sid] = n 374 375 [n.sunid,n.type,n.sccs,n.description] = data 376 377 if data[1] != 'ro': 378 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 379 n.parent = cur.fetchone()[0] 380 381 n.sunid = int(n.sunid) 382 383 self._sunidDict[n.sunid] = n
384
385 - def getAscendentFromSQL(self, node, type):
386 """Get ascendents using SQL backend""" 387 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 388 return None 389 390 cur = self.db_handle.cursor() 391 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 392 result = cur.fetchone() 393 if result is not None: 394 return self.getNodeBySunid(result[0]) 395 else: 396 return None
397
398 - def getDescendentsFromSQL(self, node, type):
399 """Get descendents of a node using the database backend. This avoids 400 repeated iteration of SQL calls and is therefore much quicker than 401 repeatedly calling node.getChildren(). 402 """ 403 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 404 return [] 405 406 des_list = [] 407 408 # SQL cla table knows nothing about 'ro' 409 if node.type == 'ro': 410 for c in node.getChildren(): 411 for d in self.getDescendentsFromSQL(c,type): 412 des_list.append(d) 413 return des_list 414 415 cur = self.db_handle.cursor() 416 417 if type != 'px': 418 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 419 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 420 data = cur.fetchall() 421 for d in data: 422 if int(d[0]) not in self._sunidDict: 423 n = Node(scop=self) 424 [n.sunid,n.type,n.sccs,n.description] = d 425 n.sunid=int(n.sunid) 426 self._sunidDict[n.sunid] = n 427 428 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 429 n.parent = cur.fetchone()[0] 430 431 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 432 children = [] 433 for c in cur.fetchall(): 434 children.append(c[0]) 435 n.children = children 436 437 des_list.append( self._sunidDict[int(d[0])] ) 438 439 else: 440 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 441 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 442 node.sunid) 443 444 data = cur.fetchall() 445 for d in data: 446 if int(d[0]) not in self._sunidDict: 447 n = Domain(scop=self) 448 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 449 #n.description,n.parent] = data 450 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 451 n.parent] = d[0:8] 452 n.residues = Residues.Residues(n.residues) 453 n.residues.pdbid = pdbid 454 n.sunid = int(n.sunid) 455 self._sunidDict[n.sunid] = n 456 self._sidDict[n.sid] = n 457 458 des_list.append( self._sunidDict[int(d[0])] ) 459 460 return des_list
461
462 - def write_hie_sql(self, handle):
463 """Write HIE data to SQL database""" 464 cur = handle.cursor() 465 466 cur.execute("DROP TABLE IF EXISTS hie") 467 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 468 INDEX (parent) )") 469 470 for p in self._sunidDict.itervalues(): 471 for c in p.children: 472 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
473
474 - def write_cla_sql(self, handle):
475 """Write CLA data to SQL database""" 476 cur = handle.cursor() 477 478 cur.execute("DROP TABLE IF EXISTS cla") 479 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 480 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 481 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 482 483 for n in self._sidDict.itervalues(): 484 c = n.toClaRecord() 485 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 486 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 487 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 488 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 489 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 490 n.sunid ))
491
492 - def write_des_sql(self, handle):
493 """Write DES data to SQL database""" 494 cur = handle.cursor() 495 496 cur.execute("DROP TABLE IF EXISTS des") 497 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 498 description VARCHAR(255),\ 499 PRIMARY KEY (sunid) )") 500 501 for n in self._sunidDict.itervalues(): 502 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 503 ( n.sunid, n.type, n.sccs, n.description ) )
504 505
506 -class Node(object):
507 """ A node in the Scop hierarchy 508 509 sunid -- SCOP unique identifiers. e.g. '14986' 510 511 parent -- The parent node 512 513 children -- A list of child nodes 514 515 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 516 517 type -- A 2 letter node type code. e.g. 'px' for domains 518 519 description -- 520 521 """
522 - def __init__(self, scop=None):
523 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 524 constructor, this will be used to lookup related references using the SQL 525 methods. If no instance is provided, it is assumed the whole tree exists 526 and is connected.""" 527 self.sunid='' 528 self.parent = None 529 self.children=[] 530 self.sccs = '' 531 self.type ='' 532 self.description ='' 533 self.scop=scop
534
535 - def __str__(self):
536 s = [] 537 s.append(str(self.sunid)) 538 s.append(self.sccs) 539 s.append(self.type) 540 s.append(self.description) 541 542 return " ".join(s)
543
544 - def toHieRecord(self):
545 """Return an Hie.Record""" 546 rec = Hie.Record() 547 rec.sunid = str(self.sunid) 548 if self.getParent(): # Not root node 549 rec.parent = str(self.getParent().sunid) 550 else: 551 rec.parent = '-' 552 for c in self.getChildren(): 553 rec.children.append(str(c.sunid)) 554 return rec
555
556 - def toDesRecord(self):
557 """Return a Des.Record""" 558 rec = Des.Record() 559 rec.sunid = str(self.sunid) 560 rec.nodetype = self.type 561 rec.sccs = self.sccs 562 rec.description = self.description 563 return rec
564
565 - def getChildren(self):
566 """Return a list of children of this Node""" 567 if self.scop is None: 568 return self.children 569 else: 570 return map( self.scop.getNodeBySunid, self.children )
571
572 - def getParent(self):
573 """Return the parent of this Node""" 574 if self.scop is None: 575 return self.parent 576 else: 577 return self.scop.getNodeBySunid( self.parent )
578
579 - def getDescendents( self, node_type):
580 """ Return a list of all decendent nodes of the given type. Node type can a 581 two letter code or longer description. e.g. 'fa' or 'family' 582 """ 583 if node_type in _nodetype_to_code: 584 node_type = _nodetype_to_code[node_type] 585 586 nodes = [self] 587 if self.scop: 588 return self.scop.getDescendentsFromSQL(self,node_type) 589 while nodes[0].type != node_type: 590 if nodes[0].type == 'px': 591 return [] # Fell of the bottom of the hierarchy 592 child_list = [] 593 for n in nodes: 594 for child in n.getChildren(): 595 child_list.append( child ) 596 nodes = child_list 597 598 return nodes
599
600 - def getAscendent( self, node_type):
601 """ Return the ancenstor node of the given type, or None.Node type can a 602 two letter code or longer description. e.g. 'fa' or 'family'""" 603 if node_type in _nodetype_to_code: 604 node_type = _nodetype_to_code[node_type] 605 606 if self.scop: 607 return self.scop.getAscendentFromSQL(self,node_type) 608 else: 609 n = self 610 if n.type == node_type: 611 return None 612 613 while n.type != node_type: 614 if n.type == 'ro': 615 return None # Fell of the top of the hierarchy 616 n = n.getParent() 617 618 return n
619 620
621 -class Domain(Node):
622 """ A SCOP domain. A leaf node in the Scop hierarchy. 623 624 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 625 626 residues -- A Residue object. It defines the collection 627 of PDB atoms that make up this domain. 628 """
629 - def __init__(self,scop=None):
630 Node.__init__(self,scop=scop) 631 self.sid = '' 632 self.residues = None
633
634 - def __str__(self):
635 s = [] 636 s.append(self.sid) 637 s.append(self.sccs) 638 s.append("("+str(self.residues)+")") 639 640 if not self.getParent(): 641 s.append(self.description) 642 else: 643 sp = self.getParent() 644 dm = sp.getParent() 645 s.append(dm.description) 646 s.append("{"+sp.description+"}") 647 648 return " ".join(s)
649
650 - def toDesRecord(self):
651 """Return a Des.Record""" 652 rec = Node.toDesRecord(self) 653 rec.name = self.sid 654 return rec
655
656 - def toClaRecord(self):
657 """Return a Cla.Record""" 658 rec = Cla.Record() 659 rec.sid = self.sid 660 rec.residues = self.residues 661 rec.sccs = self.sccs 662 rec.sunid = self.sunid 663 664 n = self 665 while n.sunid != 0: # Not root node 666 rec.hierarchy[n.type] = str(n.sunid) 667 n = n.getParent() 668 669 # Order does not matter in the hierarchy field. For more info, see 670 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 671 #rec.hierarchy.reverse() 672 673 return rec
674 675
676 -class Astral(object):
677 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 678 as well as clusterings by percent id or evalue. 679 """ 680
681 - def __init__( self, dir_path=None, version=None, scop=None, 682 astral_file=None, db_handle=None):
683 """ 684 Initialise the astral database. 685 686 You must provide either a directory of SCOP files: 687 688 dir_path - string, the path to location of the scopseq-x.xx directory 689 (not the directory itself), and 690 version -a version number. 691 692 or, a FASTA file: 693 694 astral_file - string, a path to a fasta file (which will be loaded in memory) 695 696 or, a MYSQL database: 697 698 db_handle - a database handle for a MYSQL database containing a table 699 'astral' with the astral data in it. This can be created 700 using writeToSQL. 701 """ 702 703 if astral_file is None and dir_path is None and db_handle is None: 704 raise RuntimeError("Need either file handle, or (dir_path + " 705 + "version) or database handle to construct Astral") 706 if not scop: 707 raise RuntimeError("Must provide a Scop instance to construct") 708 709 self.scop = scop 710 self.db_handle = db_handle 711 712 if not astral_file and not db_handle: 713 if dir_path is None or version is None: 714 raise RuntimeError("must provide dir_path and version") 715 716 self.version = version 717 self.path = os.path.join( dir_path, "scopseq-%s" % version) 718 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 719 astral_file = os.path.join(self.path, astral_file) 720 721 if astral_file: 722 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 723 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 724 725 self.astral_file = astral_file 726 self.EvDatasets = {} 727 self.EvDatahash = {} 728 self.IdDatasets = {} 729 self.IdDatahash = {}
730
731 - def domainsClusteredByEv(self,id):
732 """get domains clustered by evalue""" 733 if id not in self.EvDatasets: 734 if self.db_handle: 735 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 736 737 else: 738 if not self.path: 739 raise RuntimeError("No scopseq directory specified") 740 741 file_prefix = "astral-scopdom-seqres-sel-gs" 742 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 743 self.version) 744 filename = os.path.join(self.path,filename) 745 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 746 return self.EvDatasets[id]
747
748 - def domainsClusteredById(self,id):
749 """get domains clustered by percent id""" 750 if id not in self.IdDatasets: 751 if self.db_handle: 752 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 753 else: 754 if not self.path: 755 raise RuntimeError("No scopseq directory specified") 756 757 file_prefix = "astral-scopdom-seqres-sel-gs" 758 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 759 filename = os.path.join(self.path,filename) 760 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 761 return self.IdDatasets[id]
762
763 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
764 """Get the scop domains from a file containing a list of sids""" 765 if file_handle is None and filename is None: 766 raise RuntimeError("You must provide a filename or handle") 767 if not file_handle: 768 file_handle = open(filename) 769 doms = [] 770 while 1: 771 line = file_handle.readline() 772 if not line: 773 break 774 line = line.rstrip() 775 doms.append(line) 776 if filename: 777 file_handle.close() 778 779 doms = filter( lambda a: a[0]=='d', doms ) 780 doms = map( self.scop.getDomainBySid, doms ) 781 return doms
782
783 - def getAstralDomainsFromSQL(self, column):
784 """Load a set of astral domains from a column in the astral table of a MYSQL 785 database (which can be created with writeToSQL(...)""" 786 cur = self.db_handle.cursor() 787 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 788 data = cur.fetchall() 789 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 790 791 return data
792
793 - def getSeqBySid(self,domain):
794 """get the seq record of a given domain from its sid""" 795 if self.db_handle is None: 796 return self.fasta_dict[domain].seq 797 else: 798 cur = self.db_handle.cursor() 799 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 800 return Seq(cur.fetchone()[0])
801
802 - def getSeq(self,domain):
803 """Return seq associated with domain""" 804 return self.getSeqBySid(domain.sid)
805
806 - def hashedDomainsById(self,id):
807 """Get domains clustered by sequence identity in a dict""" 808 if id not in self.IdDatahash: 809 self.IdDatahash[id] = {} 810 for d in self.domainsClusteredById(id): 811 self.IdDatahash[id][d] = 1 812 return self.IdDatahash[id]
813
814 - def hashedDomainsByEv(self,id):
815 """Get domains clustered by evalue in a dict""" 816 if id not in self.EvDatahash: 817 self.EvDatahash[id] = {} 818 for d in self.domainsClusteredByEv(id): 819 self.EvDatahash[id][d] = 1 820 return self.EvDatahash[id]
821
822 - def isDomainInId(self,dom,id):
823 """Returns true if the domain is in the astral clusters for percent ID""" 824 return dom in self.hashedDomainsById(id)
825
826 - def isDomainInEv(self,dom,id):
827 """Returns true if the domain is in the ASTRAL clusters for evalues""" 828 return dom in self.hashedDomainsByEv(id)
829
830 - def writeToSQL(self, db_handle):
831 """Write the ASTRAL database to a MYSQL database""" 832 cur = db_handle.cursor() 833 834 cur.execute("DROP TABLE IF EXISTS astral") 835 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 836 837 for dom in self.fasta_dict: 838 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 839 (dom, self.fasta_dict[dom].seq.data)) 840 841 for i in astralBibIds: 842 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 843 844 for d in self.domainsClusteredById(i): 845 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 846 d.sid) 847 848 for ev in astralEvs: 849 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 850 851 for d in self.domainsClusteredByEv(ev): 852 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 853 d.sid)
854 855
856 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 857 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
858 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 859 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 860 861 Access search.cgi and return a handle to the results. See the 862 online help file for an explanation of the parameters: 863 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 864 865 Raises an IOError if there's a network error. 866 867 """ 868 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 869 'dir' : dir, 'loc' : loc} 870 variables = {} 871 for k, v in params.iteritems(): 872 if v is not None: 873 variables[k] = v 874 variables.update(keywds) 875 return _open(cgi, variables)
876 877
878 -def _open(cgi, params={}, get=1):
879 """_open(cgi, params={}, get=1) -> UndoHandle 880 881 Open a handle to SCOP. cgi is the URL for the cgi script to access. 882 params is a dictionary with the options to pass to it. get is a boolean 883 that describes whether a GET should be used. Does some 884 simple error checking, and will raise an IOError if it encounters one. 885 886 """ 887 import urllib 888 import urllib2 889 # Open a handle to SCOP. 890 options = urllib.urlencode(params) 891 try: 892 if get: # do a GET 893 if options: 894 cgi += "?" + options 895 handle = urllib2.urlopen(cgi) 896 else: # do a POST 897 handle = urllib2.urlopen(cgi, data=options) 898 except urllib2.HTTPError, exception: 899 raise exception 900 return handle
901