Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """ SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25   
 26  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 27   
 28  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 29   
 30  The Scop object in this module represents the entire SCOP classification. It 
 31  can be built from the three SCOP parsable files, modified is so desired, and 
 32  converted back to the same file formats. A single SCOP domain (represented 
 33  by the Domain class) can be obtained from Scop using the domain's SCOP 
 34  identifier (sid). 
 35   
 36   
 37  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 38                    description. The known node types are 'cl' (class), 'cf' 
 39                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 40                    'sp' (species), 'px' (domain). Additional node types may 
 41                    be added in the future. 
 42   
 43  This module also provides code to access SCOP over the WWW. 
 44   
 45  Functions: 
 46  search        -- Access the main CGI script. 
 47  _open         -- Internally used function. 
 48   
 49  """ 
 50   
 51  from __future__ import print_function 
 52   
 53  import os 
 54  import re 
 55   
 56  from . import Des 
 57  from . import Cla 
 58  from . import Hie 
 59  from . import Residues 
 60  from Bio import SeqIO 
 61  from Bio.Seq import Seq 
 62   
 63  __docformat__ = "restructuredtext en" 
 64   
 65  nodeCodeDict = {'cl': 'class', 'cf': 'fold', 'sf': 'superfamily', 
 66                  'fa': 'family', 'dm': 'protein', 'sp': 'species', 'px': 'domain'} 
 67   
 68   
 69  _nodetype_to_code= {'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 70                      'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 71   
 72  nodeCodeOrder = ['ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px'] 
 73   
 74  astralBibIds = [10, 20, 25, 30, 35, 40, 50, 70, 90, 95, 100] 
 75   
 76  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 77               1e-20, 1e-25, 1e-50] 
 78   
 79  astralEv_to_file = {10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 80                      0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 81                      1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 82                      1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50'} 
 83   
 84  astralEv_to_sql = {10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 85                     0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 86                     1e-4: 'e_4', 1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 87                     1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50'} 
 88   
 89  try: 
 90      # See if the cmp function exists (will on Python 2) 
 91      _cmp = cmp 
 92  except NameError: 
93 - def _cmp(a, b):
94 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 95 96 Based on Python 3 docs which say if you really need the cmp() 97 functionality, you could use the expression (a > b) - (a < b) 98 as the equivalent for cmp(a, b) 99 """ 100 return (a > b) - (a < b)
101 102
103 -def cmp_sccs(sccs1, sccs2):
104 """Order SCOP concise classification strings (sccs). 105 106 a.4.5.1 < a.4.5.11 < b.1.1.1 107 108 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 109 The letter represents the class, and the numbers are the fold, 110 superfamily, and family, respectively. 111 112 """ 113 114 s1 = sccs1.split(".") 115 s2 = sccs2.split(".") 116 117 if s1[0] != s2[0]: 118 return _cmp(s1[0], s2[0]) 119 120 s1 = [int(x) for x in s1[1:]] 121 s2 = [int(x) for x in s2[1:]] 122 123 return _cmp(s1, s2)
124 125 126 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 127 128
129 -def parse_domain(str):
130 """Convert an ASTRAL header string into a Scop domain. 131 132 An ASTRAL (http://astral.stanford.edu/) header contains a concise 133 description of a SCOP domain. A very similar format is used when a 134 Domain object is converted into a string. The Domain returned by this 135 method contains most of the SCOP information, but it will not be located 136 within the SCOP hierarchy (i.e. The parent node will be None). The 137 description is composed of the SCOP protein and species descriptions. 138 139 A typical ASTRAL header looks like -- 140 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 141 """ 142 143 m = _domain_re.match(str) 144 if (not m): 145 raise ValueError("Domain: "+ str) 146 147 dom = Domain() 148 dom.sid = m.group(1) 149 dom.sccs = m.group(2) 150 dom.residues = Residues.Residues(m.group(3)) 151 if not dom.residues.pdbid: 152 dom.residues.pdbid= dom.sid[1:5] 153 dom.description = m.group(4).strip() 154 155 return dom
156 157
158 -def _open_scop_file(scop_dir_path, version, filetype):
159 filename = "dir.%s.scop.txt_%s" % (filetype, version) 160 handle = open(os.path.join(scop_dir_path, filename)) 161 return handle
162 163
164 -class Scop(object):
165 """The entire SCOP hierarchy. 166 167 root -- The root node of the hierarchy 168 """
169 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 170 dir_path=None, db_handle=None, version=None):
171 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 172 173 If no file handles are given, then a Scop object with a single 174 empty root node is returned. 175 176 If a directory and version are given (with dir_path=.., version=...) or 177 file handles for each file, the whole scop tree will be built in memory. 178 179 If a MySQLdb database handle is given, the tree will be built as needed, 180 minimising construction times. To build the SQL database to the methods 181 write_xxx_sql to create the tables. 182 183 """ 184 self._sidDict = {} 185 self._sunidDict = {} 186 187 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 188 return 189 190 if dir_path is None and db_handle is None: 191 if cla_handle is None or des_handle is None or hie_handle is None: 192 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 193 194 sunidDict = {} 195 196 self.db_handle = db_handle 197 try: 198 if db_handle: 199 # do nothing if we have a db handle, we'll do it all on the fly 200 pass 201 else: 202 # open SCOP parseable files 203 if dir_path: 204 if not version: 205 raise RuntimeError("Need SCOP version to find parsable files in directory") 206 if cla_handle or des_handle or hie_handle: 207 raise RuntimeError("Cannot specify SCOP directory and specific files") 208 209 cla_handle = _open_scop_file(dir_path, version, 'cla') 210 des_handle = _open_scop_file(dir_path, version, 'des') 211 hie_handle = _open_scop_file(dir_path, version, 'hie') 212 213 root = Node() 214 domains = [] 215 root.sunid = 0 216 root.type = 'ro' 217 sunidDict[root.sunid] = root 218 self.root = root 219 root.description = 'SCOP Root' 220 221 # Build the rest of the nodes using the DES file 222 records = Des.parse(des_handle) 223 for record in records: 224 if record.nodetype == 'px': 225 n = Domain() 226 n.sid = record.name 227 domains.append(n) 228 else: 229 n = Node() 230 n.sunid = record.sunid 231 n.type = record.nodetype 232 n.sccs = record.sccs 233 n.description = record.description 234 235 sunidDict[n.sunid] = n 236 237 # Glue all of the Nodes together using the HIE file 238 records = Hie.parse(hie_handle) 239 for record in records: 240 if record.sunid not in sunidDict: 241 print(record.sunid) 242 243 n = sunidDict[record.sunid] 244 245 if record.parent != '': # Not root node 246 247 if record.parent not in sunidDict: 248 raise ValueError("Incomplete data?") 249 250 n.parent = sunidDict[record.parent] 251 252 for c in record.children: 253 if c not in sunidDict: 254 raise ValueError("Incomplete data?") 255 n.children.append(sunidDict[c]) 256 257 # Fill in the gaps with information from the CLA file 258 sidDict = {} 259 records = Cla.parse(cla_handle) 260 for record in records: 261 n = sunidDict[record.sunid] 262 assert n.sccs == record.sccs 263 assert n.sid == record.sid 264 n.residues = record.residues 265 sidDict[n.sid] = n 266 267 # Clean up 268 self._sunidDict = sunidDict 269 self._sidDict = sidDict 270 self._domains = tuple(domains) 271 272 finally: 273 if dir_path: 274 # If we opened the files, we close the files 275 if cla_handle: 276 cla_handle.close() 277 if des_handle: 278 des_handle.close() 279 if hie_handle: 280 hie_handle.close()
281
282 - def getRoot(self):
283 return self.getNodeBySunid(0)
284
285 - def getDomainBySid(self, sid):
286 """Return a domain from its sid""" 287 if sid in self._sidDict: 288 return self._sidDict[sid] 289 if self.db_handle: 290 self.getDomainFromSQL(sid=sid) 291 if sid in self._sidDict: 292 return self._sidDict[sid] 293 else: 294 return None
295
296 - def getNodeBySunid(self, sunid):
297 """Return a node from its sunid""" 298 if sunid in self._sunidDict: 299 return self._sunidDict[sunid] 300 if self.db_handle: 301 self.getDomainFromSQL(sunid=sunid) 302 if sunid in self._sunidDict: 303 return self._sunidDict[sunid] 304 else: 305 return None
306
307 - def getDomains(self):
308 """Returns an ordered tuple of all SCOP Domains""" 309 if self.db_handle: 310 return self.getRoot().getDescendents('px') 311 else: 312 return self._domains
313
314 - def write_hie(self, handle):
315 """Build an HIE SCOP parsable file from this object""" 316 # We order nodes to ease comparison with original file 317 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 318 handle.write(str(n.toHieRecord()))
319
320 - def write_des(self, handle):
321 """Build a DES SCOP parsable file from this object""" 322 # Origional SCOP file is not ordered? 323 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 324 if n != self.root: 325 handle.write(str(n.toDesRecord()))
326
327 - def write_cla(self, handle):
328 """Build a CLA SCOP parsable file from this object""" 329 # We order nodes to ease comparison with original file 330 for n in sorted(self._sidDict.values(), key=lambda n: n.sunid): 331 handle.write(str(n.toClaRecord()))
332
333 - def getDomainFromSQL(self, sunid=None, sid=None):
334 """Load a node from the SQL backend using sunid or sid""" 335 if sunid is None and sid is None: 336 return None 337 338 cur = self.db_handle.cursor() 339 340 if sid: 341 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 342 res = cur.fetchone() 343 if res is None: 344 return None 345 sunid = res[0] 346 347 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 348 data = cur.fetchone() 349 350 if data is not None: 351 n = None 352 353 # determine if Node or Domain 354 if data[1] != "px": 355 n = Node(scop=self) 356 357 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 358 children = [] 359 for c in cur.fetchall(): 360 children.append(c[0]) 361 n.children = children 362 else: 363 n = Domain(scop=self) 364 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 365 sunid) 366 367 [n.sid, n.residues, pdbid] = cur.fetchone() 368 n.residues = Residues.Residues(n.residues) 369 n.residues.pdbid=pdbid 370 self._sidDict[n.sid] = n 371 372 [n.sunid, n.type, n.sccs, n.description] = data 373 374 if data[1] != 'ro': 375 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 376 n.parent = cur.fetchone()[0] 377 378 n.sunid = int(n.sunid) 379 380 self._sunidDict[n.sunid] = n
381
382 - def getAscendentFromSQL(self, node, type):
383 """Get ascendents using SQL backend""" 384 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 385 return None 386 387 cur = self.db_handle.cursor() 388 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 389 result = cur.fetchone() 390 if result is not None: 391 return self.getNodeBySunid(result[0]) 392 else: 393 return None
394
395 - def getDescendentsFromSQL(self, node, type):
396 """Get descendents of a node using the database backend. This avoids 397 repeated iteration of SQL calls and is therefore much quicker than 398 repeatedly calling node.getChildren(). 399 """ 400 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 401 return [] 402 403 des_list = [] 404 405 # SQL cla table knows nothing about 'ro' 406 if node.type == 'ro': 407 for c in node.getChildren(): 408 for d in self.getDescendentsFromSQL(c, type): 409 des_list.append(d) 410 return des_list 411 412 cur = self.db_handle.cursor() 413 414 if type != 'px': 415 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 416 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 417 data = cur.fetchall() 418 for d in data: 419 if int(d[0]) not in self._sunidDict: 420 n = Node(scop=self) 421 [n.sunid, n.type, n.sccs, n.description] = d 422 n.sunid=int(n.sunid) 423 self._sunidDict[n.sunid] = n 424 425 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 426 n.parent = cur.fetchone()[0] 427 428 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 429 children = [] 430 for c in cur.fetchall(): 431 children.append(c[0]) 432 n.children = children 433 434 des_list.append(self._sunidDict[int(d[0])]) 435 436 else: 437 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 438 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 439 node.sunid) 440 441 data = cur.fetchall() 442 for d in data: 443 if int(d[0]) not in self._sunidDict: 444 n = Domain(scop=self) 445 # [n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 446 # n.description,n.parent] = data 447 [n.sunid, n.sid, pdbid, n.residues, n.sccs, n.type, n.description, 448 n.parent] = d[0:8] 449 n.residues = Residues.Residues(n.residues) 450 n.residues.pdbid = pdbid 451 n.sunid = int(n.sunid) 452 self._sunidDict[n.sunid] = n 453 self._sidDict[n.sid] = n 454 455 des_list.append(self._sunidDict[int(d[0])]) 456 457 return des_list
458
459 - def write_hie_sql(self, handle):
460 """Write HIE data to SQL database""" 461 cur = handle.cursor() 462 463 cur.execute("DROP TABLE IF EXISTS hie") 464 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 465 INDEX (parent) )") 466 467 for p in self._sunidDict.values(): 468 for c in p.children: 469 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
470
471 - def write_cla_sql(self, handle):
472 """Write CLA data to SQL database""" 473 cur = handle.cursor() 474 475 cur.execute("DROP TABLE IF EXISTS cla") 476 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 477 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 478 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 479 480 for n in self._sidDict.values(): 481 c = n.toClaRecord() 482 cur.execute("INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 483 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 484 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 485 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 486 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 487 n.sunid))
488
489 - def write_des_sql(self, handle):
490 """Write DES data to SQL database""" 491 cur = handle.cursor() 492 493 cur.execute("DROP TABLE IF EXISTS des") 494 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 495 description VARCHAR(255),\ 496 PRIMARY KEY (sunid) )") 497 498 for n in self._sunidDict.values(): 499 cur.execute("INSERT INTO des VALUES (%s,%s,%s,%s)", 500 (n.sunid, n.type, n.sccs, n.description))
501 502
503 -class Node(object):
504 """ A node in the Scop hierarchy 505 506 sunid -- SCOP unique identifiers. e.g. '14986' 507 508 parent -- The parent node 509 510 children -- A list of child nodes 511 512 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 513 514 type -- A 2 letter node type code. e.g. 'px' for domains 515 516 description -- 517 518 """
519 - def __init__(self, scop=None):
520 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 521 constructor, this will be used to lookup related references using the SQL 522 methods. If no instance is provided, it is assumed the whole tree exists 523 and is connected.""" 524 self.sunid = '' 525 self.parent = None 526 self.children = [] 527 self.sccs = '' 528 self.type = '' 529 self.description = '' 530 self.scop = scop
531
532 - def __str__(self):
533 s = [] 534 s.append(str(self.sunid)) 535 s.append(self.sccs) 536 s.append(self.type) 537 s.append(self.description) 538 539 return " ".join(s)
540
541 - def toHieRecord(self):
542 """Return an Hie.Record""" 543 rec = Hie.Record() 544 rec.sunid = str(self.sunid) 545 if self.getParent(): # Not root node 546 rec.parent = str(self.getParent().sunid) 547 else: 548 rec.parent = '-' 549 for c in self.getChildren(): 550 rec.children.append(str(c.sunid)) 551 return rec
552
553 - def toDesRecord(self):
554 """Return a Des.Record""" 555 rec = Des.Record() 556 rec.sunid = str(self.sunid) 557 rec.nodetype = self.type 558 rec.sccs = self.sccs 559 rec.description = self.description 560 return rec
561
562 - def getChildren(self):
563 """Return a list of children of this Node""" 564 if self.scop is None: 565 return self.children 566 else: 567 return [self.scop.getNodeBySunid(x) for x in self.children]
568
569 - def getParent(self):
570 """Return the parent of this Node""" 571 if self.scop is None: 572 return self.parent 573 else: 574 return self.scop.getNodeBySunid(self.parent)
575
576 - def getDescendents(self, node_type):
577 """ Return a list of all decendent nodes of the given type. Node type can a 578 two letter code or longer description. e.g. 'fa' or 'family' 579 """ 580 if node_type in _nodetype_to_code: 581 node_type = _nodetype_to_code[node_type] 582 583 nodes = [self] 584 if self.scop: 585 return self.scop.getDescendentsFromSQL(self, node_type) 586 while nodes[0].type != node_type: 587 if nodes[0].type == 'px': 588 return [] # Fell of the bottom of the hierarchy 589 child_list = [] 590 for n in nodes: 591 for child in n.getChildren(): 592 child_list.append(child) 593 nodes = child_list 594 595 return nodes
596
597 - def getAscendent(self, node_type):
598 """ Return the ancenstor node of the given type, or None.Node type can a 599 two letter code or longer description. e.g. 'fa' or 'family'""" 600 if node_type in _nodetype_to_code: 601 node_type = _nodetype_to_code[node_type] 602 603 if self.scop: 604 return self.scop.getAscendentFromSQL(self, node_type) 605 else: 606 n = self 607 if n.type == node_type: 608 return None 609 610 while n.type != node_type: 611 if n.type == 'ro': 612 return None # Fell of the top of the hierarchy 613 n = n.getParent() 614 615 return n
616 617
618 -class Domain(Node):
619 """ A SCOP domain. A leaf node in the Scop hierarchy. 620 621 - sid -- The SCOP domain identifier. e.g. ``"d5hbib_"`` 622 623 - residues -- A Residue object. It defines the collection 624 of PDB atoms that make up this domain. 625 """
626 - def __init__(self, scop=None):
627 Node.__init__(self, scop=scop) 628 self.sid = '' 629 self.residues = None
630
631 - def __str__(self):
632 s = [] 633 s.append(self.sid) 634 s.append(self.sccs) 635 s.append("("+str(self.residues)+")") 636 637 if not self.getParent(): 638 s.append(self.description) 639 else: 640 sp = self.getParent() 641 dm = sp.getParent() 642 s.append(dm.description) 643 s.append("{"+sp.description+"}") 644 645 return " ".join(s)
646
647 - def toDesRecord(self):
648 """Return a Des.Record""" 649 rec = Node.toDesRecord(self) 650 rec.name = self.sid 651 return rec
652
653 - def toClaRecord(self):
654 """Return a Cla.Record""" 655 rec = Cla.Record() 656 rec.sid = self.sid 657 rec.residues = self.residues 658 rec.sccs = self.sccs 659 rec.sunid = self.sunid 660 661 n = self 662 while n.sunid != 0: # Not root node 663 rec.hierarchy[n.type] = str(n.sunid) 664 n = n.getParent() 665 666 # Order does not matter in the hierarchy field. For more info, see 667 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 668 # rec.hierarchy.reverse() 669 670 return rec
671 672
673 -class Astral(object):
674 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 675 as well as clusterings by percent id or evalue. 676 """ 677
678 - def __init__(self, dir_path=None, version=None, scop=None, 679 astral_file=None, db_handle=None):
680 """ 681 Initialise the astral database. 682 683 You must provide either a directory of SCOP files: 684 685 - dir_path - string, the path to location of the scopseq-x.xx directory 686 (not the directory itself), and 687 - version -a version number. 688 689 or, a FASTA file: 690 691 - astral_file - string, a path to a fasta file (which will be loaded in memory) 692 693 or, a MYSQL database: 694 695 - db_handle - a database handle for a MYSQL database containing a table 696 'astral' with the astral data in it. This can be created 697 using writeToSQL. 698 """ 699 700 if astral_file is None and dir_path is None and db_handle is None: 701 raise RuntimeError("Need either file handle, or (dir_path + " 702 + "version) or database handle to construct Astral") 703 if not scop: 704 raise RuntimeError("Must provide a Scop instance to construct") 705 706 self.scop = scop 707 self.db_handle = db_handle 708 709 if not astral_file and not db_handle: 710 if dir_path is None or version is None: 711 raise RuntimeError("must provide dir_path and version") 712 713 self.version = version 714 self.path = os.path.join(dir_path, "scopseq-%s" % version) 715 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 716 astral_file = os.path.join(self.path, astral_file) 717 718 if astral_file: 719 # Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 720 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) 721 722 self.astral_file = astral_file 723 self.EvDatasets = {} 724 self.EvDatahash = {} 725 self.IdDatasets = {} 726 self.IdDatahash = {}
727
728 - def domainsClusteredByEv(self, id):
729 """get domains clustered by evalue""" 730 if id not in self.EvDatasets: 731 if self.db_handle: 732 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 733 734 else: 735 if not self.path: 736 raise RuntimeError("No scopseq directory specified") 737 738 file_prefix = "astral-scopdom-seqres-sel-gs" 739 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id], 740 self.version) 741 filename = os.path.join(self.path, filename) 742 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 743 return self.EvDatasets[id]
744
745 - def domainsClusteredById(self, id):
746 """get domains clustered by percent id""" 747 if id not in self.IdDatasets: 748 if self.db_handle: 749 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 750 else: 751 if not self.path: 752 raise RuntimeError("No scopseq directory specified") 753 754 file_prefix = "astral-scopdom-seqres-sel-gs" 755 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 756 filename = os.path.join(self.path, filename) 757 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 758 return self.IdDatasets[id]
759
760 - def getAstralDomainsFromFile(self, filename=None, file_handle=None):
761 """Get the scop domains from a file containing a list of sids""" 762 if file_handle is None and filename is None: 763 raise RuntimeError("You must provide a filename or handle") 764 if not file_handle: 765 file_handle = open(filename) 766 doms = [] 767 while True: 768 line = file_handle.readline() 769 if not line: 770 break 771 line = line.rstrip() 772 doms.append(line) 773 if filename: 774 file_handle.close() 775 776 doms = [a for a in doms if a[0]=='d'] 777 doms = [self.scop.getDomainBySid(x) for x in doms] 778 return doms
779
780 - def getAstralDomainsFromSQL(self, column):
781 """Load a set of astral domains from a column in the astral table of a MYSQL 782 database (which can be created with writeToSQL(...)""" 783 cur = self.db_handle.cursor() 784 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 785 data = cur.fetchall() 786 data = [self.scop.getDomainBySid(x[0]) for x in data] 787 788 return data
789
790 - def getSeqBySid(self, domain):
791 """get the seq record of a given domain from its sid""" 792 if self.db_handle is None: 793 return self.fasta_dict[domain].seq 794 else: 795 cur = self.db_handle.cursor() 796 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 797 return Seq(cur.fetchone()[0])
798
799 - def getSeq(self, domain):
800 """Return seq associated with domain""" 801 return self.getSeqBySid(domain.sid)
802
803 - def hashedDomainsById(self, id):
804 """Get domains clustered by sequence identity in a dict""" 805 if id not in self.IdDatahash: 806 self.IdDatahash[id] = {} 807 for d in self.domainsClusteredById(id): 808 self.IdDatahash[id][d] = 1 809 return self.IdDatahash[id]
810
811 - def hashedDomainsByEv(self, id):
812 """Get domains clustered by evalue in a dict""" 813 if id not in self.EvDatahash: 814 self.EvDatahash[id] = {} 815 for d in self.domainsClusteredByEv(id): 816 self.EvDatahash[id][d] = 1 817 return self.EvDatahash[id]
818
819 - def isDomainInId(self, dom, id):
820 """Returns true if the domain is in the astral clusters for percent ID""" 821 return dom in self.hashedDomainsById(id)
822
823 - def isDomainInEv(self, dom, id):
824 """Returns true if the domain is in the ASTRAL clusters for evalues""" 825 return dom in self.hashedDomainsByEv(id)
826
827 - def writeToSQL(self, db_handle):
828 """Write the ASTRAL database to a MYSQL database""" 829 cur = db_handle.cursor() 830 831 cur.execute("DROP TABLE IF EXISTS astral") 832 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 833 834 for dom in self.fasta_dict: 835 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 836 (dom, self.fasta_dict[dom].seq.data)) 837 838 for i in astralBibIds: 839 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 840 841 for d in self.domainsClusteredById(i): 842 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 843 d.sid) 844 845 for ev in astralEvs: 846 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 847 848 for d in self.domainsClusteredByEv(ev): 849 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 850 d.sid)
851 852
853 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 854 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
855 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 856 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 857 858 Access search.cgi and return a handle to the results. See the 859 online help file for an explanation of the parameters: 860 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 861 862 Raises an IOError if there's a network error. 863 864 """ 865 params = {'pdb': pdb, 'key': key, 'sid': sid, 'disp': disp, 866 'dir': dir, 'loc': loc} 867 variables = {} 868 for k, v in params.items(): 869 if v is not None: 870 variables[k] = v 871 variables.update(keywds) 872 return _open(cgi, variables)
873 874
875 -def _open(cgi, params={}, get=1):
876 """_open(cgi, params={}, get=1) -> UndoHandle 877 878 Open a handle to SCOP. cgi is the URL for the cgi script to access. 879 params is a dictionary with the options to pass to it. get is a boolean 880 that describes whether a GET should be used. Does some 881 simple error checking, and will raise an IOError if it encounters one. 882 883 """ 884 from Bio._py3k import urlopen, urlencode 885 886 # Open a handle to SCOP. 887 options = urlencode(params) 888 if get: # do a GET 889 if options: 890 cgi += "?" + options 891 handle = urlopen(cgi) 892 else: # do a POST 893 handle = urlopen(cgi, data=options) 894 return handle
895