Package Bio :: Package SearchIO :: Package _model :: Module hsp
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO._model.hsp

   1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Bio.SearchIO objects to model high scoring regions between query and hit.""" 
   7   
   8  from __future__ import print_function 
   9  from Bio._py3k import basestring 
  10   
  11  import warnings 
  12  from operator import ge, le 
  13   
  14  from Bio import BiopythonWarning 
  15  from Bio.Align import MultipleSeqAlignment 
  16  from Bio.Alphabet import single_letter_alphabet 
  17  from Bio.Seq import Seq 
  18  from Bio.SeqRecord import SeqRecord 
  19   
  20  from Bio._utils import getattr_str, trim_str 
  21  from Bio.SearchIO._utils import singleitem, allitems, fullcascade, \ 
  22          fragcascade 
  23   
  24  from ._base import _BaseHSP 
  25   
  26   
  27  __docformat__ = "restructuredtext en" 
  28   
  29   
30 -class HSP(_BaseHSP):
31 32 """Class representing high-scoring region(s) between query and hit. 33 34 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 35 In most cases, HSP objects store the bulk of the statistics and results 36 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 37 program. 38 39 Depending on the search output file format, a given HSP will contain one 40 or more HSPFragment object(s). Examples of search programs that produce HSP 41 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 42 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 43 However, their native terminologies may differ: in BLAT these fragments 44 are called 'blocks' while in Exonerate they are called exons or NER. 45 46 Here are examples from each type of HSP. The first one comes from a BLAST 47 search:: 48 49 >>> from Bio import SearchIO 50 >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 51 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 52 >>> blast_hsp 53 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 54 >>> print(blast_hsp) 55 Query: 33211 mir_1 56 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 57 Query range: [1:61] (1) 58 Hit range: [0:60] (1) 59 Quick stats: evalue 1.7e-22; bitscore 109.49 60 Fragments: 1 (60 columns) 61 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 62 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 63 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 64 65 For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the 66 underlying sequence alignment, if it exists. This is not the case for HSPs 67 with more than one HSPFragment. Below is an example, using an HSP from a 68 BLAT search. Invoking ``print`` on these HSPs will instead show a table of the 69 HSPFragment objects it contains:: 70 71 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 72 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 73 >>> blat_hsp 74 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 75 >>> print(blat_hsp) 76 Query: blat_1 <unknown description> 77 Hit: chr11 <unknown description> 78 Query range: [42:67] (-1) 79 Hit range: [59018929:59018955] (1) 80 Quick stats: evalue ?; bitscore ? 81 Fragments: --- -------------- ---------------------- ---------------------- 82 # Span Query range Hit range 83 --- -------------- ---------------------- ---------------------- 84 0 6 [61:67] [59018929:59018935] 85 1 16 [42:58] [59018939:59018955] 86 87 Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` 88 ``hit_range`` properties encompasses all fragments it contains. 89 90 You can check whether an HSP has more than one HSPFragments or not using the 91 ``is_fragmented`` property:: 92 93 >>> blast_hsp.is_fragmented 94 False 95 >>> blat_hsp.is_fragmented 96 True 97 98 Since HSP objects are also containers similar to Python lists, you can 99 access a single fragment in an HSP using its integer index:: 100 101 >>> blat_fragment = blat_hsp[0] 102 >>> print(blat_fragment) 103 Query: blat_1 <unknown description> 104 Hit: chr11 <unknown description> 105 Query range: [61:67] (-1) 106 Hit range: [59018929:59018935] (1) 107 Fragments: 1 (6 columns) 108 Query - tatagt 109 Hit - tatagt 110 111 This applies to HSPs objects with a single fragment as well:: 112 113 >>> blast_fragment = blast_hsp[0] 114 >>> print(blast_fragment) 115 Query: 33211 mir_1 116 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 117 Query range: [1:61] (1) 118 Hit range: [0:60] (1) 119 Fragments: 1 (60 columns) 120 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 121 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 122 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 123 124 Regardless of the search output file format, HSP objects provide the 125 properties listed below. These properties always return values in a list, 126 due to the HSP object itself being a list-like container. However, for 127 HSP objects with a single HSPFragment, shortcut properties that fetches 128 the item from the list are also provided. 129 130 +----------------------+---------------------+-----------------------------+ 131 | Property | Shortcut | Value | 132 +======================+=====================+=============================+ 133 | aln_all | aln | HSP alignments as | 134 | | | MultipleSeqAlignment object | 135 +----------------------+---------------------+-----------------------------+ 136 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 137 | | | of all fragments' alignments| 138 +----------------------+---------------------+-----------------------------+ 139 | fragments | fragment | HSPFragment objects | 140 +----------------------+---------------------+-----------------------------+ 141 | hit_all | hit | hit sequence as SeqRecord | 142 | | | objects | 143 +----------------------+---------------------+-----------------------------+ 144 | hit_features_all | hit_features | SeqFeatures of all hit | 145 | | | fragments | 146 +----------------------+---------------------+-----------------------------+ 147 | hit_start_all | hit_start* | start coordinates of the | 148 | | | hit fragments | 149 +----------------------+---------------------+-----------------------------+ 150 | hit_end_all | hit_end* | end coordinates of the hit | 151 | | | fragments | 152 +----------------------+---------------------+-----------------------------+ 153 | hit_span_all | hit_span* | sizes of each hit fragments | 154 +----------------------+---------------------+-----------------------------+ 155 | hit_strand_all | hit_strand | strand orientations of the | 156 | | | hit fragments | 157 +----------------------+---------------------+-----------------------------+ 158 | hit_frame_all | hit_frame | reading frames of the hit | 159 | | | fragments | 160 +----------------------+---------------------+-----------------------------+ 161 | hit_range_all | hit_range | tuples of start and end | 162 | | | coordinates of each hit | 163 | | | fragment | 164 +----------------------+---------------------+-----------------------------+ 165 | query_all | query | query sequence as SeqRecord | 166 | | | object | 167 +----------------------+---------------------+-----------------------------+ 168 | query_features_all | query_features | SeqFeatures of all query | 169 | | | fragments | 170 +----------------------+---------------------+-----------------------------+ 171 | query_start_all | query_start* | start coordinates of the | 172 | | | fragments | 173 +----------------------+---------------------+-----------------------------+ 174 | query_end_all | query_end* | end coordinates of the | 175 | | | query fragments | 176 +----------------------+---------------------+-----------------------------+ 177 | query_span_all | query_span* | sizes of each query | 178 | | | fragments | 179 +----------------------+---------------------+-----------------------------+ 180 | query_strand_all | query_strand | strand orientations of the | 181 | | | query fragments | 182 +----------------------+---------------------+-----------------------------+ 183 | query_frame_all | query_frame | reading frames of the query | 184 | | | fragments | 185 +----------------------+---------------------+-----------------------------+ 186 | query_range_all | query_range | tuples of start and end | 187 | | | coordinates of each query | 188 | | | fragment | 189 +----------------------+---------------------+-----------------------------+ 190 191 For all types of HSP objects, the property will return the values in a list. 192 Shorcuts are only applicable for HSPs with one fragment. Except the ones 193 noted, if they are used on an HSP with more than one fragments, an exception 194 will be raised. 195 196 For properties that may be used in HSPs with multiple or single fragments 197 (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends 198 on how many fragment the HSP has: 199 200 +------------+---------------------------------------------------+ 201 | Property | Value | 202 +============+===================================================+ 203 | hit_start | smallest coordinate value of all hit fragments | 204 +------------+---------------------------------------------------+ 205 | hit_end | largest coordinate value of all hit fragments | 206 +------------+---------------------------------------------------+ 207 | hit_span | difference between ``hit_start`` and ``hit_end`` | 208 +------------+---------------------------------------------------+ 209 | query_start| smallest coordinate value of all query fragments | 210 +------------+---------------------------------------------------+ 211 | query_end | largest coordinate value of all query fragments | 212 +------------+---------------------------------------------------+ 213 | query_span | difference between ``query_start`` and | 214 | | ``query_end`` | 215 +------------+---------------------------------------------------+ 216 217 In addition to the objects listed above, HSP objects also provide the 218 following properties: 219 220 +--------------------+------------------------------------------------------+ 221 | Property | Value | 222 +====================+======================================================+ 223 | aln_span | total number of residues in all HSPFragment objects | 224 +--------------------+------------------------------------------------------+ 225 | alphabet | alphabet used in hit and query SeqRecord objects | 226 +--------------------+------------------------------------------------------+ 227 | is_fragmented | boolean, whether there are multiple fragments or not | 228 +--------------------+------------------------------------------------------+ 229 | hit_id | ID of the hit sequence | 230 +--------------------+------------------------------------------------------+ 231 | hit_description | description of the hit sequence | 232 +--------------------+------------------------------------------------------+ 233 | hit_inter_ranges | list of hit sequence coordinates of the regions | 234 | | between fragments | 235 +--------------------+------------------------------------------------------+ 236 | hit_inter_spans | list of lengths of the regions between hit fragments | 237 +--------------------+------------------------------------------------------+ 238 | query_id | ID of the query sequence | 239 +--------------------+------------------------------------------------------+ 240 | query_description | description of the query sequence | 241 +--------------------+------------------------------------------------------+ 242 | query_inter_ranges | list of query sequence coordinates of the regions | 243 | | between fragments | 244 +--------------------+------------------------------------------------------+ 245 | query_inter_spans | list of lengths of the regions between query | 246 | | fragments | 247 +--------------------+------------------------------------------------------+ 248 249 .. [1] may be used in HSPs with multiple fragments 250 251 """ 252 # attributes we don't want to transfer when creating a new Hit class 253 # from this one 254 _NON_STICKY_ATTRS = ('_items', ) 255
256 - def __init__(self, fragments=[]):
257 """Initializes an HSP object. 258 259 :param fragments: fragments contained in the HSP object 260 :type fragments: iterable yielding HSPFragment 261 262 HSP objects must be initialized with a list containing at least one 263 HSPFragment object. If multiple HSPFragment objects are used for 264 initialization, they must all have the same ``query_id``, 265 ``query_description``, ``hit_id``, ``hit_description``, and alphabet 266 properties. 267 268 """ 269 if not fragments: 270 raise ValueError("HSP objects must have at least one HSPFragment " 271 "object.") 272 # check that all fragments contain the same IDs, descriptions, alphabet 273 for attr in ('query_id', 'query_description', 'hit_id', 274 'hit_description', 'alphabet'): 275 if len(set(getattr(frag, attr) for frag in fragments)) != 1: 276 raise ValueError("HSP object can not contain fragments with " 277 "more than one %s." % attr) 278 279 self._items = [] 280 for fragment in fragments: 281 self._validate_fragment(fragment) 282 self._items.append(fragment)
283
284 - def __repr__(self):
285 return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ 286 (self.__class__.__name__, self.hit_id, self.query_id, len(self))
287
288 - def __iter__(self):
289 return iter(self._items)
290
291 - def __contains__(self, fragment):
292 return fragment in self._items
293
294 - def __len__(self):
295 return len(self._items)
296 297 # Python 3:
298 - def __bool__(self):
299 return bool(self._items)
300 301 # Python 2: 302 __nonzero__= __bool__ 303
304 - def __str__(self):
305 306 lines = [] 307 # set hsp info line 308 statline = [] 309 # evalue 310 evalue = getattr_str(self, 'evalue', fmt='%.2g') 311 statline.append('evalue ' + evalue) 312 # bitscore 313 bitscore = getattr_str(self, 'bitscore', fmt='%.2f') 314 statline.append('bitscore ' + bitscore) 315 lines.append('Quick stats: ' + '; '.join(statline)) 316 317 if len(self.fragments) == 1: 318 return '\n'.join([self._str_hsp_header(), '\n'.join(lines), 319 self.fragments[0]._str_aln()]) 320 else: 321 lines.append(' Fragments: %s %s %s %s' % 322 ('-'*3, '-'*14, '-'*22, '-'*22)) 323 pattern = '%16s %14s %22s %22s' 324 lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) 325 lines.append(pattern % ('-'*3, '-'*14, '-'*22, '-'*22)) 326 for idx, block in enumerate(self.fragments): 327 # set hsp line and table 328 # alignment span 329 aln_span = getattr_str(block, 'aln_span') 330 # query region 331 query_start = getattr_str(block, 'query_start') 332 query_end = getattr_str(block, 'query_end') 333 query_range = '[%s:%s]' % (query_start, query_end) 334 # max column length is 20 335 query_range = trim_str(query_range, 22, '~]') 336 # hit region 337 hit_start = getattr_str(block, 'hit_start') 338 hit_end = getattr_str(block, 'hit_end') 339 hit_range = '[%s:%s]' % (hit_start, hit_end) 340 hit_range = trim_str(hit_range, 22, '~]') 341 # append the hsp row 342 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 343 344 return self._str_hsp_header() + '\n' + '\n'.join(lines)
345
346 - def __getitem__(self, idx):
347 # if key is slice, return a new HSP instance 348 if isinstance(idx, slice): 349 obj = self.__class__(self._items[idx]) 350 self._transfer_attrs(obj) 351 return obj 352 return self._items[idx]
353
354 - def __setitem__(self, idx, fragments):
355 # handle case if hsps is a list of hsp 356 if isinstance(fragments, (list, tuple)): 357 for fragment in fragments: 358 self._validate_fragment(fragment) 359 else: 360 self._validate_fragment(fragments) 361 362 self._items[idx] = fragments
363
364 - def __delitem__(self, idx):
365 # note that this may result in an empty HSP object, which should be 366 # invalid 367 del self._items[idx]
368
369 - def _validate_fragment(self, fragment):
370 if not isinstance(fragment, HSPFragment): 371 raise TypeError("HSP objects can only contain HSPFragment " 372 "objects.") 373 # HACK: to make validation during __init__ work 374 if self._items: 375 if fragment.hit_id != self.hit_id: 376 raise ValueError("Expected HSPFragment with hit ID %r, " 377 "found %r instead." % (self.id, fragment.hit_id)) 378 379 if fragment.hit_description != self.hit_description: 380 raise ValueError("Expected HSPFragment with hit " 381 "description %r, found %r instead." % 382 (self.description, fragment.hit_description)) 383 384 if fragment.query_id != self.query_id: 385 raise ValueError("Expected HSPFragment with query ID %r, " 386 "found %r instead." % (self.query_id, 387 fragment.query_id)) 388 389 if fragment.query_description != self.query_description: 390 raise ValueError("Expected HSP with query description %r, " 391 "found %r instead." % (self.query_description, 392 fragment.query_description))
393
394 - def _aln_span_get(self):
395 # length of all alignments 396 # alignment span can be its own attribute, or computed from 397 # query / hit length 398 return sum(frg.aln_span for frg in self.fragments)
399 400 aln_span = property(fget=_aln_span_get, 401 doc="""Total number of columns in all HSPFragment objects.""") 402 403 # coordinate properties #
404 - def _get_coords(self, seq_type, coord_type):
405 assert seq_type in ('hit', 'query') 406 assert coord_type in ('start', 'end') 407 coord_name = '%s_%s' % (seq_type, coord_type) 408 coords = [getattr(frag, coord_name) for frag in self.fragments] 409 if None in coords: 410 warnings.warn("'None' exist in %s coordinates; ignored" % 411 (coord_name), BiopythonWarning) 412 return coords
413
414 - def _hit_start_get(self):
415 return min(self._get_coords('hit', 'start'))
416 417 hit_start = property(fget=_hit_start_get, 418 doc="""Smallest coordinate value of all hit fragments""") 419
420 - def _query_start_get(self):
421 return min(self._get_coords('query', 'start'))
422 423 query_start = property(fget=_query_start_get, 424 doc="""Smallest coordinate value of all query fragments""") 425
426 - def _hit_end_get(self):
427 return max(self._get_coords('hit', 'end'))
428 429 hit_end = property(fget=_hit_end_get, 430 doc="""Largest coordinate value of all hit fragments""") 431
432 - def _query_end_get(self):
433 return max(self._get_coords('query', 'end'))
434 435 query_end = property(fget=_query_end_get, 436 doc="""Largest coordinate value of all hit fragments""") 437 438 # coordinate-dependent properties #
439 - def _hit_span_get(self):
440 try: 441 return self.hit_end - self.hit_start 442 except TypeError: # triggered if any of the coordinates are None 443 return None
444 445 hit_span = property(fget=_hit_span_get, 446 doc="""The number of hit residues covered by the HSP.""") 447
448 - def _query_span_get(self):
449 try: 450 return self.query_end - self.query_start 451 except TypeError: # triggered if any of the coordinates are None 452 return None
453 454 query_span = property(fget=_query_span_get, 455 doc="""The number of query residues covered by the HSP.""") 456
457 - def _hit_range_get(self):
458 return (self.hit_start, self.hit_end)
459 460 hit_range = property(fget=_hit_range_get, 461 doc="""Tuple of HSP hit start and end coordinates.""") 462
463 - def _query_range_get(self):
464 return (self.query_start, self.query_end)
465 466 query_range = property(fget=_query_range_get, 467 doc="""Tuple of HSP query start and end coordinates.""") 468
469 - def _inter_ranges_get(self, seq_type):
470 # this property assumes that there are no mixed strands in a hit/query 471 assert seq_type in ('query', 'hit') 472 strand = getattr(self, '%s_strand_all' % seq_type)[0] 473 coords = getattr(self, '%s_range_all' % seq_type) 474 # determine function used to set inter range 475 # start and end coordinates, given two pairs 476 # of fragment start and end coordinates 477 if strand == -1: 478 startfunc, endfunc = min, max 479 else: 480 startfunc, endfunc = max, min 481 inter_coords = [] 482 for idx, coord in enumerate(coords[:-1]): 483 start = startfunc(coords[idx]) 484 end = endfunc(coords[idx+1]) 485 inter_coords.append((min(start, end), max(start, end))) 486 487 return inter_coords
488
489 - def _hit_inter_ranges_get(self):
490 return self._inter_ranges_get('hit')
491 492 hit_inter_ranges = property(fget=_hit_inter_ranges_get, 493 doc="""Hit sequence coordinates of the regions between fragments""") 494
495 - def _query_inter_ranges_get(self):
496 return self._inter_ranges_get('query')
497 498 query_inter_ranges = property(fget=_query_inter_ranges_get, 499 doc="""Query sequence coordinates of the regions between fragments""") 500
501 - def _inter_spans_get(self, seq_type):
502 assert seq_type in ('query', 'hit') 503 attr_name = '%s_inter_ranges' % seq_type 504 return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
505
506 - def _hit_inter_spans_get(self):
507 return self._inter_spans_get('hit')
508 509 hit_inter_spans = property(fget=_hit_inter_spans_get, 510 doc="""Lengths of regions between hit fragments""") 511
512 - def _query_inter_spans_get(self):
513 return self._inter_spans_get('query')
514 515 query_inter_spans = property(fget=_query_inter_spans_get, 516 doc="""Lengths of regions between query fragments""") 517 518 # shortcuts for fragments' properties # 519 520 # bool check if there's more than one fragments 521 is_fragmented = property(lambda self: len(self) > 1, 522 doc="""Whether the HSP has more than one HSPFragment objects""") 523 524 # first item properties with setters 525 hit_description = fullcascade('hit_description', 526 doc="""Description of the hit sequence""") 527 528 query_description = fullcascade('query_description', 529 doc="""Description of the query sequence""") 530 531 hit_id = fullcascade('hit_id', 532 doc="""ID of the hit sequence""") 533 534 query_id = fullcascade('query_id', 535 doc="""ID of the query sequence""") 536 537 alphabet = fullcascade('alphabet', 538 doc="""Alphabet used in hit and query SeqRecord objects""") 539 540 # properties for single-fragment HSPs 541 fragment = singleitem( 542 doc="""HSPFragment object, first fragment""") 543 544 hit = singleitem('hit', 545 doc="""Hit sequence as a SeqRecord object, first fragment""") 546 547 query = singleitem('query', 548 doc="""Query sequence as a SeqRecord object, first fragment""") 549 550 aln = singleitem('aln', 551 doc="""Alignment of the first fragment as a MultipleSeqAlignment object""") 552 553 aln_annotation = singleitem('aln_annotation', 554 doc="""Dictionary of annotation(s) of the first fragment's alignment""") 555 556 hit_features = singleitem('hit_features', 557 doc="""Hit sequence features, first fragment""") 558 559 query_features = singleitem('query_features', 560 doc="""Query sequence features, first fragment""") 561 562 hit_strand = singleitem('hit_strand', 563 doc="""Hit strand orientation, first fragment""") 564 565 query_strand = singleitem('query_strand', 566 doc="""Query strand orientation, first fragment""") 567 568 hit_frame = singleitem('hit_frame', 569 doc="""Hit sequence reading frame, first fragment""") 570 571 query_frame = singleitem('query_frame', 572 doc="""Query sequence reading frame, first fragment""") 573 574 # properties for multi-fragment HSPs 575 fragments = allitems(doc="""List of all HSPFragment objects""") 576 577 hit_all = allitems('hit', 578 doc="""List of all fragments' hit sequences as SeqRecord objects""") 579 580 query_all = allitems('query', 581 doc="""List of all fragments' query sequences as SeqRecord objects""") 582 583 aln_all = allitems('aln', 584 doc="""List of all fragments' alignments as MultipleSeqAlignment objects""") 585 586 aln_annotation_all = allitems('aln_annotation', 587 doc="""Dictionary of annotation(s) of all fragments' alignments""") 588 589 hit_features_all = allitems('hit_features', 590 doc="""List of all hit sequence features""") 591 592 query_features_all = allitems('query_features', 593 doc="""List of all query sequence features""") 594 595 hit_strand_all = allitems('hit_strand', 596 doc="""List of all fragments' hit sequence strands""") 597 598 query_strand_all = allitems('query_strand', 599 doc="""List of all fragments' query sequence strands""") 600 601 hit_frame_all = allitems('hit_frame', 602 doc="""List of all fragments' hit sequence reading frames""") 603 604 query_frame_all = allitems('query_frame', 605 doc="""List of all fragments' query sequence reading frames""") 606 607 hit_start_all = allitems('hit_start', 608 doc="""List of all fragments' hit start coordinates""") 609 610 query_start_all = allitems('query_start', 611 doc="""List of all fragments' query start coordinates""") 612 613 hit_end_all = allitems('hit_end', 614 doc="""List of all fragments' hit end coordinates""") 615 616 query_end_all = allitems('query_end', 617 doc="""List of all fragments' query end coordinates""") 618 619 hit_span_all = allitems('hit_span', 620 doc="""List of all fragments' hit sequence size""") 621 622 query_span_all = allitems('query_span', 623 doc="""List of all fragments' query sequence size""") 624 625 hit_range_all = allitems('hit_range', 626 doc="""List of all fragments' hit start and end coordinates""") 627 628 query_range_all = allitems('query_range', 629 doc="""List of all fragments' query start and end coordinates""")
630 631
632 -class HSPFragment(_BaseHSP):
633 634 """Class representing a contiguous alignment of hit-query sequence. 635 636 HSPFragment forms the core of any parsed search output file. Depending on 637 the search output file format, it may contain the actual query and/or hit 638 sequences that produces the search hits. These sequences are stored as 639 SeqRecord objects (see SeqRecord): 640 641 >>> from Bio import SearchIO 642 >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 643 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 644 >>> print(fragment) 645 Query: 33211 mir_1 646 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 647 Query range: [0:61] (1) 648 Hit range: [0:61] (1) 649 Fragments: 1 (61 columns) 650 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 651 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 652 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 653 654 # the query sequence is a SeqRecord object 655 >>> fragment.query.__class__ 656 <class 'Bio.SeqRecord.SeqRecord'> 657 >>> print(fragment.query) 658 ID: 33211 659 Name: aligned query sequence 660 Description: mir_1 661 Number of features: 0 662 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 663 664 # the hit sequence is a SeqRecord object as well 665 >>> fragment.hit.__class__ 666 <class 'Bio.SeqRecord.SeqRecord'> 667 >>> print(fragment.hit) 668 ID: gi|262205317|ref|NR_030195.1| 669 Name: aligned hit sequence 670 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 671 Number of features: 0 672 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 673 674 # when both query and hit are present, we get a MultipleSeqAlignment object 675 >>> fragment.aln.__class__ 676 <class 'Bio.Align.MultipleSeqAlignment'> 677 >>> print(fragment.aln) 678 DNAAlphabet() alignment with 2 rows and 61 columns 679 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 680 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 681 682 """ 683
684 - def __init__(self, hit_id='<unknown id>', query_id='<unknown id>', 685 hit=None, query=None, alphabet=single_letter_alphabet):
686 687 self._alphabet = alphabet 688 self.aln_annotation = {} 689 690 self._hit_id = hit_id 691 self._query_id = query_id 692 693 for seq_type in ('query', 'hit'): 694 # query or hit attributes default attributes 695 setattr(self, '_%s_description' % seq_type, '<unknown description>') 696 setattr(self, '_%s_features' % seq_type, []) 697 # query or hit attributes whose default attribute is None 698 for attr in ('strand', 'frame', 'start', 'end'): 699 setattr(self, '%s_%s' % (seq_type, attr), None) 700 # self.query or self.hit 701 if eval(seq_type): 702 setattr(self, seq_type, eval(seq_type)) 703 else: 704 setattr(self, seq_type, None)
705
706 - def __repr__(self):
707 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 708 try: 709 info += ", %i columns" % len(self) 710 except AttributeError: 711 pass 712 return "%s(%s)" % (self.__class__.__name__, info)
713
714 - def __len__(self):
715 return self.aln_span
716
717 - def __str__(self):
718 return self._str_hsp_header() + '\n' + self._str_aln()
719
720 - def __getitem__(self, idx):
721 if self.aln is not None: 722 obj = self.__class__( 723 hit_id=self.hit_id, query_id=self.query_id, 724 alphabet=self.alphabet) 725 # transfer query and hit attributes 726 # let SeqRecord handle feature slicing, then retrieve the sliced 727 # features into the sliced HSPFragment 728 if self.query is not None: 729 obj.query = self.query[idx] 730 obj.query_features = obj.query.features 731 if self.hit is not None: 732 obj.hit = self.hit[idx] 733 obj.hit_features = obj.hit.features 734 # description, strand, frame 735 for attr in ('description', 'strand', 'frame'): 736 for seq_type in ('hit', 'query'): 737 attr_name = '%s_%s' % (seq_type, attr) 738 self_val = getattr(self, attr_name) 739 setattr(obj, attr_name, self_val) 740 # alignment annotation should be transferred, since we can compute 741 # the resulting annotation 742 obj.aln_annotation = {} 743 for key, value in self.aln_annotation.items(): 744 assert len(value[idx]) == len(obj) 745 obj.aln_annotation[key] = value[idx] 746 return obj 747 else: 748 raise TypeError("Slicing for HSP objects without " 749 "alignment is not supported.")
750
751 - def _str_aln(self):
752 lines = [] 753 # alignment length 754 aln_span = getattr_str(self, 'aln_span') 755 lines.append(' Fragments: 1 (%s columns)' % aln_span) 756 # sequences 757 if self.query is not None and self.hit is not None: 758 try: 759 qseq = str(self.query.seq) 760 except AttributeError: # query is None 761 qseq = '?' 762 try: 763 hseq = str(self.hit.seq) 764 except AttributeError: # hit is None 765 hseq = '?' 766 767 # similarity line 768 simil = '' 769 if 'similarity' in self.aln_annotation and \ 770 isinstance(self.aln_annotation.get('similarity'), basestring): 771 simil = self.aln_annotation['similarity'] 772 773 if self.aln_span <= 67: 774 lines.append("%10s - %s" % ('Query', qseq)) 775 if simil: 776 lines.append(" %s" % simil) 777 lines.append("%10s - %s" % ('Hit', hseq)) 778 else: 779 # adjust continuation character length, so we don't display 780 # the same residues twice 781 if self.aln_span - 66 > 3: 782 cont = '~' * 3 783 else: 784 cont = '~' * (self.aln_span - 66) 785 lines.append("%10s - %s%s%s" % ('Query', 786 qseq[:59], cont, qseq[-5:])) 787 if simil: 788 lines.append(" %s%s%s" % 789 (simil[:59], cont, simil[-5:])) 790 lines.append("%10s - %s%s%s" % ('Hit', 791 hseq[:59], cont, hseq[-5:])) 792 793 return '\n'.join(lines)
794 795 # sequence properties #
796 - def _set_seq(self, seq, seq_type):
797 """Checks the given sequence for attribute setting 798 799 :param seq: sequence to check 800 :type seq: string or SeqRecord 801 :param seq_type: sequence type 802 :type seq_type: string, choice of 'hit' or 'query' 803 804 """ 805 assert seq_type in ('hit', 'query') 806 if seq is None: 807 return seq # return immediately if seq is None 808 else: 809 if not isinstance(seq, (basestring, SeqRecord)): 810 raise TypeError("%s sequence must be a string or a SeqRecord" 811 " object." % seq_type) 812 # check length if the opposite sequence is not None 813 opp_type = 'hit' if seq_type == 'query' else 'query' 814 opp_seq = getattr(self, '_%s' % opp_type, None) 815 if opp_seq is not None: 816 if len(seq) != len(opp_seq): 817 raise ValueError("Sequence lengths do not match. Expected: " 818 "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, 819 len(seq), seq_type)) 820 821 seq_id = getattr(self, '%s_id' % seq_type) 822 seq_desc = getattr(self, '%s_description' % seq_type) 823 seq_feats = getattr(self, '%s_features' % seq_type) 824 seq_name = 'aligned %s sequence' % seq_type 825 826 if isinstance(seq, SeqRecord): 827 seq.id = seq_id 828 seq.description = seq_desc 829 seq.name = seq_name 830 seq.features = seq_feats 831 seq.seq.alphabet = self.alphabet 832 elif isinstance(seq, basestring): 833 seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, 834 description=seq_desc, features=seq_feats) 835 836 return seq
837
838 - def _hit_get(self):
839 return self._hit
840
841 - def _hit_set(self, value):
842 self._hit = self._set_seq(value, 'hit')
843 844 hit = property(fget=_hit_get, fset=_hit_set, 845 doc="""Hit sequence as a SeqRecord object, defaults to None""") 846
847 - def _query_get(self):
848 return self._query
849
850 - def _query_set(self, value):
851 self._query = self._set_seq(value, 'query')
852 853 query = property(fget=_query_get, fset=_query_set, 854 doc="""Query sequence as a SeqRecord object, defaults to None""") 855
856 - def _aln_get(self):
857 if self.query is None and self.hit is None: 858 return None 859 elif self.hit is None: 860 return MultipleSeqAlignment([self.query], self.alphabet) 861 elif self.query is None: 862 return MultipleSeqAlignment([self.hit], self.alphabet) 863 else: 864 return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
865 866 aln = property(fget=_aln_get, 867 doc="""Query-hit alignment as a MultipleSeqAlignment object, 868 defaults to None""") 869
870 - def _alphabet_get(self):
871 return self._alphabet
872
873 - def _alphabet_set(self, value):
874 self._alphabet = value 875 try: 876 self.query.seq.alphabet = value 877 except AttributeError: 878 pass 879 try: 880 self.hit.seq.alphabet = value 881 except AttributeError: 882 pass
883 884 alphabet = property(fget=_alphabet_get, fset=_alphabet_set, 885 doc="""Alphabet object used in the fragment's sequences and alignment, 886 defaults to single_letter_alphabet""") 887
888 - def _aln_span_get(self):
889 # length of alignment (gaps included) 890 # alignment span can be its own attribute, or computed from 891 # query / hit length 892 if not hasattr(self, '_aln_span'): 893 if self.query is not None: 894 self._aln_span = len(self.query) 895 elif self.hit is not None: 896 self._aln_span = len(self.hit) 897 898 return self._aln_span
899
900 - def _aln_span_set(self, value):
901 self._aln_span = value
902 903 aln_span = property(fget=_aln_span_get, fset=_aln_span_set, 904 doc="""The number of alignment columns covered by the fragment""") 905 906 # id, description, and features properties # 907 hit_description = fragcascade('description', 'hit', 908 doc="""Hit sequence description""") 909 910 query_description = fragcascade('description', 'query', 911 doc="""Query sequence description""") 912 913 hit_id = fragcascade('id', 'hit', 914 doc="""Hit sequence ID""") 915 916 query_id = fragcascade('id', 'query', 917 doc="""Query sequence ID""") 918 919 hit_features = fragcascade('features', 'hit', 920 doc="""Hit sequence features""") 921 922 query_features = fragcascade('features', 'query', 923 doc="""Query sequence features""") 924 925 # strand properties #
926 - def _prep_strand(self, strand):
927 # follow SeqFeature's convention 928 if strand not in (-1, 0, 1, None): 929 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % 930 strand) 931 return strand
932
933 - def _get_strand(self, seq_type):
934 assert seq_type in ('hit', 'query') 935 strand = getattr(self, '_%s_strand' % seq_type) 936 937 if strand is None: 938 # try to compute strand from frame 939 frame = getattr(self, '%s_frame' % seq_type) 940 if frame is not None: 941 try: 942 strand = frame // abs(frame) 943 except ZeroDivisionError: 944 strand = 0 945 setattr(self, '%s_strand' % seq_type, strand) 946 947 return strand
948
949 - def _hit_strand_get(self):
950 return self._get_strand('hit')
951
952 - def _hit_strand_set(self, value):
953 self._hit_strand = self._prep_strand(value)
954 955 hit_strand = property(fget=_hit_strand_get, fset=_hit_strand_set, 956 doc="""Hit sequence strand, defaults to None""") 957
958 - def _query_strand_get(self):
959 return self._get_strand('query')
960
961 - def _query_strand_set(self, value):
962 self._query_strand = self._prep_strand(value)
963 964 query_strand = property(fget=_query_strand_get, fset=_query_strand_set, 965 doc="""Query sequence strand, defaults to None""") 966 967 # frame properties #
968 - def _prep_frame(self, frame):
969 if frame not in (-3, -2, -1, 0, 1, 2, 3, None): 970 raise ValueError("Strand should be an integer between -3 and 3, " 971 "or None; not %r" % frame) 972 return frame
973
974 - def _hit_frame_get(self):
975 return self._hit_frame
976
977 - def _hit_frame_set(self, value):
978 self._hit_frame = self._prep_frame(value)
979 980 hit_frame = property(fget=_hit_frame_get, fset=_hit_frame_set, 981 doc="""Hit sequence reading frame, defaults to None""") 982
983 - def _query_frame_get(self):
984 return self._query_frame
985
986 - def _query_frame_set(self, value):
987 self._query_frame = self._prep_frame(value)
988 989 query_frame = property(fget=_query_frame_get, fset=_query_frame_set, 990 doc="""Query sequence reading frame, defaults to None""") 991 992 # coordinate properties #
993 - def _prep_coord(self, coord, opp_coord_name, op):
994 # coord must either be None or int 995 if coord is None: 996 return coord 997 assert isinstance(coord, int) 998 # try to get opposite coordinate, if it's not present, return 999 try: 1000 opp_coord = getattr(self, opp_coord_name) 1001 except AttributeError: 1002 return coord 1003 # if opposite coordinate is None, return 1004 if opp_coord is None: 1005 return coord 1006 # otherwise compare it to coord ('>=' or '<=') 1007 else: 1008 assert op(coord, opp_coord) 1009 return coord
1010
1011 - def _hit_start_get(self):
1012 return self._hit_start
1013
1014 - def _hit_start_set(self, value):
1015 self._hit_start = self._prep_coord(value, 'hit_end', le)
1016 1017 hit_start = property(fget=_hit_start_get, fset=_hit_start_set, 1018 doc="""Hit sequence start coordinate, defaults to None""") 1019
1020 - def _query_start_get(self):
1021 return self._query_start
1022
1023 - def _query_start_set(self, value):
1024 self._query_start = self._prep_coord(value, 'query_end', le)
1025 1026 query_start = property(fget=_query_start_get, fset=_query_start_set, 1027 doc="""Query sequence start coordinate, defaults to None""") 1028
1029 - def _hit_end_get(self):
1030 return self._hit_end
1031
1032 - def _hit_end_set(self, value):
1033 self._hit_end = self._prep_coord(value, 'hit_start', ge)
1034 1035 hit_end = property(fget=_hit_end_get, fset=_hit_end_set, 1036 doc="""Hit sequence start coordinate, defaults to None""") 1037
1038 - def _query_end_get(self):
1039 return self._query_end
1040
1041 - def _query_end_set(self, value):
1042 self._query_end = self._prep_coord(value, 'query_start', ge)
1043 1044 query_end = property(fget=_query_end_get, fset=_query_end_set, 1045 doc="""Query sequence end coordinate, defaults to None""") 1046 1047 # coordinate-dependent properties #
1048 - def _hit_span_get(self):
1049 try: 1050 return self.hit_end - self.hit_start 1051 except TypeError: # triggered if any of the coordinates are None 1052 return None
1053 1054 hit_span = property(fget=_hit_span_get, 1055 doc="""The number of residues covered by the hit sequence""") 1056
1057 - def _query_span_get(self):
1058 try: 1059 return self.query_end - self.query_start 1060 except TypeError: # triggered if any of the coordinates are None 1061 return None
1062 1063 query_span = property(fget=_query_span_get, 1064 doc="""The number of residues covered by the query sequence""") 1065
1066 - def _hit_range_get(self):
1067 return (self.hit_start, self.hit_end)
1068 1069 hit_range = property(fget=_hit_range_get, 1070 doc="""Tuple of hit start and end coordinates""") 1071
1072 - def _query_range_get(self):
1073 return (self.query_start, self.query_end)
1074 1075 query_range = property(fget=_query_range_get, 1076 doc="""Tuple of query start and end coordinates""")
1077 1078 1079 # if not used as a module, run the doctest 1080 if __name__ == "__main__": 1081 from Bio._utils import run_doctest 1082 run_doctest() 1083