1
2
3
4
5
6 """Bio.SearchIO object to model search results from a single query."""
7
8 from copy import deepcopy
9 from itertools import chain
10
11 from Bio._py3k import OrderedDict
12 from Bio._utils import trim_str
13 from Bio.SearchIO._utils import partialcascade
14
15 from _base import _BaseSearchObject
16 from hit import Hit
20
21 """Class representing search results from a single query.
22
23 QueryResult is the container object that stores all search hits from a
24 single search query. It is the top-level object returned by SearchIO's two
25 main functions, `read` and `parse`. Depending on the search results and
26 search output format, a QueryResult object will contain zero or more Hit
27 objects (see Hit).
28
29 You can take a quick look at a QueryResult's contents and attributes by
30 invoking `print` on it:
31
32 >>> from Bio import SearchIO
33 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next()
34 >>> print qresult
35 Program: blastn (2.2.27+)
36 Query: 33211 (61)
37 mir_1
38 Target: refseq_rna
39 Hits: ---- ----- ----------------------------------------------------------
40 # # HSP ID + description
41 ---- ----- ----------------------------------------------------------
42 0 1 gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 52...
43 1 1 gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA...
44 2 1 gi|270133242|ref|NR_032573.1| Macaca mulatta microRNA ...
45 3 2 gi|301171322|ref|NR_035857.1| Pan troglodytes microRNA...
46 4 1 gi|301171267|ref|NR_035851.1| Pan troglodytes microRNA...
47 5 2 gi|262205330|ref|NR_030198.1| Homo sapiens microRNA 52...
48 6 1 gi|262205302|ref|NR_030191.1| Homo sapiens microRNA 51...
49 7 1 gi|301171259|ref|NR_035850.1| Pan troglodytes microRNA...
50 8 1 gi|262205451|ref|NR_030222.1| Homo sapiens microRNA 51...
51 9 2 gi|301171447|ref|NR_035871.1| Pan troglodytes microRNA...
52 10 1 gi|301171276|ref|NR_035852.1| Pan troglodytes microRNA...
53 11 1 gi|262205290|ref|NR_030188.1| Homo sapiens microRNA 51...
54 ...
55
56 If you just want to know how many hits a QueryResult has, you can invoke
57 `len` on it. Alternatively, you can simply type its name in the interpreter:
58
59 >>> len(qresult)
60 100
61 >>> qresult
62 QueryResult(id='33211', 100 hits)
63
64 QueryResult behaves like a hybrid of Python's built-in list and dictionary.
65 You can retrieve its items (Hit objects) using the integer index of the
66 item, just like regular Python lists:
67
68 >>> first_hit = qresult[0]
69 >>> first_hit
70 Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
71
72 You can slice QueryResult objects as well. Slicing will return a new
73 QueryResult object containing only the sliced hits:
74
75 >>> sliced_qresult = qresult[:3] # slice the first three hits
76 >>> len(qresult)
77 100
78 >>> len(sliced_qresult)
79 3
80 >>> print sliced_qresult
81 Program: blastn (2.2.27+)
82 Query: 33211 (61)
83 mir_1
84 Target: refseq_rna
85 Hits: ---- ----- ----------------------------------------------------------
86 # # HSP ID + description
87 ---- ----- ----------------------------------------------------------
88 0 1 gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 52...
89 1 1 gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA...
90 2 1 gi|270133242|ref|NR_032573.1| Macaca mulatta microRNA ...
91
92 Like Python dictionaries, you can also retrieve hits using the hit's ID.
93 This is useful for retrieving hits that you know should exist in a given
94 search:
95
96 >>> hit = qresult['gi|262205317|ref|NR_030195.1|']
97 >>> hit
98 Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
99
100 You can also replace a Hit in QueryResult with another Hit using either the
101 integer index or hit key string. Note that the replacing object must be a
102 Hit that has the same `query_id` property as the QueryResult object.
103
104 If you're not sure whether a QueryResult contains a particular hit, you can
105 use the hit ID to check for membership first:
106
107 >>> 'gi|262205317|ref|NR_030195.1|' in qresult
108 True
109 >>> 'gi|262380031|ref|NR_023426.1|' in qresult
110 False
111
112 Or, if you just want to know the rank / position of a given hit, you can
113 use the hit ID as an argument for the `index` method. Note that the values
114 returned will be zero-based. So zero (0) means the hit is the first in the
115 QueryResult, three (3) means the hit is the fourth item, and so on. If the
116 hit does not exist in the QueryResult, a `ValueError` will be raised.
117
118 >>> qresult.index('gi|262205317|ref|NR_030195.1|')
119 0
120 >>> qresult.index('gi|262205330|ref|NR_030198.1|')
121 5
122 >>> qresult.index('gi|262380031|ref|NR_023426.1|')
123 Traceback (most recent call last):
124 ...
125 ValueError: ...
126
127 To ease working with a large number of hits, QueryResult has several
128 `filter` and `map` methods, analogous to Python's built-in functions with
129 the same names. There are `filter` and `map` methods available for
130 operations over both Hit objects or HSP objects. As an example, here we are
131 using the `hit_map` method to rename all hit IDs within a QueryResult:
132
133 >>> def renamer(hit):
134 ... hit.id = hit.id.split('|')[3]
135 ... return hit
136 >>> mapped_qresult = qresult.hit_map(renamer)
137 >>> print mapped_qresult
138 Program: blastn (2.2.27+)
139 Query: 33211 (61)
140 mir_1
141 Target: refseq_rna
142 Hits: ---- ----- ----------------------------------------------------------
143 # # HSP ID + description
144 ---- ----- ----------------------------------------------------------
145 0 1 NR_030195.1 Homo sapiens microRNA 520b (MIR520B), micr...
146 1 1 NR_035856.1 Pan troglodytes microRNA mir-520b (MIR520B...
147 2 1 NR_032573.1 Macaca mulatta microRNA mir-519a (MIR519A)...
148 ...
149
150 The principle for other `map` and `filter` methods are similar: they accept
151 a function, applies it, and returns a new QueryResult object.
152
153 There are also other methods useful for working with list-like objects:
154 `append`, `pop`, and `sort`. More details and examples are available in
155 their respective documentations.
156
157 Finally, just like Python lists and dictionaries, QueryResult objects are
158 iterable. Iteration over QueryResults will yield Hit objects:
159
160 >>> for hit in qresult[:4]: # iterate over the first four items
161 ... hit
162 ...
163 Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
164 Hit(id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 hsps)
165 Hit(id='gi|270133242|ref|NR_032573.1|', query_id='33211', 1 hsps)
166 Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps)
167
168 If you need access to all the hits in a QueryResult object, you can get
169 them in a list using the `hits` property. Similarly, access to all hit IDs is
170 available through the `hit_keys` property.
171
172 >>> qresult.hits
173 [Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps), ...]
174 >>> qresult.hit_keys
175 ['gi|262205317|ref|NR_030195.1|', 'gi|301171311|ref|NR_035856.1|', ...]
176
177 """
178
179
180
181 _NON_STICKY_ATTRS = ('_items',)
182
183 - def __init__(self, id='<unknown id>', hits=[],
184 hit_key_function=lambda hit: hit.id):
185 """Initializes a QueryResult object.
186
187 Arguments:
188 id -- String of query sequence ID.
189 hits -- Iterator returning Hit objects.
190 hit_key_function -- Function to define hit keys, defaults to a function
191 that return Hit object IDs.
192
193 """
194 if id is None:
195 raise ValueError("Query ID string is required for QueryResult "
196 "creation")
197
198 self._id = id
199 self._hit_key_function = hit_key_function
200 self._items = OrderedDict()
201 self._description = '<unknown description>'
202 self.program = '<unknown program>'
203 self.target = '<unknown target>'
204 self.version = '<unknown version>'
205
206
207 for hit in hits:
208
209 self.append(hit)
210
211
212 if hasattr(OrderedDict, 'iteritems'):
213
216
217 @property
219 """Hit objects contained in the QueryResult."""
220 return self._items.values()
221
222 @property
224 """Hit IDs of the Hit objects contained in the QueryResult."""
225 return self._items.keys()
226
227 @property
229 """List of tuples of Hit IDs and Hit objects."""
230 return self._items.items()
231
233 """Returns an iterator over the Hit objects."""
234 for hit in self._items.itervalues():
235 yield hit
236
238 """Returns an iterator over the ID of the Hit objects."""
239 for hit_id in self._items.iterkeys():
240 yield hit_id
241
243 """Returns an iterator yielding tuples of Hit ID and Hit objects."""
244 for item in self._items.iteritems():
245 yield item
246
247 else:
248
250 return iter(self.hits)
251
252 @property
254 """Hit objects contained in the QueryResult."""
255 return list(self._items.values())
256
257 @property
259 """Hit IDs of the Hit objects contained in the QueryResult."""
260 return list(self._items.keys())
261
262 @property
264 """List of tuples of Hit IDs and Hit objects."""
265 return list(self._items.items())
266
268 """Returns an iterator over the Hit objects."""
269 for hit in self._items.values():
270 yield hit
271
273 """Returns an iterator over the ID of the Hit objects."""
274 for hit_id in self._items.keys():
275 yield hit_id
276
278 """Returns an iterator yielding tuples of Hit ID and Hit objects."""
279 for item in self._items.items():
280 yield item
281
283 if isinstance(hit_key, Hit):
284 return self._hit_key_function(hit_key) in self._items
285 return hit_key in self._items
286
288 return len(self._items)
289
291 return bool(self._items)
292
294 return "QueryResult(id=%r, %r hits)" % (self.id, len(self))
295
297 lines = []
298
299
300 lines.append('Program: %s (%s)' % (self.program, self.version))
301
302
303 qid_line = ' Query: %s' % self.id
304 if hasattr(self, 'seq_len'):
305 qid_line += ' (%i)' % self.seq_len
306 if self.description:
307 qid_line += trim_str('\n %s' % self.description, 80, '...')
308 lines.append(qid_line)
309
310
311 lines.append(' Target: %s' % self.target)
312
313
314 if not self.hits:
315 lines.append(' Hits: 0')
316 else:
317 lines.append(' Hits: %s %s %s' % ('-'*4, '-'*5, '-'*58))
318 pattern = '%13s %5s %56s'
319 lines.append(pattern % ('#', '# HSP',
320 'ID + description'.ljust(58)))
321 lines.append(pattern % ('-'*4, '-'*5, '-'*58))
322 for idx, hit in enumerate(self.hits):
323 if idx < 30:
324 hid_line = '%s %s' % (hit.id, hit.description)
325 if len(hid_line) > 58:
326 hid_line = hid_line[:55] + '...'
327 lines.append(pattern % (idx, str(len(hit)),
328 hid_line.ljust(58)))
329 elif idx > len(self.hits) - 4:
330 hid_line = '%s %s' % (hit.id, hit.description)
331 if len(hid_line) > 58:
332 hid_line = hid_line[:55] + '...'
333 lines.append(pattern % (idx, str(len(hit)),
334 hid_line.ljust(58)))
335 elif idx == 30:
336 lines.append('%14s' % '~~~')
337
338 return '\n'.join(lines)
339
341
342 if isinstance(hit_key, slice):
343
344
345 hits = list(self.hits)[hit_key]
346 obj = self.__class__(self.id, hits, self._hit_key_function)
347 self._transfer_attrs(obj)
348 return obj
349
350
351 elif isinstance(hit_key, int):
352 return list(self._items.values())[hit_key]
353
354
355 return self._items[hit_key]
356
358
359 if not isinstance(hit_key, basestring):
360 raise TypeError("QueryResult object keys must be a string.")
361
362 if not isinstance(hit, Hit):
363 raise TypeError("QueryResult objects can only contain Hit objects.")
364
365 if hit.query_id != self.id:
366 raise ValueError("Expected Hit with query ID '%s', found '%s' "
367 "instead." % (self.id, hit.query_id))
368
369 self._items[hit_key] = hit
370
386
387
388 id = partialcascade('_id', 'query_id', """QueryResult ID string""")
389 description = partialcascade('_description', 'query_description',
390 """QueryResult description""")
391
392 @property
394 """HSP objects contained in the QueryResult."""
395 return [hsp for hsp in chain(*self.hits)]
396
397 @property
399 """HSPFragment objects contained in the QueryResult."""
400 return [frag for frag in chain(*self.hsps)]
401
402
404 """Adds a Hit object to the end of QueryResult. If the QueryResult
405 already has a Hit with the same ID, append the new Hit's HSPs into
406 the existing Hit.
407
408 Arguments:
409 hit -- Hit object to absorb.
410
411 This method is used for file formats that may output the same Hit in
412 separate places, such as BLAT or Exonerate. In both formats, Hit
413 with different strands are put in different places. However, SearchIO
414 considers them to be the same as a Hit object should be all database
415 entries with the same ID, regardless of strand orientation.
416
417 """
418 try:
419 self.append(hit)
420 except ValueError:
421 assert hit.id in self
422 for hsp in hit:
423 self[hit.id].append(hsp)
424
426 """Adds a Hit object to the end of QueryResult.
427
428 Parameters
429 hit -- Hit object to append.
430
431 Any Hit object appended must have the same `query_id` property as the
432 QueryResult's `id` property. If the hit key already exists, a
433 `ValueError` will be raised.
434
435 """
436
437 if self._hit_key_function is not None:
438 hit_key = self._hit_key_function(hit)
439 else:
440 hit_key = hit.id
441
442 if hit_key not in self:
443 self[hit_key] = hit
444 else:
445 raise ValueError("Hit '%s' already present in this QueryResult." %
446 hit_key)
447
449 """Creates a new QueryResult object whose Hit objects pass the filter
450 function.
451
452 Arguments:
453 func -- Callback function that accepts a Hit object as its parameter,
454 does a boolean check, and returns True or False
455
456 Here is an example of using `hit_filter` to select Hits whose
457 description begins with the string 'Homo sapiens', case sensitive:
458
459 >>> from Bio import SearchIO
460 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next()
461 >>> def desc_filter(hit):
462 ... return hit.description.startswith('Homo sapiens')
463 ...
464 >>> len(qresult)
465 100
466 >>> filtered = qresult.hit_filter(desc_filter)
467 >>> len(filtered)
468 39
469 >>> print filtered[:4]
470 Program: blastn (2.2.27+)
471 Query: 33211 (61)
472 mir_1
473 Target: refseq_rna
474 Hits: ---- ----- ----------------------------------------------------------
475 # # HSP ID + description
476 ---- ----- ----------------------------------------------------------
477 0 1 gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 52...
478 1 2 gi|262205330|ref|NR_030198.1| Homo sapiens microRNA 52...
479 2 1 gi|262205302|ref|NR_030191.1| Homo sapiens microRNA 51...
480 3 1 gi|262205451|ref|NR_030222.1| Homo sapiens microRNA 51...
481
482 Note that instance attributes (other than the hits) from the unfiltered
483 QueryResult are retained in the filtered object.
484
485 >>> qresult.program == filtered.program
486 True
487 >>> qresult.target == filtered.target
488 True
489
490 """
491 hits = filter(func, self.hits)
492 obj = self.__class__(self.id, hits, self._hit_key_function)
493 self._transfer_attrs(obj)
494 return obj
495
497 """Creates a new QueryResult object, mapping the given function to its
498 Hits.
499
500 Arguments:
501 func -- Callback function that accepts a Hit object as its parameter and
502 also returns a Hit object.
503
504 Here is an example of using `hit_map` with a function that discards all
505 HSPs in a Hit except for the first one:
506
507 >>> from Bio import SearchIO
508 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next()
509 >>> print qresult[:8]
510 Program: blastn (2.2.27+)
511 Query: 33211 (61)
512 mir_1
513 Target: refseq_rna
514 Hits: ---- ----- ----------------------------------------------------------
515 # # HSP ID + description
516 ---- ----- ----------------------------------------------------------
517 0 1 gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 52...
518 1 1 gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA...
519 2 1 gi|270133242|ref|NR_032573.1| Macaca mulatta microRNA ...
520 3 2 gi|301171322|ref|NR_035857.1| Pan troglodytes microRNA...
521 4 1 gi|301171267|ref|NR_035851.1| Pan troglodytes microRNA...
522 5 2 gi|262205330|ref|NR_030198.1| Homo sapiens microRNA 52...
523 6 1 gi|262205302|ref|NR_030191.1| Homo sapiens microRNA 51...
524 7 1 gi|301171259|ref|NR_035850.1| Pan troglodytes microRNA...
525
526 >>> top_hsp = lambda hit: hit[:1]
527 >>> mapped_qresult = qresult.hit_map(top_hsp)
528 >>> print mapped_qresult[:8]
529 Program: blastn (2.2.27+)
530 Query: 33211 (61)
531 mir_1
532 Target: refseq_rna
533 Hits: ---- ----- ----------------------------------------------------------
534 # # HSP ID + description
535 ---- ----- ----------------------------------------------------------
536 0 1 gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 52...
537 1 1 gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA...
538 2 1 gi|270133242|ref|NR_032573.1| Macaca mulatta microRNA ...
539 3 1 gi|301171322|ref|NR_035857.1| Pan troglodytes microRNA...
540 4 1 gi|301171267|ref|NR_035851.1| Pan troglodytes microRNA...
541 5 1 gi|262205330|ref|NR_030198.1| Homo sapiens microRNA 52...
542 6 1 gi|262205302|ref|NR_030191.1| Homo sapiens microRNA 51...
543 7 1 gi|301171259|ref|NR_035850.1| Pan troglodytes microRNA...
544
545 """
546 hits = [deepcopy(hit) for hit in self.hits]
547 if func is not None:
548 hits = map(func, hits)
549 obj = self.__class__(self.id, hits, self._hit_key_function)
550 self._transfer_attrs(obj)
551 return obj
552
554 """Creates a new QueryResult object whose HSP objects pass the filter
555 function.
556
557 `hsp_filter` is the same as `hit_filter`, except that it filters
558 directly on each HSP object in every Hit. If a the filtering removes
559 all HSP object in a given Hit, the entire Hit will be discarded. This
560 will result in the QueryResult having less Hit after filtering.
561
562 """
563 hits = filter(None, (hit.filter(func) for hit in self.hits))
564 obj = self.__class__(self.id, hits, self._hit_key_function)
565 self._transfer_attrs(obj)
566 return obj
567
569 """Creates a new QueryResult object, mapping the given function to its
570 HSPs.
571
572 `hsp_map` is the same as `hit_map`, except that it applies the given
573 function to all HSP objects in every Hit, instead of the Hit objects.
574
575 """
576 hits = filter(None, (hit.map(func) for hit in list(self.hits)[:]))
577 obj = self.__class__(self.id, hits, self._hit_key_function)
578 self._transfer_attrs(obj)
579 return obj
580
581
582
583
584 __marker = object()
585
587 """Removes the specified hit key and return the Hit object.
588
589 Arguments:
590 hit_key -- Integer index or string of hit key that points to a Hit
591 object.
592 default -- Value that will be returned if the Hit object with the
593 specified index or hit key is not found.
594
595 By default, `pop` will remove and return the last Hit object in the
596 QueryResult object. To remove specific Hit objects, you can use its
597 integer index or hit key.
598
599 >>> from Bio import SearchIO
600 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next()
601 >>> len(qresult)
602 100
603 >>> for hit in qresult[:5]:
604 ... print hit.id
605 ...
606 gi|262205317|ref|NR_030195.1|
607 gi|301171311|ref|NR_035856.1|
608 gi|270133242|ref|NR_032573.1|
609 gi|301171322|ref|NR_035857.1|
610 gi|301171267|ref|NR_035851.1|
611
612 # remove the last hit
613 >>> qresult.pop()
614 Hit(id='gi|397513516|ref|XM_003827011.1|', query_id='33211', 1 hsps)
615
616 # remove the first hit
617 >>> qresult.pop(0)
618 Hit(id='gi|262205317|ref|NR_030195.1|', query_id='33211', 1 hsps)
619
620 # remove hit with the given ID
621 >>> qresult.pop('gi|301171322|ref|NR_035857.1|')
622 Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps)
623
624 """
625
626
627 if isinstance(hit_key, int):
628
629 if not self:
630 raise IndexError("pop from empty list")
631 hit_key = list(self.hit_keys)[hit_key]
632
633 try:
634 return self._items.pop(hit_key)
635 except KeyError:
636
637 if default is self.__marker:
638 raise KeyError(hit_key)
639
640 return default
641
642 - def index(self, hit_key):
643 """Returns the index of a given hit key, zero-based.
644
645 Arguments:
646 hit_key -- Hit ID string to look up.
647
648 This method is useful for finding out the integer index (usually
649 correlated with search rank) of a given hit key.
650
651 >>> from Bio import SearchIO
652 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next()
653 >>> qresult.index('gi|301171259|ref|NR_035850.1|')
654 7
655
656 """
657 if isinstance(hit_key, Hit):
658 return list(self.hit_keys).index(hit_key.id)
659 return list(self.hit_keys).index(hit_key)
660
661 - def sort(self, key=None, reverse=False, in_place=True):
662
663 """Sorts the Hit objects.
664
665 Arguments:
666 key -- Function used to sort the Hit objects.
667 reverse -- Boolean, whether to reverse the sorting or not.
668 in_place -- Boolean, whether to perform sorting in place (in the same
669 object) or not (creating a new object).
670
671 `sort` defaults to sorting in-place, to mimick Python's `list.sort`
672 method. If you set the `in_place` argument to False, it will treat
673 return a new, sorted QueryResult object and keep the initial one
674 unsorted.
675
676 """
677 if key is None:
678
679 if reverse:
680 sorted_hits = list(self.hits)[::-1]
681
682 else:
683 sorted_hits = list(self.hits)[:]
684 else:
685 sorted_hits = sorted(self.hits, key=key, reverse=reverse)
686
687
688 if in_place:
689 new_hits = OrderedDict()
690 for hit in sorted_hits:
691 new_hits[self._hit_key_function(hit)] = hit
692 self._items = new_hits
693
694 else:
695 obj = self.__class__(self.id, sorted_hits, self._hit_key_function)
696 self._transfer_attrs(obj)
697 return obj
698
699
700
701 if __name__ == "__main__":
702 from Bio._utils import run_doctest
703 run_doctest()
704