Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2015 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  """Represent a Sequence Feature holding info about a part of a sequence. 
  10   
  11  This is heavily modeled after the Biocorba SeqFeature objects, and 
  12  may be pretty biased towards GenBank stuff since I'm writing it 
  13  for the GenBank parser output... 
  14   
  15  What's here: 
  16   
  17  Base class to hold a Feature 
  18  ---------------------------- 
  19   
  20  classes: 
  21   
  22      - SeqFeature 
  23   
  24  Hold information about a Reference 
  25  ---------------------------------- 
  26   
  27  This is an attempt to create a General class to hold Reference type 
  28  information. 
  29   
  30  classes: 
  31   
  32      - Reference 
  33   
  34  Specify locations of a feature on a Sequence 
  35  -------------------------------------------- 
  36   
  37  This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 
  38  This has the advantages of allowing us to handle fuzzy stuff in case anyone 
  39  needs it, and also be compatible with BioPerl etc and BioSQL. 
  40   
  41  classes: 
  42   
  43      - FeatureLocation - Specify the start and end location of a feature. 
  44      - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 
  45   
  46      - ExactPosition - Specify the position as being exact. 
  47      - WithinPosition - Specify a position occuring within some range. 
  48      - BetweenPosition - Specify a position occuring between a range (OBSOLETE?). 
  49      - BeforePosition - Specify the position as being found before some base. 
  50      - AfterPosition - Specify the position as being found after some base. 
  51      - OneOfPosition - Specify a position where the location can be multiple positions. 
  52      - UnknownPosition - Represents missing information like '?' in UniProt. 
  53  """ 
  54   
  55  from __future__ import print_function 
  56   
  57  from Bio.Seq import MutableSeq, reverse_complement 
  58   
  59  __docformat__ = "restructuredtext en" 
60 61 62 -class SeqFeature(object):
63 """Represent a Sequence Feature on an object. 64 65 Attributes: 66 67 - location - the location of the feature on the sequence (FeatureLocation) 68 - type - the specified type of the feature (ie. CDS, exon, repeat...) 69 - location_operator - a string specifying how this SeqFeature may 70 be related to others. For example, in the example GenBank feature 71 shown below, the location_operator would be "join". This is a proxy 72 for feature.location.operator and only applies to compound locations. 73 - strand - A value specifying on which strand (of a DNA sequence, for 74 instance) the feature deals with. 1 indicates the plus strand, -1 75 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 76 while the default of None indicates that strand doesn't apply (dot in GFF3, 77 e.g. features on proteins). Note this is a shortcut for accessing the 78 strand property of the feature's location. 79 - id - A string identifier for the feature. 80 - ref - A reference to another sequence. This could be an accession 81 number for some different sequence. Note this is a shortcut for the 82 reference property of the feature's location. 83 - ref_db - A different database for the reference accession number. 84 Note this is a shortcut for the reference property of the location 85 - qualifiers - A dictionary of qualifiers on the feature. These are 86 analogous to the qualifiers from a GenBank feature table. The keys of 87 the dictionary are qualifier names, the values are the qualifier 88 values. 89 - sub_features - Obsolete list of additional SeqFeatures which was 90 used for holding compound locations (e.g. joins in GenBank/EMBL). 91 This is now superceded by a CompoundFeatureLocation as the location, 92 and should not be used (DEPRECATED). 93 """ 94
95 - def __init__(self, location=None, type='', location_operator='', 96 strand=None, id="<unknown id>", 97 qualifiers=None, sub_features=None, 98 ref=None, ref_db=None):
99 """Initialize a SeqFeature on a Sequence. 100 101 location can either be a FeatureLocation (with strand argument also 102 given if required), or None. 103 104 e.g. With no strand, on the forward strand, and on the reverse strand: 105 106 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 107 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 108 >>> f1.strand == f1.location.strand == None 109 True 110 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 111 >>> f2.strand == f2.location.strand == +1 112 True 113 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 114 >>> f3.strand == f3.location.strand == -1 115 True 116 117 An invalid strand will trigger an exception: 118 119 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 120 Traceback (most recent call last): 121 ... 122 ValueError: Strand should be +1, -1, 0 or None, not 2 123 124 Similarly if set via the FeatureLocation directly: 125 126 >>> loc4 = FeatureLocation(50, 60, strand=2) 127 Traceback (most recent call last): 128 ... 129 ValueError: Strand should be +1, -1, 0 or None, not 2 130 131 For exact start/end positions, an integer can be used (as shown above) 132 as shorthand for the ExactPosition object. For non-exact locations, the 133 FeatureLocation must be specified via the appropriate position objects. 134 135 Note that the strand, ref and ref_db arguments to the SeqFeature are 136 now obsolete and will be deprecated in a future release (which will 137 give warning messages) and later removed. Set them via the location 138 object instead. 139 140 Note that location_operator and sub_features arguments can no longer 141 be used, instead do this via the CompoundLocation object. 142 """ 143 if location is not None and not isinstance(location, FeatureLocation) \ 144 and not isinstance(location, CompoundLocation): 145 raise TypeError( 146 "FeatureLocation, CompoundLocation (or None) required for the location") 147 self.location = location 148 self.type = type 149 if location_operator: 150 # TODO - Deprecation warning 151 self.location_operator = location_operator 152 if strand is not None: 153 # TODO - Deprecation warning 154 self.strand = strand 155 self.id = id 156 if qualifiers is None: 157 qualifiers = {} 158 self.qualifiers = qualifiers 159 if sub_features is None: 160 sub_features = [] 161 else: 162 import warnings 163 from Bio import BiopythonDeprecationWarning 164 warnings.warn("Rather than sub_features, use a CompoundFeatureLocation", 165 BiopythonDeprecationWarning) 166 self._sub_features = sub_features 167 if ref is not None: 168 # TODO - Deprecation warning 169 self.ref = ref 170 if ref_db is not None: 171 # TODO - Deprecation warning 172 self.ref_db = ref_db
173
174 - def _get_sub_features(self):
175 if self._sub_features: 176 import warnings 177 from Bio import BiopythonDeprecationWarning 178 warnings.warn("Rather using f.sub_features, f.location should be a CompoundFeatureLocation", 179 BiopythonDeprecationWarning) 180 return self._sub_features
181
182 - def _set_sub_features(self, value):
183 if value: 184 import warnings 185 from Bio import BiopythonDeprecationWarning 186 warnings.warn("Rather than f.sub_features, use a CompoundFeatureLocation for f.location", 187 BiopythonDeprecationWarning) 188 self._sub_features = value
189 sub_features = property(fget=_get_sub_features, fset=_set_sub_features, 190 doc="Obsolete representation of compound locations (DEPRECATED).") 191
192 - def _get_strand(self):
193 return self.location.strand
194
195 - def _set_strand(self, value):
196 try: 197 self.location.strand = value 198 except AttributeError: 199 if self.location is None: 200 if value is not None: 201 raise ValueError("Can't set strand without a location.") 202 else: 203 raise
204 205 strand = property(fget=_get_strand, fset=_set_strand, 206 doc="""Feature's strand 207 208 This is a shortcut for feature.location.strand 209 """) 210
211 - def _get_ref(self):
212 try: 213 return self.location.ref 214 except AttributeError: 215 return None
216
217 - def _set_ref(self, value):
218 try: 219 self.location.ref = value 220 except AttributeError: 221 if self.location is None: 222 if value is not None: 223 raise ValueError("Can't set ref without a location.") 224 else: 225 raise
226 ref = property(fget=_get_ref, fset=_set_ref, 227 doc="""Feature location reference (e.g. accession). 228 229 This is a shortcut for feature.location.ref 230 """) 231
232 - def _get_ref_db(self):
233 try: 234 return self.location.ref_db 235 except AttributeError: 236 return None
237
238 - def _set_ref_db(self, value):
239 self.location.ref_db = value
240 ref_db = property(fget=_get_ref_db, fset=_set_ref_db, 241 doc="""Feature location reference's database. 242 243 This is a shortcut for feature.location.ref_db 244 """) 245
246 - def _get_location_operator(self):
247 try: 248 return self.location.operator 249 except AttributeError: 250 return None
251
252 - def _set_location_operator(self, value):
253 if value: 254 if isinstance(self.location, CompoundLocation): 255 self.location.operator = value 256 elif self.location is None: 257 raise ValueError( 258 "Location is None so can't set its operator (to %r)" % value) 259 else: 260 raise ValueError( 261 "Only CompoundLocation gets an operator (%r)" % value)
262 location_operator = property(fget=_get_location_operator, fset=_set_location_operator, 263 doc="Location operator for compound locations (e.g. join).") 264
265 - def __repr__(self):
266 """A string representation of the record for debugging.""" 267 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 268 if self.type: 269 answer += ", type=%s" % repr(self.type) 270 if self.location_operator: 271 answer += ", location_operator=%s" % repr(self.location_operator) 272 if self.id and self.id != "<unknown id>": 273 answer += ", id=%s" % repr(self.id) 274 if self.ref: 275 answer += ", ref=%s" % repr(self.ref) 276 if self.ref_db: 277 answer += ", ref_db=%s" % repr(self.ref_db) 278 answer += ")" 279 return answer
280
281 - def __str__(self):
282 """A readable summary of the feature intended to be printed to screen. 283 """ 284 out = "type: %s\n" % self.type 285 out += "location: %s\n" % self.location 286 if self.id and self.id != "<unknown id>": 287 out += "id: %s\n" % self.id 288 out += "qualifiers:\n" 289 for qual_key in sorted(self.qualifiers): 290 out += " Key: %s, Value: %s\n" % (qual_key, 291 self.qualifiers[qual_key]) 292 # TODO - Remove this from __str__ since deprecated 293 if len(self._sub_features) != 0: 294 out += "Sub-Features\n" 295 for sub_feature in self._sub_features: 296 out += "%s\n" % sub_feature 297 return out
298
299 - def _shift(self, offset):
300 """Returns a copy of the feature with its location shifted (PRIVATE). 301 302 The annotation qaulifiers are copied.""" 303 answer = SeqFeature(location=self.location._shift(offset), 304 type=self.type, 305 location_operator=self.location_operator, 306 id=self.id, 307 qualifiers=dict(self.qualifiers.items())) 308 # This is to avoid the deprecation warning: 309 answer._sub_features = [f._shift(offset) for f in self._sub_features] 310 return answer
311
312 - def _flip(self, length):
313 """Returns a copy of the feature with its location flipped (PRIVATE). 314 315 The argument length gives the length of the parent sequence. For 316 example a location 0..20 (+1 strand) with parent length 30 becomes 317 after flipping 10..30 (-1 strand). Strandless (None) or unknown 318 strand (0) remain like that - just their end points are changed. 319 320 The annotation qaulifiers are copied. 321 """ 322 answer = SeqFeature(location=self.location._flip(length), 323 type=self.type, 324 location_operator=self.location_operator, 325 id=self.id, 326 qualifiers=dict(self.qualifiers.items())) 327 # This is to avoid the deprecation warning: 328 answer._sub_features = [f._flip(length) 329 for f in self._sub_features[::-1]] 330 return answer
331
332 - def extract(self, parent_sequence):
333 """Extract feature sequence from the supplied parent sequence. 334 335 The parent_sequence can be a Seq like object or a string, and will 336 generally return an object of the same type. The exception to this is 337 a MutableSeq as the parent sequence will return a Seq object. 338 339 This should cope with complex locations including complements, joins 340 and fuzzy positions. Even mixed strand features should work! This 341 also covers features on protein sequences (e.g. domains), although 342 here reverse strand features are not permitted. 343 344 >>> from Bio.Seq import Seq 345 >>> from Bio.Alphabet import generic_protein 346 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 347 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 348 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 349 >>> f.extract(seq) 350 Seq('VALIVIC', ProteinAlphabet()) 351 352 If the FeatureLocation is None, e.g. when parsing invalid locus 353 locations in the GenBank parser, extract() will raise a ValueError. 354 355 >>> from Bio.Seq import Seq 356 >>> from Bio.SeqFeature import SeqFeature 357 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 358 >>> f = SeqFeature(None, type="domain") 359 >>> f.extract(seq) 360 Traceback (most recent call last): 361 ... 362 ValueError: The feature's .location is None. Check the sequence file for a valid location. 363 364 Note - currently only sub-features of type "join" are supported. 365 """ 366 if self.location is None: 367 raise ValueError("The feature's .location is None. Check the " 368 "sequence file for a valid location.") 369 return self.location.extract(parent_sequence)
370 371 # Python 3:
372 - def __bool__(self):
373 """Boolean value of an instance of this class (True). 374 375 This behaviour is for backwards compatibility, since until the 376 __len__ method was added, a SeqFeature always evaluated as True. 377 378 Note that in comparison, Seq objects, strings, lists, etc, will all 379 evaluate to False if they have length zero. 380 381 WARNING: The SeqFeature may in future evaluate to False when its 382 length is zero (in order to better match normal python behaviour)! 383 """ 384 return True
385 386 # Python 2: 387 __nonzero__ = __bool__ 388
389 - def __len__(self):
390 """Returns the length of the region described by a feature. 391 392 >>> from Bio.Seq import Seq 393 >>> from Bio.Alphabet import generic_protein 394 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 395 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 396 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 397 >>> len(f) 398 7 399 >>> f.extract(seq) 400 Seq('VALIVIC', ProteinAlphabet()) 401 >>> len(f.extract(seq)) 402 7 403 404 This is a proxy for taking the length of the feature's location: 405 406 >>> len(f.location) 407 7 408 409 For simple features this is the same as the region spanned (end 410 position minus start position using Pythonic counting). However, for 411 a compound location (e.g. a CDS as the join of several exons) the 412 gaps are not counted (e.g. introns). This ensures that len(f) matches 413 len(f.extract(parent_seq)), and also makes sure things work properly 414 with features wrapping the origin etc. 415 """ 416 return len(self.location)
417
418 - def __iter__(self):
419 """Iterate over the parent positions within the feature. 420 421 The iteration order is strand aware, and can be thought of as moving 422 along the feature using the parent sequence coordinates: 423 424 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 425 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 426 >>> len(f) 427 5 428 >>> for i in f: print(i) 429 9 430 8 431 7 432 6 433 5 434 >>> list(f) 435 [9, 8, 7, 6, 5] 436 437 This is a proxy for iterating over the location, 438 439 >>> list(f.location) 440 [9, 8, 7, 6, 5] 441 """ 442 return iter(self.location)
443
444 - def __contains__(self, value):
445 """Check if an integer position is within the feature. 446 447 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 448 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 449 >>> len(f) 450 5 451 >>> [i for i in range(15) if i in f] 452 [5, 6, 7, 8, 9] 453 454 For example, to see which features include a SNP position, you could 455 use this: 456 457 >>> from Bio import SeqIO 458 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 459 >>> for f in record.features: 460 ... if 1750 in f: 461 ... print("%s %s" % (f.type, f.location)) 462 source [0:154478](+) 463 gene [1716:4347](-) 464 tRNA join{[4310:4347](-), [1716:1751](-)} 465 466 Note that for a feature defined as a join of several subfeatures (e.g. 467 the union of several exons) the gaps are not checked (e.g. introns). 468 In this example, the tRNA location is defined in the GenBank file as 469 complement(join(1717..1751,4311..4347)), so that position 1760 falls 470 in the gap: 471 472 >>> for f in record.features: 473 ... if 1760 in f: 474 ... print("%s %s" % (f.type, f.location)) 475 source [0:154478](+) 476 gene [1716:4347](-) 477 478 Note that additional care may be required with fuzzy locations, for 479 example just before a BeforePosition: 480 481 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 482 >>> from Bio.SeqFeature import BeforePosition 483 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 484 >>> len(f) 485 5 486 >>> [i for i in range(10) if i in f] 487 [3, 4, 5, 6, 7] 488 489 Note that is is a proxy for testing membership on the location. 490 491 >>> [i for i in range(10) if i in f.location] 492 [3, 4, 5, 6, 7] 493 """ 494 return value in self.location
495
496 497 # --- References 498 499 500 # TODO -- Will this hold PubMed and Medline information decently? 501 -class Reference(object):
502 """Represent a Generic Reference object. 503 504 Attributes: 505 o location - A list of Location objects specifying regions of 506 the sequence that the references correspond to. If no locations are 507 specified, the entire sequence is assumed. 508 o authors - A big old string, or a list split by author, of authors 509 for the reference. 510 o title - The title of the reference. 511 o journal - Journal the reference was published in. 512 o medline_id - A medline reference for the article. 513 o pubmed_id - A pubmed reference for the article. 514 o comment - A place to stick any comments about the reference. 515 """ 516
517 - def __init__(self):
518 self.location = [] 519 self.authors = '' 520 self.consrtm = '' 521 self.title = '' 522 self.journal = '' 523 self.medline_id = '' 524 self.pubmed_id = '' 525 self.comment = ''
526
527 - def __str__(self):
528 """Output an informative string for debugging. 529 """ 530 out = "" 531 for single_location in self.location: 532 out += "location: %s\n" % single_location 533 out += "authors: %s\n" % self.authors 534 if self.consrtm: 535 out += "consrtm: %s\n" % self.consrtm 536 out += "title: %s\n" % self.title 537 out += "journal: %s\n" % self.journal 538 out += "medline id: %s\n" % self.medline_id 539 out += "pubmed id: %s\n" % self.pubmed_id 540 out += "comment: %s\n" % self.comment 541 return out
542
543 - def __repr__(self):
544 # TODO - Update this is __init__ later accpets values 545 return "%s(title=%s, ...)" % (self.__class__.__name__, 546 repr(self.title))
547
548 549 # --- Handling feature locations 550 551 -class FeatureLocation(object):
552 """Specify the location of a feature along a sequence. 553 554 The FeatureLocation is used for simple continous features, which can 555 be described as running from a start position to and end position 556 (optionally with a strand and reference information). More complex 557 locations made up from several non-continuous parts (e.g. a coding 558 sequence made up of several exons) are currently described using a 559 SeqFeature with sub-features. 560 561 Note that the start and end location numbering follow Python's scheme, 562 thus a GenBank entry of 123..150 (one based counting) becomes a location 563 of [122:150] (zero based counting). 564 565 >>> from Bio.SeqFeature import FeatureLocation 566 >>> f = FeatureLocation(122, 150) 567 >>> print(f) 568 [122:150] 569 >>> print(f.start) 570 122 571 >>> print(f.end) 572 150 573 >>> print(f.strand) 574 None 575 576 Note the strand defaults to None. If you are working with nucleotide 577 sequences you'd want to be explicit if it is the forward strand: 578 579 >>> from Bio.SeqFeature import FeatureLocation 580 >>> f = FeatureLocation(122, 150, strand=+1) 581 >>> print(f) 582 [122:150](+) 583 >>> print(f.strand) 584 1 585 586 Note that for a parent sequence of length n, the FeatureLocation 587 start and end must satisfy the inequality 0 <= start <= end <= n. 588 This means even for features on the reverse strand of a nucleotide 589 sequence, we expect the 'start' coordinate to be less than the 590 'end'. 591 592 >>> from Bio.SeqFeature import FeatureLocation 593 >>> r = FeatureLocation(122, 150, strand=-1) 594 >>> print(r) 595 [122:150](-) 596 >>> print(r.start) 597 122 598 >>> print(r.end) 599 150 600 >>> print(r.strand) 601 -1 602 603 i.e. Rather than thinking of the 'start' and 'end' biologically in a 604 strand aware manor, think of them as the 'left most' or 'minimum' 605 boundary, and the 'right most' or 'maximum' boundary of the region 606 being described. This is particularly important with compound 607 locations describing non-continuous regions. 608 609 In the example above we have used standard exact positions, but there 610 are also specialised position objects used to represent fuzzy positions 611 as well, for example a GenBank location like complement(<123..150) 612 would use a BeforePosition object for the start. 613 """ 614
615 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
616 """Specify the start, end, strand etc of a sequence feature. 617 618 start and end arguments specify the values where the feature begins 619 and ends. These can either by any of the ``*Position`` objects that 620 inherit from AbstractPosition, or can just be integers specifying the 621 position. In the case of integers, the values are assumed to be 622 exact and are converted in ExactPosition arguments. This is meant 623 to make it easy to deal with non-fuzzy ends. 624 625 i.e. Short form: 626 627 >>> from Bio.SeqFeature import FeatureLocation 628 >>> loc = FeatureLocation(5, 10, strand=-1) 629 >>> print(loc) 630 [5:10](-) 631 632 Explicit form: 633 634 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 635 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 636 >>> print(loc) 637 [5:10](-) 638 639 Other fuzzy positions are used similarly, 640 641 >>> from Bio.SeqFeature import FeatureLocation 642 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 643 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 644 >>> print(loc2) 645 [<5:>10](-) 646 647 For nucleotide features you will also want to specify the strand, 648 use 1 for the forward (plus) strand, -1 for the reverse (negative) 649 strand, 0 for stranded but strand unknown (? in GFF3), or None for 650 when the strand does not apply (dot in GFF3), e.g. features on 651 proteins. 652 653 >>> loc = FeatureLocation(5, 10, strand=+1) 654 >>> print(loc) 655 [5:10](+) 656 >>> print(loc.strand) 657 1 658 659 Normally feature locations are given relative to the parent 660 sequence you are working with, but an explicit accession can 661 be given with the optional ref and db_ref strings: 662 663 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 664 >>> print(loc) 665 AL391218.9[105172:108462](+) 666 >>> print(loc.ref) 667 AL391218.9 668 669 """ 670 # TODO - Check 0 <= start <= end (<= length of reference) 671 if isinstance(start, AbstractPosition): 672 self._start = start 673 elif isinstance(start, int) or isinstance(start, long): 674 self._start = ExactPosition(start) 675 else: 676 raise TypeError("start=%r %s" % (start, type(start))) 677 if isinstance(end, AbstractPosition): 678 self._end = end 679 elif isinstance(end, int) or isinstance(end, long): 680 self._end = ExactPosition(end) 681 else: 682 raise TypeError("end=%r %s" % (end, type(end))) 683 self.strand = strand 684 self.ref = ref 685 self.ref_db = ref_db
686
687 - def _get_strand(self):
688 return self._strand
689
690 - def _set_strand(self, value):
691 if value not in [+1, -1, 0, None]: 692 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 693 % value) 694 self._strand = value
695 696 strand = property(fget=_get_strand, fset=_set_strand, 697 doc="Strand of the location (+1, -1, 0 or None).") 698
699 - def __str__(self):
700 """Returns a representation of the location (with python counting). 701 702 For the simple case this uses the python splicing syntax, [122:150] 703 (zero based counting) which GenBank would call 123..150 (one based 704 counting). 705 """ 706 answer = "[%s:%s]" % (self._start, self._end) 707 if self.ref and self.ref_db: 708 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 709 elif self.ref: 710 answer = self.ref + answer 711 # Is ref_db without ref meaningful? 712 if self.strand is None: 713 return answer 714 elif self.strand == +1: 715 return answer + "(+)" 716 elif self.strand == -1: 717 return answer + "(-)" 718 else: 719 # strand = 0, stranded but strand unknown, ? in GFF3 720 return answer + "(?)"
721
722 - def __repr__(self):
723 """A string representation of the location for debugging.""" 724 optional = "" 725 if self.strand is not None: 726 optional += ", strand=%r" % self.strand 727 if self.ref is not None: 728 optional += ", ref=%r" % self.ref 729 if self.ref_db is not None: 730 optional += ", ref_db=%r" % self.ref_db 731 return "%s(%r, %r%s)" \ 732 % (self.__class__.__name__, self.start, self.end, optional)
733
734 - def __add__(self, other):
735 """Combine location with another feature location, or shift it. 736 737 You can add two feature locations to make a join CompoundLocation: 738 739 >>> from Bio.SeqFeature import FeatureLocation 740 >>> f1 = FeatureLocation(5, 10) 741 >>> f2 = FeatureLocation(20, 30) 742 >>> combined = f1 + f2 743 >>> print(combined) 744 join{[5:10], [20:30]} 745 746 This is thus equivalent to: 747 748 >>> from Bio.SeqFeature import CompoundLocation 749 >>> join = CompoundLocation([f1, f2]) 750 >>> print(join) 751 join{[5:10], [20:30]} 752 753 You can also use sum(...) in this way: 754 755 >>> join = sum([f1, f2]) 756 >>> print(join) 757 join{[5:10], [20:30]} 758 759 Furthermore, you can combine a FeatureLocation with a CompoundLocation 760 in this way. 761 762 Separately, adding an integer will give a new FeatureLocation with 763 its start and end offset by that amount. For example: 764 765 >>> print(f1) 766 [5:10] 767 >>> print(f1 + 100) 768 [105:110] 769 >>> print(200 + f1) 770 [205:210] 771 772 This can be useful when editing annotation. 773 """ 774 if isinstance(other, FeatureLocation): 775 return CompoundLocation([self, other]) 776 elif isinstance(other, int): 777 return self._shift(other) 778 else: 779 # This will allow CompoundLocation's __radd__ to be called: 780 return NotImplemented
781
782 - def __radd__(self, other):
783 if isinstance(other, int): 784 return self._shift(other) 785 else: 786 return NotImplemented
787
788 - def __nonzero__(self):
789 """Returns True regardless of the length of the feature. 790 791 This behaviour is for backwards compatibility, since until the 792 __len__ method was added, a FeatureLocation always evaluated as True. 793 794 Note that in comparison, Seq objects, strings, lists, etc, will all 795 evaluate to False if they have length zero. 796 797 WARNING: The FeatureLocation may in future evaluate to False when its 798 length is zero (in order to better match normal python behaviour)! 799 """ 800 return True
801
802 - def __len__(self):
803 """Returns the length of the region described by the FeatureLocation. 804 805 Note that extra care may be needed for fuzzy locations, e.g. 806 807 >>> from Bio.SeqFeature import FeatureLocation 808 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 809 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 810 >>> len(loc) 811 5 812 """ 813 return int(self._end) - int(self._start)
814
815 - def __contains__(self, value):
816 """Check if an integer position is within the FeatureLocation. 817 818 Note that extra care may be needed for fuzzy locations, e.g. 819 820 >>> from Bio.SeqFeature import FeatureLocation 821 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 822 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 823 >>> len(loc) 824 5 825 >>> [i for i in range(15) if i in loc] 826 [5, 6, 7, 8, 9] 827 """ 828 if not isinstance(value, int): 829 raise ValueError("Currently we only support checking for integer " 830 "positions being within a FeatureLocation.") 831 if value < self._start or value >= self._end: 832 return False 833 else: 834 return True
835
836 - def __iter__(self):
837 """Iterate over the parent positions within the FeatureLocation. 838 839 >>> from Bio.SeqFeature import FeatureLocation 840 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 841 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 842 >>> len(loc) 843 5 844 >>> for i in loc: print(i) 845 5 846 6 847 7 848 8 849 9 850 >>> list(loc) 851 [5, 6, 7, 8, 9] 852 >>> [i for i in range(15) if i in loc] 853 [5, 6, 7, 8, 9] 854 855 Note this is strand aware: 856 857 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 858 >>> list(loc) 859 [9, 8, 7, 6, 5] 860 """ 861 if self.strand == -1: 862 for i in range(self._end - 1, self._start - 1, -1): 863 yield i 864 else: 865 for i in range(self._start, self._end): 866 yield i
867
868 - def _shift(self, offset):
869 """Returns a copy of the location shifted by the offset (PRIVATE).""" 870 # TODO - What if offset is a fuzzy position? 871 if self.ref or self.ref_db: 872 # TODO - Return self? 873 raise ValueError("Feature references another sequence.") 874 return FeatureLocation(start=self._start._shift(offset), 875 end=self._end._shift(offset), 876 strand=self.strand)
877
878 - def _flip(self, length):
879 """Returns a copy of the location after the parent is reversed (PRIVATE).""" 880 if self.ref or self.ref_db: 881 # TODO - Return self? 882 raise ValueError("Feature references another sequence.") 883 # Note this will flip the start and end too! 884 if self.strand == +1: 885 flip_strand = -1 886 elif self.strand == -1: 887 flip_strand = +1 888 else: 889 # 0 or None 890 flip_strand = self.strand 891 return FeatureLocation(start=self._end._flip(length), 892 end=self._start._flip(length), 893 strand=flip_strand)
894 895 @property
896 - def parts(self):
897 """Read only list of parts (always one, the Feature Location). 898 899 This is a convience property allowing you to write code handling 900 both simple FeatureLocation objects (with one part) and more complex 901 CompoundLocation objects (with multiple parts) interchangably. 902 """ 903 return [self]
904 905 @property
906 - def start(self):
907 """Start location (integer like, possibly a fuzzy position, read only).""" 908 return self._start
909 910 @property
911 - def end(self):
912 """End location (integer like, possibly a fuzzy position, read only).""" 913 return self._end
914 915 @property
916 - def nofuzzy_start(self):
917 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 918 919 This is now an alias for int(feature.start), which should be 920 used in preference -- unless you are trying to support old 921 versions of Biopython. 922 """ 923 try: 924 return int(self._start) 925 except TypeError: 926 if isinstance(self._start, UnknownPosition): 927 return None 928 raise
929 930 @property
931 - def nofuzzy_end(self):
932 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 933 934 This is now an alias for int(feature.end), which should be 935 used in preference -- unless you are trying to support old 936 versions of Biopython. 937 """ 938 try: 939 return int(self._end) 940 except TypeError: 941 if isinstance(self._end, UnknownPosition): 942 return None 943 raise
944
945 - def extract(self, parent_sequence):
946 """Extract feature sequence from the supplied parent sequence.""" 947 if self.ref or self.ref_db: 948 # TODO - Take a dictionary as an optional argument? 949 raise ValueError("Feature references another sequence.") 950 if isinstance(parent_sequence, MutableSeq): 951 # This avoids complications with reverse complements 952 # (the MutableSeq reverse complement acts in situ) 953 parent_sequence = parent_sequence.toseq() 954 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 955 if self.strand == -1: 956 try: 957 f_seq = f_seq.reverse_complement() 958 except AttributeError: 959 assert isinstance(f_seq, str) 960 f_seq = reverse_complement(f_seq) 961 return f_seq
962
963 964 -class CompoundLocation(object):
965 """For handling joins etc where a feature location has several parts.""" 966
967 - def __init__(self, parts, operator="join"):
968 """Create a compound location with several parts. 969 970 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 971 >>> f1 = FeatureLocation(10, 40, strand=+1) 972 >>> f2 = FeatureLocation(50, 59, strand=+1) 973 >>> f = CompoundLocation([f1, f2]) 974 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 975 True 976 >>> print(f.operator) 977 join 978 >>> 5 in f 979 False 980 >>> 15 in f 981 True 982 >>> f.strand 983 1 984 985 Notice that the strand of the compound location is computed 986 automatically - in the case of mixed strands on the sub-locations 987 the overall strand is set to None. 988 989 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 990 ... FeatureLocation(10, 13, strand=-1)]) 991 >>> print(f.strand) 992 None 993 >>> len(f) 994 6 995 >>> list(f) 996 [3, 4, 5, 12, 11, 10] 997 998 The example above doing list(f) iterates over the coordinates within the 999 feature. This allows you to use max and min on the location, to find the 1000 range covered: 1001 1002 >>> min(f) 1003 3 1004 >>> max(f) 1005 12 1006 1007 More generally, you can use the compound location's start and end which 1008 give the full range covered, 0 <= start <= end <= full sequence length. 1009 1010 >>> f.start == min(f) 1011 True 1012 >>> f.end == max(f) + 1 1013 True 1014 1015 This is consistent with the behaviour of the simple FeatureLocation for 1016 a single region, where again the 'start' and 'end' do not necessarily 1017 give the biological start and end, but rather the 'minimal' and 'maximal' 1018 coordinate boundaries. 1019 1020 Note that adding locations provides a more intuitive method of 1021 construction: 1022 1023 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1024 >>> len(f) 1025 6 1026 >>> list(f) 1027 [3, 4, 5, 12, 11, 10] 1028 """ 1029 self.operator = operator 1030 self.parts = list(parts) 1031 for loc in self.parts: 1032 if not isinstance(loc, FeatureLocation): 1033 raise ValueError("CompoundLocation should be given a list of " 1034 "FeatureLocation objects, not %s" % loc.__class__) 1035 if len(parts) < 2: 1036 raise ValueError( 1037 "CompoundLocation should have at least 2 parts, not %r" % parts)
1038
1039 - def __str__(self):
1040 """Returns a representation of the location (with python counting).""" 1041 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1042
1043 - def __repr__(self):
1044 """String representation of the location for debugging.""" 1045 return "%s(%r, %r)" % (self.__class__.__name__, 1046 self.parts, self.operator)
1047
1048 - def _get_strand(self):
1049 # Historically a join on the reverse strand has been represented 1050 # in Biopython with both the parent SeqFeature and its children 1051 # (the exons for a CDS) all given a strand of -1. Likewise, for 1052 # a join feature on the forward strand they all have strand +1. 1053 # However, we must also consider evil mixed strand examples like 1054 # this, join(complement(69611..69724),139856..140087,140625..140650) 1055 if len(set(loc.strand for loc in self.parts)) == 1: 1056 return self.parts[0].strand 1057 else: 1058 return None # i.e. mixed strands
1059
1060 - def _set_strand(self, value):
1061 # Should this be allowed/encouraged? 1062 for loc in self.parts: 1063 loc.strand = value
1064 strand = property(fget=_get_strand, fset=_set_strand, 1065 doc="""Overall strand of the compound location. 1066 1067 If all the parts have the same strand, that is returned. Otherwise 1068 for mixed strands, this returns None. 1069 1070 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1071 >>> f1 = FeatureLocation(15, 17, strand=1) 1072 >>> f2 = FeatureLocation(20, 30, strand=-1) 1073 >>> f = f1 + f2 1074 >>> f1.strand 1075 1 1076 >>> f2.strand 1077 -1 1078 >>> f.strand 1079 >>> f.strand is None 1080 True 1081 1082 If you set the strand of a CompoundLocation, this is applied to 1083 all the parts - use with caution: 1084 1085 >>> f.strand = 1 1086 >>> f1.strand 1087 1 1088 >>> f2.strand 1089 1 1090 >>> f.strand 1091 1 1092 1093 """) 1094
1095 - def __add__(self, other):
1096 """Combine locations, or shift the location by an integer offset. 1097 1098 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1099 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1100 >>> print(f1) 1101 join{[15:17], [20:30]} 1102 1103 You can add another FeatureLocation: 1104 1105 >>> print(f1 + FeatureLocation(40, 50)) 1106 join{[15:17], [20:30], [40:50]} 1107 >>> print(FeatureLocation(5, 10) + f1) 1108 join{[5:10], [15:17], [20:30]} 1109 1110 You can also add another CompoundLocation: 1111 1112 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1113 >>> print(f2) 1114 join{[40:50], [60:70]} 1115 >>> print(f1 + f2) 1116 join{[15:17], [20:30], [40:50], [60:70]} 1117 1118 Also, as with the FeatureLocation, adding an integer shifts the 1119 location's co-ordinates by that offset: 1120 1121 >>> print(f1 + 100) 1122 join{[115:117], [120:130]} 1123 >>> print(200 + f1) 1124 join{[215:217], [220:230]} 1125 >>> print(f1 + (-5)) 1126 join{[10:12], [15:25]} 1127 """ 1128 if isinstance(other, FeatureLocation): 1129 return CompoundLocation(self.parts + [other], self.operator) 1130 elif isinstance(other, CompoundLocation): 1131 if self.operator != other.operator: 1132 # Handle join+order -> order as a special case? 1133 raise ValueError("Mixed operators %s and %s" 1134 % (self.operator, other.operator)) 1135 return CompoundLocation(self.parts + other.parts, self.operator) 1136 elif isinstance(other, int): 1137 return self._shift(other) 1138 else: 1139 raise NotImplementedError
1140
1141 - def __radd__(self, other):
1142 """Combine locations.""" 1143 if isinstance(other, FeatureLocation): 1144 return CompoundLocation([other] + self.parts, self.operator) 1145 elif isinstance(other, int): 1146 return self._shift(other) 1147 else: 1148 raise NotImplementedError
1149
1150 - def __contains__(self, value):
1151 """Check if an integer position is within the location.""" 1152 for loc in self.parts: 1153 if value in loc: 1154 return True 1155 return False
1156
1157 - def __nonzero__(self):
1158 """Returns True regardless of the length of the feature. 1159 1160 This behaviour is for backwards compatibility, since until the 1161 __len__ method was added, a FeatureLocation always evaluated as True. 1162 1163 Note that in comparison, Seq objects, strings, lists, etc, will all 1164 evaluate to False if they have length zero. 1165 1166 WARNING: The FeatureLocation may in future evaluate to False when its 1167 length is zero (in order to better match normal python behaviour)! 1168 """ 1169 return True
1170
1171 - def __len__(self):
1172 return sum(len(loc) for loc in self.parts)
1173
1174 - def __iter__(self):
1175 for loc in self.parts: 1176 for pos in loc: 1177 yield pos
1178
1179 - def _shift(self, offset):
1180 """Returns a copy of the location shifted by the offset (PRIVATE).""" 1181 return CompoundLocation([loc._shift(offset) for loc in self.parts], 1182 self.operator)
1183
1184 - def _flip(self, length):
1185 """Returns a copy of the location after the parent is reversed (PRIVATE). 1186 1187 Note that the order of the parts is NOT reversed too. Consider a CDS 1188 on the forward strand with exons small, medium and large (in length). 1189 Once we change the frame of reference to the reverse complement strand, 1190 the start codon is still part of the small exon, and the stop codon 1191 still part of the large exon - so the part order remains the same! 1192 1193 Here is an artificial example, were the features map to the two upper 1194 case regions and the lower case runs of n are not used: 1195 1196 >>> from Bio.Seq import Seq 1197 >>> from Bio.SeqFeature import FeatureLocation 1198 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1199 >>> small = FeatureLocation(5, 20, strand=1) 1200 >>> large = FeatureLocation(28, 52, strand=1) 1201 >>> location = small + large 1202 >>> print(small) 1203 [5:20](+) 1204 >>> print(large) 1205 [28:52](+) 1206 >>> print(location) 1207 join{[5:20](+), [28:52](+)} 1208 >>> for part in location.parts: 1209 ... print(len(part)) 1210 ... 1211 15 1212 24 1213 1214 As you can see, this is a silly example where each "exon" is a word: 1215 1216 >>> print(small.extract(dna).translate()) 1217 SILLY 1218 >>> print(large.extract(dna).translate()) 1219 EXAMPLE* 1220 >>> print(location.extract(dna).translate()) 1221 SILLYEXAMPLE* 1222 >>> for part in location.parts: 1223 ... print(part.extract(dna).translate()) 1224 ... 1225 SILLY 1226 EXAMPLE* 1227 1228 Now, let's look at this from the reverse strand frame of reference: 1229 1230 >>> flipped_dna = dna.reverse_complement() 1231 >>> flipped_location = location._flip(len(dna)) 1232 >>> print(flipped_location.extract(flipped_dna).translate()) 1233 SILLYEXAMPLE* 1234 >>> for part in flipped_location.parts: 1235 ... print(part.extract(flipped_dna).translate()) 1236 ... 1237 SILLY 1238 EXAMPLE* 1239 1240 The key point here is the first part of the CompoundFeature is still the 1241 small exon, while the second part is still the large exon: 1242 1243 >>> for part in flipped_location.parts: 1244 ... print(len(part)) 1245 ... 1246 15 1247 24 1248 >>> print(flipped_location) 1249 join{[37:52](-), [5:29](-)} 1250 1251 Notice the parts are not reversed. However, there was a bug here in older 1252 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1253 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1254 1255 """ 1256 return CompoundLocation([loc._flip(length) for loc in self.parts], 1257 self.operator)
1258 1259 @property
1260 - def start(self):
1261 """Start location (integer like, possibly a fuzzy position, read only).""" 1262 return min(loc.start for loc in self.parts)
1263 1264 @property
1265 - def end(self):
1266 """End location (integer like, possibly a fuzzy position, read only).""" 1267 return max(loc.end for loc in self.parts)
1268 1269 @property
1270 - def nofuzzy_start(self):
1271 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1272 1273 This is an alias for int(feature.start), which should be used in 1274 preference -- unless you are trying to support old versions of 1275 Biopython. 1276 """ 1277 try: 1278 return int(self.start) 1279 except TypeError: 1280 if isinstance(self.start, UnknownPosition): 1281 return None 1282 raise
1283 1284 @property
1285 - def nofuzzy_end(self):
1286 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1287 1288 This is an alias for int(feature.end), which should be used in 1289 preference -- unless you are trying to support old versions of 1290 Biopython. 1291 """ 1292 try: 1293 return int(self.end) 1294 except TypeError: 1295 if isinstance(self.end, UnknownPosition): 1296 return None 1297 raise
1298 1299 @property
1300 - def ref(self):
1301 """CompoundLocation's don't have a ref (dummy method for API compatibility).""" 1302 return None
1303 1304 @property
1305 - def ref_db(self):
1306 """CompoundLocation's don't have a ref_db (dummy method for API compatibility).""" 1307 return None
1308
1309 - def extract(self, parent_sequence):
1310 """Extract feature sequence from the supplied parent sequence.""" 1311 # This copes with mixed strand features & all on reverse: 1312 parts = [loc.extract(parent_sequence) for loc in self.parts] 1313 # We use addition rather than a join to avoid alphabet issues: 1314 f_seq = parts[0] 1315 for part in parts[1:]: 1316 f_seq += part 1317 return f_seq
1318
1319 1320 -class AbstractPosition(object):
1321 """Abstract base class representing a position.""" 1322
1323 - def __repr__(self):
1324 """String representation of the location for debugging.""" 1325 return "%s(...)" % (self.__class__.__name__)
1326
1327 1328 -class ExactPosition(int, AbstractPosition):
1329 """Specify the specific position of a boundary. 1330 1331 o position - The position of the boundary. 1332 o extension - An optional argument which must be zero since we don't 1333 have an extension. The argument is provided so that the same number of 1334 arguments can be passed to all position types. 1335 1336 In this case, there is no fuzziness associated with the position. 1337 1338 >>> p = ExactPosition(5) 1339 >>> p 1340 ExactPosition(5) 1341 >>> print(p) 1342 5 1343 1344 >>> isinstance(p, AbstractPosition) 1345 True 1346 >>> isinstance(p, int) 1347 True 1348 1349 Integer comparisons and operations should work as expected: 1350 1351 >>> p == 5 1352 True 1353 >>> p < 6 1354 True 1355 >>> p <= 5 1356 True 1357 >>> p + 10 1358 15 1359 1360 """
1361 - def __new__(cls, position, extension=0):
1362 if extension != 0: 1363 raise AttributeError("Non-zero extension %s for exact position." 1364 % extension) 1365 return int.__new__(cls, position)
1366
1367 - def __repr__(self):
1368 """String representation of the ExactPosition location for debugging.""" 1369 return "%s(%i)" % (self.__class__.__name__, int(self))
1370 1371 @property
1372 - def position(self):
1373 """Legacy attribute to get position as integer (OBSOLETE).""" 1374 return int(self)
1375 1376 @property
1377 - def extension(self):
1378 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1379 return 0
1380
1381 - def _shift(self, offset):
1382 # By default preserve any subclass 1383 return self.__class__(int(self) + offset)
1384
1385 - def _flip(self, length):
1386 # By default perserve any subclass 1387 return self.__class__(length - int(self))
1388
1389 1390 -class UncertainPosition(ExactPosition):
1391 """Specify a specific position which is uncertain. 1392 1393 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1394 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1395 """ 1396 pass
1397
1398 1399 -class UnknownPosition(AbstractPosition):
1400 """Specify a specific position which is unknown (has no position). 1401 1402 This is used in UniProt, e.g. ? or in the XML as unknown. 1403 """ 1404
1405 - def __repr__(self):
1406 """String representation of the UnknownPosition location for debugging.""" 1407 return "%s()" % self.__class__.__name__
1408
1409 - def __hash__(self):
1410 return hash(None)
1411 1412 @property
1413 - def position(self):
1414 """Legacy attribute to get position (None) (OBSOLETE).""" 1415 return None
1416 1417 @property
1418 - def extension(self):
1419 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1420 return 0
1421
1422 - def _shift(self, offset):
1423 return self
1424
1425 - def _flip(self, length):
1426 return self
1427
1428 1429 -class WithinPosition(int, AbstractPosition):
1430 """Specify the position of a boundary within some coordinates. 1431 1432 Arguments: 1433 o position - The default integer position 1434 o left - The start (left) position of the boundary 1435 o right - The end (right) position of the boundary 1436 1437 This allows dealing with a position like ((1.4)..100). This 1438 indicates that the start of the sequence is somewhere between 1 1439 and 4. Since this is a start coordinate, it should acts like 1440 it is at position 1 (or in Python counting, 0). 1441 1442 >>> p = WithinPosition(10, 10, 13) 1443 >>> p 1444 WithinPosition(10, left=10, right=13) 1445 >>> print(p) 1446 (10.13) 1447 >>> int(p) 1448 10 1449 1450 Basic integer comparisons and operations should work as though 1451 this were a plain integer: 1452 1453 >>> p == 10 1454 True 1455 >>> p in [9, 10, 11] 1456 True 1457 >>> p < 11 1458 True 1459 >>> p + 10 1460 20 1461 1462 >>> isinstance(p, WithinPosition) 1463 True 1464 >>> isinstance(p, AbstractPosition) 1465 True 1466 >>> isinstance(p, int) 1467 True 1468 1469 Note this also applies for comparison to other position objects, 1470 where again the integer behaviour is used: 1471 1472 >>> p == 10 1473 True 1474 >>> p == ExactPosition(10) 1475 True 1476 >>> p == BeforePosition(10) 1477 True 1478 >>> p == AfterPosition(10) 1479 True 1480 1481 If this were an end point, you would want the position to be 13: 1482 1483 >>> p2 = WithinPosition(13, 10, 13) 1484 >>> p2 1485 WithinPosition(13, left=10, right=13) 1486 >>> print(p2) 1487 (10.13) 1488 >>> int(p2) 1489 13 1490 >>> p2 == 13 1491 True 1492 >>> p2 == ExactPosition(13) 1493 True 1494 1495 The old legacy properties of position and extension give the 1496 starting/lower/left position as an integer, and the distance 1497 to the ending/higher/right position as an integer. Note that 1498 the position object will act like either the left or the right 1499 end-point depending on how it was created: 1500 1501 >>> p.position == p2.position == 10 1502 True 1503 >>> p.extension == p2.extension == 3 1504 True 1505 >>> int(p) == int(p2) 1506 False 1507 >>> p == 10 1508 True 1509 >>> p2 == 13 1510 True 1511 1512 """
1513 - def __new__(cls, position, left, right):
1514 assert position == left or position == right, \ 1515 "WithinPosition: %r should match left %r or right %r" \ 1516 (position, left, right) 1517 obj = int.__new__(cls, position) 1518 obj._left = left 1519 obj._right = right 1520 return obj
1521
1522 - def __repr__(self):
1523 """String representation of the WithinPosition location for debugging.""" 1524 return "%s(%i, left=%i, right=%i)" \ 1525 % (self.__class__.__name__, int(self), 1526 self._left, self._right)
1527
1528 - def __str__(self):
1529 return "(%s.%s)" % (self._left, self._right)
1530 1531 @property
1532 - def position(self):
1533 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1534 return self._left
1535 1536 @property
1537 - def extension(self):
1538 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1539 return self._right - self._left
1540
1541 - def _shift(self, offset):
1542 return self.__class__(int(self) + offset, 1543 self._left + offset, 1544 self._right + offset)
1545
1546 - def _flip(self, length):
1547 return self.__class__(length - int(self), 1548 length - self._right, 1549 length - self._left)
1550
1551 1552 -class BetweenPosition(int, AbstractPosition):
1553 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1554 1555 Arguments: 1556 o position - The default integer position 1557 o left - The start (left) position of the boundary 1558 o right - The end (right) position of the boundary 1559 1560 This allows dealing with a position like 123^456. This 1561 indicates that the start of the sequence is somewhere between 1562 123 and 456. It is up to the parser to set the position argument 1563 to either boundary point (depending on if this is being used as 1564 a start or end of the feature). For example as a feature end: 1565 1566 >>> p = BetweenPosition(456, 123, 456) 1567 >>> p 1568 BetweenPosition(456, left=123, right=456) 1569 >>> print(p) 1570 (123^456) 1571 >>> int(p) 1572 456 1573 1574 Integer equality and comparison use the given position, 1575 1576 >>> p == 456 1577 True 1578 >>> p in [455, 456, 457] 1579 True 1580 >>> p > 300 1581 True 1582 1583 The old legacy properties of position and extension give the 1584 starting/lower/left position as an integer, and the distance 1585 to the ending/higher/right position as an integer. Note that 1586 the position object will act like either the left or the right 1587 end-point depending on how it was created: 1588 1589 >>> p2 = BetweenPosition(123, left=123, right=456) 1590 >>> p.position == p2.position == 123 1591 True 1592 >>> p.extension 1593 333 1594 >>> p2.extension 1595 333 1596 >>> p.extension == p2.extension == 333 1597 True 1598 >>> int(p) == int(p2) 1599 False 1600 >>> p == 456 1601 True 1602 >>> p2 == 123 1603 True 1604 1605 Note this potentially surprising behaviour: 1606 1607 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1608 True 1609 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1610 True 1611 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1612 True 1613 1614 i.e. For equality (and sorting) the position objects behave like 1615 integers. 1616 """
1617 - def __new__(cls, position, left, right):
1618 assert position == left or position == right 1619 obj = int.__new__(cls, position) 1620 obj._left = left 1621 obj._right = right 1622 return obj
1623
1624 - def __repr__(self):
1625 """String representation of the WithinPosition location for debugging.""" 1626 return "%s(%i, left=%i, right=%i)" \ 1627 % (self.__class__.__name__, int(self), 1628 self._left, self._right)
1629
1630 - def __str__(self):
1631 return "(%s^%s)" % (self._left, self._right)
1632 1633 @property
1634 - def position(self):
1635 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1636 return self._left
1637 1638 @property
1639 - def extension(self):
1640 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1641 return self._right - self._left
1642
1643 - def _shift(self, offset):
1644 return self.__class__(int(self) + offset, 1645 self._left + offset, 1646 self._right + offset)
1647
1648 - def _flip(self, length):
1649 return self.__class__(length - int(self), 1650 length - self._right, 1651 length - self._left)
1652
1653 1654 -class BeforePosition(int, AbstractPosition):
1655 """Specify a position where the actual location occurs before it. 1656 1657 Arguments: 1658 o position - The upper boundary of where the location can occur. 1659 o extension - An optional argument which must be zero since we don't 1660 have an extension. The argument is provided so that the same number of 1661 arguments can be passed to all position types. 1662 1663 This is used to specify positions like (<10..100) where the location 1664 occurs somewhere before position 10. 1665 1666 >>> p = BeforePosition(5) 1667 >>> p 1668 BeforePosition(5) 1669 >>> print(p) 1670 <5 1671 >>> int(p) 1672 5 1673 >>> p + 10 1674 15 1675 1676 Note this potentially surprising behaviour: 1677 1678 >>> p == ExactPosition(5) 1679 True 1680 >>> p == AfterPosition(5) 1681 True 1682 1683 Just remember that for equality and sorting the position objects act 1684 like integers. 1685 """ 1686 # Subclasses int so can't use __init__
1687 - def __new__(cls, position, extension=0):
1688 if extension != 0: 1689 raise AttributeError("Non-zero extension %s for exact position." 1690 % extension) 1691 return int.__new__(cls, position)
1692 1693 @property
1694 - def position(self):
1695 """Legacy attribute to get position as integer (OBSOLETE).""" 1696 return int(self)
1697 1698 @property
1699 - def extension(self):
1700 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1701 return 0
1702
1703 - def __repr__(self):
1704 """A string representation of the location for debugging.""" 1705 return "%s(%i)" % (self.__class__.__name__, int(self))
1706
1707 - def __str__(self):
1708 return "<%s" % self.position
1709
1710 - def _shift(self, offset):
1711 return self.__class__(int(self) + offset)
1712
1713 - def _flip(self, length):
1714 return AfterPosition(length - int(self))
1715
1716 1717 -class AfterPosition(int, AbstractPosition):
1718 """Specify a position where the actual location is found after it. 1719 1720 Arguments: 1721 o position - The lower boundary of where the location can occur. 1722 o extension - An optional argument which must be zero since we don't 1723 have an extension. The argument is provided so that the same number of 1724 arguments can be passed to all position types. 1725 1726 This is used to specify positions like (>10..100) where the location 1727 occurs somewhere after position 10. 1728 1729 >>> p = AfterPosition(7) 1730 >>> p 1731 AfterPosition(7) 1732 >>> print(p) 1733 >7 1734 >>> int(p) 1735 7 1736 >>> p + 10 1737 17 1738 1739 >>> isinstance(p, AfterPosition) 1740 True 1741 >>> isinstance(p, AbstractPosition) 1742 True 1743 >>> isinstance(p, int) 1744 True 1745 1746 Note this potentially surprising behaviour: 1747 1748 >>> p == ExactPosition(7) 1749 True 1750 >>> p == BeforePosition(7) 1751 True 1752 1753 Just remember that for equality and sorting the position objects act 1754 like integers. 1755 """ 1756 # Subclasses int so can't use __init__
1757 - def __new__(cls, position, extension=0):
1758 if extension != 0: 1759 raise AttributeError("Non-zero extension %s for exact position." 1760 % extension) 1761 return int.__new__(cls, position)
1762 1763 @property
1764 - def position(self):
1765 """Legacy attribute to get position as integer (OBSOLETE).""" 1766 return int(self)
1767 1768 @property
1769 - def extension(self):
1770 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1771 return 0
1772
1773 - def __repr__(self):
1774 """A string representation of the location for debugging.""" 1775 return "%s(%i)" % (self.__class__.__name__, int(self))
1776
1777 - def __str__(self):
1778 return ">%s" % self.position
1779
1780 - def _shift(self, offset):
1781 return self.__class__(int(self) + offset)
1782
1783 - def _flip(self, length):
1784 return BeforePosition(length - int(self))
1785
1786 1787 -class OneOfPosition(int, AbstractPosition):
1788 """Specify a position where the location can be multiple positions. 1789 1790 This models the GenBank 'one-of(1888,1901)' function, and tries 1791 to make this fit within the Biopython Position models. If this was 1792 a start position it should act like 1888, but as an end position 1901. 1793 1794 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1795 >>> p 1796 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1797 >>> int(p) 1798 1888 1799 1800 Interget comparisons and operators act like using int(p), 1801 1802 >>> p == 1888 1803 True 1804 >>> p <= 1888 1805 True 1806 >>> p > 1888 1807 False 1808 >>> p + 100 1809 1988 1810 1811 >>> isinstance(p, OneOfPosition) 1812 True 1813 >>> isinstance(p, AbstractPosition) 1814 True 1815 >>> isinstance(p, int) 1816 True 1817 1818 The old legacy properties of position and extension give the 1819 starting/lowest/left-most position as an integer, and the 1820 distance to the ending/highest/right-most position as an integer. 1821 Note that the position object will act like one of the list of 1822 possible locations depending on how it was created: 1823 1824 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 1825 >>> p.position == p2.position == 1888 1826 True 1827 >>> p.extension == p2.extension == 13 1828 True 1829 >>> int(p) == int(p2) 1830 False 1831 >>> p == 1888 1832 True 1833 >>> p2 == 1901 1834 True 1835 1836 """
1837 - def __new__(cls, position, choices):
1838 """Initialize with a set of posssible positions. 1839 1840 position_list is a list of AbstractPosition derived objects, 1841 specifying possible locations. 1842 1843 position is an integer specifying the default behaviour. 1844 """ 1845 assert position in choices, \ 1846 "OneOfPosition: %r should match one of %r" % (position, choices) 1847 obj = int.__new__(cls, position) 1848 obj.position_choices = choices 1849 return obj
1850 1851 @property
1852 - def position(self):
1853 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1854 return min(int(pos) for pos in self.position_choices)
1855 1856 @property
1857 - def extension(self):
1858 """Legacy attribute to get extension as integer (OBSOLETE).""" 1859 positions = [int(pos) for pos in self.position_choices] 1860 return max(positions) - min(positions)
1861
1862 - def __repr__(self):
1863 """String representation of the OneOfPosition location for debugging.""" 1864 return "%s(%i, choices=%r)" % (self.__class__.__name__, 1865 int(self), self.position_choices)
1866
1867 - def __str__(self):
1868 out = "one-of(" 1869 for position in self.position_choices: 1870 out += "%s," % position 1871 # replace the last comma with the closing parenthesis 1872 out = out[:-1] + ")" 1873 return out
1874
1875 - def _shift(self, offset):
1876 return self.__class__(int(self) + offset, 1877 [p._shift(offset) for p in self.position_choices])
1878
1879 - def _flip(self, length):
1880 return self.__class__(length - int(self), 1881 [p._flip(length) for p in self.position_choices[::-1]])
1882
1883 1884 -class PositionGap(object):
1885 """Simple class to hold information about a gap between positions.""" 1886
1887 - def __init__(self, gap_size):
1888 """Intialize with a position object containing the gap information. 1889 """ 1890 self.gap_size = gap_size
1891
1892 - def __repr__(self):
1893 """A string representation of the position gap for debugging.""" 1894 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1895
1896 - def __str__(self):
1897 out = "gap(%s)" % self.gap_size 1898 return out
1899 1900 1901 if __name__ == "__main__": 1902 from Bio._utils import run_doctest 1903 run_doctest() 1904