Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents that the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21      - read                  Parse a handle containing a single GenBank record 
  22        as Bio.GenBank specific Record objects. 
  23      - parse                 Iterate over a handle containing multiple GenBank 
  24        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30   
  31      - Iterator              Iterate through a file of GenBank entries 
  32      - ErrorFeatureParser    Catch errors caused during parsing. 
  33      - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  34      - RecordParser          Parse GenBank data into a Record object. 
  35   
  36  Exceptions: 
  37   
  38      - ParserFailureError    Exception indicating a failure in the parser (ie. 
  39        scanner or consumer) 
  40      - LocationParserError   Exception indiciating a problem with the spark based 
  41        location parser. 
  42   
  43  """ 
  44  from __future__ import print_function 
  45   
  46  import re 
  47  import sys  # for checking if Python 2 
  48   
  49  # other Biopython stuff 
  50  from Bio import SeqFeature 
  51   
  52  # other Bio.GenBank stuff 
  53  from .utils import FeatureValueCleaner 
  54  from .Scanner import GenBankScanner 
  55   
  56  __docformat__ = "restructuredtext en" 
  57   
  58  # Constants used to parse GenBank header lines 
  59  GENBANK_INDENT = 12 
  60  GENBANK_SPACER = " " * GENBANK_INDENT 
  61   
  62  # Constants for parsing GenBank feature lines 
  63  FEATURE_KEY_INDENT = 5 
  64  FEATURE_QUALIFIER_INDENT = 21 
  65  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  66  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  67   
  68  # Regular expressions for location parsing 
  69  _solo_location = r"[<>]?\d+" 
  70  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  71  _between_location = r"\d+\^\d+" 
  72   
  73  _within_position = r"\(\d+\.\d+\)" 
  74  _re_within_position = re.compile(_within_position) 
  75  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  76                     % (_within_position, _within_position) 
  77  assert _re_within_position.match("(3.9)") 
  78  assert re.compile(_within_location).match("(3.9)..10") 
  79  assert re.compile(_within_location).match("26..(30.33)") 
  80  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  81   
  82  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  83  _re_oneof_position = re.compile(_oneof_position) 
  84  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  85                     % (_oneof_position, _oneof_position) 
  86  assert _re_oneof_position.match("one-of(6,9)") 
  87  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  88  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  89  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  90   
  91  assert not _re_oneof_position.match("one-of(3)") 
  92  assert _re_oneof_position.match("one-of(3,6)") 
  93  assert _re_oneof_position.match("one-of(3,6,9)") 
  94   
  95   
  96  _simple_location = r"\d+\.\.\d+" 
  97  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  98  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  99                                   % (_simple_location, _simple_location)) 
 100  _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \ 
 101                      % (_pair_location, _solo_location, _between_location, 
 102                         _within_location, _oneof_location) 
 103  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 104  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 105                                            % (_complex_location, _complex_location) 
 106  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 107                                   % (_possibly_complemented_complex_location, 
 108                                      _possibly_complemented_complex_location)) 
 109   
 110   
 111  assert _re_simple_location.match("104..160") 
 112  assert not _re_simple_location.match("68451760..68452073^68452074") 
 113  assert not _re_simple_location.match("<104..>160") 
 114  assert not _re_simple_location.match("104") 
 115  assert not _re_simple_location.match("<1") 
 116  assert not _re_simple_location.match(">99999") 
 117  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 118  assert not _re_simple_compound.match("bond(12,63)") 
 119  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 120  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 121  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 122  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 123  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 124  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 125  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 126  assert not _re_simple_compound.match("complement(1..69)") 
 127  assert not _re_simple_compound.match("(1..69)") 
 128  assert _re_complex_location.match("(3.9)..10") 
 129  assert _re_complex_location.match("26..(30.33)") 
 130  assert _re_complex_location.match("(13.19)..(20.28)") 
 131  assert _re_complex_location.match("41^42")  # between 
 132  assert _re_complex_location.match("AL121804:41^42") 
 133  assert _re_complex_location.match("AL121804:41..610") 
 134  assert _re_complex_location.match("AL121804.2:41..610") 
 135  assert _re_complex_location.match("one-of(3,6)..101") 
 136  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 137  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 138  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 139   
 140  # Trans-spliced example from NC_016406, note underscore in reference name: 
 141  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 142  assert _re_complex_location.match("181647..181905") 
 143  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 144  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 145  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 146  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 147  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 148   
 149  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 150  assert _solo_bond.match("bond(196)") 
 151  assert _solo_bond.search("bond(196)") 
 152  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
 153   
 154   
155 -def _pos(pos_str, offset=0):
156 """Build a Position object (PRIVATE). 157 158 For an end position, leave offset as zero (default): 159 160 >>> _pos("5") 161 ExactPosition(5) 162 163 For a start position, set offset to minus one (for Python counting): 164 165 >>> _pos("5", -1) 166 ExactPosition(4) 167 168 This also covers fuzzy positions: 169 170 >>> p = _pos("<5") 171 >>> p 172 BeforePosition(5) 173 >>> print(p) 174 <5 175 >>> int(p) 176 5 177 178 >>> _pos(">5") 179 AfterPosition(5) 180 181 By default assumes an end position, so note the integer behaviour: 182 183 >>> p = _pos("one-of(5,8,11)") 184 >>> p 185 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 186 >>> print(p) 187 one-of(5,8,11) 188 >>> int(p) 189 11 190 191 >>> _pos("(8.10)") 192 WithinPosition(10, left=8, right=10) 193 194 Fuzzy start positions: 195 196 >>> p = _pos("<5", -1) 197 >>> p 198 BeforePosition(4) 199 >>> print(p) 200 <4 201 >>> int(p) 202 4 203 204 Notice how the integer behaviour changes too! 205 206 >>> p = _pos("one-of(5,8,11)", -1) 207 >>> p 208 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 209 >>> print(p) 210 one-of(4,7,10) 211 >>> int(p) 212 4 213 214 """ 215 if pos_str.startswith("<"): 216 return SeqFeature.BeforePosition(int(pos_str[1:])+offset) 217 elif pos_str.startswith(">"): 218 return SeqFeature.AfterPosition(int(pos_str[1:])+offset) 219 elif _re_within_position.match(pos_str): 220 s, e = pos_str[1:-1].split(".") 221 s = int(s) + offset 222 e = int(e) + offset 223 if offset == -1: 224 default = s 225 else: 226 default = e 227 return SeqFeature.WithinPosition(default, left=s, right=e) 228 elif _re_oneof_position.match(pos_str): 229 assert pos_str.startswith("one-of(") 230 assert pos_str[-1]==")" 231 parts = [SeqFeature.ExactPosition(int(pos)+offset) 232 for pos in pos_str[7:-1].split(",")] 233 if offset == -1: 234 default = min(int(pos) for pos in parts) 235 else: 236 default = max(int(pos) for pos in parts) 237 return SeqFeature.OneOfPosition(default, choices=parts) 238 else: 239 return SeqFeature.ExactPosition(int(pos_str)+offset)
240 241
242 -def _loc(loc_str, expected_seq_length, strand):
243 """FeatureLocation from non-compound non-complement location (PRIVATE). 244 245 Simple examples, 246 247 >>> _loc("123..456", 1000, +1) 248 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 249 >>> _loc("<123..>456", 1000, strand = -1) 250 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 251 252 A more complex location using within positions, 253 254 >>> _loc("(9.10)..(20.25)", 1000, 1) 255 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 256 257 Notice how that will act as though it has overall start 8 and end 25. 258 259 Zero length between feature, 260 261 >>> _loc("123^124", 1000, 0) 262 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 263 264 The expected sequence length is needed for a special case, a between 265 position at the start/end of a circular genome: 266 267 >>> _loc("1000^1", 1000, 1) 268 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 269 270 Apart from this special case, between positions P^Q must have P+1==Q, 271 272 >>> _loc("123^456", 1000, 1) 273 Traceback (most recent call last): 274 ... 275 ValueError: Invalid between location '123^456' 276 """ 277 try: 278 s, e = loc_str.split("..") 279 except ValueError: 280 assert ".." not in loc_str 281 if "^" in loc_str: 282 # A between location like "67^68" (one based counting) is a 283 # special case (note it has zero length). In python slice 284 # notation this is 67:67, a zero length slice. See Bug 2622 285 # Further more, on a circular genome of length N you can have 286 # a location N^1 meaning the junction at the origin. See Bug 3098. 287 # NOTE - We can imagine between locations like "2^4", but this 288 # is just "3". Similarly, "2^5" is just "3..4" 289 s, e = loc_str.split("^") 290 if int(s)+1==int(e): 291 pos = _pos(s) 292 elif int(s)==expected_seq_length and e=="1": 293 pos = _pos(s) 294 else: 295 raise ValueError("Invalid between location %s" % repr(loc_str)) 296 return SeqFeature.FeatureLocation(pos, pos, strand) 297 else: 298 # e.g. "123" 299 s = loc_str 300 e = loc_str 301 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand)
302 303
304 -def _split_compound_loc(compound_loc):
305 """Split a tricky compound location string (PRIVATE). 306 307 >>> list(_split_compound_loc("123..145")) 308 ['123..145'] 309 >>> list(_split_compound_loc("123..145,200..209")) 310 ['123..145', '200..209'] 311 >>> list(_split_compound_loc("one-of(200,203)..300")) 312 ['one-of(200,203)..300'] 313 >>> list(_split_compound_loc("complement(123..145),200..209")) 314 ['complement(123..145)', '200..209'] 315 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 316 ['123..145', 'one-of(200,203)..209'] 317 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 318 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 319 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 320 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 321 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 322 ['123..145', '200..one-of(209,211)', '300'] 323 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 324 ['123..145', '200..one-of(209,211)'] 325 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 326 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 327 """ 328 if "one-of(" in compound_loc: 329 # Hard case 330 while "," in compound_loc: 331 assert compound_loc[0] != "," 332 assert compound_loc[0:2] != ".." 333 i = compound_loc.find(",") 334 part = compound_loc[:i] 335 compound_loc = compound_loc[i:] # includes the comma 336 while part.count("(") > part.count(")"): 337 assert "one-of(" in part, (part, compound_loc) 338 i = compound_loc.find(")") 339 part += compound_loc[:i+1] 340 compound_loc = compound_loc[i+1:] 341 if compound_loc.startswith(".."): 342 i = compound_loc.find(",") 343 if i==-1: 344 part += compound_loc 345 compound_loc = "" 346 else: 347 part += compound_loc[:i] 348 compound_loc = compound_loc[i:] # includes the comma 349 while part.count("(") > part.count(")"): 350 assert part.count("one-of(") == 2 351 i = compound_loc.find(")") 352 part += compound_loc[:i+1] 353 compound_loc = compound_loc[i+1:] 354 if compound_loc.startswith(","): 355 compound_loc = compound_loc[1:] 356 assert part 357 yield part 358 if compound_loc: 359 yield compound_loc 360 else: 361 # Easy case 362 for part in compound_loc.split(","): 363 yield part
364 365
366 -class Iterator(object):
367 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 368 369 This class is likely to be deprecated in a future release of Biopython. 370 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 371 for SeqRecord and GenBank specific Record objects respectively instead. 372 """
373 - def __init__(self, handle, parser=None):
374 """Initialize the iterator. 375 376 Arguments: 377 378 - handle - A handle with GenBank entries to iterate through. 379 - parser - An optional parser to pass the entries through before 380 returning them. If None, then the raw entry will be returned. 381 """ 382 self.handle = handle 383 self._parser = parser
384
385 - def __next__(self):
386 """Return the next GenBank record from the handle. 387 388 Will return None if we ran out of records. 389 """ 390 if self._parser is None: 391 lines = [] 392 while True: 393 line = self.handle.readline() 394 if not line: 395 return None # Premature end of file? 396 lines.append(line) 397 if line.rstrip() == "//": 398 break 399 return "".join(lines) 400 try: 401 return self._parser.parse(self.handle) 402 except StopIteration: 403 return None
404 405 if sys.version_info[0] < 3:
406 - def next(self):
407 """Python 2 style alias for Python 3 style __next__ method.""" 408 return self.__next__()
409
410 - def __iter__(self):
411 return iter(self.__next__, None)
412 413
414 -class ParserFailureError(Exception):
415 """Failure caused by some kind of problem in the parser. 416 """ 417 pass
418 419
420 -class LocationParserError(Exception):
421 """Could not Properly parse out a location from a GenBank file. 422 """ 423 pass
424 425
426 -class FeatureParser(object):
427 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 428 429 Direct use of this class is discouraged, and may be deprecated in 430 a future release of Biopython. 431 432 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 433 """
434 - def __init__(self, debug_level=0, use_fuzziness=1, 435 feature_cleaner=FeatureValueCleaner()):
436 """Initialize a GenBank parser and Feature consumer. 437 438 Arguments: 439 440 - debug_level - An optional argument that species the amount of 441 debugging information the parser should spit out. By default we have 442 no debugging info (the fastest way to do things), but if you want 443 you can set this as high as two and see exactly where a parse fails. 444 - use_fuzziness - Specify whether or not to use fuzzy representations. 445 The default is 1 (use fuzziness). 446 - feature_cleaner - A class which will be used to clean out the 447 values of features. This class must implement the function 448 clean_value. GenBank.utils has a "standard" cleaner class, which 449 is used by default. 450 """ 451 self._scanner = GenBankScanner(debug_level) 452 self.use_fuzziness = use_fuzziness 453 self._cleaner = feature_cleaner
454
455 - def parse(self, handle):
456 """Parse the specified handle. 457 """ 458 self._consumer = _FeatureConsumer(self.use_fuzziness, 459 self._cleaner) 460 self._scanner.feed(handle, self._consumer) 461 return self._consumer.data
462 463
464 -class RecordParser(object):
465 """Parse GenBank files into Record objects (OBSOLETE). 466 467 Direct use of this class is discouraged, and may be deprecated in 468 a future release of Biopython. 469 470 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 471 instead. 472 """
473 - def __init__(self, debug_level=0):
474 """Initialize the parser. 475 476 Arguments: 477 478 - debug_level - An optional argument that species the amount of 479 debugging information the parser should spit out. By default we have 480 no debugging info (the fastest way to do things), but if you want 481 you can set this as high as two and see exactly where a parse fails. 482 """ 483 self._scanner = GenBankScanner(debug_level)
484
485 - def parse(self, handle):
486 """Parse the specified handle into a GenBank record. 487 """ 488 self._consumer = _RecordConsumer() 489 490 self._scanner.feed(handle, self._consumer) 491 return self._consumer.data
492 493
494 -class _BaseGenBankConsumer(object):
495 """Abstract GenBank consumer providing useful general functions (PRIVATE). 496 497 This just helps to eliminate some duplication in things that most 498 GenBank consumers want to do. 499 """ 500 # Special keys in GenBank records that we should remove spaces from 501 # For instance, \translation keys have values which are proteins and 502 # should have spaces and newlines removed from them. This class 503 # attribute gives us more control over specific formatting problems. 504 remove_space_keys = ["translation"] 505
506 - def __init__(self):
507 pass
508
509 - def _unhandled(self, data):
510 pass
511
512 - def __getattr__(self, attr):
513 return self._unhandled
514
515 - def _split_keywords(self, keyword_string):
516 """Split a string of keywords into a nice clean list. 517 """ 518 # process the keywords into a python list 519 if keyword_string == "" or keyword_string == ".": 520 keywords = "" 521 elif keyword_string[-1] == '.': 522 keywords = keyword_string[:-1] 523 else: 524 keywords = keyword_string 525 keyword_list = keywords.split(';') 526 clean_keyword_list = [x.strip() for x in keyword_list] 527 return clean_keyword_list
528
529 - def _split_accessions(self, accession_string):
530 """Split a string of accession numbers into a list. 531 """ 532 # first replace all line feeds with spaces 533 # Also, EMBL style accessions are split with ';' 534 accession = accession_string.replace("\n", " ").replace(";", " ") 535 536 return [x.strip() for x in accession.split() if x.strip()]
537
538 - def _split_taxonomy(self, taxonomy_string):
539 """Split a string with taxonomy info into a list. 540 """ 541 if not taxonomy_string or taxonomy_string==".": 542 # Missing data, no taxonomy 543 return [] 544 545 if taxonomy_string[-1] == '.': 546 tax_info = taxonomy_string[:-1] 547 else: 548 tax_info = taxonomy_string 549 tax_list = tax_info.split(';') 550 new_tax_list = [] 551 for tax_item in tax_list: 552 new_items = tax_item.split("\n") 553 new_tax_list.extend(new_items) 554 while '' in new_tax_list: 555 new_tax_list.remove('') 556 clean_tax_list = [x.strip() for x in new_tax_list] 557 558 return clean_tax_list
559
560 - def _clean_location(self, location_string):
561 """Clean whitespace out of a location string. 562 563 The location parser isn't a fan of whitespace, so we clean it out 564 before feeding it into the parser. 565 """ 566 # Originally this imported string.whitespace and did a replace 567 # via a loop. It's simpler to just split on whitespace and rejoin 568 # the string - and this avoids importing string too. See Bug 2684. 569 return ''.join(location_string.split())
570
571 - def _remove_newlines(self, text):
572 """Remove any newlines in the passed text, returning the new string. 573 """ 574 # get rid of newlines in the qualifier value 575 newlines = ["\n", "\r"] 576 for ws in newlines: 577 text = text.replace(ws, "") 578 579 return text
580
581 - def _normalize_spaces(self, text):
582 """Replace multiple spaces in the passed text with single spaces. 583 """ 584 # get rid of excessive spaces 585 return ' '.join(x for x in text.split(" ") if x)
586
587 - def _remove_spaces(self, text):
588 """Remove all spaces from the passed text. 589 """ 590 return text.replace(" ", "")
591
592 - def _convert_to_python_numbers(self, start, end):
593 """Convert a start and end range to python notation. 594 595 In GenBank, starts and ends are defined in "biological" coordinates, 596 where 1 is the first base and [i, j] means to include both i and j. 597 598 In python, 0 is the first base and [i, j] means to include i, but 599 not j. 600 601 So, to convert "biological" to python coordinates, we need to 602 subtract 1 from the start, and leave the end and things should 603 be converted happily. 604 """ 605 new_start = start - 1 606 new_end = end 607 608 return new_start, new_end
609 610
611 -class _FeatureConsumer(_BaseGenBankConsumer):
612 """Create a SeqRecord object with Features to return (PRIVATE). 613 614 Attributes: 615 616 - use_fuzziness - specify whether or not to parse with fuzziness in 617 feature locations. 618 - feature_cleaner - a class that will be used to provide specialized 619 cleaning-up of feature values. 620 """
621 - def __init__(self, use_fuzziness, feature_cleaner=None):
622 from Bio.SeqRecord import SeqRecord 623 _BaseGenBankConsumer.__init__(self) 624 self.data = SeqRecord(None, id=None) 625 self.data.id = None 626 self.data.description = "" 627 628 self._use_fuzziness = use_fuzziness 629 self._feature_cleaner = feature_cleaner 630 631 self._seq_type = '' 632 self._seq_data = [] 633 self._cur_reference = None 634 self._cur_feature = None 635 self._expected_size = None
636
637 - def locus(self, locus_name):
638 """Set the locus name is set as the name of the Sequence. 639 """ 640 self.data.name = locus_name
641
642 - def size(self, content):
643 """Record the sequence length.""" 644 self._expected_size = int(content)
645
646 - def residue_type(self, type):
647 """Record the sequence type so we can choose an appropriate alphabet. 648 """ 649 self._seq_type = type.strip()
650
651 - def data_file_division(self, division):
652 self.data.annotations['data_file_division'] = division
653
654 - def date(self, submit_date):
655 self.data.annotations['date'] = submit_date
656
657 - def definition(self, definition):
658 """Set the definition as the description of the sequence. 659 """ 660 if self.data.description: 661 # Append to any existing description 662 # e.g. EMBL files with two DE lines. 663 self.data.description += " " + definition 664 else: 665 self.data.description = definition
666
667 - def accession(self, acc_num):
668 """Set the accession number as the id of the sequence. 669 670 If we have multiple accession numbers, the first one passed is 671 used. 672 """ 673 new_acc_nums = self._split_accessions(acc_num) 674 675 # Also record them ALL in the annotations 676 try: 677 # On the off chance there was more than one accession line: 678 for acc in new_acc_nums: 679 # Prevent repeat entries 680 if acc not in self.data.annotations['accessions']: 681 self.data.annotations['accessions'].append(acc) 682 except KeyError: 683 self.data.annotations['accessions'] = new_acc_nums 684 685 # if we haven't set the id information yet, add the first acc num 686 if not self.data.id: 687 if len(new_acc_nums) > 0: 688 # self.data.id = new_acc_nums[0] 689 # Use the FIRST accession as the ID, not the first on this line! 690 self.data.id = self.data.annotations['accessions'][0]
691
692 - def wgs(self, content):
693 self.data.annotations['wgs'] = content.split('-')
694
695 - def add_wgs_scafld(self, content):
696 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
697
698 - def nid(self, content):
699 self.data.annotations['nid'] = content
700
701 - def pid(self, content):
702 self.data.annotations['pid'] = content
703
704 - def version(self, version_id):
705 # Want to use the versioned accession as the record.id 706 # This comes from the VERSION line in GenBank files, or the 707 # obsolete SV line in EMBL. For the new EMBL files we need 708 # both the version suffix from the ID line and the accession 709 # from the AC line. 710 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 711 self.accession(version_id.split(".")[0]) 712 self.version_suffix(version_id.split(".")[1]) 713 elif version_id: 714 # For backwards compatibility... 715 self.data.id = version_id
716
717 - def project(self, content):
718 """Handle the information from the PROJECT line as a list of projects. 719 720 e.g.:: 721 722 PROJECT GenomeProject:28471 723 724 or:: 725 726 PROJECT GenomeProject:13543 GenomeProject:99999 727 728 This is stored as dbxrefs in the SeqRecord to be consistent with the 729 projected switch of this line to DBLINK in future GenBank versions. 730 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 731 "Project:28471" as part of this transition. 732 """ 733 content = content.replace("GenomeProject:", "Project:") 734 self.data.dbxrefs.extend(p for p in content.split() if p)
735 768
769 - def version_suffix(self, version):
770 """Set the version to overwrite the id. 771 772 Since the verison provides the same information as the accession 773 number, plus some extra info, we set this as the id if we have 774 a version. 775 """ 776 # e.g. GenBank line: 777 # VERSION U49845.1 GI:1293613 778 # or the obsolete EMBL line: 779 # SV U49845.1 780 # Scanner calls consumer.version("U49845.1") 781 # which then calls consumer.version_suffix(1) 782 # 783 # e.g. EMBL new line: 784 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 785 # Scanner calls consumer.version_suffix(1) 786 assert version.isdigit() 787 self.data.annotations['sequence_version'] = int(version)
788
789 - def db_source(self, content):
790 self.data.annotations['db_source'] = content.rstrip()
791
792 - def gi(self, content):
793 self.data.annotations['gi'] = content
794
795 - def keywords(self, content):
796 if 'keywords' in self.data.annotations: 797 # Multi-line keywords, append to list 798 # Note EMBL states "A keyword is never split between lines." 799 self.data.annotations['keywords'].extend(self._split_keywords(content)) 800 else: 801 self.data.annotations['keywords'] = self._split_keywords(content)
802
803 - def segment(self, content):
804 self.data.annotations['segment'] = content
805
806 - def source(self, content):
807 # Note that some software (e.g. VectorNTI) may produce an empty 808 # source (rather than using a dot/period as might be expected). 809 if content == "": 810 source_info = "" 811 elif content[-1] == '.': 812 source_info = content[:-1] 813 else: 814 source_info = content 815 self.data.annotations['source'] = source_info
816
817 - def organism(self, content):
818 self.data.annotations['organism'] = content
819
820 - def taxonomy(self, content):
821 """Records (another line of) the taxonomy lineage. 822 """ 823 lineage = self._split_taxonomy(content) 824 try: 825 self.data.annotations['taxonomy'].extend(lineage) 826 except KeyError: 827 self.data.annotations['taxonomy'] = lineage
828
829 - def reference_num(self, content):
830 """Signal the beginning of a new reference object. 831 """ 832 # if we have a current reference that hasn't been added to 833 # the list of references, add it. 834 if self._cur_reference is not None: 835 self.data.annotations['references'].append(self._cur_reference) 836 else: 837 self.data.annotations['references'] = [] 838 839 self._cur_reference = SeqFeature.Reference()
840
841 - def reference_bases(self, content):
842 """Attempt to determine the sequence region the reference entails. 843 844 Possible types of information we may have to deal with: 845 846 (bases 1 to 86436) 847 (sites) 848 (bases 1 to 105654; 110423 to 111122) 849 1 (residues 1 to 182) 850 """ 851 # first remove the parentheses or other junk 852 ref_base_info = content[1:-1] 853 854 all_locations = [] 855 # parse if we've got 'bases' and 'to' 856 if 'bases' in ref_base_info and 'to' in ref_base_info: 857 # get rid of the beginning 'bases' 858 ref_base_info = ref_base_info[5:] 859 locations = self._split_reference_locations(ref_base_info) 860 all_locations.extend(locations) 861 elif 'residues' in ref_base_info and 'to' in ref_base_info: 862 residues_start = ref_base_info.find("residues") 863 # get only the information after "residues" 864 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 865 locations = self._split_reference_locations(ref_base_info) 866 all_locations.extend(locations) 867 868 # make sure if we are not finding information then we have 869 # the string 'sites' or the string 'bases' 870 elif (ref_base_info == 'sites' or 871 ref_base_info.strip() == 'bases'): 872 pass 873 # otherwise raise an error 874 else: 875 raise ValueError("Could not parse base info %s in record %s" % 876 (ref_base_info, self.data.id)) 877 878 self._cur_reference.location = all_locations
879
880 - def _split_reference_locations(self, location_string):
881 """Get reference locations out of a string of reference information 882 883 The passed string should be of the form:: 884 885 1 to 20; 20 to 100 886 887 This splits the information out and returns a list of location objects 888 based on the reference locations. 889 """ 890 # split possibly multiple locations using the ';' 891 all_base_info = location_string.split(';') 892 893 new_locations = [] 894 for base_info in all_base_info: 895 start, end = base_info.split('to') 896 new_start, new_end = \ 897 self._convert_to_python_numbers(int(start.strip()), 898 int(end.strip())) 899 this_location = SeqFeature.FeatureLocation(new_start, new_end) 900 new_locations.append(this_location) 901 return new_locations
902
903 - def authors(self, content):
904 if self._cur_reference.authors: 905 self._cur_reference.authors += ' ' + content 906 else: 907 self._cur_reference.authors = content
908
909 - def consrtm(self, content):
910 if self._cur_reference.consrtm: 911 self._cur_reference.consrtm += ' ' + content 912 else: 913 self._cur_reference.consrtm = content
914
915 - def title(self, content):
916 if self._cur_reference is None: 917 import warnings 918 from Bio import BiopythonParserWarning 919 warnings.warn("GenBank TITLE line without REFERENCE line.", 920 BiopythonParserWarning) 921 elif self._cur_reference.title: 922 self._cur_reference.title += ' ' + content 923 else: 924 self._cur_reference.title = content
925
926 - def journal(self, content):
927 if self._cur_reference.journal: 928 self._cur_reference.journal += ' ' + content 929 else: 930 self._cur_reference.journal = content
931
932 - def medline_id(self, content):
933 self._cur_reference.medline_id = content
934
935 - def pubmed_id(self, content):
936 self._cur_reference.pubmed_id = content
937
938 - def remark(self, content):
939 """Deal with a reference comment.""" 940 if self._cur_reference.comment: 941 self._cur_reference.comment += ' ' + content 942 else: 943 self._cur_reference.comment = content
944
945 - def comment(self, content):
946 try: 947 self.data.annotations['comment'] += "\n" + "\n".join(content) 948 except KeyError: 949 self.data.annotations['comment'] = "\n".join(content)
950
951 - def features_line(self, content):
952 """Get ready for the feature table when we reach the FEATURE line. 953 """ 954 self.start_feature_table()
955
956 - def start_feature_table(self):
957 """Indicate we've got to the start of the feature table. 958 """ 959 # make sure we've added on our last reference object 960 if self._cur_reference is not None: 961 self.data.annotations['references'].append(self._cur_reference) 962 self._cur_reference = None
963
964 - def feature_key(self, content):
965 # start a new feature 966 self._cur_feature = SeqFeature.SeqFeature() 967 self._cur_feature.type = content 968 self.data.features.append(self._cur_feature)
969
970 - def location(self, content):
971 """Parse out location information from the location string. 972 973 This uses simple Python code with some regular expressions to do the 974 parsing, and then translates the results into appropriate objects. 975 """ 976 # clean up newlines and other whitespace inside the location before 977 # parsing - locations should have no whitespace whatsoever 978 location_line = self._clean_location(content) 979 980 # Older records have junk like replace(266,"c") in the 981 # location line. Newer records just replace this with 982 # the number 266 and have the information in a more reasonable 983 # place. So we'll just grab out the number and feed this to the 984 # parser. We shouldn't really be losing any info this way. 985 if 'replace' in location_line: 986 comma_pos = location_line.find(',') 987 location_line = location_line[8:comma_pos] 988 989 cur_feature = self._cur_feature 990 991 # Handle top level complement here for speed 992 if location_line.startswith("complement("): 993 assert location_line.endswith(")") 994 location_line = location_line[11:-1] 995 strand = -1 996 elif "PROTEIN" in self._seq_type.upper(): 997 strand = None 998 else: 999 # Assume nucleotide otherwise feature strand for 1000 # GenBank files with bad LOCUS lines set to None 1001 strand = 1 1002 1003 # Special case handling of the most common cases for speed 1004 if _re_simple_location.match(location_line): 1005 # e.g. "123..456" 1006 s, e = location_line.split("..") 1007 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1, 1008 int(e), 1009 strand) 1010 return 1011 1012 if _solo_bond.search(location_line): 1013 # e.g. bond(196) 1014 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1015 import warnings 1016 from Bio import BiopythonParserWarning 1017 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1018 # There ought to be a better way to do this... 1019 for x in _solo_bond.finditer(location_line): 1020 x = x.group() 1021 location_line = location_line.replace(x, x[5:-1]) 1022 1023 if _re_simple_compound.match(location_line): 1024 # e.g. join(<123..456,480..>500) 1025 i = location_line.find("(") 1026 # cur_feature.location_operator = location_line[:i] 1027 # we can split on the comma because these are simple locations 1028 sub_features = cur_feature.sub_features 1029 for part in location_line[i+1:-1].split(","): 1030 s, e = part.split("..") 1031 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1, 1032 int(e), 1033 strand), 1034 location_operator=cur_feature.location_operator, 1035 type=cur_feature.type) 1036 sub_features.append(f) 1037 # s = cur_feature.sub_features[0].location.start 1038 # e = cur_feature.sub_features[-1].location.end 1039 # cur_feature.location = SeqFeature.FeatureLocation(s,e, strand) 1040 # TODO - Remove use of sub_features 1041 if strand == -1: 1042 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1043 operator=location_line[:i]) 1044 else: 1045 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1046 operator=location_line[:i]) 1047 return 1048 1049 # Handle the general case with more complex regular expressions 1050 if _re_complex_location.match(location_line): 1051 # e.g. "AL121804.2:41..610" 1052 if ":" in location_line: 1053 location_ref, location_line = location_line.split(":") 1054 cur_feature.location = _loc(location_line, self._expected_size, strand) 1055 cur_feature.location.ref = location_ref 1056 else: 1057 cur_feature.location = _loc(location_line, self._expected_size, strand) 1058 return 1059 1060 if _re_complex_compound.match(location_line): 1061 i = location_line.find("(") 1062 # cur_feature.location_operator = location_line[:i] 1063 # Can't split on the comma because of positions like one-of(1,2,3) 1064 sub_features = cur_feature.sub_features 1065 for part in _split_compound_loc(location_line[i+1:-1]): 1066 if part.startswith("complement("): 1067 assert part[-1]==")" 1068 part = part[11:-1] 1069 assert strand != -1, "Double complement?" 1070 part_strand = -1 1071 else: 1072 part_strand = strand 1073 if ":" in part: 1074 ref, part = part.split(":") 1075 else: 1076 ref = None 1077 try: 1078 loc = _loc(part, self._expected_size, part_strand) 1079 except ValueError as err: 1080 print(location_line) 1081 print(part) 1082 raise err 1083 f = SeqFeature.SeqFeature(location=loc, ref=ref, 1084 location_operator=cur_feature.location_operator, 1085 type=cur_feature.type) 1086 sub_features.append(f) 1087 # Historically a join on the reverse strand has been represented 1088 # in Biopython with both the parent SeqFeature and its children 1089 # (the exons for a CDS) all given a strand of -1. Likewise, for 1090 # a join feature on the forward strand they all have strand +1. 1091 # However, we must also consider evil mixed strand examples like 1092 # this, join(complement(69611..69724),139856..140087,140625..140650) 1093 # 1094 # TODO - Remove use of sub_features 1095 strands = set(sf.strand for sf in sub_features) 1096 if len(strands)==1: 1097 strand = sub_features[0].strand 1098 else: 1099 strand = None # i.e. mixed strands 1100 if strand == -1: 1101 # Reverse the backwards order used in GenBank files 1102 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1103 operator=location_line[:i]) 1104 else: 1105 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1106 operator=location_line[:i]) 1107 return 1108 # Not recognised 1109 if "order" in location_line and "join" in location_line: 1110 # See Bug 3197 1111 msg = 'Combinations of "join" and "order" within the same ' + \ 1112 'location (nested operators) are illegal:\n' + location_line 1113 raise LocationParserError(msg) 1114 # This used to be an error.... 1115 cur_feature.location = None 1116 import warnings 1117 from Bio import BiopythonParserWarning 1118 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1119 % (location_line)))
1120
1121 - def feature_qualifier(self, key, value):
1122 """When we get a qualifier key and its value. 1123 1124 Can receive None, since you can have valueless keys such as /pseudo 1125 """ 1126 # Hack to try to preserve historical behaviour of /pseudo etc 1127 if value is None: 1128 # if the key doesn't exist yet, add an empty string 1129 if key not in self._cur_feature.qualifiers: 1130 self._cur_feature.qualifiers[key] = [""] 1131 return 1132 # otherwise just skip this key 1133 return 1134 1135 value = value.replace('"', '') 1136 if self._feature_cleaner is not None: 1137 value = self._feature_cleaner.clean_value(key, value) 1138 1139 # if the qualifier name exists, append the value 1140 if key in self._cur_feature.qualifiers: 1141 self._cur_feature.qualifiers[key].append(value) 1142 # otherwise start a new list of the key with its values 1143 else: 1144 self._cur_feature.qualifiers[key] = [value]
1145
1146 - def feature_qualifier_name(self, content_list):
1147 """Use feature_qualifier instead (OBSOLETE).""" 1148 raise NotImplementedError("Use the feature_qualifier method instead.")
1149
1150 - def feature_qualifier_description(self, content):
1151 """Use feature_qualifier instead (OBSOLETE).""" 1152 raise NotImplementedError("Use the feature_qualifier method instead.")
1153
1154 - def contig_location(self, content):
1155 """Deal with CONTIG information.""" 1156 # Historically this was stored as a SeqFeature object, but it was 1157 # stored under record.annotations["contig"] and not under 1158 # record.features with the other SeqFeature objects. 1159 # 1160 # The CONTIG location line can include additional tokens like 1161 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1162 # location lines, so storing it using SeqFeature based location 1163 # objects is difficult. 1164 # 1165 # We now store this a string, which means for BioSQL we are now in 1166 # much better agreement with how BioPerl records the CONTIG line 1167 # in the database. 1168 # 1169 # NOTE - This code assumes the scanner will return all the CONTIG 1170 # lines already combined into one long string! 1171 self.data.annotations["contig"] = content
1172
1173 - def origin_name(self, content):
1174 pass
1175
1176 - def base_count(self, content):
1177 pass
1178
1179 - def base_number(self, content):
1180 pass
1181
1182 - def sequence(self, content):
1183 """Add up sequence information as we get it. 1184 1185 To try and make things speedier, this puts all of the strings 1186 into a list of strings, and then uses string.join later to put 1187 them together. Supposedly, this is a big time savings 1188 """ 1189 assert ' ' not in content 1190 self._seq_data.append(content.upper())
1191
1192 - def record_end(self, content):
1193 """Clean up when we've finished the record. 1194 """ 1195 from Bio import Alphabet 1196 from Bio.Alphabet import IUPAC 1197 from Bio.Seq import Seq, UnknownSeq 1198 1199 # Try and append the version number to the accession for the full id 1200 if not self.data.id: 1201 assert 'accessions' not in self.data.annotations, \ 1202 self.data.annotations['accessions'] 1203 self.data.id = self.data.name # Good fall back? 1204 elif self.data.id.count('.') == 0: 1205 try: 1206 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1207 except KeyError: 1208 pass 1209 1210 # add the sequence information 1211 # first, determine the alphabet 1212 # we default to an generic alphabet if we don't have a 1213 # seq type or have strange sequence information. 1214 seq_alphabet = Alphabet.generic_alphabet 1215 1216 # now set the sequence 1217 sequence = "".join(self._seq_data) 1218 1219 if self._expected_size is not None \ 1220 and len(sequence) != 0 \ 1221 and self._expected_size != len(sequence): 1222 import warnings 1223 from Bio import BiopythonParserWarning 1224 warnings.warn("Expected sequence length %i, found %i (%s)." 1225 % (self._expected_size, len(sequence), self.data.id), 1226 BiopythonParserWarning) 1227 1228 if self._seq_type: 1229 # mRNA is really also DNA, since it is actually cDNA 1230 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1231 seq_alphabet = IUPAC.ambiguous_dna 1232 # are there ever really RNA sequences in GenBank? 1233 elif 'RNA' in self._seq_type.upper(): 1234 # Even for data which was from RNA, the sequence string 1235 # is usually given as DNA (T not U). Bug 2408 1236 if "T" in sequence and "U" not in sequence: 1237 seq_alphabet = IUPAC.ambiguous_dna 1238 else: 1239 seq_alphabet = IUPAC.ambiguous_rna 1240 elif 'PROTEIN' in self._seq_type.upper() \ 1241 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1242 seq_alphabet = IUPAC.protein # or extended protein? 1243 # work around ugly GenBank records which have circular or 1244 # linear but no indication of sequence type 1245 elif self._seq_type in ["circular", "linear", "unspecified"]: 1246 pass 1247 # we have a bug if we get here 1248 else: 1249 raise ValueError("Could not determine alphabet for seq_type %s" 1250 % self._seq_type) 1251 1252 if not sequence and self.__expected_size: 1253 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1254 else: 1255 self.data.seq = Seq(sequence, seq_alphabet)
1256 1257
1258 -class _RecordConsumer(_BaseGenBankConsumer):
1259 """Create a GenBank Record object from scanner generated information (PRIVATE). 1260 """
1261 - def __init__(self):
1262 _BaseGenBankConsumer.__init__(self) 1263 from . import Record 1264 self.data = Record.Record() 1265 1266 self._seq_data = [] 1267 self._cur_reference = None 1268 self._cur_feature = None 1269 self._cur_qualifier = None
1270
1271 - def wgs(self, content):
1272 self.data.wgs = content.split('-')
1273
1274 - def add_wgs_scafld(self, content):
1275 self.data.wgs_scafld.append(content.split('-'))
1276
1277 - def locus(self, content):
1278 self.data.locus = content
1279
1280 - def size(self, content):
1281 self.data.size = content
1282
1283 - def residue_type(self, content):
1284 # Be lenient about parsing, but technically lowercase residue types are malformed. 1285 if 'dna' in content or 'rna' in content: 1286 import warnings 1287 from Bio import BiopythonParserWarning 1288 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1289 BiopythonParserWarning) 1290 self.data.residue_type = content
1291
1292 - def data_file_division(self, content):
1293 self.data.data_file_division = content
1294
1295 - def date(self, content):
1296 self.data.date = content
1297
1298 - def definition(self, content):
1299 self.data.definition = content
1300
1301 - def accession(self, content):
1302 for acc in self._split_accessions(content): 1303 if acc not in self.data.accession: 1304 self.data.accession.append(acc)
1305
1306 - def nid(self, content):
1307 self.data.nid = content
1308
1309 - def pid(self, content):
1310 self.data.pid = content
1311
1312 - def version(self, content):
1313 self.data.version = content
1314
1315 - def db_source(self, content):
1316 self.data.db_source = content.rstrip()
1317
1318 - def gi(self, content):
1319 self.data.gi = content
1320
1321 - def keywords(self, content):
1322 self.data.keywords = self._split_keywords(content)
1323
1324 - def project(self, content):
1325 self.data.projects.extend(p for p in content.split() if p)
1326 1329
1330 - def segment(self, content):
1331 self.data.segment = content
1332
1333 - def source(self, content):
1334 self.data.source = content
1335
1336 - def organism(self, content):
1337 self.data.organism = content
1338
1339 - def taxonomy(self, content):
1340 self.data.taxonomy = self._split_taxonomy(content)
1341
1342 - def reference_num(self, content):
1343 """Grab the reference number and signal the start of a new reference. 1344 """ 1345 # check if we have a reference to add 1346 if self._cur_reference is not None: 1347 self.data.references.append(self._cur_reference) 1348 1349 from . import Record 1350 self._cur_reference = Record.Reference() 1351 self._cur_reference.number = content
1352
1353 - def reference_bases(self, content):
1354 self._cur_reference.bases = content
1355
1356 - def authors(self, content):
1357 self._cur_reference.authors = content
1358
1359 - def consrtm(self, content):
1360 self._cur_reference.consrtm = content
1361
1362 - def title(self, content):
1363 if self._cur_reference is None: 1364 import warnings 1365 from Bio import BiopythonParserWarning 1366 warnings.warn("GenBank TITLE line without REFERENCE line.", 1367 BiopythonParserWarning) 1368 return 1369 self._cur_reference.title = content
1370
1371 - def journal(self, content):
1372 self._cur_reference.journal = content
1373
1374 - def medline_id(self, content):
1375 self._cur_reference.medline_id = content
1376
1377 - def pubmed_id(self, content):
1378 self._cur_reference.pubmed_id = content
1379
1380 - def remark(self, content):
1381 self._cur_reference.remark = content
1382
1383 - def comment(self, content):
1384 self.data.comment += "\n".join(content)
1385
1386 - def primary_ref_line(self, content):
1387 """Data for the PRIMARY line""" 1388 self.data.primary.append(content)
1389
1390 - def primary(self, content):
1391 pass
1392
1393 - def features_line(self, content):
1394 """Get ready for the feature table when we reach the FEATURE line. 1395 """ 1396 self.start_feature_table()
1397
1398 - def start_feature_table(self):
1399 """Signal the start of the feature table. 1400 """ 1401 # we need to add on the last reference 1402 if self._cur_reference is not None: 1403 self.data.references.append(self._cur_reference)
1404
1405 - def feature_key(self, content):
1406 """Grab the key of the feature and signal the start of a new feature. 1407 """ 1408 # first add on feature information if we've got any 1409 self._add_feature() 1410 1411 from . import Record 1412 self._cur_feature = Record.Feature() 1413 self._cur_feature.key = content
1414
1415 - def _add_feature(self):
1416 """Utility function to add a feature to the Record. 1417 1418 This does all of the appropriate checking to make sure we haven't 1419 left any info behind, and that we are only adding info if it 1420 exists. 1421 """ 1422 if self._cur_feature is not None: 1423 # if we have a left over qualifier, add it to the qualifiers 1424 # on the current feature 1425 if self._cur_qualifier is not None: 1426 self._cur_feature.qualifiers.append(self._cur_qualifier) 1427 1428 self._cur_qualifier = None 1429 self.data.features.append(self._cur_feature)
1430
1431 - def location(self, content):
1432 self._cur_feature.location = self._clean_location(content)
1433
1434 - def feature_qualifier(self, key, value):
1435 self.feature_qualifier_name([key]) 1436 if value is not None: 1437 self.feature_qualifier_description(value)
1438
1439 - def feature_qualifier_name(self, content_list):
1440 """Deal with qualifier names 1441 1442 We receive a list of keys, since you can have valueless keys such as 1443 /pseudo which would be passed in with the next key (since no other 1444 tags separate them in the file) 1445 """ 1446 from . import Record 1447 for content in content_list: 1448 # the record parser keeps the /s -- add them if we don't have 'em 1449 if not content.startswith("/"): 1450 content = "/%s" % content 1451 # add on a qualifier if we've got one 1452 if self._cur_qualifier is not None: 1453 self._cur_feature.qualifiers.append(self._cur_qualifier) 1454 1455 self._cur_qualifier = Record.Qualifier() 1456 self._cur_qualifier.key = content
1457
1458 - def feature_qualifier_description(self, content):
1459 # if we have info then the qualifier key should have a ='s 1460 if '=' not in self._cur_qualifier.key: 1461 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1462 cur_content = self._remove_newlines(content) 1463 # remove all spaces from the value if it is a type where spaces 1464 # are not important 1465 for remove_space_key in self.__class__.remove_space_keys: 1466 if remove_space_key in self._cur_qualifier.key: 1467 cur_content = self._remove_spaces(cur_content) 1468 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1469
1470 - def base_count(self, content):
1471 self.data.base_counts = content
1472
1473 - def origin_name(self, content):
1474 self.data.origin = content
1475
1476 - def contig_location(self, content):
1477 """Signal that we have contig information to add to the record. 1478 """ 1479 self.data.contig = self._clean_location(content)
1480
1481 - def sequence(self, content):
1482 """Add sequence information to a list of sequence strings. 1483 1484 This removes spaces in the data and uppercases the sequence, and 1485 then adds it to a list of sequences. Later on we'll join this 1486 list together to make the final sequence. This is faster than 1487 adding on the new string every time. 1488 """ 1489 assert ' ' not in content 1490 self._seq_data.append(content.upper())
1491
1492 - def record_end(self, content):
1493 """Signal the end of the record and do any necessary clean-up. 1494 """ 1495 # add together all of the sequence parts to create the 1496 # final sequence string 1497 self.data.sequence = "".join(self._seq_data) 1498 # add on the last feature 1499 self._add_feature()
1500 1501
1502 -def parse(handle):
1503 """Iterate over GenBank formatted entries as Record objects. 1504 1505 >>> from Bio import GenBank 1506 >>> with open("GenBank/NC_000932.gb") as handle: 1507 ... for record in GenBank.parse(handle): 1508 ... print(record.accession) 1509 ['NC_000932'] 1510 1511 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1512 instead. 1513 """ 1514 return iter(Iterator(handle, RecordParser()))
1515 1516
1517 -def read(handle):
1518 """Read a handle containing a single GenBank entry as a Record object. 1519 1520 >>> from Bio import GenBank 1521 >>> with open("GenBank/NC_000932.gb") as handle: 1522 ... record = GenBank.read(handle) 1523 ... print(record.accession) 1524 ['NC_000932'] 1525 1526 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1527 instead. 1528 """ 1529 iterator = parse(handle) 1530 try: 1531 first = next(iterator) 1532 except StopIteration: 1533 first = None 1534 if first is None: 1535 raise ValueError("No records found in handle") 1536 try: 1537 second = next(iterator) 1538 except StopIteration: 1539 second = None 1540 if second is not None: 1541 raise ValueError("More than one record found in handle") 1542 return first
1543 1544
1545 -def _test():
1546 """Run the Bio.GenBank module's doctests.""" 1547 import doctest 1548 import os 1549 if os.path.isdir(os.path.join("..", "..", "Tests")): 1550 print("Running doctests...") 1551 cur_dir = os.path.abspath(os.curdir) 1552 os.chdir(os.path.join("..", "..", "Tests")) 1553 doctest.testmod() 1554 os.chdir(cur_dir) 1555 del cur_dir 1556 print("Done") 1557 elif os.path.isdir(os.path.join("Tests")): 1558 print("Running doctests...") 1559 cur_dir = os.path.abspath(os.curdir) 1560 os.chdir(os.path.join("Tests")) 1561 doctest.testmod() 1562 os.chdir(cur_dir) 1563 del cur_dir 1564 print("Done")
1565 1566 if __name__ == "__main__": 1567 _test() 1568