Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents that the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21  read                  Parse a handle containing a single GenBank record 
  22                        as Bio.GenBank specific Record objects. 
  23  parse                 Iterate over a handle containing multiple GenBank 
  24                        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30  Iterator              Iterate through a file of GenBank entries 
  31  ErrorFeatureParser    Catch errors caused during parsing. 
  32  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  33  RecordParser          Parse GenBank data into a Record object. 
  34   
  35  Exceptions: 
  36  ParserFailureError    Exception indicating a failure in the parser (ie. 
  37                        scanner or consumer) 
  38  LocationParserError   Exception indiciating a problem with the spark based 
  39                        location parser. 
  40   
  41  """ 
  42  from __future__ import print_function 
  43   
  44  import re 
  45  import sys # for checking if Python 2 
  46   
  47  # other Biopython stuff 
  48  from Bio import SeqFeature 
  49   
  50  # other Bio.GenBank stuff 
  51  from .utils import FeatureValueCleaner 
  52  from .Scanner import GenBankScanner 
  53   
  54  #Constants used to parse GenBank header lines 
  55  GENBANK_INDENT = 12 
  56  GENBANK_SPACER = " " * GENBANK_INDENT 
  57   
  58  #Constants for parsing GenBank feature lines 
  59  FEATURE_KEY_INDENT = 5 
  60  FEATURE_QUALIFIER_INDENT = 21 
  61  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  62  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  63   
  64  #Regular expressions for location parsing 
  65  _solo_location = r"[<>]?\d+" 
  66  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  67  _between_location = r"\d+\^\d+" 
  68   
  69  _within_position = r"\(\d+\.\d+\)" 
  70  _re_within_position = re.compile(_within_position) 
  71  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  72                     % (_within_position, _within_position) 
  73  assert _re_within_position.match("(3.9)") 
  74  assert re.compile(_within_location).match("(3.9)..10") 
  75  assert re.compile(_within_location).match("26..(30.33)") 
  76  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  77   
  78  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  79  _re_oneof_position = re.compile(_oneof_position) 
  80  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  81                     % (_oneof_position, _oneof_position) 
  82  assert _re_oneof_position.match("one-of(6,9)") 
  83  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  84  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  85  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  86   
  87  assert not _re_oneof_position.match("one-of(3)") 
  88  assert _re_oneof_position.match("one-of(3,6)") 
  89  assert _re_oneof_position.match("one-of(3,6,9)") 
  90   
  91   
  92  _simple_location = r"\d+\.\.\d+" 
  93  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  94  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  95                                   % (_simple_location, _simple_location)) 
  96  _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \ 
  97                      % (_pair_location, _solo_location, _between_location, 
  98                         _within_location, _oneof_location) 
  99  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 100  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 101                                            % (_complex_location, _complex_location) 
 102  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 103                                   % (_possibly_complemented_complex_location, 
 104                                      _possibly_complemented_complex_location)) 
 105   
 106   
 107  assert _re_simple_location.match("104..160") 
 108  assert not _re_simple_location.match("68451760..68452073^68452074") 
 109  assert not _re_simple_location.match("<104..>160") 
 110  assert not _re_simple_location.match("104") 
 111  assert not _re_simple_location.match("<1") 
 112  assert not _re_simple_location.match(">99999") 
 113  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 114  assert not _re_simple_compound.match("bond(12,63)") 
 115  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 116  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 117  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 118  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 119  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 120  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 121  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 122  assert not _re_simple_compound.match("complement(1..69)") 
 123  assert not _re_simple_compound.match("(1..69)") 
 124  assert _re_complex_location.match("(3.9)..10") 
 125  assert _re_complex_location.match("26..(30.33)") 
 126  assert _re_complex_location.match("(13.19)..(20.28)") 
 127  assert _re_complex_location.match("41^42")  # between 
 128  assert _re_complex_location.match("AL121804:41^42") 
 129  assert _re_complex_location.match("AL121804:41..610") 
 130  assert _re_complex_location.match("AL121804.2:41..610") 
 131  assert _re_complex_location.match("one-of(3,6)..101") 
 132  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 133  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 134  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 135   
 136  #Trans-spliced example from NC_016406, note underscore in reference name: 
 137  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 138  assert _re_complex_location.match("181647..181905") 
 139  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 140  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 141  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 142  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 143  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 144   
 145  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 146  assert _solo_bond.match("bond(196)") 
 147  assert _solo_bond.search("bond(196)") 
 148  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
 149   
 150   
151 -def _pos(pos_str, offset=0):
152 """Build a Position object (PRIVATE). 153 154 For an end position, leave offset as zero (default): 155 156 >>> _pos("5") 157 ExactPosition(5) 158 159 For a start position, set offset to minus one (for Python counting): 160 161 >>> _pos("5", -1) 162 ExactPosition(4) 163 164 This also covers fuzzy positions: 165 166 >>> p = _pos("<5") 167 >>> p 168 BeforePosition(5) 169 >>> print(p) 170 <5 171 >>> int(p) 172 5 173 174 >>> _pos(">5") 175 AfterPosition(5) 176 177 By default assumes an end position, so note the integer behaviour: 178 179 >>> p = _pos("one-of(5,8,11)") 180 >>> p 181 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 182 >>> print(p) 183 one-of(5,8,11) 184 >>> int(p) 185 11 186 187 >>> _pos("(8.10)") 188 WithinPosition(10, left=8, right=10) 189 190 Fuzzy start positions: 191 192 >>> p = _pos("<5", -1) 193 >>> p 194 BeforePosition(4) 195 >>> print(p) 196 <4 197 >>> int(p) 198 4 199 200 Notice how the integer behaviour changes too! 201 202 >>> p = _pos("one-of(5,8,11)", -1) 203 >>> p 204 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 205 >>> print(p) 206 one-of(4,7,10) 207 >>> int(p) 208 4 209 210 """ 211 if pos_str.startswith("<"): 212 return SeqFeature.BeforePosition(int(pos_str[1:])+offset) 213 elif pos_str.startswith(">"): 214 return SeqFeature.AfterPosition(int(pos_str[1:])+offset) 215 elif _re_within_position.match(pos_str): 216 s, e = pos_str[1:-1].split(".") 217 s = int(s) + offset 218 e = int(e) + offset 219 if offset == -1: 220 default = s 221 else: 222 default = e 223 return SeqFeature.WithinPosition(default, left=s, right=e) 224 elif _re_oneof_position.match(pos_str): 225 assert pos_str.startswith("one-of(") 226 assert pos_str[-1]==")" 227 parts = [SeqFeature.ExactPosition(int(pos)+offset) 228 for pos in pos_str[7:-1].split(",")] 229 if offset == -1: 230 default = min(int(pos) for pos in parts) 231 else: 232 default = max(int(pos) for pos in parts) 233 return SeqFeature.OneOfPosition(default, choices=parts) 234 else: 235 return SeqFeature.ExactPosition(int(pos_str)+offset)
236 237
238 -def _loc(loc_str, expected_seq_length, strand):
239 """FeatureLocation from non-compound non-complement location (PRIVATE). 240 241 Simple examples, 242 243 >>> _loc("123..456", 1000, +1) 244 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 245 >>> _loc("<123..>456", 1000, strand = -1) 246 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 247 248 A more complex location using within positions, 249 250 >>> _loc("(9.10)..(20.25)", 1000, 1) 251 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 252 253 Notice how that will act as though it has overall start 8 and end 25. 254 255 Zero length between feature, 256 257 >>> _loc("123^124", 1000, 0) 258 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 259 260 The expected sequence length is needed for a special case, a between 261 position at the start/end of a circular genome: 262 263 >>> _loc("1000^1", 1000, 1) 264 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 265 266 Apart from this special case, between positions P^Q must have P+1==Q, 267 268 >>> _loc("123^456", 1000, 1) 269 Traceback (most recent call last): 270 ... 271 ValueError: Invalid between location '123^456' 272 """ 273 try: 274 s, e = loc_str.split("..") 275 except ValueError: 276 assert ".." not in loc_str 277 if "^" in loc_str: 278 #A between location like "67^68" (one based counting) is a 279 #special case (note it has zero length). In python slice 280 #notation this is 67:67, a zero length slice. See Bug 2622 281 #Further more, on a circular genome of length N you can have 282 #a location N^1 meaning the junction at the origin. See Bug 3098. 283 #NOTE - We can imagine between locations like "2^4", but this 284 #is just "3". Similarly, "2^5" is just "3..4" 285 s, e = loc_str.split("^") 286 if int(s)+1==int(e): 287 pos = _pos(s) 288 elif int(s)==expected_seq_length and e=="1": 289 pos = _pos(s) 290 else: 291 raise ValueError("Invalid between location %s" % repr(loc_str)) 292 return SeqFeature.FeatureLocation(pos, pos, strand) 293 else: 294 #e.g. "123" 295 s = loc_str 296 e = loc_str 297 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand)
298 299
300 -def _split_compound_loc(compound_loc):
301 """Split a tricky compound location string (PRIVATE). 302 303 >>> list(_split_compound_loc("123..145")) 304 ['123..145'] 305 >>> list(_split_compound_loc("123..145,200..209")) 306 ['123..145', '200..209'] 307 >>> list(_split_compound_loc("one-of(200,203)..300")) 308 ['one-of(200,203)..300'] 309 >>> list(_split_compound_loc("complement(123..145),200..209")) 310 ['complement(123..145)', '200..209'] 311 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 312 ['123..145', 'one-of(200,203)..209'] 313 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 314 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 315 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 316 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 317 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 318 ['123..145', '200..one-of(209,211)', '300'] 319 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 320 ['123..145', '200..one-of(209,211)'] 321 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 322 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 323 """ 324 if "one-of(" in compound_loc: 325 #Hard case 326 while "," in compound_loc: 327 assert compound_loc[0] != "," 328 assert compound_loc[0:2] != ".." 329 i = compound_loc.find(",") 330 part = compound_loc[:i] 331 compound_loc = compound_loc[i:] # includes the comma 332 while part.count("(") > part.count(")"): 333 assert "one-of(" in part, (part, compound_loc) 334 i = compound_loc.find(")") 335 part += compound_loc[:i+1] 336 compound_loc = compound_loc[i+1:] 337 if compound_loc.startswith(".."): 338 i = compound_loc.find(",") 339 if i==-1: 340 part += compound_loc 341 compound_loc = "" 342 else: 343 part += compound_loc[:i] 344 compound_loc = compound_loc[i:] # includes the comma 345 while part.count("(") > part.count(")"): 346 assert part.count("one-of(") == 2 347 i = compound_loc.find(")") 348 part += compound_loc[:i+1] 349 compound_loc = compound_loc[i+1:] 350 if compound_loc.startswith(","): 351 compound_loc = compound_loc[1:] 352 assert part 353 yield part 354 if compound_loc: 355 yield compound_loc 356 else: 357 #Easy case 358 for part in compound_loc.split(","): 359 yield part
360 361
362 -class Iterator(object):
363 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 364 365 This class is likely to be deprecated in a future release of Biopython. 366 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 367 for SeqRecord and GenBank specific Record objects respectively instead. 368 """
369 - def __init__(self, handle, parser = None):
370 """Initialize the iterator. 371 372 Arguments: 373 o handle - A handle with GenBank entries to iterate through. 374 o parser - An optional parser to pass the entries through before 375 returning them. If None, then the raw entry will be returned. 376 """ 377 self.handle = handle 378 self._parser = parser
379
380 - def __next__(self):
381 """Return the next GenBank record from the handle. 382 383 Will return None if we ran out of records. 384 """ 385 if self._parser is None: 386 lines = [] 387 while True: 388 line = self.handle.readline() 389 if not line: 390 return None # Premature end of file? 391 lines.append(line) 392 if line.rstrip() == "//": 393 break 394 return "".join(lines) 395 try: 396 return self._parser.parse(self.handle) 397 except StopIteration: 398 return None
399 400 if sys.version_info[0] < 3:
401 - def next(self):
402 """Python 2 style alias for Python 3 style __next__ method.""" 403 return self.__next__()
404
405 - def __iter__(self):
406 return iter(self.__next__, None)
407 408
409 -class ParserFailureError(Exception):
410 """Failure caused by some kind of problem in the parser. 411 """ 412 pass
413 414
415 -class LocationParserError(Exception):
416 """Could not Properly parse out a location from a GenBank file. 417 """ 418 pass
419 420
421 -class FeatureParser(object):
422 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 423 424 Direct use of this class is discouraged, and may be deprecated in 425 a future release of Biopython. 426 427 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 428 """
429 - def __init__(self, debug_level = 0, use_fuzziness = 1, 430 feature_cleaner = FeatureValueCleaner()):
431 """Initialize a GenBank parser and Feature consumer. 432 433 Arguments: 434 o debug_level - An optional argument that species the amount of 435 debugging information the parser should spit out. By default we have 436 no debugging info (the fastest way to do things), but if you want 437 you can set this as high as two and see exactly where a parse fails. 438 o use_fuzziness - Specify whether or not to use fuzzy representations. 439 The default is 1 (use fuzziness). 440 o feature_cleaner - A class which will be used to clean out the 441 values of features. This class must implement the function 442 clean_value. GenBank.utils has a "standard" cleaner class, which 443 is used by default. 444 """ 445 self._scanner = GenBankScanner(debug_level) 446 self.use_fuzziness = use_fuzziness 447 self._cleaner = feature_cleaner
448
449 - def parse(self, handle):
450 """Parse the specified handle. 451 """ 452 self._consumer = _FeatureConsumer(self.use_fuzziness, 453 self._cleaner) 454 self._scanner.feed(handle, self._consumer) 455 return self._consumer.data
456 457
458 -class RecordParser(object):
459 """Parse GenBank files into Record objects (OBSOLETE). 460 461 Direct use of this class is discouraged, and may be deprecated in 462 a future release of Biopython. 463 464 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 465 instead. 466 """
467 - def __init__(self, debug_level = 0):
468 """Initialize the parser. 469 470 Arguments: 471 o debug_level - An optional argument that species the amount of 472 debugging information the parser should spit out. By default we have 473 no debugging info (the fastest way to do things), but if you want 474 you can set this as high as two and see exactly where a parse fails. 475 """ 476 self._scanner = GenBankScanner(debug_level)
477
478 - def parse(self, handle):
479 """Parse the specified handle into a GenBank record. 480 """ 481 self._consumer = _RecordConsumer() 482 483 self._scanner.feed(handle, self._consumer) 484 return self._consumer.data
485 486
487 -class _BaseGenBankConsumer(object):
488 """Abstract GenBank consumer providing useful general functions (PRIVATE). 489 490 This just helps to eliminate some duplication in things that most 491 GenBank consumers want to do. 492 """ 493 # Special keys in GenBank records that we should remove spaces from 494 # For instance, \translation keys have values which are proteins and 495 # should have spaces and newlines removed from them. This class 496 # attribute gives us more control over specific formatting problems. 497 remove_space_keys = ["translation"] 498
499 - def __init__(self):
500 pass
501
502 - def _unhandled(self, data):
503 pass
504
505 - def __getattr__(self, attr):
506 return self._unhandled
507
508 - def _split_keywords(self, keyword_string):
509 """Split a string of keywords into a nice clean list. 510 """ 511 # process the keywords into a python list 512 if keyword_string == "" or keyword_string == ".": 513 keywords = "" 514 elif keyword_string[-1] == '.': 515 keywords = keyword_string[:-1] 516 else: 517 keywords = keyword_string 518 keyword_list = keywords.split(';') 519 clean_keyword_list = [x.strip() for x in keyword_list] 520 return clean_keyword_list
521
522 - def _split_accessions(self, accession_string):
523 """Split a string of accession numbers into a list. 524 """ 525 # first replace all line feeds with spaces 526 # Also, EMBL style accessions are split with ';' 527 accession = accession_string.replace("\n", " ").replace(";", " ") 528 529 return [x.strip() for x in accession.split() if x.strip()]
530
531 - def _split_taxonomy(self, taxonomy_string):
532 """Split a string with taxonomy info into a list. 533 """ 534 if not taxonomy_string or taxonomy_string==".": 535 #Missing data, no taxonomy 536 return [] 537 538 if taxonomy_string[-1] == '.': 539 tax_info = taxonomy_string[:-1] 540 else: 541 tax_info = taxonomy_string 542 tax_list = tax_info.split(';') 543 new_tax_list = [] 544 for tax_item in tax_list: 545 new_items = tax_item.split("\n") 546 new_tax_list.extend(new_items) 547 while '' in new_tax_list: 548 new_tax_list.remove('') 549 clean_tax_list = [x.strip() for x in new_tax_list] 550 551 return clean_tax_list
552
553 - def _clean_location(self, location_string):
554 """Clean whitespace out of a location string. 555 556 The location parser isn't a fan of whitespace, so we clean it out 557 before feeding it into the parser. 558 """ 559 #Originally this imported string.whitespace and did a replace 560 #via a loop. It's simpler to just split on whitespace and rejoin 561 #the string - and this avoids importing string too. See Bug 2684. 562 return ''.join(location_string.split())
563
564 - def _remove_newlines(self, text):
565 """Remove any newlines in the passed text, returning the new string. 566 """ 567 # get rid of newlines in the qualifier value 568 newlines = ["\n", "\r"] 569 for ws in newlines: 570 text = text.replace(ws, "") 571 572 return text
573
574 - def _normalize_spaces(self, text):
575 """Replace multiple spaces in the passed text with single spaces. 576 """ 577 # get rid of excessive spaces 578 return ' '.join(x for x in text.split(" ") if x)
579
580 - def _remove_spaces(self, text):
581 """Remove all spaces from the passed text. 582 """ 583 return text.replace(" ", "")
584
585 - def _convert_to_python_numbers(self, start, end):
586 """Convert a start and end range to python notation. 587 588 In GenBank, starts and ends are defined in "biological" coordinates, 589 where 1 is the first base and [i, j] means to include both i and j. 590 591 In python, 0 is the first base and [i, j] means to include i, but 592 not j. 593 594 So, to convert "biological" to python coordinates, we need to 595 subtract 1 from the start, and leave the end and things should 596 be converted happily. 597 """ 598 new_start = start - 1 599 new_end = end 600 601 return new_start, new_end
602 603
604 -class _FeatureConsumer(_BaseGenBankConsumer):
605 """Create a SeqRecord object with Features to return (PRIVATE). 606 607 Attributes: 608 o use_fuzziness - specify whether or not to parse with fuzziness in 609 feature locations. 610 o feature_cleaner - a class that will be used to provide specialized 611 cleaning-up of feature values. 612 """
613 - def __init__(self, use_fuzziness, feature_cleaner = None):
614 from Bio.SeqRecord import SeqRecord 615 _BaseGenBankConsumer.__init__(self) 616 self.data = SeqRecord(None, id = None) 617 self.data.id = None 618 self.data.description = "" 619 620 self._use_fuzziness = use_fuzziness 621 self._feature_cleaner = feature_cleaner 622 623 self._seq_type = '' 624 self._seq_data = [] 625 self._cur_reference = None 626 self._cur_feature = None 627 self._expected_size = None
628
629 - def locus(self, locus_name):
630 """Set the locus name is set as the name of the Sequence. 631 """ 632 self.data.name = locus_name
633
634 - def size(self, content):
635 """Record the sequence length.""" 636 self._expected_size = int(content)
637
638 - def residue_type(self, type):
639 """Record the sequence type so we can choose an appropriate alphabet. 640 """ 641 self._seq_type = type.strip()
642
643 - def data_file_division(self, division):
644 self.data.annotations['data_file_division'] = division
645
646 - def date(self, submit_date):
647 self.data.annotations['date'] = submit_date
648
649 - def definition(self, definition):
650 """Set the definition as the description of the sequence. 651 """ 652 if self.data.description: 653 #Append to any existing description 654 #e.g. EMBL files with two DE lines. 655 self.data.description += " " + definition 656 else: 657 self.data.description = definition
658
659 - def accession(self, acc_num):
660 """Set the accession number as the id of the sequence. 661 662 If we have multiple accession numbers, the first one passed is 663 used. 664 """ 665 new_acc_nums = self._split_accessions(acc_num) 666 667 #Also record them ALL in the annotations 668 try: 669 #On the off chance there was more than one accession line: 670 for acc in new_acc_nums: 671 #Prevent repeat entries 672 if acc not in self.data.annotations['accessions']: 673 self.data.annotations['accessions'].append(acc) 674 except KeyError: 675 self.data.annotations['accessions'] = new_acc_nums 676 677 # if we haven't set the id information yet, add the first acc num 678 if not self.data.id: 679 if len(new_acc_nums) > 0: 680 #self.data.id = new_acc_nums[0] 681 #Use the FIRST accession as the ID, not the first on this line! 682 self.data.id = self.data.annotations['accessions'][0]
683
684 - def wgs(self, content):
685 self.data.annotations['wgs'] = content.split('-')
686
687 - def add_wgs_scafld(self, content):
688 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
689
690 - def nid(self, content):
691 self.data.annotations['nid'] = content
692
693 - def pid(self, content):
694 self.data.annotations['pid'] = content
695
696 - def version(self, version_id):
697 #Want to use the versioned accession as the record.id 698 #This comes from the VERSION line in GenBank files, or the 699 #obsolete SV line in EMBL. For the new EMBL files we need 700 #both the version suffix from the ID line and the accession 701 #from the AC line. 702 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 703 self.accession(version_id.split(".")[0]) 704 self.version_suffix(version_id.split(".")[1]) 705 elif version_id: 706 #For backwards compatibility... 707 self.data.id = version_id
708
709 - def project(self, content):
710 """Handle the information from the PROJECT line as a list of projects. 711 712 e.g. 713 PROJECT GenomeProject:28471 714 715 or: 716 PROJECT GenomeProject:13543 GenomeProject:99999 717 718 This is stored as dbxrefs in the SeqRecord to be consistent with the 719 projected switch of this line to DBLINK in future GenBank versions. 720 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 721 "Project:28471" as part of this transition. 722 """ 723 content = content.replace("GenomeProject:", "Project:") 724 self.data.dbxrefs.extend(p for p in content.split() if p)
725 757
758 - def version_suffix(self, version):
759 """Set the version to overwrite the id. 760 761 Since the verison provides the same information as the accession 762 number, plus some extra info, we set this as the id if we have 763 a version. 764 """ 765 #e.g. GenBank line: 766 #VERSION U49845.1 GI:1293613 767 #or the obsolete EMBL line: 768 #SV U49845.1 769 #Scanner calls consumer.version("U49845.1") 770 #which then calls consumer.version_suffix(1) 771 # 772 #e.g. EMBL new line: 773 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 774 #Scanner calls consumer.version_suffix(1) 775 assert version.isdigit() 776 self.data.annotations['sequence_version'] = int(version)
777
778 - def db_source(self, content):
779 self.data.annotations['db_source'] = content.rstrip()
780
781 - def gi(self, content):
782 self.data.annotations['gi'] = content
783
784 - def keywords(self, content):
785 if 'keywords' in self.data.annotations: 786 #Multi-line keywords, append to list 787 #Note EMBL states "A keyword is never split between lines." 788 self.data.annotations['keywords'].extend(self._split_keywords(content)) 789 else: 790 self.data.annotations['keywords'] = self._split_keywords(content)
791
792 - def segment(self, content):
793 self.data.annotations['segment'] = content
794
795 - def source(self, content):
796 #Note that some software (e.g. VectorNTI) may produce an empty 797 #source (rather than using a dot/period as might be expected). 798 if content == "": 799 source_info = "" 800 elif content[-1] == '.': 801 source_info = content[:-1] 802 else: 803 source_info = content 804 self.data.annotations['source'] = source_info
805
806 - def organism(self, content):
807 self.data.annotations['organism'] = content
808
809 - def taxonomy(self, content):
810 """Records (another line of) the taxonomy lineage. 811 """ 812 lineage = self._split_taxonomy(content) 813 try: 814 self.data.annotations['taxonomy'].extend(lineage) 815 except KeyError: 816 self.data.annotations['taxonomy'] = lineage
817
818 - def reference_num(self, content):
819 """Signal the beginning of a new reference object. 820 """ 821 # if we have a current reference that hasn't been added to 822 # the list of references, add it. 823 if self._cur_reference is not None: 824 self.data.annotations['references'].append(self._cur_reference) 825 else: 826 self.data.annotations['references'] = [] 827 828 self._cur_reference = SeqFeature.Reference()
829
830 - def reference_bases(self, content):
831 """Attempt to determine the sequence region the reference entails. 832 833 Possible types of information we may have to deal with: 834 835 (bases 1 to 86436) 836 (sites) 837 (bases 1 to 105654; 110423 to 111122) 838 1 (residues 1 to 182) 839 """ 840 # first remove the parentheses or other junk 841 ref_base_info = content[1:-1] 842 843 all_locations = [] 844 # parse if we've got 'bases' and 'to' 845 if 'bases' in ref_base_info and 'to' in ref_base_info: 846 # get rid of the beginning 'bases' 847 ref_base_info = ref_base_info[5:] 848 locations = self._split_reference_locations(ref_base_info) 849 all_locations.extend(locations) 850 elif 'residues' in ref_base_info and 'to' in ref_base_info: 851 residues_start = ref_base_info.find("residues") 852 # get only the information after "residues" 853 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 854 locations = self._split_reference_locations(ref_base_info) 855 all_locations.extend(locations) 856 857 # make sure if we are not finding information then we have 858 # the string 'sites' or the string 'bases' 859 elif (ref_base_info == 'sites' or 860 ref_base_info.strip() == 'bases'): 861 pass 862 # otherwise raise an error 863 else: 864 raise ValueError("Could not parse base info %s in record %s" % 865 (ref_base_info, self.data.id)) 866 867 self._cur_reference.location = all_locations
868
869 - def _split_reference_locations(self, location_string):
870 """Get reference locations out of a string of reference information 871 872 The passed string should be of the form: 873 874 1 to 20; 20 to 100 875 876 This splits the information out and returns a list of location objects 877 based on the reference locations. 878 """ 879 # split possibly multiple locations using the ';' 880 all_base_info = location_string.split(';') 881 882 new_locations = [] 883 for base_info in all_base_info: 884 start, end = base_info.split('to') 885 new_start, new_end = \ 886 self._convert_to_python_numbers(int(start.strip()), 887 int(end.strip())) 888 this_location = SeqFeature.FeatureLocation(new_start, new_end) 889 new_locations.append(this_location) 890 return new_locations
891
892 - def authors(self, content):
893 if self._cur_reference.authors: 894 self._cur_reference.authors += ' ' + content 895 else: 896 self._cur_reference.authors = content
897
898 - def consrtm(self, content):
899 if self._cur_reference.consrtm: 900 self._cur_reference.consrtm += ' ' + content 901 else: 902 self._cur_reference.consrtm = content
903
904 - def title(self, content):
905 if self._cur_reference is None: 906 import warnings 907 from Bio import BiopythonParserWarning 908 warnings.warn("GenBank TITLE line without REFERENCE line.", 909 BiopythonParserWarning) 910 elif self._cur_reference.title: 911 self._cur_reference.title += ' ' + content 912 else: 913 self._cur_reference.title = content
914
915 - def journal(self, content):
916 if self._cur_reference.journal: 917 self._cur_reference.journal += ' ' + content 918 else: 919 self._cur_reference.journal = content
920
921 - def medline_id(self, content):
922 self._cur_reference.medline_id = content
923
924 - def pubmed_id(self, content):
925 self._cur_reference.pubmed_id = content
926
927 - def remark(self, content):
928 """Deal with a reference comment.""" 929 if self._cur_reference.comment: 930 self._cur_reference.comment += ' ' + content 931 else: 932 self._cur_reference.comment = content
933
934 - def comment(self, content):
935 try: 936 self.data.annotations['comment'] += "\n" + "\n".join(content) 937 except KeyError: 938 self.data.annotations['comment'] = "\n".join(content)
939
940 - def features_line(self, content):
941 """Get ready for the feature table when we reach the FEATURE line. 942 """ 943 self.start_feature_table()
944
945 - def start_feature_table(self):
946 """Indicate we've got to the start of the feature table. 947 """ 948 # make sure we've added on our last reference object 949 if self._cur_reference is not None: 950 self.data.annotations['references'].append(self._cur_reference) 951 self._cur_reference = None
952
953 - def feature_key(self, content):
954 # start a new feature 955 self._cur_feature = SeqFeature.SeqFeature() 956 self._cur_feature.type = content 957 self.data.features.append(self._cur_feature)
958
959 - def location(self, content):
960 """Parse out location information from the location string. 961 962 This uses simple Python code with some regular expressions to do the 963 parsing, and then translates the results into appropriate objects. 964 """ 965 # clean up newlines and other whitespace inside the location before 966 # parsing - locations should have no whitespace whatsoever 967 location_line = self._clean_location(content) 968 969 # Older records have junk like replace(266,"c") in the 970 # location line. Newer records just replace this with 971 # the number 266 and have the information in a more reasonable 972 # place. So we'll just grab out the number and feed this to the 973 # parser. We shouldn't really be losing any info this way. 974 if 'replace' in location_line: 975 comma_pos = location_line.find(',') 976 location_line = location_line[8:comma_pos] 977 978 cur_feature = self._cur_feature 979 980 #Handle top level complement here for speed 981 if location_line.startswith("complement("): 982 assert location_line.endswith(")") 983 location_line = location_line[11:-1] 984 strand = -1 985 elif "PROTEIN" in self._seq_type.upper(): 986 strand = None 987 else: 988 #Assume nucleotide otherwise feature strand for 989 #GenBank files with bad LOCUS lines set to None 990 strand = 1 991 992 #Special case handling of the most common cases for speed 993 if _re_simple_location.match(location_line): 994 #e.g. "123..456" 995 s, e = location_line.split("..") 996 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1, 997 int(e), 998 strand) 999 return 1000 1001 if _solo_bond.search(location_line): 1002 #e.g. bond(196) 1003 #e.g. join(bond(284),bond(305),bond(309),bond(305)) 1004 import warnings 1005 from Bio import BiopythonParserWarning 1006 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1007 #There ought to be a better way to do this... 1008 for x in _solo_bond.finditer(location_line): 1009 x = x.group() 1010 location_line = location_line.replace(x, x[5:-1]) 1011 1012 if _re_simple_compound.match(location_line): 1013 #e.g. join(<123..456,480..>500) 1014 i = location_line.find("(") 1015 #cur_feature.location_operator = location_line[:i] 1016 #we can split on the comma because these are simple locations 1017 sub_features = cur_feature.sub_features 1018 for part in location_line[i+1:-1].split(","): 1019 s, e = part.split("..") 1020 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1, 1021 int(e), 1022 strand), 1023 location_operator=cur_feature.location_operator, 1024 type=cur_feature.type) 1025 sub_features.append(f) 1026 #s = cur_feature.sub_features[0].location.start 1027 #e = cur_feature.sub_features[-1].location.end 1028 #cur_feature.location = SeqFeature.FeatureLocation(s,e, strand) 1029 #TODO - Remove use of sub_features 1030 if strand == -1: 1031 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1032 operator=location_line[:i]) 1033 else: 1034 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1035 operator=location_line[:i]) 1036 return 1037 1038 #Handle the general case with more complex regular expressions 1039 if _re_complex_location.match(location_line): 1040 #e.g. "AL121804.2:41..610" 1041 if ":" in location_line: 1042 location_ref, location_line = location_line.split(":") 1043 cur_feature.location = _loc(location_line, self._expected_size, strand) 1044 cur_feature.location.ref = location_ref 1045 else: 1046 cur_feature.location = _loc(location_line, self._expected_size, strand) 1047 return 1048 1049 if _re_complex_compound.match(location_line): 1050 i = location_line.find("(") 1051 #cur_feature.location_operator = location_line[:i] 1052 #Can't split on the comma because of positions like one-of(1,2,3) 1053 sub_features = cur_feature.sub_features 1054 for part in _split_compound_loc(location_line[i+1:-1]): 1055 if part.startswith("complement("): 1056 assert part[-1]==")" 1057 part = part[11:-1] 1058 assert strand != -1, "Double complement?" 1059 part_strand = -1 1060 else: 1061 part_strand = strand 1062 if ":" in part: 1063 ref, part = part.split(":") 1064 else: 1065 ref = None 1066 try: 1067 loc = _loc(part, self._expected_size, part_strand) 1068 except ValueError as err: 1069 print(location_line) 1070 print(part) 1071 raise err 1072 f = SeqFeature.SeqFeature(location=loc, ref=ref, 1073 location_operator=cur_feature.location_operator, 1074 type=cur_feature.type) 1075 sub_features.append(f) 1076 # Historically a join on the reverse strand has been represented 1077 # in Biopython with both the parent SeqFeature and its children 1078 # (the exons for a CDS) all given a strand of -1. Likewise, for 1079 # a join feature on the forward strand they all have strand +1. 1080 # However, we must also consider evil mixed strand examples like 1081 # this, join(complement(69611..69724),139856..140087,140625..140650) 1082 # 1083 # TODO - Remove use of sub_features 1084 strands = set(sf.strand for sf in sub_features) 1085 if len(strands)==1: 1086 strand = sub_features[0].strand 1087 else: 1088 strand = None # i.e. mixed strands 1089 if strand == -1: 1090 #Reverse the backwards order used in GenBank files 1091 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1092 operator=location_line[:i]) 1093 else: 1094 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1095 operator=location_line[:i]) 1096 return 1097 #Not recognised 1098 if "order" in location_line and "join" in location_line: 1099 #See Bug 3197 1100 msg = 'Combinations of "join" and "order" within the same ' + \ 1101 'location (nested operators) are illegal:\n' + location_line 1102 raise LocationParserError(msg) 1103 #This used to be an error.... 1104 cur_feature.location = None 1105 import warnings 1106 from Bio import BiopythonParserWarning 1107 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1108 % (location_line)))
1109
1110 - def feature_qualifier(self, key, value):
1111 """When we get a qualifier key and its value. 1112 1113 Can receive None, since you can have valueless keys such as /pseudo 1114 """ 1115 # Hack to try to preserve historical behaviour of /pseudo etc 1116 if value is None: 1117 # if the key doesn't exist yet, add an empty string 1118 if key not in self._cur_feature.qualifiers: 1119 self._cur_feature.qualifiers[key] = [""] 1120 return 1121 # otherwise just skip this key 1122 return 1123 1124 value = value.replace('"', '') 1125 if self._feature_cleaner is not None: 1126 value = self._feature_cleaner.clean_value(key, value) 1127 1128 # if the qualifier name exists, append the value 1129 if key in self._cur_feature.qualifiers: 1130 self._cur_feature.qualifiers[key].append(value) 1131 # otherwise start a new list of the key with its values 1132 else: 1133 self._cur_feature.qualifiers[key] = [value]
1134
1135 - def feature_qualifier_name(self, content_list):
1136 """Use feature_qualifier instead (OBSOLETE).""" 1137 raise NotImplementedError("Use the feature_qualifier method instead.")
1138
1139 - def feature_qualifier_description(self, content):
1140 """Use feature_qualifier instead (OBSOLETE).""" 1141 raise NotImplementedError("Use the feature_qualifier method instead.")
1142
1143 - def contig_location(self, content):
1144 """Deal with CONTIG information.""" 1145 #Historically this was stored as a SeqFeature object, but it was 1146 #stored under record.annotations["contig"] and not under 1147 #record.features with the other SeqFeature objects. 1148 # 1149 #The CONTIG location line can include additional tokens like 1150 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1151 #location lines, so storing it using SeqFeature based location 1152 #objects is difficult. 1153 # 1154 #We now store this a string, which means for BioSQL we are now in 1155 #much better agreement with how BioPerl records the CONTIG line 1156 #in the database. 1157 # 1158 #NOTE - This code assumes the scanner will return all the CONTIG 1159 #lines already combined into one long string! 1160 self.data.annotations["contig"] = content
1161
1162 - def origin_name(self, content):
1163 pass
1164
1165 - def base_count(self, content):
1166 pass
1167
1168 - def base_number(self, content):
1169 pass
1170
1171 - def sequence(self, content):
1172 """Add up sequence information as we get it. 1173 1174 To try and make things speedier, this puts all of the strings 1175 into a list of strings, and then uses string.join later to put 1176 them together. Supposedly, this is a big time savings 1177 """ 1178 assert ' ' not in content 1179 self._seq_data.append(content.upper())
1180
1181 - def record_end(self, content):
1182 """Clean up when we've finished the record. 1183 """ 1184 from Bio import Alphabet 1185 from Bio.Alphabet import IUPAC 1186 from Bio.Seq import Seq, UnknownSeq 1187 1188 #Try and append the version number to the accession for the full id 1189 if not self.data.id: 1190 assert 'accessions' not in self.data.annotations, \ 1191 self.data.annotations['accessions'] 1192 self.data.id = self.data.name # Good fall back? 1193 elif self.data.id.count('.') == 0: 1194 try: 1195 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1196 except KeyError: 1197 pass 1198 1199 # add the sequence information 1200 # first, determine the alphabet 1201 # we default to an generic alphabet if we don't have a 1202 # seq type or have strange sequence information. 1203 seq_alphabet = Alphabet.generic_alphabet 1204 1205 # now set the sequence 1206 sequence = "".join(self._seq_data) 1207 1208 if self._expected_size is not None \ 1209 and len(sequence) != 0 \ 1210 and self._expected_size != len(sequence): 1211 import warnings 1212 from Bio import BiopythonParserWarning 1213 warnings.warn("Expected sequence length %i, found %i (%s)." 1214 % (self._expected_size, len(sequence), self.data.id), 1215 BiopythonParserWarning) 1216 1217 if self._seq_type: 1218 # mRNA is really also DNA, since it is actually cDNA 1219 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1220 seq_alphabet = IUPAC.ambiguous_dna 1221 # are there ever really RNA sequences in GenBank? 1222 elif 'RNA' in self._seq_type.upper(): 1223 #Even for data which was from RNA, the sequence string 1224 #is usually given as DNA (T not U). Bug 2408 1225 if "T" in sequence and "U" not in sequence: 1226 seq_alphabet = IUPAC.ambiguous_dna 1227 else: 1228 seq_alphabet = IUPAC.ambiguous_rna 1229 elif 'PROTEIN' in self._seq_type.upper() \ 1230 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1231 seq_alphabet = IUPAC.protein # or extended protein? 1232 # work around ugly GenBank records which have circular or 1233 # linear but no indication of sequence type 1234 elif self._seq_type in ["circular", "linear", "unspecified"]: 1235 pass 1236 # we have a bug if we get here 1237 else: 1238 raise ValueError("Could not determine alphabet for seq_type %s" 1239 % self._seq_type) 1240 1241 if not sequence and self.__expected_size: 1242 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1243 else: 1244 self.data.seq = Seq(sequence, seq_alphabet)
1245 1246
1247 -class _RecordConsumer(_BaseGenBankConsumer):
1248 """Create a GenBank Record object from scanner generated information (PRIVATE). 1249 """
1250 - def __init__(self):
1251 _BaseGenBankConsumer.__init__(self) 1252 from . import Record 1253 self.data = Record.Record() 1254 1255 self._seq_data = [] 1256 self._cur_reference = None 1257 self._cur_feature = None 1258 self._cur_qualifier = None
1259
1260 - def wgs(self, content):
1261 self.data.wgs = content.split('-')
1262
1263 - def add_wgs_scafld(self, content):
1264 self.data.wgs_scafld.append(content.split('-'))
1265
1266 - def locus(self, content):
1267 self.data.locus = content
1268
1269 - def size(self, content):
1270 self.data.size = content
1271
1272 - def residue_type(self, content):
1273 # Be lenient about parsing, but technically lowercase residue types are malformed. 1274 if 'dna' in content or 'rna' in content: 1275 import warnings 1276 from Bio import BiopythonParserWarning 1277 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1278 BiopythonParserWarning) 1279 self.data.residue_type = content
1280
1281 - def data_file_division(self, content):
1282 self.data.data_file_division = content
1283
1284 - def date(self, content):
1285 self.data.date = content
1286
1287 - def definition(self, content):
1288 self.data.definition = content
1289
1290 - def accession(self, content):
1291 for acc in self._split_accessions(content): 1292 if acc not in self.data.accession: 1293 self.data.accession.append(acc)
1294
1295 - def nid(self, content):
1296 self.data.nid = content
1297
1298 - def pid(self, content):
1299 self.data.pid = content
1300
1301 - def version(self, content):
1302 self.data.version = content
1303
1304 - def db_source(self, content):
1305 self.data.db_source = content.rstrip()
1306
1307 - def gi(self, content):
1308 self.data.gi = content
1309
1310 - def keywords(self, content):
1311 self.data.keywords = self._split_keywords(content)
1312
1313 - def project(self, content):
1314 self.data.projects.extend(p for p in content.split() if p)
1315 1318
1319 - def segment(self, content):
1320 self.data.segment = content
1321
1322 - def source(self, content):
1323 self.data.source = content
1324
1325 - def organism(self, content):
1326 self.data.organism = content
1327
1328 - def taxonomy(self, content):
1329 self.data.taxonomy = self._split_taxonomy(content)
1330
1331 - def reference_num(self, content):
1332 """Grab the reference number and signal the start of a new reference. 1333 """ 1334 # check if we have a reference to add 1335 if self._cur_reference is not None: 1336 self.data.references.append(self._cur_reference) 1337 1338 from . import Record 1339 self._cur_reference = Record.Reference() 1340 self._cur_reference.number = content
1341
1342 - def reference_bases(self, content):
1343 self._cur_reference.bases = content
1344
1345 - def authors(self, content):
1346 self._cur_reference.authors = content
1347
1348 - def consrtm(self, content):
1349 self._cur_reference.consrtm = content
1350
1351 - def title(self, content):
1352 if self._cur_reference is None: 1353 import warnings 1354 from Bio import BiopythonParserWarning 1355 warnings.warn("GenBank TITLE line without REFERENCE line.", 1356 BiopythonParserWarning) 1357 return 1358 self._cur_reference.title = content
1359
1360 - def journal(self, content):
1361 self._cur_reference.journal = content
1362
1363 - def medline_id(self, content):
1364 self._cur_reference.medline_id = content
1365
1366 - def pubmed_id(self, content):
1367 self._cur_reference.pubmed_id = content
1368
1369 - def remark(self, content):
1370 self._cur_reference.remark = content
1371
1372 - def comment(self, content):
1373 self.data.comment += "\n".join(content)
1374
1375 - def primary_ref_line(self, content):
1376 """Data for the PRIMARY line""" 1377 self.data.primary.append(content)
1378
1379 - def primary(self, content):
1380 pass
1381
1382 - def features_line(self, content):
1383 """Get ready for the feature table when we reach the FEATURE line. 1384 """ 1385 self.start_feature_table()
1386
1387 - def start_feature_table(self):
1388 """Signal the start of the feature table. 1389 """ 1390 # we need to add on the last reference 1391 if self._cur_reference is not None: 1392 self.data.references.append(self._cur_reference)
1393
1394 - def feature_key(self, content):
1395 """Grab the key of the feature and signal the start of a new feature. 1396 """ 1397 # first add on feature information if we've got any 1398 self._add_feature() 1399 1400 from . import Record 1401 self._cur_feature = Record.Feature() 1402 self._cur_feature.key = content
1403
1404 - def _add_feature(self):
1405 """Utility function to add a feature to the Record. 1406 1407 This does all of the appropriate checking to make sure we haven't 1408 left any info behind, and that we are only adding info if it 1409 exists. 1410 """ 1411 if self._cur_feature is not None: 1412 # if we have a left over qualifier, add it to the qualifiers 1413 # on the current feature 1414 if self._cur_qualifier is not None: 1415 self._cur_feature.qualifiers.append(self._cur_qualifier) 1416 1417 self._cur_qualifier = None 1418 self.data.features.append(self._cur_feature)
1419
1420 - def location(self, content):
1421 self._cur_feature.location = self._clean_location(content)
1422
1423 - def feature_qualifier(self, key, value):
1424 self.feature_qualifier_name([key]) 1425 if value is not None: 1426 self.feature_qualifier_description(value)
1427
1428 - def feature_qualifier_name(self, content_list):
1429 """Deal with qualifier names 1430 1431 We receive a list of keys, since you can have valueless keys such as 1432 /pseudo which would be passed in with the next key (since no other 1433 tags separate them in the file) 1434 """ 1435 from . import Record 1436 for content in content_list: 1437 # the record parser keeps the /s -- add them if we don't have 'em 1438 if not content.startswith("/"): 1439 content = "/%s" % content 1440 # add on a qualifier if we've got one 1441 if self._cur_qualifier is not None: 1442 self._cur_feature.qualifiers.append(self._cur_qualifier) 1443 1444 self._cur_qualifier = Record.Qualifier() 1445 self._cur_qualifier.key = content
1446
1447 - def feature_qualifier_description(self, content):
1448 # if we have info then the qualifier key should have a ='s 1449 if '=' not in self._cur_qualifier.key: 1450 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1451 cur_content = self._remove_newlines(content) 1452 # remove all spaces from the value if it is a type where spaces 1453 # are not important 1454 for remove_space_key in self.__class__.remove_space_keys: 1455 if remove_space_key in self._cur_qualifier.key: 1456 cur_content = self._remove_spaces(cur_content) 1457 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1458
1459 - def base_count(self, content):
1460 self.data.base_counts = content
1461
1462 - def origin_name(self, content):
1463 self.data.origin = content
1464
1465 - def contig_location(self, content):
1466 """Signal that we have contig information to add to the record. 1467 """ 1468 self.data.contig = self._clean_location(content)
1469
1470 - def sequence(self, content):
1471 """Add sequence information to a list of sequence strings. 1472 1473 This removes spaces in the data and uppercases the sequence, and 1474 then adds it to a list of sequences. Later on we'll join this 1475 list together to make the final sequence. This is faster than 1476 adding on the new string every time. 1477 """ 1478 assert ' ' not in content 1479 self._seq_data.append(content.upper())
1480
1481 - def record_end(self, content):
1482 """Signal the end of the record and do any necessary clean-up. 1483 """ 1484 # add together all of the sequence parts to create the 1485 # final sequence string 1486 self.data.sequence = "".join(self._seq_data) 1487 # add on the last feature 1488 self._add_feature()
1489 1490
1491 -def parse(handle):
1492 """Iterate over GenBank formatted entries as Record objects. 1493 1494 >>> from Bio import GenBank 1495 >>> with open("GenBank/NC_000932.gb") as handle: 1496 ... for record in GenBank.parse(handle): 1497 ... print(record.accession) 1498 ['NC_000932'] 1499 1500 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1501 instead. 1502 """ 1503 return iter(Iterator(handle, RecordParser()))
1504 1505
1506 -def read(handle):
1507 """Read a handle containing a single GenBank entry as a Record object. 1508 1509 >>> from Bio import GenBank 1510 >>> with open("GenBank/NC_000932.gb") as handle: 1511 ... record = GenBank.read(handle) 1512 ... print(record.accession) 1513 ['NC_000932'] 1514 1515 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1516 instead. 1517 """ 1518 iterator = parse(handle) 1519 try: 1520 first = next(iterator) 1521 except StopIteration: 1522 first = None 1523 if first is None: 1524 raise ValueError("No records found in handle") 1525 try: 1526 second = next(iterator) 1527 except StopIteration: 1528 second = None 1529 if second is not None: 1530 raise ValueError("More than one record found in handle") 1531 return first
1532 1533
1534 -def _test():
1535 """Run the Bio.GenBank module's doctests.""" 1536 import doctest 1537 import os 1538 if os.path.isdir(os.path.join("..", "..", "Tests")): 1539 print("Running doctests...") 1540 cur_dir = os.path.abspath(os.curdir) 1541 os.chdir(os.path.join("..", "..", "Tests")) 1542 doctest.testmod() 1543 os.chdir(cur_dir) 1544 del cur_dir 1545 print("Done") 1546 elif os.path.isdir(os.path.join("Tests")): 1547 print("Running doctests...") 1548 cur_dir = os.path.abspath(os.curdir) 1549 os.chdir(os.path.join("Tests")) 1550 doctest.testmod() 1551 os.chdir(cur_dir) 1552 del cur_dir 1553 print("Done")
1554 1555 if __name__ == "__main__": 1556 _test() 1557