Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Hold GenBank data in a straightforward format. 
  7   
  8  classes: 
  9   
 10      - Record - All of the information in a GenBank record. 
 11      - Reference - hold reference data for a record. 
 12      - Feature - Hold the information in a Feature Table. 
 13      - Qualifier - Qualifiers on a Feature. 
 14   
 15  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
 16  """ 
 17  # local stuff 
 18  import Bio.GenBank 
 19   
 20  __docformat__ = "restructuredtext en" 
 21   
 22   
23 -def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "):
24 """Write a line of GenBank info that can wrap over multiple lines. 25 26 This takes a line of information which can potentially wrap over 27 multiple lines, and breaks it up with carriage returns and 28 indentation so it fits properly into a GenBank record. 29 30 Arguments: 31 32 - information - The string holding the information we want 33 wrapped in GenBank method. 34 35 - indent - The indentation on the lines we are writing. 36 37 - wrap_space - Whether or not to wrap only on spaces in the 38 information. 39 40 - split_char - A specific character to split the lines on. By default 41 spaces are used. 42 """ 43 info_length = Record.GB_LINE_LENGTH - indent 44 45 if not information: 46 # GenBank files use "." for missing data 47 return ".\n" 48 49 if wrap_space: 50 info_parts = information.split(split_char) 51 else: 52 cur_pos = 0 53 info_parts = [] 54 while cur_pos < len(information): 55 info_parts.append(information[cur_pos: cur_pos + info_length]) 56 cur_pos += info_length 57 58 # first get the information string split up by line 59 output_parts = [] 60 cur_part = "" 61 for info_part in info_parts: 62 if len(cur_part) + 1 + len(info_part) > info_length: 63 if cur_part: 64 if split_char != " ": 65 cur_part += split_char 66 output_parts.append(cur_part) 67 cur_part = info_part 68 else: 69 if cur_part == "": 70 cur_part = info_part 71 else: 72 cur_part += split_char + info_part 73 74 # add the last bit of information to the output 75 if cur_part: 76 output_parts.append(cur_part) 77 78 # now format the information string for return 79 output_info = output_parts[0] + "\n" 80 for output_part in output_parts[1:]: 81 output_info += " " * indent + output_part + "\n" 82 83 return output_info
84 85
86 -def _indent_genbank(information, indent):
87 """Write out information with the specified indent. 88 89 Unlike _wrapped_genbank, this function makes no attempt to wrap 90 lines -- it assumes that the information already has newlines in the 91 appropriate places, and will add the specified indent to the start of 92 each line. 93 """ 94 # split the info into lines based on line breaks 95 info_parts = information.split("\n") 96 97 # the first line will have no indent 98 output_info = info_parts[0] + "\n" 99 for info_part in info_parts[1:]: 100 output_info += " " * indent + info_part + "\n" 101 102 return output_info
103 104
105 -class Record(object):
106 """Hold GenBank information in a format similar to the original record. 107 108 The Record class is meant to make data easy to get to when you are 109 just interested in looking at GenBank data. 110 111 Attributes: 112 113 - locus - The name specified after the LOCUS keyword in the GenBank 114 record. This may be the accession number, or a clone id or something else. 115 - size - The size of the record. 116 - residue_type - The type of residues making up the sequence in this 117 record. Normally something like RNA, DNA or PROTEIN, but may be as 118 esoteric as 'ss-RNA circular'. 119 - data_file_division - The division this record is stored under in 120 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 121 - date - The date of submission of the record, in a form like '28-JUL-1998' 122 - accession - list of all accession numbers for the sequence. 123 - nid - Nucleotide identifier number. 124 - pid - Proteint identifier number 125 - version - The accession number + version (ie. AB01234.2) 126 - db_source - Information about the database the record came from 127 - gi - The NCBI gi identifier for the record. 128 - keywords - A list of keywords related to the record. 129 - segment - If the record is one of a series, this is info about which 130 segment this record is (something like '1 of 6'). 131 - source - The source of material where the sequence came from. 132 - organism - The genus and species of the organism (ie. 'Homo sapiens') 133 - taxonomy - A listing of the taxonomic classification of the organism, 134 starting general and getting more specific. 135 - references - A list of Reference objects. 136 - comment - Text with any kind of comment about the record. 137 - features - A listing of Features making up the feature table. 138 - base_counts - A string with the counts of bases for the sequence. 139 - origin - A string specifying info about the origin of the sequence. 140 - sequence - A string with the sequence itself. 141 - contig - A string of location information for a CONTIG in a RefSeq file 142 - project - The genome sequencing project numbers 143 (will be replaced by the dblink cross-references in 2009). 144 - dblinks - The genome sequencing project number(s) and other links. 145 (will replace the project information in 2009). 146 """ 147 # constants for outputting GenBank information 148 GB_LINE_LENGTH = 79 149 GB_BASE_INDENT = 12 150 GB_FEATURE_INDENT = 21 151 GB_INTERNAL_INDENT = 2 152 GB_OTHER_INTERNAL_INDENT = 3 153 GB_FEATURE_INTERNAL_INDENT = 5 154 GB_SEQUENCE_INDENT = 9 155 156 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 157 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 158 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 159 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 160 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 161 "s" 162 163 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 164 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 165 str(GB_FEATURE_INDENT - 166 GB_FEATURE_INTERNAL_INDENT) + "s" 167 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 168
169 - def __init__(self):
170 self.locus = '' 171 self.size = '' 172 self.residue_type = '' 173 self.data_file_division = '' 174 self.date = '' 175 self.definition = '' 176 self.accession = [] 177 self.nid = '' 178 self.pid = '' 179 self.version = '' 180 self.projects = [] 181 self.dblinks = [] 182 self.db_source = '' 183 self.gi = '' 184 self.keywords = [] 185 self.segment = '' 186 self.source = '' 187 self.organism = '' 188 self.taxonomy = [] 189 self.references = [] 190 self.comment = '' 191 self.features = [] 192 self.base_counts = '' 193 self.origin = '' 194 self.sequence = '' 195 self.contig = '' 196 self.primary = [] 197 self.wgs = '' 198 self.wgs_scafld = []
199
200 - def __str__(self):
201 """Provide a GenBank formatted output option for a Record. 202 203 The objective of this is to provide an easy way to read in a GenBank 204 record, modify it somehow, and then output it in 'GenBank format.' 205 We are striving to make this work so that a parsed Record that is 206 output using this function will look exactly like the original 207 record. 208 209 Much of the output is based on format description info at: 210 211 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 212 """ 213 output = self._locus_line() 214 output += self._definition_line() 215 output += self._accession_line() 216 output += self._version_line() 217 output += self._project_line() 218 output += self._dblink_line() 219 output += self._nid_line() 220 output += self._pid_line() 221 output += self._keywords_line() 222 output += self._db_source_line() 223 output += self._segment_line() 224 output += self._source_line() 225 output += self._organism_line() 226 for reference in self.references: 227 output += str(reference) 228 output += self._comment_line() 229 output += self._features_line() 230 for feature in self.features: 231 output += str(feature) 232 output += self._base_count_line() 233 output += self._origin_line() 234 output += self._sequence_line() 235 output += self._wgs_line() 236 output += self._wgs_scafld_line() 237 output += self._contig_line() 238 output += "//" 239 return output
240
241 - def _locus_line(self):
242 """Provide the output string for the LOCUS line. 243 """ 244 output = "LOCUS" 245 output += " " * 7 # 6-12 spaces 246 output += "%-9s" % self.locus 247 output += " " # 22 space 248 output += "%7s" % self.size 249 if "PROTEIN" in self.residue_type: 250 output += " aa" 251 else: 252 output += " bp " 253 254 # treat circular types differently, since they'll have long residue 255 # types 256 if "circular" in self.residue_type: 257 output += "%17s" % self.residue_type 258 # second case: ss-DNA types of records 259 elif "-" in self.residue_type: 260 output += "%7s" % self.residue_type 261 output += " " * 10 # spaces for circular 262 else: 263 output += " " * 3 # spaces for stuff like ss- 264 output += "%-4s" % self.residue_type 265 output += " " * 10 # spaces for circular 266 267 output += " " * 2 268 output += "%3s" % self.data_file_division 269 output += " " * 7 # spaces for 56-63 270 output += "%11s" % self.date 271 output += "\n" 272 return output
273
274 - def _definition_line(self):
275 """Provide output for the DEFINITION line. 276 """ 277 output = Record.BASE_FORMAT % "DEFINITION" 278 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 279 return output
280
281 - def _accession_line(self):
282 """Output for the ACCESSION line. 283 """ 284 if self.accession: 285 output = Record.BASE_FORMAT % "ACCESSION" 286 287 acc_info = "" 288 for accession in self.accession: 289 acc_info += "%s " % accession 290 # strip off an extra space at the end 291 acc_info = acc_info.rstrip() 292 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 293 else: 294 output = "" 295 296 return output
297
298 - def _version_line(self):
299 """Output for the VERSION line. 300 """ 301 if self.version: 302 output = Record.BASE_FORMAT % "VERSION" 303 output += self.version 304 output += " GI:" 305 output += "%s\n" % self.gi 306 else: 307 output = "" 308 return output
309
310 - def _project_line(self):
311 output = "" 312 if len(self.projects) > 0: 313 output = Record.BASE_FORMAT % "PROJECT" 314 output += "%s\n" % " ".join(self.projects) 315 return output
316 324
325 - def _nid_line(self):
326 """Output for the NID line. Use of NID is obsolete in GenBank files. 327 """ 328 if self.nid: 329 output = Record.BASE_FORMAT % "NID" 330 output += "%s\n" % self.nid 331 else: 332 output = "" 333 return output
334
335 - def _pid_line(self):
336 """Output for PID line. Presumedly, PID usage is also obsolete. 337 """ 338 if self.pid: 339 output = Record.BASE_FORMAT % "PID" 340 output += "%s\n" % self.pid 341 else: 342 output = "" 343 return output
344
345 - def _keywords_line(self):
346 """Output for the KEYWORDS line. 347 """ 348 output = "" 349 if len(self.keywords) >= 0: 350 output += Record.BASE_FORMAT % "KEYWORDS" 351 keyword_info = "" 352 for keyword in self.keywords: 353 keyword_info += "%s; " % keyword 354 # replace the ; at the end with a period 355 keyword_info = keyword_info[:-2] 356 keyword_info += "." 357 358 output += _wrapped_genbank(keyword_info, 359 Record.GB_BASE_INDENT) 360 361 return output
362
363 - def _db_source_line(self):
364 """Output for DBSOURCE line. 365 """ 366 if self.db_source: 367 output = Record.BASE_FORMAT % "DBSOURCE" 368 output += "%s\n" % self.db_source 369 else: 370 output = "" 371 return output
372
373 - def _segment_line(self):
374 """Output for the SEGMENT line. 375 """ 376 output = "" 377 if self.segment: 378 output += Record.BASE_FORMAT % "SEGMENT" 379 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 380 return output
381
382 - def _source_line(self):
383 """Output for SOURCE line on where the sample came from. 384 """ 385 output = Record.BASE_FORMAT % "SOURCE" 386 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 387 return output
388
389 - def _organism_line(self):
390 """Output for ORGANISM line with taxonomy info. 391 """ 392 output = Record.INTERNAL_FORMAT % "ORGANISM" 393 # Now that species names can be too long, this line can wrap (Bug 2591) 394 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 395 output += " " * Record.GB_BASE_INDENT 396 taxonomy_info = "" 397 for tax in self.taxonomy: 398 taxonomy_info += "%s; " % tax 399 # replace the ; at the end with a period 400 taxonomy_info = taxonomy_info[:-2] 401 taxonomy_info += "." 402 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 403 404 return output
405
406 - def _comment_line(self):
407 """Output for the COMMENT lines. 408 """ 409 output = "" 410 if self.comment: 411 output += Record.BASE_FORMAT % "COMMENT" 412 output += _indent_genbank(self.comment, 413 Record.GB_BASE_INDENT) 414 return output
415
416 - def _features_line(self):
417 """Output for the FEATURES line. 418 """ 419 output = "" 420 if len(self.features) > 0: 421 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 422 output += "Location/Qualifiers\n" 423 return output
424
425 - def _base_count_line(self):
426 """Output for the BASE COUNT line with base information. 427 """ 428 output = "" 429 if self.base_counts: 430 output += Record.BASE_FORMAT % "BASE COUNT " 431 # split up the base counts into their individual parts 432 count_parts = self.base_counts.split(" ") 433 while '' in count_parts: 434 count_parts.remove('') 435 # deal with the standard case, with a normal origin line 436 # like: 474 a 356 c 428 g 364 t 437 if len(count_parts) % 2 == 0: 438 while len(count_parts) > 0: 439 count_info = count_parts.pop(0) 440 count_type = count_parts.pop(0) 441 442 output += "%7s %s" % (count_info, count_type) 443 # deal with ugly ORIGIN lines like: 444 # 1311257 a2224835 c2190093 g1309889 t 445 # by just outputting the raw information 446 else: 447 output += self.base_counts 448 output += "\n" 449 return output
450
451 - def _origin_line(self):
452 """Output for the ORIGIN line 453 """ 454 output = "" 455 # only output the ORIGIN line if we have a sequence 456 if self.sequence: 457 output += Record.BASE_FORMAT % "ORIGIN" 458 if self.origin: 459 output += _wrapped_genbank(self.origin, 460 Record.GB_BASE_INDENT) 461 else: 462 output += "\n" 463 return output
464
465 - def _sequence_line(self):
466 """Output for all of the sequence. 467 """ 468 output = "" 469 if self.sequence: 470 cur_seq_pos = 0 471 while cur_seq_pos < len(self.sequence): 472 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 473 474 for section in range(6): 475 start_pos = cur_seq_pos + section * 10 476 end_pos = start_pos + 10 477 seq_section = self.sequence[start_pos:end_pos] 478 output += " %s" % seq_section.lower() 479 480 # stop looping if we are out of sequence 481 if end_pos > len(self.sequence): 482 break 483 484 output += "\n" 485 cur_seq_pos += 60 486 return output
487
488 - def _wgs_line(self):
489 output = "" 490 if self.wgs: 491 output += Record.BASE_FORMAT % "WGS" 492 output += self.wgs 493 return output
494
495 - def _wgs_scafld_line(self):
496 output = "" 497 if self.wgs_scafld: 498 output += Record.BASE_FORMAT % "WGS_SCAFLD" 499 output += self.wgs_scafld 500 return output
501
502 - def _contig_line(self):
503 """Output for CONTIG location information from RefSeq. 504 """ 505 output = "" 506 if self.contig: 507 output += Record.BASE_FORMAT % "CONTIG" 508 output += _wrapped_genbank(self.contig, 509 Record.GB_BASE_INDENT, split_char=',') 510 return output
511 512
513 -class Reference(object):
514 """Hold information from a GenBank reference. 515 516 Attributes: 517 518 - number - The number of the reference in the listing of references. 519 - bases - The bases in the sequence the reference refers to. 520 - authors - String with all of the authors. 521 - consrtm - Consortium the authors belong to. 522 - title - The title of the reference. 523 - journal - Information about the journal where the reference appeared. 524 - medline_id - The medline id for the reference. 525 - pubmed_id - The pubmed_id for the reference. 526 - remark - Free-form remarks about the reference. 527 """
528 - def __init__(self):
529 self.number = '' 530 self.bases = '' 531 self.authors = '' 532 self.consrtm = '' 533 self.title = '' 534 self.journal = '' 535 self.medline_id = '' 536 self.pubmed_id = '' 537 self.remark = ''
538
539 - def __str__(self):
540 output = self._reference_line() 541 output += self._authors_line() 542 output += self._consrtm_line() 543 output += self._title_line() 544 output += self._journal_line() 545 output += self._medline_line() 546 output += self._pubmed_line() 547 output += self._remark_line() 548 549 return output
550
551 - def _reference_line(self):
552 """Output for REFERENCE lines. 553 """ 554 output = Record.BASE_FORMAT % "REFERENCE" 555 if self.number: 556 if self.bases: 557 output += "%-3s" % self.number 558 output += "%s" % self.bases 559 else: 560 output += "%s" % self.number 561 562 output += "\n" 563 return output
564
565 - def _authors_line(self):
566 """Output for AUTHORS information. 567 """ 568 output = "" 569 if self.authors: 570 output += Record.INTERNAL_FORMAT % "AUTHORS" 571 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 572 return output
573
574 - def _consrtm_line(self):
575 """Output for CONSRTM information. 576 """ 577 output = "" 578 if self.consrtm: 579 output += Record.INTERNAL_FORMAT % "CONSRTM" 580 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 581 return output
582
583 - def _title_line(self):
584 """Output for TITLE information. 585 """ 586 output = "" 587 if self.title: 588 output += Record.INTERNAL_FORMAT % "TITLE" 589 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 590 return output
591
592 - def _journal_line(self):
593 """Output for JOURNAL information. 594 """ 595 output = "" 596 if self.journal: 597 output += Record.INTERNAL_FORMAT % "JOURNAL" 598 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 599 return output
600
601 - def _medline_line(self):
602 """Output for MEDLINE information. 603 """ 604 output = "" 605 if self.medline_id: 606 output += Record.INTERNAL_FORMAT % "MEDLINE" 607 output += self.medline_id + "\n" 608 return output
609
610 - def _pubmed_line(self):
611 """Output for PUBMED information. 612 """ 613 output = "" 614 if self.pubmed_id: 615 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 616 output += self.pubmed_id + "\n" 617 return output
618
619 - def _remark_line(self):
620 """Output for REMARK information. 621 """ 622 output = "" 623 if self.remark: 624 output += Record.INTERNAL_FORMAT % "REMARK" 625 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 626 return output
627 628
629 -class Feature(object):
630 """Hold information about a Feature in the Feature Table of GenBank record. 631 632 Attributes: 633 634 - key - The key name of the featue (ie. source) 635 - location - The string specifying the location of the feature. 636 - qualfiers - A listing Qualifier objects in the feature. 637 """
638 - def __init__(self):
639 self.key = '' 640 self.location = '' 641 self.qualifiers = []
642
643 - def __str__(self):
644 output = Record.INTERNAL_FEATURE_FORMAT % self.key 645 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 646 split_char=',') 647 for qualifier in self.qualifiers: 648 output += " " * Record.GB_FEATURE_INDENT 649 650 # determine whether we can wrap on spaces 651 space_wrap = 1 652 for no_space_key in \ 653 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 654 if no_space_key in qualifier.key: 655 space_wrap = 0 656 657 output += _wrapped_genbank(qualifier.key + qualifier.value, 658 Record.GB_FEATURE_INDENT, space_wrap) 659 return output
660 661
662 -class Qualifier(object):
663 """Hold information about a qualifier in a GenBank feature. 664 665 Attributes: 666 667 - key - The key name of the qualifier (ie. /organism=) 668 - value - The value of the qualifier ("Dictyostelium discoideum"). 669 """
670 - def __init__(self):
671 self.key = '' 672 self.value = ''
673