Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Hold GenBank data in a straightforward format. 
  7   
  8  classes: 
  9  o Record - All of the information in a GenBank record. 
 10  o Reference - hold reference data for a record. 
 11  o Feature - Hold the information in a Feature Table. 
 12  o Qualifier - Qualifiers on a Feature. 
 13  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
 14  """ 
 15  # local stuff 
 16  import Bio.GenBank 
 17   
 18   
19 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
20 """Write a line of GenBank info that can wrap over multiple lines. 21 22 This takes a line of information which can potentially wrap over 23 multiple lines, and breaks it up with carriage returns and 24 indentation so it fits properly into a GenBank record. 25 26 Arguments: 27 28 o information - The string holding the information we want 29 wrapped in GenBank method. 30 31 o indent - The indentation on the lines we are writing. 32 33 o wrap_space - Whether or not to wrap only on spaces in the 34 information. 35 36 o split_char - A specific character to split the lines on. By default 37 spaces are used. 38 """ 39 info_length = Record.GB_LINE_LENGTH - indent 40 41 if not information: 42 #GenBank files use "." for missing data 43 return ".\n" 44 45 if wrap_space: 46 info_parts = information.split(split_char) 47 else: 48 cur_pos = 0 49 info_parts = [] 50 while cur_pos < len(information): 51 info_parts.append(information[cur_pos: cur_pos + info_length]) 52 cur_pos += info_length 53 54 # first get the information string split up by line 55 output_parts = [] 56 cur_part = "" 57 for info_part in info_parts: 58 if len(cur_part) + 1 + len(info_part) > info_length: 59 if cur_part: 60 if split_char != " ": 61 cur_part += split_char 62 output_parts.append(cur_part) 63 cur_part = info_part 64 else: 65 if cur_part == "": 66 cur_part = info_part 67 else: 68 cur_part += split_char + info_part 69 70 # add the last bit of information to the output 71 if cur_part: 72 output_parts.append(cur_part) 73 74 # now format the information string for return 75 output_info = output_parts[0] + "\n" 76 for output_part in output_parts[1:]: 77 output_info += " " * indent + output_part + "\n" 78 79 return output_info
80 81
82 -def _indent_genbank(information, indent):
83 """Write out information with the specified indent. 84 85 Unlike _wrapped_genbank, this function makes no attempt to wrap 86 lines -- it assumes that the information already has newlines in the 87 appropriate places, and will add the specified indent to the start of 88 each line. 89 """ 90 # split the info into lines based on line breaks 91 info_parts = information.split("\n") 92 93 # the first line will have no indent 94 output_info = info_parts[0] + "\n" 95 for info_part in info_parts[1:]: 96 output_info += " " * indent + info_part + "\n" 97 98 return output_info
99 100
101 -class Record(object):
102 """Hold GenBank information in a format similar to the original record. 103 104 The Record class is meant to make data easy to get to when you are 105 just interested in looking at GenBank data. 106 107 Attributes: 108 o locus - The name specified after the LOCUS keyword in the GenBank 109 record. This may be the accession number, or a clone id or something else. 110 o size - The size of the record. 111 o residue_type - The type of residues making up the sequence in this 112 record. Normally something like RNA, DNA or PROTEIN, but may be as 113 esoteric as 'ss-RNA circular'. 114 o data_file_division - The division this record is stored under in 115 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 116 o date - The date of submission of the record, in a form like '28-JUL-1998' 117 o accession - list of all accession numbers for the sequence. 118 o nid - Nucleotide identifier number. 119 o pid - Proteint identifier number 120 o version - The accession number + version (ie. AB01234.2) 121 o db_source - Information about the database the record came from 122 o gi - The NCBI gi identifier for the record. 123 o keywords - A list of keywords related to the record. 124 o segment - If the record is one of a series, this is info about which 125 segment this record is (something like '1 of 6'). 126 o source - The source of material where the sequence came from. 127 o organism - The genus and species of the organism (ie. 'Homo sapiens') 128 o taxonomy - A listing of the taxonomic classification of the organism, 129 starting general and getting more specific. 130 o references - A list of Reference objects. 131 o comment - Text with any kind of comment about the record. 132 o features - A listing of Features making up the feature table. 133 o base_counts - A string with the counts of bases for the sequence. 134 o origin - A string specifying info about the origin of the sequence. 135 o sequence - A string with the sequence itself. 136 o contig - A string of location information for a CONTIG in a RefSeq file 137 o project - The genome sequencing project numbers 138 (will be replaced by the dblink cross-references in 2009). 139 o dblinks - The genome sequencing project number(s) and other links. 140 (will replace the project information in 2009). 141 """ 142 # constants for outputting GenBank information 143 GB_LINE_LENGTH = 79 144 GB_BASE_INDENT = 12 145 GB_FEATURE_INDENT = 21 146 GB_INTERNAL_INDENT = 2 147 GB_OTHER_INTERNAL_INDENT = 3 148 GB_FEATURE_INTERNAL_INDENT = 5 149 GB_SEQUENCE_INDENT = 9 150 151 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 152 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 153 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 154 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 155 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 156 "s" 157 158 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 159 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 160 str(GB_FEATURE_INDENT - 161 GB_FEATURE_INTERNAL_INDENT) + "s" 162 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 163
164 - def __init__(self):
165 self.locus = '' 166 self.size = '' 167 self.residue_type = '' 168 self.data_file_division = '' 169 self.date = '' 170 self.definition = '' 171 self.accession = [] 172 self.nid = '' 173 self.pid = '' 174 self.version = '' 175 self.projects = [] 176 self.dblinks = [] 177 self.db_source = '' 178 self.gi = '' 179 self.keywords = [] 180 self.segment = '' 181 self.source = '' 182 self.organism = '' 183 self.taxonomy = [] 184 self.references = [] 185 self.comment = '' 186 self.features = [] 187 self.base_counts = '' 188 self.origin = '' 189 self.sequence = '' 190 self.contig = '' 191 self.primary=[] 192 self.wgs = '' 193 self.wgs_scafld = []
194
195 - def __str__(self):
196 """Provide a GenBank formatted output option for a Record. 197 198 The objective of this is to provide an easy way to read in a GenBank 199 record, modify it somehow, and then output it in 'GenBank format.' 200 We are striving to make this work so that a parsed Record that is 201 output using this function will look exactly like the original 202 record. 203 204 Much of the output is based on format description info at: 205 206 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 207 """ 208 output = self._locus_line() 209 output += self._definition_line() 210 output += self._accession_line() 211 output += self._version_line() 212 output += self._project_line() 213 output += self._dblink_line() 214 output += self._nid_line() 215 output += self._pid_line() 216 output += self._keywords_line() 217 output += self._db_source_line() 218 output += self._segment_line() 219 output += self._source_line() 220 output += self._organism_line() 221 for reference in self.references: 222 output += str(reference) 223 output += self._comment_line() 224 output += self._features_line() 225 for feature in self.features: 226 output += str(feature) 227 output += self._base_count_line() 228 output += self._origin_line() 229 output += self._sequence_line() 230 output += self._wgs_line() 231 output += self._wgs_scafld_line() 232 output += self._contig_line() 233 output += "//" 234 return output
235
236 - def _locus_line(self):
237 """Provide the output string for the LOCUS line. 238 """ 239 output = "LOCUS" 240 output += " " * 7 # 6-12 spaces 241 output += "%-9s" % self.locus 242 output += " " # 22 space 243 output += "%7s" % self.size 244 if "PROTEIN" in self.residue_type: 245 output += " aa" 246 else: 247 output += " bp " 248 249 # treat circular types differently, since they'll have long residue 250 # types 251 if "circular" in self.residue_type: 252 output += "%17s" % self.residue_type 253 # second case: ss-DNA types of records 254 elif "-" in self.residue_type: 255 output += "%7s" % self.residue_type 256 output += " " * 10 # spaces for circular 257 else: 258 output += " " * 3 # spaces for stuff like ss- 259 output += "%-4s" % self.residue_type 260 output += " " * 10 # spaces for circular 261 262 output += " " * 2 263 output += "%3s" % self.data_file_division 264 output += " " * 7 # spaces for 56-63 265 output += "%11s" % self.date 266 output += "\n" 267 return output
268
269 - def _definition_line(self):
270 """Provide output for the DEFINITION line. 271 """ 272 output = Record.BASE_FORMAT % "DEFINITION" 273 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 274 return output
275
276 - def _accession_line(self):
277 """Output for the ACCESSION line. 278 """ 279 if self.accession: 280 output = Record.BASE_FORMAT % "ACCESSION" 281 282 acc_info = "" 283 for accession in self.accession: 284 acc_info += "%s " % accession 285 # strip off an extra space at the end 286 acc_info = acc_info.rstrip() 287 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 288 else: 289 output = "" 290 291 return output
292
293 - def _version_line(self):
294 """Output for the VERSION line. 295 """ 296 if self.version: 297 output = Record.BASE_FORMAT % "VERSION" 298 output += self.version 299 output += " GI:" 300 output += "%s\n" % self.gi 301 else: 302 output = "" 303 return output
304
305 - def _project_line(self):
306 output = "" 307 if len(self.projects) > 0: 308 output = Record.BASE_FORMAT % "PROJECT" 309 output += "%s\n" % " ".join(self.projects) 310 return output
311 319
320 - def _nid_line(self):
321 """Output for the NID line. Use of NID is obsolete in GenBank files. 322 """ 323 if self.nid: 324 output = Record.BASE_FORMAT % "NID" 325 output += "%s\n" % self.nid 326 else: 327 output = "" 328 return output
329
330 - def _pid_line(self):
331 """Output for PID line. Presumedly, PID usage is also obsolete. 332 """ 333 if self.pid: 334 output = Record.BASE_FORMAT % "PID" 335 output += "%s\n" % self.pid 336 else: 337 output = "" 338 return output
339
340 - def _keywords_line(self):
341 """Output for the KEYWORDS line. 342 """ 343 output = "" 344 if len(self.keywords) >= 0: 345 output += Record.BASE_FORMAT % "KEYWORDS" 346 keyword_info = "" 347 for keyword in self.keywords: 348 keyword_info += "%s; " % keyword 349 # replace the ; at the end with a period 350 keyword_info = keyword_info[:-2] 351 keyword_info += "." 352 353 output += _wrapped_genbank(keyword_info, 354 Record.GB_BASE_INDENT) 355 356 return output
357
358 - def _db_source_line(self):
359 """Output for DBSOURCE line. 360 """ 361 if self.db_source: 362 output = Record.BASE_FORMAT % "DBSOURCE" 363 output += "%s\n" % self.db_source 364 else: 365 output = "" 366 return output
367
368 - def _segment_line(self):
369 """Output for the SEGMENT line. 370 """ 371 output = "" 372 if self.segment: 373 output += Record.BASE_FORMAT % "SEGMENT" 374 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 375 return output
376
377 - def _source_line(self):
378 """Output for SOURCE line on where the sample came from. 379 """ 380 output = Record.BASE_FORMAT % "SOURCE" 381 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 382 return output
383
384 - def _organism_line(self):
385 """Output for ORGANISM line with taxonomy info. 386 """ 387 output = Record.INTERNAL_FORMAT % "ORGANISM" 388 # Now that species names can be too long, this line can wrap (Bug 2591) 389 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 390 output += " " * Record.GB_BASE_INDENT 391 taxonomy_info = "" 392 for tax in self.taxonomy: 393 taxonomy_info += "%s; " % tax 394 # replace the ; at the end with a period 395 taxonomy_info = taxonomy_info[:-2] 396 taxonomy_info += "." 397 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 398 399 return output
400
401 - def _comment_line(self):
402 """Output for the COMMENT lines. 403 """ 404 output = "" 405 if self.comment: 406 output += Record.BASE_FORMAT % "COMMENT" 407 output += _indent_genbank(self.comment, 408 Record.GB_BASE_INDENT) 409 return output
410
411 - def _features_line(self):
412 """Output for the FEATURES line. 413 """ 414 output = "" 415 if len(self.features) > 0: 416 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 417 output += "Location/Qualifiers\n" 418 return output
419
420 - def _base_count_line(self):
421 """Output for the BASE COUNT line with base information. 422 """ 423 output = "" 424 if self.base_counts: 425 output += Record.BASE_FORMAT % "BASE COUNT " 426 # split up the base counts into their individual parts 427 count_parts = self.base_counts.split(" ") 428 while '' in count_parts: 429 count_parts.remove('') 430 # deal with the standard case, with a normal origin line 431 # like: 474 a 356 c 428 g 364 t 432 if len(count_parts) % 2 == 0: 433 while len(count_parts) > 0: 434 count_info = count_parts.pop(0) 435 count_type = count_parts.pop(0) 436 437 output += "%7s %s" % (count_info, count_type) 438 # deal with ugly ORIGIN lines like: 439 # 1311257 a2224835 c2190093 g1309889 t 440 # by just outputting the raw information 441 else: 442 output += self.base_counts 443 output += "\n" 444 return output
445
446 - def _origin_line(self):
447 """Output for the ORIGIN line 448 """ 449 output = "" 450 # only output the ORIGIN line if we have a sequence 451 if self.sequence: 452 output += Record.BASE_FORMAT % "ORIGIN" 453 if self.origin: 454 output += _wrapped_genbank(self.origin, 455 Record.GB_BASE_INDENT) 456 else: 457 output += "\n" 458 return output
459
460 - def _sequence_line(self):
461 """Output for all of the sequence. 462 """ 463 output = "" 464 if self.sequence: 465 cur_seq_pos = 0 466 while cur_seq_pos < len(self.sequence): 467 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 468 469 for section in range(6): 470 start_pos = cur_seq_pos + section * 10 471 end_pos = start_pos + 10 472 seq_section = self.sequence[start_pos:end_pos] 473 output += " %s" % seq_section.lower() 474 475 # stop looping if we are out of sequence 476 if end_pos > len(self.sequence): 477 break 478 479 output += "\n" 480 cur_seq_pos += 60 481 return output
482
483 - def _wgs_line(self):
484 output = "" 485 if self.wgs: 486 output += Record.BASE_FORMAT % "WGS" 487 output += self.wgs 488 return output
489
490 - def _wgs_scafld_line(self):
491 output = "" 492 if self.wgs_scafld: 493 output += Record.BASE_FORMAT % "WGS_SCAFLD" 494 output += self.wgs_scafld 495 return output
496
497 - def _contig_line(self):
498 """Output for CONTIG location information from RefSeq. 499 """ 500 output = "" 501 if self.contig: 502 output += Record.BASE_FORMAT % "CONTIG" 503 output += _wrapped_genbank(self.contig, 504 Record.GB_BASE_INDENT, split_char = ',') 505 return output
506 507
508 -class Reference(object):
509 """Hold information from a GenBank reference. 510 511 Attributes: 512 o number - The number of the reference in the listing of references. 513 o bases - The bases in the sequence the reference refers to. 514 o authors - String with all of the authors. 515 o consrtm - Consortium the authors belong to. 516 o title - The title of the reference. 517 o journal - Information about the journal where the reference appeared. 518 o medline_id - The medline id for the reference. 519 o pubmed_id - The pubmed_id for the reference. 520 o remark - Free-form remarks about the reference. 521 """
522 - def __init__(self):
523 self.number = '' 524 self.bases = '' 525 self.authors = '' 526 self.consrtm = '' 527 self.title = '' 528 self.journal = '' 529 self.medline_id = '' 530 self.pubmed_id = '' 531 self.remark = ''
532
533 - def __str__(self):
534 output = self._reference_line() 535 output += self._authors_line() 536 output += self._consrtm_line() 537 output += self._title_line() 538 output += self._journal_line() 539 output += self._medline_line() 540 output += self._pubmed_line() 541 output += self._remark_line() 542 543 return output
544
545 - def _reference_line(self):
546 """Output for REFERENCE lines. 547 """ 548 output = Record.BASE_FORMAT % "REFERENCE" 549 if self.number: 550 if self.bases: 551 output += "%-3s" % self.number 552 output += "%s" % self.bases 553 else: 554 output += "%s" % self.number 555 556 output += "\n" 557 return output
558
559 - def _authors_line(self):
560 """Output for AUTHORS information. 561 """ 562 output = "" 563 if self.authors: 564 output += Record.INTERNAL_FORMAT % "AUTHORS" 565 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 566 return output
567
568 - def _consrtm_line(self):
569 """Output for CONSRTM information. 570 """ 571 output = "" 572 if self.consrtm: 573 output += Record.INTERNAL_FORMAT % "CONSRTM" 574 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 575 return output
576
577 - def _title_line(self):
578 """Output for TITLE information. 579 """ 580 output = "" 581 if self.title: 582 output += Record.INTERNAL_FORMAT % "TITLE" 583 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 584 return output
585
586 - def _journal_line(self):
587 """Output for JOURNAL information. 588 """ 589 output = "" 590 if self.journal: 591 output += Record.INTERNAL_FORMAT % "JOURNAL" 592 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 593 return output
594
595 - def _medline_line(self):
596 """Output for MEDLINE information. 597 """ 598 output = "" 599 if self.medline_id: 600 output += Record.INTERNAL_FORMAT % "MEDLINE" 601 output += self.medline_id + "\n" 602 return output
603
604 - def _pubmed_line(self):
605 """Output for PUBMED information. 606 """ 607 output = "" 608 if self.pubmed_id: 609 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 610 output += self.pubmed_id + "\n" 611 return output
612
613 - def _remark_line(self):
614 """Output for REMARK information. 615 """ 616 output = "" 617 if self.remark: 618 output += Record.INTERNAL_FORMAT % "REMARK" 619 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 620 return output
621 622
623 -class Feature(object):
624 """Hold information about a Feature in the Feature Table of GenBank record. 625 626 Attributes: 627 o key - The key name of the featue (ie. source) 628 o location - The string specifying the location of the feature. 629 o qualfiers - A listing Qualifier objects in the feature. 630 """
631 - def __init__(self):
632 self.key = '' 633 self.location = '' 634 self.qualifiers = []
635
636 - def __str__(self):
637 output = Record.INTERNAL_FEATURE_FORMAT % self.key 638 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 639 split_char = ',') 640 for qualifier in self.qualifiers: 641 output += " " * Record.GB_FEATURE_INDENT 642 643 # determine whether we can wrap on spaces 644 space_wrap = 1 645 for no_space_key in \ 646 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 647 if no_space_key in qualifier.key: 648 space_wrap = 0 649 650 output += _wrapped_genbank(qualifier.key + qualifier.value, 651 Record.GB_FEATURE_INDENT, space_wrap) 652 return output
653 654
655 -class Qualifier(object):
656 """Hold information about a qualifier in a GenBank feature. 657 658 Attributes: 659 o key - The key name of the qualifier (ie. /organism=) 660 o value - The value of the qualifier ("Dictyostelium discoideum"). 661 """
662 - def __init__(self):
663 self.key = '' 664 self.value = ''
665