Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  """Hold GenBank data in a straightforward format. 
  2   
  3  classes: 
  4  o Record - All of the information in a GenBank record. 
  5  o Reference - hold reference data for a record. 
  6  o Feature - Hold the information in a Feature Table. 
  7  o Qualifier - Qualifiers on a Feature. 
  8  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
  9  """ 
 10  # local stuff 
 11  import Bio.GenBank 
 12   
 13   
14 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
15 """Write a line of GenBank info that can wrap over multiple lines. 16 17 This takes a line of information which can potentially wrap over 18 multiple lines, and breaks it up with carriage returns and 19 indentation so it fits properly into a GenBank record. 20 21 Arguments: 22 23 o information - The string holding the information we want 24 wrapped in GenBank method. 25 26 o indent - The indentation on the lines we are writing. 27 28 o wrap_space - Whether or not to wrap only on spaces in the 29 information. 30 31 o split_char - A specific character to split the lines on. By default 32 spaces are used. 33 """ 34 info_length = Record.GB_LINE_LENGTH - indent 35 36 if not information: 37 #GenBank files use "." for missing data 38 return ".\n" 39 40 if wrap_space: 41 info_parts = information.split(split_char) 42 else: 43 cur_pos = 0 44 info_parts = [] 45 while cur_pos < len(information): 46 info_parts.append(information[cur_pos: cur_pos + info_length]) 47 cur_pos += info_length 48 49 # first get the information string split up by line 50 output_parts = [] 51 cur_part = "" 52 for info_part in info_parts: 53 if len(cur_part) + 1 + len(info_part) > info_length: 54 if cur_part: 55 if split_char != " ": 56 cur_part += split_char 57 output_parts.append(cur_part) 58 cur_part = info_part 59 else: 60 if cur_part == "": 61 cur_part = info_part 62 else: 63 cur_part += split_char + info_part 64 65 # add the last bit of information to the output 66 if cur_part: 67 output_parts.append(cur_part) 68 69 # now format the information string for return 70 output_info = output_parts[0] + "\n" 71 for output_part in output_parts[1:]: 72 output_info += " " * indent + output_part + "\n" 73 74 return output_info
75 76
77 -def _indent_genbank(information, indent):
78 """Write out information with the specified indent. 79 80 Unlike _wrapped_genbank, this function makes no attempt to wrap 81 lines -- it assumes that the information already has newlines in the 82 appropriate places, and will add the specified indent to the start of 83 each line. 84 """ 85 # split the info into lines based on line breaks 86 info_parts = information.split("\n") 87 88 # the first line will have no indent 89 output_info = info_parts[0] + "\n" 90 for info_part in info_parts[1:]: 91 output_info += " " * indent + info_part + "\n" 92 93 return output_info
94 95
96 -class Record(object):
97 """Hold GenBank information in a format similar to the original record. 98 99 The Record class is meant to make data easy to get to when you are 100 just interested in looking at GenBank data. 101 102 Attributes: 103 o locus - The name specified after the LOCUS keyword in the GenBank 104 record. This may be the accession number, or a clone id or something else. 105 o size - The size of the record. 106 o residue_type - The type of residues making up the sequence in this 107 record. Normally something like RNA, DNA or PROTEIN, but may be as 108 esoteric as 'ss-RNA circular'. 109 o data_file_division - The division this record is stored under in 110 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 111 o date - The date of submission of the record, in a form like '28-JUL-1998' 112 o accession - list of all accession numbers for the sequence. 113 o nid - Nucleotide identifier number. 114 o pid - Proteint identifier number 115 o version - The accession number + version (ie. AB01234.2) 116 o db_source - Information about the database the record came from 117 o gi - The NCBI gi identifier for the record. 118 o keywords - A list of keywords related to the record. 119 o segment - If the record is one of a series, this is info about which 120 segment this record is (something like '1 of 6'). 121 o source - The source of material where the sequence came from. 122 o organism - The genus and species of the organism (ie. 'Homo sapiens') 123 o taxonomy - A listing of the taxonomic classification of the organism, 124 starting general and getting more specific. 125 o references - A list of Reference objects. 126 o comment - Text with any kind of comment about the record. 127 o features - A listing of Features making up the feature table. 128 o base_counts - A string with the counts of bases for the sequence. 129 o origin - A string specifying info about the origin of the sequence. 130 o sequence - A string with the sequence itself. 131 o contig - A string of location information for a CONTIG in a RefSeq file 132 o project - The genome sequencing project numbers 133 (will be replaced by the dblink cross-references in 2009). 134 o dblinks - The genome sequencing project number(s) and other links. 135 (will replace the project information in 2009). 136 """ 137 # constants for outputting GenBank information 138 GB_LINE_LENGTH = 79 139 GB_BASE_INDENT = 12 140 GB_FEATURE_INDENT = 21 141 GB_INTERNAL_INDENT = 2 142 GB_OTHER_INTERNAL_INDENT = 3 143 GB_FEATURE_INTERNAL_INDENT = 5 144 GB_SEQUENCE_INDENT = 9 145 146 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 147 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 148 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 149 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 150 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 151 "s" 152 153 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 154 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 155 str(GB_FEATURE_INDENT - 156 GB_FEATURE_INTERNAL_INDENT) + "s" 157 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 158
159 - def __init__(self):
160 self.locus = '' 161 self.size = '' 162 self.residue_type = '' 163 self.data_file_division = '' 164 self.date = '' 165 self.definition = '' 166 self.accession = [] 167 self.nid = '' 168 self.pid = '' 169 self.version = '' 170 self.projects = [] 171 self.dblinks = [] 172 self.db_source = '' 173 self.gi = '' 174 self.keywords = [] 175 self.segment = '' 176 self.source = '' 177 self.organism = '' 178 self.taxonomy = [] 179 self.references = [] 180 self.comment = '' 181 self.features = [] 182 self.base_counts = '' 183 self.origin = '' 184 self.sequence = '' 185 self.contig = '' 186 self.primary=[] 187 self.wgs = '' 188 self.wgs_scafld = []
189
190 - def __str__(self):
191 """Provide a GenBank formatted output option for a Record. 192 193 The objective of this is to provide an easy way to read in a GenBank 194 record, modify it somehow, and then output it in 'GenBank format.' 195 We are striving to make this work so that a parsed Record that is 196 output using this function will look exactly like the original 197 record. 198 199 Much of the output is based on format description info at: 200 201 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 202 """ 203 output = self._locus_line() 204 output += self._definition_line() 205 output += self._accession_line() 206 output += self._version_line() 207 output += self._project_line() 208 output += self._dblink_line() 209 output += self._nid_line() 210 output += self._pid_line() 211 output += self._keywords_line() 212 output += self._db_source_line() 213 output += self._segment_line() 214 output += self._source_line() 215 output += self._organism_line() 216 for reference in self.references: 217 output += str(reference) 218 output += self._comment_line() 219 output += self._features_line() 220 for feature in self.features: 221 output += str(feature) 222 output += self._base_count_line() 223 output += self._origin_line() 224 output += self._sequence_line() 225 output += self._wgs_line() 226 output += self._wgs_scafld_line() 227 output += self._contig_line() 228 output += "//" 229 return output
230
231 - def _locus_line(self):
232 """Provide the output string for the LOCUS line. 233 """ 234 output = "LOCUS" 235 output += " " * 7 # 6-12 spaces 236 output += "%-9s" % self.locus 237 output += " " # 22 space 238 output += "%7s" % self.size 239 if "PROTEIN" in self.residue_type: 240 output += " aa" 241 else: 242 output += " bp " 243 244 # treat circular types differently, since they'll have long residue 245 # types 246 if "circular" in self.residue_type: 247 output += "%17s" % self.residue_type 248 # second case: ss-DNA types of records 249 elif "-" in self.residue_type: 250 output += "%7s" % self.residue_type 251 output += " " * 10 # spaces for circular 252 else: 253 output += " " * 3 # spaces for stuff like ss- 254 output += "%-4s" % self.residue_type 255 output += " " * 10 # spaces for circular 256 257 output += " " * 2 258 output += "%3s" % self.data_file_division 259 output += " " * 7 # spaces for 56-63 260 output += "%11s" % self.date 261 output += "\n" 262 return output
263
264 - def _definition_line(self):
265 """Provide output for the DEFINITION line. 266 """ 267 output = Record.BASE_FORMAT % "DEFINITION" 268 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 269 return output
270
271 - def _accession_line(self):
272 """Output for the ACCESSION line. 273 """ 274 if self.accession: 275 output = Record.BASE_FORMAT % "ACCESSION" 276 277 acc_info = "" 278 for accession in self.accession: 279 acc_info += "%s " % accession 280 # strip off an extra space at the end 281 acc_info = acc_info.rstrip() 282 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 283 else: 284 output = "" 285 286 return output
287
288 - def _version_line(self):
289 """Output for the VERSION line. 290 """ 291 if self.version: 292 output = Record.BASE_FORMAT % "VERSION" 293 output += self.version 294 output += " GI:" 295 output += "%s\n" % self.gi 296 else: 297 output = "" 298 return output
299
300 - def _project_line(self):
301 output = "" 302 if len(self.projects) > 0: 303 output = Record.BASE_FORMAT % "PROJECT" 304 output += "%s\n" % " ".join(self.projects) 305 return output
306 314
315 - def _nid_line(self):
316 """Output for the NID line. Use of NID is obsolete in GenBank files. 317 """ 318 if self.nid: 319 output = Record.BASE_FORMAT % "NID" 320 output += "%s\n" % self.nid 321 else: 322 output = "" 323 return output
324
325 - def _pid_line(self):
326 """Output for PID line. Presumedly, PID usage is also obsolete. 327 """ 328 if self.pid: 329 output = Record.BASE_FORMAT % "PID" 330 output += "%s\n" % self.pid 331 else: 332 output = "" 333 return output
334
335 - def _keywords_line(self):
336 """Output for the KEYWORDS line. 337 """ 338 output = "" 339 if len(self.keywords) >= 0: 340 output += Record.BASE_FORMAT % "KEYWORDS" 341 keyword_info = "" 342 for keyword in self.keywords: 343 keyword_info += "%s; " % keyword 344 # replace the ; at the end with a period 345 keyword_info = keyword_info[:-2] 346 keyword_info += "." 347 348 output += _wrapped_genbank(keyword_info, 349 Record.GB_BASE_INDENT) 350 351 return output
352
353 - def _db_source_line(self):
354 """Output for DBSOURCE line. 355 """ 356 if self.db_source: 357 output = Record.BASE_FORMAT % "DBSOURCE" 358 output += "%s\n" % self.db_source 359 else: 360 output = "" 361 return output
362
363 - def _segment_line(self):
364 """Output for the SEGMENT line. 365 """ 366 output = "" 367 if self.segment: 368 output += Record.BASE_FORMAT % "SEGMENT" 369 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 370 return output
371
372 - def _source_line(self):
373 """Output for SOURCE line on where the sample came from. 374 """ 375 output = Record.BASE_FORMAT % "SOURCE" 376 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 377 return output
378
379 - def _organism_line(self):
380 """Output for ORGANISM line with taxonomy info. 381 """ 382 output = Record.INTERNAL_FORMAT % "ORGANISM" 383 # Now that species names can be too long, this line can wrap (Bug 2591) 384 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 385 output += " " * Record.GB_BASE_INDENT 386 taxonomy_info = "" 387 for tax in self.taxonomy: 388 taxonomy_info += "%s; " % tax 389 # replace the ; at the end with a period 390 taxonomy_info = taxonomy_info[:-2] 391 taxonomy_info += "." 392 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 393 394 return output
395
396 - def _comment_line(self):
397 """Output for the COMMENT lines. 398 """ 399 output = "" 400 if self.comment: 401 output += Record.BASE_FORMAT % "COMMENT" 402 output += _indent_genbank(self.comment, 403 Record.GB_BASE_INDENT) 404 return output
405
406 - def _features_line(self):
407 """Output for the FEATURES line. 408 """ 409 output = "" 410 if len(self.features) > 0: 411 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 412 output += "Location/Qualifiers\n" 413 return output
414
415 - def _base_count_line(self):
416 """Output for the BASE COUNT line with base information. 417 """ 418 output = "" 419 if self.base_counts: 420 output += Record.BASE_FORMAT % "BASE COUNT " 421 # split up the base counts into their individual parts 422 count_parts = self.base_counts.split(" ") 423 while '' in count_parts: 424 count_parts.remove('') 425 # deal with the standard case, with a normal origin line 426 # like: 474 a 356 c 428 g 364 t 427 if len(count_parts) % 2 == 0: 428 while len(count_parts) > 0: 429 count_info = count_parts.pop(0) 430 count_type = count_parts.pop(0) 431 432 output += "%7s %s" % (count_info, count_type) 433 # deal with ugly ORIGIN lines like: 434 # 1311257 a2224835 c2190093 g1309889 t 435 # by just outputting the raw information 436 else: 437 output += self.base_counts 438 output += "\n" 439 return output
440
441 - def _origin_line(self):
442 """Output for the ORIGIN line 443 """ 444 output = "" 445 # only output the ORIGIN line if we have a sequence 446 if self.sequence: 447 output += Record.BASE_FORMAT % "ORIGIN" 448 if self.origin: 449 output += _wrapped_genbank(self.origin, 450 Record.GB_BASE_INDENT) 451 else: 452 output += "\n" 453 return output
454
455 - def _sequence_line(self):
456 """Output for all of the sequence. 457 """ 458 output = "" 459 if self.sequence: 460 cur_seq_pos = 0 461 while cur_seq_pos < len(self.sequence): 462 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 463 464 for section in range(6): 465 start_pos = cur_seq_pos + section * 10 466 end_pos = start_pos + 10 467 seq_section = self.sequence[start_pos:end_pos] 468 output += " %s" % seq_section.lower() 469 470 # stop looping if we are out of sequence 471 if end_pos > len(self.sequence): 472 break 473 474 output += "\n" 475 cur_seq_pos += 60 476 return output
477
478 - def _wgs_line(self):
479 output = "" 480 if self.wgs: 481 output += Record.BASE_FORMAT % "WGS" 482 output += self.wgs 483 return output
484
485 - def _wgs_scafld_line(self):
486 output = "" 487 if self.wgs_scafld: 488 output += Record.BASE_FORMAT % "WGS_SCAFLD" 489 output += self.wgs_scafld 490 return output
491
492 - def _contig_line(self):
493 """Output for CONTIG location information from RefSeq. 494 """ 495 output = "" 496 if self.contig: 497 output += Record.BASE_FORMAT % "CONTIG" 498 output += _wrapped_genbank(self.contig, 499 Record.GB_BASE_INDENT, split_char = ',') 500 return output
501 502
503 -class Reference(object):
504 """Hold information from a GenBank reference. 505 506 Attributes: 507 o number - The number of the reference in the listing of references. 508 o bases - The bases in the sequence the reference refers to. 509 o authors - String with all of the authors. 510 o consrtm - Consortium the authors belong to. 511 o title - The title of the reference. 512 o journal - Information about the journal where the reference appeared. 513 o medline_id - The medline id for the reference. 514 o pubmed_id - The pubmed_id for the reference. 515 o remark - Free-form remarks about the reference. 516 """
517 - def __init__(self):
518 self.number = '' 519 self.bases = '' 520 self.authors = '' 521 self.consrtm = '' 522 self.title = '' 523 self.journal = '' 524 self.medline_id = '' 525 self.pubmed_id = '' 526 self.remark = ''
527
528 - def __str__(self):
529 output = self._reference_line() 530 output += self._authors_line() 531 output += self._consrtm_line() 532 output += self._title_line() 533 output += self._journal_line() 534 output += self._medline_line() 535 output += self._pubmed_line() 536 output += self._remark_line() 537 538 return output
539
540 - def _reference_line(self):
541 """Output for REFERENCE lines. 542 """ 543 output = Record.BASE_FORMAT % "REFERENCE" 544 if self.number: 545 if self.bases: 546 output += "%-3s" % self.number 547 output += "%s" % self.bases 548 else: 549 output += "%s" % self.number 550 551 output += "\n" 552 return output
553
554 - def _authors_line(self):
555 """Output for AUTHORS information. 556 """ 557 output = "" 558 if self.authors: 559 output += Record.INTERNAL_FORMAT % "AUTHORS" 560 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 561 return output
562
563 - def _consrtm_line(self):
564 """Output for CONSRTM information. 565 """ 566 output = "" 567 if self.consrtm: 568 output += Record.INTERNAL_FORMAT % "CONSRTM" 569 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 570 return output
571
572 - def _title_line(self):
573 """Output for TITLE information. 574 """ 575 output = "" 576 if self.title: 577 output += Record.INTERNAL_FORMAT % "TITLE" 578 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 579 return output
580
581 - def _journal_line(self):
582 """Output for JOURNAL information. 583 """ 584 output = "" 585 if self.journal: 586 output += Record.INTERNAL_FORMAT % "JOURNAL" 587 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 588 return output
589
590 - def _medline_line(self):
591 """Output for MEDLINE information. 592 """ 593 output = "" 594 if self.medline_id: 595 output += Record.INTERNAL_FORMAT % "MEDLINE" 596 output += self.medline_id + "\n" 597 return output
598
599 - def _pubmed_line(self):
600 """Output for PUBMED information. 601 """ 602 output = "" 603 if self.pubmed_id: 604 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 605 output += self.pubmed_id + "\n" 606 return output
607
608 - def _remark_line(self):
609 """Output for REMARK information. 610 """ 611 output = "" 612 if self.remark: 613 output += Record.INTERNAL_FORMAT % "REMARK" 614 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 615 return output
616 617
618 -class Feature(object):
619 """Hold information about a Feature in the Feature Table of GenBank record. 620 621 Attributes: 622 o key - The key name of the featue (ie. source) 623 o location - The string specifying the location of the feature. 624 o qualfiers - A listing Qualifier objects in the feature. 625 """
626 - def __init__(self):
627 self.key = '' 628 self.location = '' 629 self.qualifiers = []
630
631 - def __str__(self):
632 output = Record.INTERNAL_FEATURE_FORMAT % self.key 633 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 634 split_char = ',') 635 for qualifier in self.qualifiers: 636 output += " " * Record.GB_FEATURE_INDENT 637 638 # determine whether we can wrap on spaces 639 space_wrap = 1 640 for no_space_key in \ 641 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 642 if no_space_key in qualifier.key: 643 space_wrap = 0 644 645 output += _wrapped_genbank(qualifier.key + qualifier.value, 646 Record.GB_FEATURE_INDENT, space_wrap) 647 return output
648 649
650 -class Qualifier(object):
651 """Hold information about a qualifier in a GenBank feature. 652 653 Attributes: 654 o key - The key name of the qualifier (ie. /organism=) 655 o value - The value of the qualifier ("Dictyostelium discoideum"). 656 """
657 - def __init__(self):
658 self.key = '' 659 self.value = ''
660