Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """Parse Unigene flat file format files such as the Hs.data file. 
 14   
 15  Here is an overview of the flat file format that this parser deals with: 
 16   
 17     Line types/qualifiers:: 
 18   
 19         ID           UniGene cluster ID 
 20         TITLE        Title for the cluster 
 21         GENE         Gene symbol 
 22         CYTOBAND     Cytological band 
 23         EXPRESS      Tissues of origin for ESTs in cluster 
 24         RESTR_EXPR   Single tissue or development stage contributes 
 25                      more than half the total EST frequency for this gene. 
 26         GNM_TERMINUS genomic confirmation of presence of a 3' terminus; 
 27                      T if a non-templated polyA tail is found among 
 28                      a cluster's sequences; else 
 29                      I if templated As are found in genomic sequence or 
 30                      S if a canonical polyA signal is found on 
 31                        the genomic sequence 
 32         GENE_ID      Entrez gene identifier associated with at least one 
 33                      sequence in this cluster; 
 34                      to be used instead of LocusLink. 
 35         LOCUSLINK    LocusLink identifier associated with at least one 
 36                      sequence in this cluster; 
 37                      deprecated in favor of GENE_ID 
 38         HOMOL        Homology; 
 39         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping 
 40                      on the arabidopsis genome. 
 41         STS          STS 
 42              ACC=         GenBank/EMBL/DDBJ accession number of STS 
 43                           [optional field] 
 44              UNISTS=      identifier in NCBI's UNISTS database 
 45         TXMAP        Transcript map interval 
 46              MARKER=      Marker found on at least one sequence in this 
 47                           cluster 
 48              RHPANEL=     Radiation Hybrid panel used to place marker 
 49         PROTSIM      Protein Similarity data for the sequence with 
 50                      highest-scoring protein similarity in this cluster 
 51              ORG=         Organism 
 52              PROTGI=      Sequence GI of protein 
 53              PROTID=      Sequence ID of protein 
 54              PCT=         Percent alignment 
 55              ALN=         length of aligned region (aa) 
 56         SCOUNT       Number of sequences in the cluster 
 57         SEQUENCE     Sequence 
 58              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 59              NID=         Unique nucleotide sequence identifier (gi) 
 60              PID=         Unique protein sequence identifier (used for 
 61                           non-ESTs) 
 62              CLONE=       Clone identifier (used for ESTs only) 
 63              END=         End (5'/3') of clone insert read (used for 
 64                           ESTs only) 
 65              LID=         Library ID; see Hs.lib.info for library name 
 66                           and tissue 
 67              MGC=         5' CDS-completeness indicator; if present, the 
 68                           clone associated with this sequence is believed 
 69                           CDS-complete. A value greater than 511 is the gi 
 70                           of the CDS-complete mRNA matched by the EST, 
 71                           otherwise the value is an indicator of the 
 72                           reliability of the test indicating CDS 
 73                           completeness; higher values indicate more 
 74                           reliable CDS-completeness predictions. 
 75             SEQTYPE=      Description of the nucleotide sequence. 
 76                           Possible values are mRNA, EST and HTC. 
 77             TRACE=        The Trace ID of the EST sequence, as provided by 
 78                           NCBI Trace Archive 
 79  """ 
 80   
 81  __docformat__ = "restructuredtext en" 
 82   
 83   
84 -class SequenceLine(object):
85 """Store the information for one SEQUENCE line from a Unigene file 86 87 Initialize with the text part of the SEQUENCE line, or nothing. 88 89 Attributes and descriptions (access as LOWER CASE): 90 91 - ACC= GenBank/EMBL/DDBJ accession number of sequence 92 - NID= Unique nucleotide sequence identifier (gi) 93 - PID= Unique protein sequence identifier (used for non-ESTs) 94 - CLONE= Clone identifier (used for ESTs only) 95 - END= End (5'/3') of clone insert read (used for ESTs only) 96 - LID= Library ID; see Hs.lib.info for library name and tissue 97 - MGC= 5' CDS-completeness indicator; if present, 98 the clone associated with this sequence 99 is believed CDS-complete. A value greater than 511 100 is the gi of the CDS-complete mRNA matched by the EST, 101 otherwise the value is an indicator of the reliability 102 of the test indicating CDS completeness; 103 higher values indicate more reliable CDS-completeness 104 predictions. 105 - SEQTYPE= Description of the nucleotide sequence. Possible values 106 are mRNA, EST and HTC. 107 - TRACE= The Trace ID of the EST sequence, as provided by NCBI 108 Trace Archive 109 """ 110
111 - def __init__(self, text=None):
112 self.acc = '' 113 self.nid = '' 114 self.lid = '' 115 self.pid = '' 116 self.clone = '' 117 self.image = '' 118 self.is_image = False 119 self.end = '' 120 self.mgc = '' 121 self.seqtype = '' 122 self.trace = '' 123 if text is not None: 124 self.text = text 125 self._init_from_text(text)
126
127 - def _init_from_text(self, text):
128 parts = text.split('; ') 129 for part in parts: 130 key, val = part.split("=") 131 if key == 'CLONE': 132 if val[:5] == 'IMAGE': 133 self.is_image = True 134 self.image = val[6:] 135 setattr(self, key.lower(), val)
136
137 - def __repr__(self):
138 return self.text
139 140
141 -class ProtsimLine(object):
142 """Store the information for one PROTSIM line from a Unigene file 143 144 Initialize with the text part of the PROTSIM line, or nothing. 145 146 Attributes and descriptions (access as LOWER CASE) 147 ORG= Organism 148 PROTGI= Sequence GI of protein 149 PROTID= Sequence ID of protein 150 PCT= Percent alignment 151 ALN= length of aligned region (aa) 152 """ 153
154 - def __init__(self, text=None):
155 self.org = '' 156 self.protgi = '' 157 self.protid = '' 158 self.pct = '' 159 self.aln = '' 160 if text is not None: 161 self.text = text 162 self._init_from_text(text)
163
164 - def _init_from_text(self, text):
165 parts = text.split('; ') 166 167 for part in parts: 168 key, val = part.split("=") 169 setattr(self, key.lower(), val)
170
171 - def __repr__(self):
172 return self.text
173 174
175 -class STSLine(object):
176 """Store the information for one STS line from a Unigene file 177 178 Initialize with the text part of the STS line, or nothing. 179 180 Attributes and descriptions (access as LOWER CASE) 181 182 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 183 UNISTS= identifier in NCBI's UNISTS database 184 """ 185
186 - def __init__(self, text=None):
187 self.acc = '' 188 self.unists = '' 189 if text is not None: 190 self.text = text 191 self._init_from_text(text)
192
193 - def _init_from_text(self, text):
194 parts = text.split(' ') 195 196 for part in parts: 197 key, val = part.split("=") 198 setattr(self, key.lower(), val)
199
200 - def __repr__(self):
201 return self.text
202 203
204 -class Record(object):
205 """Store a Unigene record 206 207 Here is what is stored:: 208 209 self.ID = '' # ID line 210 self.species = '' # Hs, Bt, etc. 211 self.title = '' # TITLE line 212 self.symbol = '' # GENE line 213 self.cytoband = '' # CYTOBAND line 214 self.express = [] # EXPRESS line, parsed on ';' 215 # Will be an array of strings 216 self.restr_expr = '' # RESTR_EXPR line 217 self.gnm_terminus = '' # GNM_TERMINUS line 218 self.gene_id = '' # GENE_ID line 219 self.locuslink = '' # LOCUSLINK line 220 self.homol = '' # HOMOL line 221 self.chromosome = '' # CHROMOSOME line 222 self.protsim = [] # PROTSIM entries, array of Protsims 223 # Type ProtsimLine 224 self.sequence = [] # SEQUENCE entries, array of Sequence entries 225 # Type SequenceLine 226 self.sts = [] # STS entries, array of STS entries 227 # Type STSLine 228 self.txmap = [] # TXMAP entries, array of TXMap entries 229 """ 230
231 - def __init__(self):
232 self.ID = '' # ID line 233 self.species = '' # Hs, Bt, etc. 234 self.title = '' # TITLE line 235 self.symbol = '' # GENE line 236 self.cytoband = '' # CYTOBAND line 237 self.express = [] # EXPRESS line, parsed on ';' 238 self.restr_expr = '' # RESTR_EXPR line 239 self.gnm_terminus = '' # GNM_TERMINUS line 240 self.gene_id = '' # GENE_ID line 241 self.locuslink = '' # LOCUSLINK line 242 self.homol = '' # HOMOL line 243 self.chromosome = '' # CHROMOSOME line 244 self.protsim = [] # PROTSIM entries, array of Protsims 245 self.sequence = [] # SEQUENCE entries, array of Sequence entries 246 self.sts = [] # STS entries, array of STS entries 247 self.txmap = [] # TXMAP entries, array of TXMap entries
248
249 - def __repr__(self):
250 return "<%s> %s %s\n%s" % (self.__class__.__name__, 251 self.ID, self.symbol, self.title)
252 253
254 -def parse(handle):
255 while True: 256 record = _read(handle) 257 if not record: 258 return 259 yield record
260 261
262 -def read(handle):
263 record = _read(handle) 264 if not record: 265 raise ValueError("No SwissProt record found") 266 # We should have reached the end of the record by now 267 remainder = handle.read() 268 if remainder: 269 raise ValueError("More than one SwissProt record found") 270 return record
271 272 273 # Everything below is private 274 275
276 -def _read(handle):
277 UG_INDENT = 12 278 record = None 279 for line in handle: 280 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() 281 line = line.rstrip() 282 if tag == "ID": 283 record = Record() 284 record.ID = value 285 record.species = record.ID.split('.')[0] 286 elif tag == "TITLE": 287 record.title = value 288 elif tag == "GENE": 289 record.symbol = value 290 elif tag == "GENE_ID": 291 record.gene_id = value 292 elif tag == "LOCUSLINK": 293 record.locuslink = value 294 elif tag == "HOMOL": 295 if value == "YES": 296 record.homol = True 297 elif value == "NO": 298 record.homol = True 299 else: 300 raise ValueError("Cannot parse HOMOL line %s" % line) 301 elif tag == "EXPRESS": 302 record.express = [word.strip() for word in value.split("|")] 303 elif tag == "RESTR_EXPR": 304 record.restr_expr = [word.strip() for word in value.split("|")] 305 elif tag == "CHROMOSOME": 306 record.chromosome = value 307 elif tag == "CYTOBAND": 308 record.cytoband = value 309 elif tag == "PROTSIM": 310 protsim = ProtsimLine(value) 311 record.protsim.append(protsim) 312 elif tag == "SCOUNT": 313 scount = int(value) 314 elif tag == "SEQUENCE": 315 sequence = SequenceLine(value) 316 record.sequence.append(sequence) 317 elif tag == "STS": 318 sts = STSLine(value) 319 record.sts.append(sts) 320 elif tag == '//': 321 if len(record.sequence) != scount: 322 raise ValueError("The number of sequences specified in the record" 323 " (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))) 324 return record 325 else: 326 raise ValueError("Unknown tag %s" % tag) 327 if record: 328 raise ValueError("Unexpected end of stream.")
329