Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """Parse Unigene flat file format files such as the Hs.data file. 
 14   
 15  Here is an overview of the flat file format that this parser deals with: 
 16   
 17     Line types/qualifiers:: 
 18   
 19         ID           UniGene cluster ID 
 20         TITLE        Title for the cluster 
 21         GENE         Gene symbol 
 22         CYTOBAND     Cytological band 
 23         EXPRESS      Tissues of origin for ESTs in cluster 
 24         RESTR_EXPR   Single tissue or development stage contributes 
 25                      more than half the total EST frequency for this gene. 
 26         GNM_TERMINUS genomic confirmation of presence of a 3' terminus; 
 27                      T if a non-templated polyA tail is found among 
 28                      a cluster's sequences; else 
 29                      I if templated As are found in genomic sequence or 
 30                      S if a canonical polyA signal is found on 
 31                        the genomic sequence 
 32         GENE_ID      Entrez gene identifier associated with at least one 
 33                      sequence in this cluster; 
 34                      to be used instead of LocusLink. 
 35         LOCUSLINK    LocusLink identifier associated with at least one 
 36                      sequence in this cluster; 
 37                      deprecated in favor of GENE_ID 
 38         HOMOL        Homology; 
 39         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping 
 40                      on the arabidopsis genome. 
 41         STS          STS 
 42              ACC=         GenBank/EMBL/DDBJ accession number of STS 
 43                           [optional field] 
 44              UNISTS=      identifier in NCBI's UNISTS database 
 45         TXMAP        Transcript map interval 
 46              MARKER=      Marker found on at least one sequence in this 
 47                           cluster 
 48              RHPANEL=     Radiation Hybrid panel used to place marker 
 49         PROTSIM      Protein Similarity data for the sequence with 
 50                      highest-scoring protein similarity in this cluster 
 51              ORG=         Organism 
 52              PROTGI=      Sequence GI of protein 
 53              PROTID=      Sequence ID of protein 
 54              PCT=         Percent alignment 
 55              ALN=         length of aligned region (aa) 
 56         SCOUNT       Number of sequences in the cluster 
 57         SEQUENCE     Sequence 
 58              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 59              NID=         Unique nucleotide sequence identifier (gi) 
 60              PID=         Unique protein sequence identifier (used for 
 61                           non-ESTs) 
 62              CLONE=       Clone identifier (used for ESTs only) 
 63              END=         End (5'/3') of clone insert read (used for 
 64                           ESTs only) 
 65              LID=         Library ID; see Hs.lib.info for library name 
 66                           and tissue 
 67              MGC=         5' CDS-completeness indicator; if present, the 
 68                           clone associated with this sequence is believed 
 69                           CDS-complete. A value greater than 511 is the gi 
 70                           of the CDS-complete mRNA matched by the EST, 
 71                           otherwise the value is an indicator of the 
 72                           reliability of the test indicating CDS 
 73                           completeness; higher values indicate more 
 74                           reliable CDS-completeness predictions. 
 75             SEQTYPE=      Description of the nucleotide sequence. 
 76                           Possible values are mRNA, EST and HTC. 
 77             TRACE=        The Trace ID of the EST sequence, as provided by 
 78                           NCBI Trace Archive 
 79  """ 
 80   
 81  __docformat__ = "restructuredtext en" 
 82   
83 -class SequenceLine(object):
84 """Store the information for one SEQUENCE line from a Unigene file 85 86 Initialize with the text part of the SEQUENCE line, or nothing. 87 88 Attributes and descriptions (access as LOWER CASE): 89 90 - ACC= GenBank/EMBL/DDBJ accession number of sequence 91 - NID= Unique nucleotide sequence identifier (gi) 92 - PID= Unique protein sequence identifier (used for non-ESTs) 93 - CLONE= Clone identifier (used for ESTs only) 94 - END= End (5'/3') of clone insert read (used for ESTs only) 95 - LID= Library ID; see Hs.lib.info for library name and tissue 96 - MGC= 5' CDS-completeness indicator; if present, 97 the clone associated with this sequence 98 is believed CDS-complete. A value greater than 511 99 is the gi of the CDS-complete mRNA matched by the EST, 100 otherwise the value is an indicator of the reliability 101 of the test indicating CDS completeness; 102 higher values indicate more reliable CDS-completeness 103 predictions. 104 - SEQTYPE= Description of the nucleotide sequence. Possible values 105 are mRNA, EST and HTC. 106 - TRACE= The Trace ID of the EST sequence, as provided by NCBI 107 Trace Archive 108 """ 109
110 - def __init__(self, text=None):
111 self.acc = '' 112 self.nid = '' 113 self.lid = '' 114 self.pid = '' 115 self.clone = '' 116 self.image = '' 117 self.is_image = False 118 self.end = '' 119 self.mgc = '' 120 self.seqtype = '' 121 self.trace = '' 122 if text is not None: 123 self.text=text 124 self._init_from_text(text)
125
126 - def _init_from_text(self, text):
127 parts = text.split('; ') 128 for part in parts: 129 key, val = part.split("=") 130 if key=='CLONE': 131 if val[:5]=='IMAGE': 132 self.is_image=True 133 self.image = val[6:] 134 setattr(self, key.lower(), val)
135
136 - def __repr__(self):
137 return self.text
138 139
140 -class ProtsimLine(object):
141 """Store the information for one PROTSIM line from a Unigene file 142 143 Initialize with the text part of the PROTSIM line, or nothing. 144 145 Attributes and descriptions (access as LOWER CASE) 146 ORG= Organism 147 PROTGI= Sequence GI of protein 148 PROTID= Sequence ID of protein 149 PCT= Percent alignment 150 ALN= length of aligned region (aa) 151 """ 152
153 - def __init__(self, text=None):
154 self.org = '' 155 self.protgi = '' 156 self.protid = '' 157 self.pct = '' 158 self.aln = '' 159 if text is not None: 160 self.text=text 161 self._init_from_text(text)
162
163 - def _init_from_text(self, text):
164 parts = text.split('; ') 165 166 for part in parts: 167 key, val = part.split("=") 168 setattr(self, key.lower(), val)
169
170 - def __repr__(self):
171 return self.text
172 173
174 -class STSLine(object):
175 """Store the information for one STS line from a Unigene file 176 177 Initialize with the text part of the STS line, or nothing. 178 179 Attributes and descriptions (access as LOWER CASE) 180 181 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 182 UNISTS= identifier in NCBI's UNISTS database 183 """ 184
185 - def __init__(self, text=None):
186 self.acc = '' 187 self.unists = '' 188 if text is not None: 189 self.text=text 190 self._init_from_text(text)
191
192 - def _init_from_text(self, text):
193 parts = text.split(' ') 194 195 for part in parts: 196 key, val = part.split("=") 197 setattr(self, key.lower(), val)
198
199 - def __repr__(self):
200 return self.text
201 202
203 -class Record(object):
204 """Store a Unigene record 205 206 Here is what is stored:: 207 208 self.ID = '' # ID line 209 self.species = '' # Hs, Bt, etc. 210 self.title = '' # TITLE line 211 self.symbol = '' # GENE line 212 self.cytoband = '' # CYTOBAND line 213 self.express = [] # EXPRESS line, parsed on ';' 214 # Will be an array of strings 215 self.restr_expr = '' # RESTR_EXPR line 216 self.gnm_terminus = '' # GNM_TERMINUS line 217 self.gene_id = '' # GENE_ID line 218 self.locuslink = '' # LOCUSLINK line 219 self.homol = '' # HOMOL line 220 self.chromosome = '' # CHROMOSOME line 221 self.protsim = [] # PROTSIM entries, array of Protsims 222 # Type ProtsimLine 223 self.sequence = [] # SEQUENCE entries, array of Sequence entries 224 # Type SequenceLine 225 self.sts = [] # STS entries, array of STS entries 226 # Type STSLine 227 self.txmap = [] # TXMAP entries, array of TXMap entries 228 """ 229
230 - def __init__(self):
231 self.ID = '' # ID line 232 self.species = '' # Hs, Bt, etc. 233 self.title = '' # TITLE line 234 self.symbol = '' # GENE line 235 self.cytoband = '' # CYTOBAND line 236 self.express = [] # EXPRESS line, parsed on ';' 237 self.restr_expr = '' # RESTR_EXPR line 238 self.gnm_terminus = '' # GNM_TERMINUS line 239 self.gene_id = '' # GENE_ID line 240 self.locuslink = '' # LOCUSLINK line 241 self.homol = '' # HOMOL line 242 self.chromosome = '' # CHROMOSOME line 243 self.protsim = [] # PROTSIM entries, array of Protsims 244 self.sequence = [] # SEQUENCE entries, array of Sequence entries 245 self.sts = [] # STS entries, array of STS entries 246 self.txmap = [] # TXMAP entries, array of TXMap entries
247
248 - def __repr__(self):
249 return "<%s> %s %s\n%s" % (self.__class__.__name__, 250 self.ID, self.symbol, self.title)
251 252
253 -def parse(handle):
254 while True: 255 record = _read(handle) 256 if not record: 257 return 258 yield record
259 260
261 -def read(handle):
262 record = _read(handle) 263 if not record: 264 raise ValueError("No SwissProt record found") 265 # We should have reached the end of the record by now 266 remainder = handle.read() 267 if remainder: 268 raise ValueError("More than one SwissProt record found") 269 return record
270 271 272 # Everything below is private 273 274
275 -def _read(handle):
276 UG_INDENT = 12 277 record = None 278 for line in handle: 279 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() 280 line = line.rstrip() 281 if tag=="ID": 282 record = Record() 283 record.ID = value 284 record.species = record.ID.split('.')[0] 285 elif tag=="TITLE": 286 record.title = value 287 elif tag=="GENE": 288 record.symbol = value 289 elif tag=="GENE_ID": 290 record.gene_id = value 291 elif tag=="LOCUSLINK": 292 record.locuslink = value 293 elif tag=="HOMOL": 294 if value=="YES": 295 record.homol = True 296 elif value=="NO": 297 record.homol = True 298 else: 299 raise ValueError("Cannot parse HOMOL line %s" % line) 300 elif tag=="EXPRESS": 301 record.express = [word.strip() for word in value.split("|")] 302 elif tag=="RESTR_EXPR": 303 record.restr_expr = [word.strip() for word in value.split("|")] 304 elif tag=="CHROMOSOME": 305 record.chromosome = value 306 elif tag=="CYTOBAND": 307 record.cytoband = value 308 elif tag=="PROTSIM": 309 protsim = ProtsimLine(value) 310 record.protsim.append(protsim) 311 elif tag=="SCOUNT": 312 scount = int(value) 313 elif tag=="SEQUENCE": 314 sequence = SequenceLine(value) 315 record.sequence.append(sequence) 316 elif tag=="STS": 317 sts = STSLine(value) 318 record.sts.append(sts) 319 elif tag=='//': 320 if len(record.sequence)!=scount: 321 raise ValueError("The number of sequences specified in the record" 322 " (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))) 323 return record 324 else: 325 raise ValueError("Unknown tag %s" % tag) 326 if record: 327 raise ValueError("Unexpected end of stream.")
328