Package Bio :: Package UniProt :: Module GOA
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniProt.GOA

  1  #!/usr/bin/env python 
  2  # Copyright 2013 by Iddo Friedberg idoerg@gmail.com 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Parsers for the GAF, GPA and GPI formats from UniProt-GOA. 
  8   
  9  Uniprot-GOA README + GAF format description: 
 10  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README 
 11   
 12  GAF formats: 
 13  http://www.geneontology.org/GO.format.annotation.shtml 
 14  gp_association (GPA format) README: 
 15  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_association_readme 
 16   
 17  gp_information (GPI format) README: 
 18  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_information_readme 
 19  """ 
 20   
 21  import copy 
 22  import sys 
 23   
 24  from Bio._py3k import zip 
 25   
 26  # GAF: GO Annotation Format 
 27  # 
 28  # GAF version 2.0 
 29   
 30  GAF20FIELDS = ['DB',  
 31          'DB_Object_ID',  
 32          'DB_Object_Symbol',  
 33          'Qualifier',  
 34          'GO_ID',  
 35          'DB:Reference',  
 36          'Evidence',  
 37          'With',  
 38          'Aspect', 
 39          'DB_Object_Name',  
 40          'Synonym',  
 41          'DB_Object_Type',  
 42          'Taxon_ID',  
 43          'Date',  
 44          'Assigned_By',  
 45          'Annotation_Extension',  
 46          'Gene_Product_Form_ID'] 
 47   
 48  # GAF version 1.0 
 49  GAF10FIELDS = ['DB',  
 50          'DB_Object_ID',  
 51          'DB_Object_Symbol',  
 52          'Qualifier',  
 53          'GO_ID',  
 54          'DB:Reference',  
 55          'Evidence',  
 56          'With',  
 57          'Aspect', 
 58          'DB_Object_Name',  
 59          'Synonym',  
 60          'DB_Object_Type',  
 61          'Taxon_ID',  
 62          'Date',  
 63          'Assigned_By']  
 64   
 65   
 66  # GPA version 1.0 
 67  GPA10FIELDS = [ 
 68        'DB', 
 69        'DB_Object_ID', 
 70        'Qualifier', 
 71        'GO_ID', 
 72        'DB:Reference', 
 73        'Evidence code', 
 74        'With', 
 75        'Interacting_taxon_ID', 
 76        'Date', 
 77        'Assigned_by', 
 78        'Annotation_Extension', 
 79        'Spliceform_ID'] 
 80   
 81  # GPA version 1.1 
 82  GPA11FIELDS = [ 
 83        'DB', 
 84        'DB_Object_ID', 
 85        'Qualifier', 
 86        'GO_ID', 
 87        'DB:Reference', 
 88        'ECO_Evidence_code', 
 89        'With', 
 90        'Interacting_taxon_ID', 
 91        'Date', 
 92        'Assigned_by', 
 93        'Annotation Extension', 
 94        'Annotation_Properties'] 
 95   
 96  # GPI version 1.0 
 97  GPI10FIELDS = [ 
 98        'DB', 
 99        'DB_subset', 
100        'DB_Object_ID', 
101        'DB_Object_Symbol', 
102        'DB_Object_Name', 
103        'DB_Object_Synonym', 
104        'DB_Object_Type', 
105        'Taxon', 
106        'Annotation_Target_Set', 
107        'Annotation_Completed', 
108        'Parent_Object_ID'] 
109   
110  # GPI version 1.1 
111  GPI11FIELDS = [ 
112        'DB_Object_ID', 
113        'DB_Object_Symbol', 
114        'DB_Object_Name', 
115        'DB_Object_Synonym', 
116        'DB_Object_Type', 
117        'Taxon', 
118        'Parent_Object_ID', 
119        'DB_Xref', 
120        'Gene_Product_Properties', 
121        'Annotation_Target_Set', 
122        'GO_Annotation_Complete'] 
123   
124 -def _gpi10iterator(handle):
125 """Read GPI 1.0 format files (PRIVATE). 126 127 This iterator is used to read a gp_information.goa_uniprot 128 file which is in the GPI 1.0 format. 129 """ 130 for inline in handle: 131 if inline[0] == '!': 132 continue 133 inrec = inline.rstrip('\n').split('\t') 134 if len(inrec) == 1: 135 continue 136 inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) 137 inrec[8] = inrec[8].split('|') # Annotation_Target_Set 138 yield dict(zip(GPI10FIELDS, inrec))
139
140 -def _gpi11iterator(handle):
141 """Read GPI 1.0 format files (PRIVATE). 142 143 This iterator is used to read a gp_information.goa_uniprot 144 file which is in the GPI 1.0 format. 145 """ 146 for inline in handle: 147 if inline[0] == '!': 148 continue 149 inrec = inline.rstrip('\n').split('\t') 150 if len(inrec) == 1: 151 continue 152 inrec[2] = inrec[2].split('|') # DB_Object_Name 153 inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) 154 inrec[7] = inrec[7].split('|') # DB_Xref(s) 155 inrec[8] = inrec[8].split('|') # Properties 156 yield dict(zip(GPI11FIELDS, inrec))
157
158 -def gpi_iterator(handle):
159 """Read GPI format files. 160 161 This function should be called to read a 162 gp_information.goa_uniprot file. At the moment, there is 163 only one format, but this may change, so 164 this function is a placeholder a future wrapper. 165 """ 166 inline = handle.readline() 167 if inline.strip() == '!gpi-version: 1.1': 168 sys.stderr.write("gpi 1.1\n") 169 return _gpi11iterator(handle) 170 else: 171 sys.stderr.write("gpi 1.0\n") 172 return _gpi10iterator(handle)
173 174
175 -def _gpa10iterator(handle):
176 """Read GPA 1.0 format files (PRIVATE). 177 178 This iterator is used to read a gp_association.* 179 file which is in the GPA 1.0 format. Do not call directly. Rather, 180 use the gpaiterator function. 181 """ 182 183 for inline in handle: 184 if inline[0] == '!': 185 continue 186 inrec = inline.rstrip('\n').split('\t') 187 if len(inrec) == 1: 188 continue 189 inrec[2] = inrec[2].split('|') # Qualifier 190 inrec[4] = inrec[4].split('|') # DB:Reference(s) 191 inrec[6] = inrec[6].split('|') # With 192 inrec[10] = inrec[10].split('|') # Annotation extension 193 yield dict(zip(GPA10FIELDS, inrec))
194 195
196 -def _gpa11iterator(handle):
197 """Read GPA 1.1 format files (PRIVATE). 198 199 This iterator is used to read a gp_association.goa_uniprot 200 file which is in the GPA 1.1 format. Do not call directly. Rather 201 use the gpa_iterator function 202 """ 203 for inline in handle: 204 if inline[0] == '!': 205 continue 206 inrec = inline.rstrip('\n').split('\t') 207 if len(inrec) == 1: 208 continue 209 inrec[2] = inrec[2].split('|') # Qualifier 210 inrec[4] = inrec[4].split('|') # DB:Reference(s) 211 inrec[6] = inrec[6].split('|') # With 212 inrec[10] = inrec[10].split('|') # Annotation extension 213 yield dict(zip(GPA11FIELDS, inrec))
214 215
216 -def gpa_iterator(handle):
217 """Wrapper function: read GPA format files. 218 219 This function should be called to read a 220 gene_association.goa_uniprot file. Reads the first record and 221 returns a gpa 1.1 or a gpa 1.0 iterator as needed 222 """ 223 inline = handle.readline() 224 if inline.strip() == '!gpa-version: 1.1': 225 sys.stderr.write("gpa 1.1\n") 226 return _gpa11iterator(handle) 227 else: 228 sys.stderr.write("gpa 1.0\n") 229 return _gpa10iterator(handle)
230 231
232 -def _gaf20iterator(handle):
233 for inline in handle: 234 if inline[0] == '!': 235 continue 236 inrec = inline.rstrip('\n').split('\t') 237 if len(inrec) == 1: 238 continue 239 inrec[3] = inrec[3].split('|') #Qualifier 240 inrec[5] = inrec[5].split('|') # DB:reference(s) 241 inrec[7] = inrec[7].split('|') # With || From 242 inrec[10] = inrec[10].split('|') # Synonym 243 inrec[12] = inrec[12].split('|') # Taxon 244 yield dict(zip(GAF20FIELDS, inrec))
245 246
247 -def _gaf10iterator(handle):
248 for inline in handle: 249 if inline[0] == '!': 250 continue 251 inrec = inline.rstrip('\n').split('\t') 252 if len(inrec) == 1: 253 continue 254 inrec[3] = inrec[3].split('|') #Qualifier 255 inrec[5] = inrec[5].split('|') # DB:reference(s) 256 inrec[7] = inrec[7].split('|') # With || From 257 inrec[10] = inrec[10].split('|') # Synonym 258 inrec[12] = inrec[12].split('|') # Taxon 259 yield dict(zip(GAF10FIELDS, inrec))
260 261
262 -def _gaf10byproteiniterator(handle):
263 cur_id = None 264 id_rec_list = [] 265 for inline in handle: 266 if inline[0] == '!': 267 continue 268 inrec = inline.rstrip('\n').split('\t') 269 if len(inrec) == 1: 270 continue 271 inrec[3] = inrec[3].split('|') #Qualifier 272 inrec[5] = inrec[5].split('|') # DB:reference(s) 273 inrec[7] = inrec[7].split('|') # With || From 274 inrec[10] = inrec[10].split('|') # Synonym 275 inrec[12] = inrec[12].split('|') # Taxon 276 cur_rec = dict(zip(GAF10FIELDS, inrec)) 277 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 278 ret_list = copy.copy(id_rec_list) 279 id_rec_list = [cur_rec] 280 cur_id = cur_rec['DB_Object_ID'] 281 yield ret_list 282 else: 283 cur_id = cur_rec['DB_Object_ID'] 284 id_rec_list.append(cur_rec)
285 286
287 -def _gaf20byproteiniterator(handle):
288 cur_id = None 289 id_rec_list = [] 290 for inline in handle: 291 if inline[0] == '!': 292 continue 293 inrec = inline.rstrip('\n').split('\t') 294 if len(inrec) == 1: 295 continue 296 inrec[3] = inrec[3].split('|') #Qualifier 297 inrec[5] = inrec[5].split('|') # DB:reference(s) 298 inrec[7] = inrec[7].split('|') # With || From 299 inrec[10] = inrec[10].split('|') # Synonym 300 inrec[12] = inrec[12].split('|') # Taxon 301 cur_rec = dict(zip(GAF20FIELDS, inrec)) 302 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 303 ret_list = copy.copy(id_rec_list) 304 id_rec_list = [cur_rec] 305 cur_id = cur_rec['DB_Object_ID'] 306 yield ret_list 307 else: 308 cur_id = cur_rec['DB_Object_ID'] 309 id_rec_list.append(cur_rec)
310 311
312 -def gafbyproteiniterator(handle):
313 """Iterates over records in a gene association file. 314 315 Returns a list of all consecutive records with the same DB_Object_ID 316 This function should be called to read a 317 gene_association.goa_uniprot file. Reads the first record and 318 returns a gaf 2.0 or a gaf 1.0 iterator as needed 319 """ 320 inline = handle.readline() 321 if inline.strip() == '!gaf-version: 2.0': 322 sys.stderr.write("gaf 2.0\n") 323 return _gaf20byproteiniterator(handle) 324 else: 325 sys.stderr.write("gaf 1.0\n") 326 return _gaf10byproteiniterator(handle)
327 328
329 -def gafiterator(handle):
330 """Iterate pver a GAF 1.0 or 2.0 file. 331 332 This function should be called to read a 333 gene_association.goa_uniprot file. Reads the first record and 334 returns a gaf 2.0 or a gaf 1.0 iterator as needed 335 """ 336 inline = handle.readline() 337 if inline.strip() == '!gaf-version: 2.0': 338 sys.stderr.write("gaf 2.0\n") 339 return _gaf20iterator(handle) 340 else: 341 sys.stderr.write("gaf 1.0\n") 342 return _gaf10iterator(handle)
343 344
345 -def writerec(outrec,handle,fields=GAF20FIELDS):
346 """Write a single UniProt-GOA record to an output stream. 347 348 Caller should know the format version. Default: gaf-2.0 349 If header has a value, then it is assumed this is the first record, 350 a header is written. 351 """ 352 outstr = '' 353 for field in fields[:-1]: 354 if isinstance(outrec[field], list): 355 for subfield in outrec[field]: 356 outstr += subfield + '|' 357 outstr = outstr[:-1] + '\t' 358 else: 359 outstr += outrec[field] + '\t' 360 outstr += outrec[fields[-1]] + '\n' 361 handle.write("%s" % outstr)
362 363
364 -def writebyproteinrec(outprotrec,handle,fields=GAF20FIELDS):
365 """Write a list of GAF records to an output stream. 366 367 Caller should know the format version. Default: gaf-2.0 368 If header has a value, then it is assumed this is the first record, 369 a header is written. Typically the list is the one read by fafbyproteinrec, which 370 contains all consecutive lines with the same DB_Object_ID 371 """ 372 for outrec in outprotrec: 373 writerec(outrec, handle, fields=fields)
374 375
376 -def record_has(inrec, fieldvals):
377 """Accepts a record, and a dictionary of field values. 378 379 The format is {'field_name': set([val1, val2])}. 380 If any field in the record has a matching value, the function returns 381 True. Otherwise, returns False. 382 """ 383 retval = False 384 for field in fieldvals: 385 if isinstance(inrec[field], str): 386 set1 = set([inrec[field]]) 387 else: 388 set1 = set(inrec[field]) 389 if (set1 & fieldvals[field]): 390 retval = True 391 break 392 return retval
393 394 395 if __name__ == '__main__': 396 """Example: read and filter a GAF file. 397 398 Write only S. cerevisiae records, but remove all 399 records with IEA evidence 400 """ 401 banned = {'Evidence': set(['IEA', 'EXP'])} 402 allowed = {'Taxon_ID': set(['taxon:4932'])} 403 for inrec in gafiterator(open(sys.argv[1])): 404 if record_has(inrec, allowed) and \ 405 not record_has(inrec, banned): 406 writerec(inrec, sys.stdout, GAF10FIELDS) 407