Package Bio :: Package UniProt :: Module GOA
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniProt.GOA

  1  #!/usr/bin/env python 
  2  # Copyright 2013 by Iddo Friedberg idoerg@gmail.com 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Parsers for the GAF, GPA and GPI formats from UniProt-GOA. 
  8   
  9  Uniprot-GOA README + GAF format description: 
 10  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README 
 11   
 12  GAF formats: 
 13  http://www.geneontology.org/GO.format.annotation.shtml 
 14  gp_association (GPA format) README: 
 15  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_association_readme 
 16   
 17  gp_information (GPI format) README: 
 18  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_information_readme 
 19  """ 
 20   
 21  import copy 
 22  import sys 
 23   
 24  from Bio._py3k import zip 
 25   
 26  # GAF: GO Annotation Format 
 27  # 
 28  # GAF version 2.0 
 29   
 30  GAF20FIELDS = ['DB', 
 31          'DB_Object_ID', 
 32          'DB_Object_Symbol', 
 33          'Qualifier', 
 34          'GO_ID', 
 35          'DB:Reference', 
 36          'Evidence', 
 37          'With', 
 38          'Aspect', 
 39          'DB_Object_Name', 
 40          'Synonym', 
 41          'DB_Object_Type', 
 42          'Taxon_ID', 
 43          'Date', 
 44          'Assigned_By', 
 45          'Annotation_Extension', 
 46          'Gene_Product_Form_ID'] 
 47   
 48  # GAF version 1.0 
 49  GAF10FIELDS = ['DB', 
 50          'DB_Object_ID', 
 51          'DB_Object_Symbol', 
 52          'Qualifier', 
 53          'GO_ID', 
 54          'DB:Reference', 
 55          'Evidence', 
 56          'With', 
 57          'Aspect', 
 58          'DB_Object_Name', 
 59          'Synonym', 
 60          'DB_Object_Type', 
 61          'Taxon_ID', 
 62          'Date', 
 63          'Assigned_By'] 
 64   
 65   
 66  # GPA version 1.0 
 67  GPA10FIELDS = [ 
 68        'DB', 
 69        'DB_Object_ID', 
 70        'Qualifier', 
 71        'GO_ID', 
 72        'DB:Reference', 
 73        'Evidence code', 
 74        'With', 
 75        'Interacting_taxon_ID', 
 76        'Date', 
 77        'Assigned_by', 
 78        'Annotation_Extension', 
 79        'Spliceform_ID'] 
 80   
 81  # GPA version 1.1 
 82  GPA11FIELDS = [ 
 83        'DB', 
 84        'DB_Object_ID', 
 85        'Qualifier', 
 86        'GO_ID', 
 87        'DB:Reference', 
 88        'ECO_Evidence_code', 
 89        'With', 
 90        'Interacting_taxon_ID', 
 91        'Date', 
 92        'Assigned_by', 
 93        'Annotation Extension', 
 94        'Annotation_Properties'] 
 95   
 96  # GPI version 1.0 
 97  GPI10FIELDS = [ 
 98        'DB', 
 99        'DB_subset', 
100        'DB_Object_ID', 
101        'DB_Object_Symbol', 
102        'DB_Object_Name', 
103        'DB_Object_Synonym', 
104        'DB_Object_Type', 
105        'Taxon', 
106        'Annotation_Target_Set', 
107        'Annotation_Completed', 
108        'Parent_Object_ID'] 
109   
110  # GPI version 1.1 
111  GPI11FIELDS = [ 
112        'DB_Object_ID', 
113        'DB_Object_Symbol', 
114        'DB_Object_Name', 
115        'DB_Object_Synonym', 
116        'DB_Object_Type', 
117        'Taxon', 
118        'Parent_Object_ID', 
119        'DB_Xref', 
120        'Gene_Product_Properties', 
121        'Annotation_Target_Set', 
122        'GO_Annotation_Complete'] 
123   
124   
125 -def _gpi10iterator(handle):
126 """Read GPI 1.0 format files (PRIVATE). 127 128 This iterator is used to read a gp_information.goa_uniprot 129 file which is in the GPI 1.0 format. 130 """ 131 for inline in handle: 132 if inline[0] == '!': 133 continue 134 inrec = inline.rstrip('\n').split('\t') 135 if len(inrec) == 1: 136 continue 137 inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) 138 inrec[8] = inrec[8].split('|') # Annotation_Target_Set 139 yield dict(zip(GPI10FIELDS, inrec))
140 141
142 -def _gpi11iterator(handle):
143 """Read GPI 1.0 format files (PRIVATE). 144 145 This iterator is used to read a gp_information.goa_uniprot 146 file which is in the GPI 1.0 format. 147 """ 148 for inline in handle: 149 if inline[0] == '!': 150 continue 151 inrec = inline.rstrip('\n').split('\t') 152 if len(inrec) == 1: 153 continue 154 inrec[2] = inrec[2].split('|') # DB_Object_Name 155 inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) 156 inrec[7] = inrec[7].split('|') # DB_Xref(s) 157 inrec[8] = inrec[8].split('|') # Properties 158 yield dict(zip(GPI11FIELDS, inrec))
159 160
161 -def gpi_iterator(handle):
162 """Read GPI format files. 163 164 This function should be called to read a 165 gp_information.goa_uniprot file. At the moment, there is 166 only one format, but this may change, so 167 this function is a placeholder a future wrapper. 168 """ 169 inline = handle.readline() 170 if inline.strip() == '!gpi-version: 1.1': 171 sys.stderr.write("gpi 1.1\n") 172 return _gpi11iterator(handle) 173 else: 174 sys.stderr.write("gpi 1.0\n") 175 return _gpi10iterator(handle)
176 177
178 -def _gpa10iterator(handle):
179 """Read GPA 1.0 format files (PRIVATE). 180 181 This iterator is used to read a gp_association.* 182 file which is in the GPA 1.0 format. Do not call directly. Rather, 183 use the gpaiterator function. 184 """ 185 186 for inline in handle: 187 if inline[0] == '!': 188 continue 189 inrec = inline.rstrip('\n').split('\t') 190 if len(inrec) == 1: 191 continue 192 inrec[2] = inrec[2].split('|') # Qualifier 193 inrec[4] = inrec[4].split('|') # DB:Reference(s) 194 inrec[6] = inrec[6].split('|') # With 195 inrec[10] = inrec[10].split('|') # Annotation extension 196 yield dict(zip(GPA10FIELDS, inrec))
197 198
199 -def _gpa11iterator(handle):
200 """Read GPA 1.1 format files (PRIVATE). 201 202 This iterator is used to read a gp_association.goa_uniprot 203 file which is in the GPA 1.1 format. Do not call directly. Rather 204 use the gpa_iterator function 205 """ 206 for inline in handle: 207 if inline[0] == '!': 208 continue 209 inrec = inline.rstrip('\n').split('\t') 210 if len(inrec) == 1: 211 continue 212 inrec[2] = inrec[2].split('|') # Qualifier 213 inrec[4] = inrec[4].split('|') # DB:Reference(s) 214 inrec[6] = inrec[6].split('|') # With 215 inrec[10] = inrec[10].split('|') # Annotation extension 216 yield dict(zip(GPA11FIELDS, inrec))
217 218
219 -def gpa_iterator(handle):
220 """Wrapper function: read GPA format files. 221 222 This function should be called to read a 223 gene_association.goa_uniprot file. Reads the first record and 224 returns a gpa 1.1 or a gpa 1.0 iterator as needed 225 """ 226 inline = handle.readline() 227 if inline.strip() == '!gpa-version: 1.1': 228 sys.stderr.write("gpa 1.1\n") 229 return _gpa11iterator(handle) 230 else: 231 sys.stderr.write("gpa 1.0\n") 232 return _gpa10iterator(handle)
233 234
235 -def _gaf20iterator(handle):
236 for inline in handle: 237 if inline[0] == '!': 238 continue 239 inrec = inline.rstrip('\n').split('\t') 240 if len(inrec) == 1: 241 continue 242 inrec[3] = inrec[3].split('|') # Qualifier 243 inrec[5] = inrec[5].split('|') # DB:reference(s) 244 inrec[7] = inrec[7].split('|') # With || From 245 inrec[10] = inrec[10].split('|') # Synonym 246 inrec[12] = inrec[12].split('|') # Taxon 247 yield dict(zip(GAF20FIELDS, inrec))
248 249
250 -def _gaf10iterator(handle):
251 for inline in handle: 252 if inline[0] == '!': 253 continue 254 inrec = inline.rstrip('\n').split('\t') 255 if len(inrec) == 1: 256 continue 257 inrec[3] = inrec[3].split('|') # Qualifier 258 inrec[5] = inrec[5].split('|') # DB:reference(s) 259 inrec[7] = inrec[7].split('|') # With || From 260 inrec[10] = inrec[10].split('|') # Synonym 261 inrec[12] = inrec[12].split('|') # Taxon 262 yield dict(zip(GAF10FIELDS, inrec))
263 264
265 -def _gaf10byproteiniterator(handle):
266 cur_id = None 267 id_rec_list = [] 268 for inline in handle: 269 if inline[0] == '!': 270 continue 271 inrec = inline.rstrip('\n').split('\t') 272 if len(inrec) == 1: 273 continue 274 inrec[3] = inrec[3].split('|') # Qualifier 275 inrec[5] = inrec[5].split('|') # DB:reference(s) 276 inrec[7] = inrec[7].split('|') # With || From 277 inrec[10] = inrec[10].split('|') # Synonym 278 inrec[12] = inrec[12].split('|') # Taxon 279 cur_rec = dict(zip(GAF10FIELDS, inrec)) 280 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 281 ret_list = copy.copy(id_rec_list) 282 id_rec_list = [cur_rec] 283 cur_id = cur_rec['DB_Object_ID'] 284 yield ret_list 285 else: 286 cur_id = cur_rec['DB_Object_ID'] 287 id_rec_list.append(cur_rec)
288 289
290 -def _gaf20byproteiniterator(handle):
291 cur_id = None 292 id_rec_list = [] 293 for inline in handle: 294 if inline[0] == '!': 295 continue 296 inrec = inline.rstrip('\n').split('\t') 297 if len(inrec) == 1: 298 continue 299 inrec[3] = inrec[3].split('|') # Qualifier 300 inrec[5] = inrec[5].split('|') # DB:reference(s) 301 inrec[7] = inrec[7].split('|') # With || From 302 inrec[10] = inrec[10].split('|') # Synonym 303 inrec[12] = inrec[12].split('|') # Taxon 304 cur_rec = dict(zip(GAF20FIELDS, inrec)) 305 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 306 ret_list = copy.copy(id_rec_list) 307 id_rec_list = [cur_rec] 308 cur_id = cur_rec['DB_Object_ID'] 309 yield ret_list 310 else: 311 cur_id = cur_rec['DB_Object_ID'] 312 id_rec_list.append(cur_rec)
313 314
315 -def gafbyproteiniterator(handle):
316 """Iterates over records in a gene association file. 317 318 Returns a list of all consecutive records with the same DB_Object_ID 319 This function should be called to read a 320 gene_association.goa_uniprot file. Reads the first record and 321 returns a gaf 2.0 or a gaf 1.0 iterator as needed 322 """ 323 inline = handle.readline() 324 if inline.strip() == '!gaf-version: 2.0': 325 sys.stderr.write("gaf 2.0\n") 326 return _gaf20byproteiniterator(handle) 327 else: 328 sys.stderr.write("gaf 1.0\n") 329 return _gaf10byproteiniterator(handle)
330 331
332 -def gafiterator(handle):
333 """Iterate pver a GAF 1.0 or 2.0 file. 334 335 This function should be called to read a 336 gene_association.goa_uniprot file. Reads the first record and 337 returns a gaf 2.0 or a gaf 1.0 iterator as needed 338 """ 339 inline = handle.readline() 340 if inline.strip() == '!gaf-version: 2.0': 341 sys.stderr.write("gaf 2.0\n") 342 return _gaf20iterator(handle) 343 else: 344 sys.stderr.write("gaf 1.0\n") 345 return _gaf10iterator(handle)
346 347
348 -def writerec(outrec, handle, fields=GAF20FIELDS):
349 """Write a single UniProt-GOA record to an output stream. 350 351 Caller should know the format version. Default: gaf-2.0 352 If header has a value, then it is assumed this is the first record, 353 a header is written. 354 """ 355 outstr = '' 356 for field in fields[:-1]: 357 if isinstance(outrec[field], list): 358 for subfield in outrec[field]: 359 outstr += subfield + '|' 360 outstr = outstr[:-1] + '\t' 361 else: 362 outstr += outrec[field] + '\t' 363 outstr += outrec[fields[-1]] + '\n' 364 handle.write("%s" % outstr)
365 366
367 -def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
368 """Write a list of GAF records to an output stream. 369 370 Caller should know the format version. Default: gaf-2.0 371 If header has a value, then it is assumed this is the first record, 372 a header is written. Typically the list is the one read by fafbyproteinrec, which 373 contains all consecutive lines with the same DB_Object_ID 374 """ 375 for outrec in outprotrec: 376 writerec(outrec, handle, fields=fields)
377 378
379 -def record_has(inrec, fieldvals):
380 """Accepts a record, and a dictionary of field values. 381 382 The format is {'field_name': set([val1, val2])}. 383 If any field in the record has a matching value, the function returns 384 True. Otherwise, returns False. 385 """ 386 retval = False 387 for field in fieldvals: 388 if isinstance(inrec[field], str): 389 set1 = set([inrec[field]]) 390 else: 391 set1 = set(inrec[field]) 392 if (set1 & fieldvals[field]): 393 retval = True 394 break 395 return retval
396 397 398 if __name__ == '__main__': 399 """Example: read and filter a GAF file. 400 401 Write only S. cerevisiae records, but remove all 402 records with IEA evidence 403 """ 404 banned = {'Evidence': set(['IEA', 'EXP'])} 405 allowed = {'Taxon_ID': set(['taxon:4932'])} 406 for inrec in gafiterator(open(sys.argv[1])): 407 if record_has(inrec, allowed) and \ 408 not record_has(inrec, banned): 409 writerec(inrec, sys.stdout, GAF10FIELDS) 410