Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from binascii import crc32 as _crc32 
 14  from Bio._py3k import _as_bytes 
 15   
 16   
17 -def crc32(seq):
18 """Returns the crc32 checksum for a sequence (string or Seq object).""" 19 #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 20 #Docs suggest should use crc32(x) & 0xffffffff for consistency. 21 #TODO - Should we return crc32(x) & 0xffffffff here? 22 try: 23 #Assume its a Seq object 24 return _crc32(_as_bytes(str(seq))) 25 except AttributeError: 26 #Assume its a string/unicode 27 return _crc32(_as_bytes(seq))
28 29
30 -def _init_table_h():
31 _table_h = [] 32 for i in range(256): 33 l = i 34 part_h = 0 35 for j in range(8): 36 rflag = l & 1 37 l >>= 1 38 if part_h & 1: 39 l |= (1L << 31) 40 part_h >>= 1L 41 if rflag: 42 part_h ^= 0xd8000000L 43 _table_h.append(part_h) 44 return _table_h
45 46 # Initialisation 47 _table_h = _init_table_h() 48 49
50 -def crc64(s):
51 """Returns the crc64 checksum for a sequence (string or Seq object).""" 52 crcl = 0 53 crch = 0 54 for c in s: 55 shr = (crch & 0xFF) << 24 56 temp1h = crch >> 8 57 temp1l = (crcl >> 8) | shr 58 idx = (crcl ^ ord(c)) & 0xFF 59 crch = temp1h ^ _table_h[idx] 60 crcl = temp1l 61 62 return "CRC-%08X%08X" % (crch, crcl)
63 64
65 -def gcg(seq):
66 """Returns the GCG checksum (int) for a sequence (string or Seq object). 67 68 Given a nucleotide or amino-acid secuence (or any string), 69 returns the GCG checksum (int). Checksum used by GCG program. 70 seq type = str. 71 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 72 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 73 All sequences are converted to uppercase """ 74 try: 75 #Assume its a Seq object 76 seq = str(seq) 77 except AttributeError: 78 #Assume its a string 79 pass 80 index = checksum = 0 81 for char in seq: 82 index += 1 83 checksum += index * ord(char.upper()) 84 if index == 57: 85 index = 0 86 return checksum % 10000
87 88
89 -def seguid(seq):
90 """Returns the SEGUID (string) for a sequence (string or Seq object). 91 92 Given a nucleotide or amino-acid secuence (or any string), 93 returns the SEGUID string (A SEquence Globally Unique IDentifier). 94 seq type = str. 95 For more information about SEGUID, see: 96 http://bioinformatics.anl.gov/seguid/ 97 DOI: 10.1002/pmic.200600032 """ 98 import hashlib 99 import base64 100 m = hashlib.sha1() 101 try: 102 #Assume it's a Seq object 103 seq = str(seq) 104 except AttributeError: 105 #Assume it's a string 106 pass 107 m.update(_as_bytes(seq.upper())) 108 try: 109 #For Python 3+ 110 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") 111 except AttributeError: 112 pass 113 # For all other Pythons 114 return base64.b64encode(m.digest()).rstrip("=")
115 116 117 if __name__ == "__main__": 118 print "Quick self test" 119 120 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 121 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 122 + "YCSSYAGSSTLVFGGGTKLTVL" 123 124 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 125 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 126 + "YCCSYAGSSTWVFGGGTKLTVL" 127 128 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 129 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 130 131 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 132 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 133 134 print "Done" 135