Package Bio :: Package SeqUtils :: Module lcc
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.lcc

  1  # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com 
  2  # All rights reserved.  This code is part of the Biopython 
  3  # distribution and governed by its license. 
  4  # Please see the LICENSE file that should have been included as part 
  5  # of this package. 
  6   
  7  import math 
  8   
  9   
10 -def lcc_mult(seq, wsize):
11 """Local Composition Complexity (LCC) values over sliding window. 12 13 Returns a list of floats, the LCC values for a sliding window over 14 the sequence. 15 16 seq - an unambiguous DNA sequence (a string or Seq object) 17 wsize - window size, integer 18 19 The result is the same as applying lcc_simp multiple times, but this 20 version is optimized for speed. The optimization works by using the 21 value of previous window as a base to compute the next one.""" 22 l2 = math.log(2) 23 tamseq = len(seq) 24 try: 25 #Assume its a string 26 upper = seq.upper() 27 except AttributeError: 28 #Should be a Seq object then 29 upper = str(seq).upper() 30 compone = [0] 31 lccsal = [0] 32 for i in range(wsize): 33 compone.append(((i+1)/float(wsize))* 34 ((math.log((i+1)/float(wsize)))/l2)) 35 window = seq[0:wsize] 36 cant_a = window.count('A') 37 cant_c = window.count('C') 38 cant_t = window.count('T') 39 cant_g = window.count('G') 40 term_a = compone[cant_a] 41 term_c = compone[cant_c] 42 term_t = compone[cant_t] 43 term_g = compone[cant_g] 44 lccsal.append(-(term_a+term_c+term_t+term_g)) 45 tail = seq[0] 46 for x in range(tamseq-wsize): 47 window = upper[x+1:wsize+x+1] 48 if tail == window[-1]: 49 lccsal.append(lccsal[-1]) 50 elif tail == 'A': 51 cant_a -= 1 52 if window.endswith('C'): 53 cant_c += 1 54 term_a = compone[cant_a] 55 term_c = compone[cant_c] 56 lccsal.append(-(term_a+term_c+term_t+term_g)) 57 elif window.endswith('T'): 58 cant_t += 1 59 term_a = compone[cant_a] 60 term_t = compone[cant_t] 61 lccsal.append(-(term_a+term_c+term_t+term_g)) 62 elif window.endswith('G'): 63 cant_g += 1 64 term_a = compone[cant_a] 65 term_g = compone[cant_g] 66 lccsal.append(-(term_a+term_c+term_t+term_g)) 67 elif tail == 'C': 68 cant_c -= 1 69 if window.endswith('A'): 70 cant_a += 1 71 term_a = compone[cant_a] 72 term_c = compone[cant_c] 73 lccsal.append(-(term_a+term_c+term_t+term_g)) 74 elif window.endswith('T'): 75 cant_t += 1 76 term_c = compone[cant_c] 77 term_t = compone[cant_t] 78 lccsal.append(-(term_a+term_c+term_t+term_g)) 79 elif window.endswith('G'): 80 cant_g += 1 81 term_c = compone[cant_c] 82 term_g = compone[cant_g] 83 lccsal.append(-(term_a+term_c+term_t+term_g)) 84 elif tail == 'T': 85 cant_t -= 1 86 if window.endswith('A'): 87 cant_a += 1 88 term_a = compone[cant_a] 89 term_t = compone[cant_t] 90 lccsal.append(-(term_a+term_c+term_t+term_g)) 91 elif window.endswith('C'): 92 cant_c += 1 93 term_c = compone[cant_c] 94 term_t = compone[cant_t] 95 lccsal.append(-(term_a+term_c+term_t+term_g)) 96 elif window.endswith('G'): 97 cant_g += 1 98 term_t = compone[cant_t] 99 term_g = compone[cant_g] 100 lccsal.append(-(term_a+term_c+term_t+term_g)) 101 elif tail == 'G': 102 cant_g -= 1 103 if window.endswith('A'): 104 cant_a += 1 105 term_a = compone[cant_a] 106 term_g = compone[cant_g] 107 lccsal.append(-(term_a+term_c+term_t+term_g)) 108 elif window.endswith('C'): 109 cant_c += 1 110 term_c = compone[cant_c] 111 term_g = compone[cant_g] 112 lccsal.append(-(term_a+term_c+term_t+term_g)) 113 elif window.endswith('T'): 114 cant_t += 1 115 term_t = compone[cant_t] 116 term_g = compone[cant_g] 117 lccsal.append(-(term_a+term_c+term_t+term_g)) 118 tail = window[0] 119 return lccsal
120 121
122 -def lcc_simp(seq):
123 """Local Composition Complexity (LCC) for a sequence. 124 125 seq - an unambiguous DNA sequence (a string or Seq object) 126 127 Returns the Local Composition Complexity (LCC) value for the entire 128 sequence (as a float). 129 130 Reference: 131 Andrzej K Konopka (2005) Sequence Complexity and Composition 132 DOI: 10.1038/npg.els.0005260 133 """ 134 wsize = len(seq) 135 try: 136 #Assume its a string 137 upper = seq.upper() 138 except AttributeError: 139 #Should be a Seq object then 140 upper = str(seq).upper() 141 l2 = math.log(2) 142 if 'A' not in seq: 143 term_a = 0 144 # Check to avoid calculating the log of 0. 145 else: 146 term_a = ((upper.count('A'))/float(wsize))*((math.log((upper.count('A')) 147 /float(wsize)))/l2) 148 if 'C' not in seq: 149 term_c = 0 150 else: 151 term_c = ((upper.count('C'))/float(wsize))*((math.log((upper.count('C')) 152 /float(wsize)))/l2) 153 if 'T' not in seq: 154 term_t = 0 155 else: 156 term_t = ((upper.count('T'))/float(wsize))*((math.log((upper.count('T')) 157 /float(wsize)))/l2) 158 if 'G' not in seq: 159 term_g = 0 160 else: 161 term_g = ((upper.count('G'))/float(wsize))*((math.log((upper.count('G')) 162 /float(wsize)))/l2) 163 return -(term_a+term_c+term_t+term_g)
164