1
2
3
4
5
6
7 import math
8
9
11 """Local Composition Complexity (LCC) values over sliding window.
12
13 Returns a list of floats, the LCC values for a sliding window over
14 the sequence.
15
16 seq - an unambiguous DNA sequence (a string or Seq object)
17 wsize - window size, integer
18
19 The result is the same as applying lcc_simp multiple times, but this
20 version is optimized for speed. The optimization works by using the
21 value of previous window as a base to compute the next one."""
22 l2 = math.log(2)
23 tamseq = len(seq)
24 try:
25
26 upper = seq.upper()
27 except AttributeError:
28
29 upper = str(seq).upper()
30 compone = [0]
31 lccsal = [0]
32 for i in range(wsize):
33 compone.append(((i+1)/float(wsize))*
34 ((math.log((i+1)/float(wsize)))/l2))
35 window = seq[0:wsize]
36 cant_a = window.count('A')
37 cant_c = window.count('C')
38 cant_t = window.count('T')
39 cant_g = window.count('G')
40 term_a = compone[cant_a]
41 term_c = compone[cant_c]
42 term_t = compone[cant_t]
43 term_g = compone[cant_g]
44 lccsal.append(-(term_a+term_c+term_t+term_g))
45 tail = seq[0]
46 for x in range(tamseq-wsize):
47 window = upper[x+1:wsize+x+1]
48 if tail == window[-1]:
49 lccsal.append(lccsal[-1])
50 elif tail == 'A':
51 cant_a -= 1
52 if window.endswith('C'):
53 cant_c += 1
54 term_a = compone[cant_a]
55 term_c = compone[cant_c]
56 lccsal.append(-(term_a+term_c+term_t+term_g))
57 elif window.endswith('T'):
58 cant_t += 1
59 term_a = compone[cant_a]
60 term_t = compone[cant_t]
61 lccsal.append(-(term_a+term_c+term_t+term_g))
62 elif window.endswith('G'):
63 cant_g += 1
64 term_a = compone[cant_a]
65 term_g = compone[cant_g]
66 lccsal.append(-(term_a+term_c+term_t+term_g))
67 elif tail == 'C':
68 cant_c -= 1
69 if window.endswith('A'):
70 cant_a += 1
71 term_a = compone[cant_a]
72 term_c = compone[cant_c]
73 lccsal.append(-(term_a+term_c+term_t+term_g))
74 elif window.endswith('T'):
75 cant_t += 1
76 term_c = compone[cant_c]
77 term_t = compone[cant_t]
78 lccsal.append(-(term_a+term_c+term_t+term_g))
79 elif window.endswith('G'):
80 cant_g += 1
81 term_c = compone[cant_c]
82 term_g = compone[cant_g]
83 lccsal.append(-(term_a+term_c+term_t+term_g))
84 elif tail == 'T':
85 cant_t -= 1
86 if window.endswith('A'):
87 cant_a += 1
88 term_a = compone[cant_a]
89 term_t = compone[cant_t]
90 lccsal.append(-(term_a+term_c+term_t+term_g))
91 elif window.endswith('C'):
92 cant_c += 1
93 term_c = compone[cant_c]
94 term_t = compone[cant_t]
95 lccsal.append(-(term_a+term_c+term_t+term_g))
96 elif window.endswith('G'):
97 cant_g += 1
98 term_t = compone[cant_t]
99 term_g = compone[cant_g]
100 lccsal.append(-(term_a+term_c+term_t+term_g))
101 elif tail == 'G':
102 cant_g -= 1
103 if window.endswith('A'):
104 cant_a += 1
105 term_a = compone[cant_a]
106 term_g = compone[cant_g]
107 lccsal.append(-(term_a+term_c+term_t+term_g))
108 elif window.endswith('C'):
109 cant_c += 1
110 term_c = compone[cant_c]
111 term_g = compone[cant_g]
112 lccsal.append(-(term_a+term_c+term_t+term_g))
113 elif window.endswith('T'):
114 cant_t += 1
115 term_t = compone[cant_t]
116 term_g = compone[cant_g]
117 lccsal.append(-(term_a+term_c+term_t+term_g))
118 tail = window[0]
119 return lccsal
120
121
123 """Local Composition Complexity (LCC) for a sequence.
124
125 seq - an unambiguous DNA sequence (a string or Seq object)
126
127 Returns the Local Composition Complexity (LCC) value for the entire
128 sequence (as a float).
129
130 Reference:
131 Andrzej K Konopka (2005) Sequence Complexity and Composition
132 DOI: 10.1038/npg.els.0005260
133 """
134 wsize = len(seq)
135 try:
136
137 upper = seq.upper()
138 except AttributeError:
139
140 upper = str(seq).upper()
141 l2 = math.log(2)
142 if 'A' not in seq:
143 term_a = 0
144
145 else:
146 term_a = ((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
147 /float(wsize)))/l2)
148 if 'C' not in seq:
149 term_c = 0
150 else:
151 term_c = ((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
152 /float(wsize)))/l2)
153 if 'T' not in seq:
154 term_t = 0
155 else:
156 term_t = ((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
157 /float(wsize)))/l2)
158 if 'G' not in seq:
159 term_g = 0
160 else:
161 term_g = ((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
162 /float(wsize)))/l2)
163 return -(term_a+term_c+term_t+term_g)
164