1
2
3
4
5
6 """
7 This class provides code to parse BIG GenePop files.
8
9 The difference between this class and the standard Bio.PopGen.GenePop.Record
10 class is that this one does not read the whole file to memory.
11 It provides an iterator interface, slower but consuming much mess memory.
12 Should be used with big files (Thousands of markers and individuals).
13
14 See http://wbiomed.curtin.edu.au/genepop/ , the format is documented
15 here: http://wbiomed.curtin.edu.au/genepop/help_input.html .
16
17 Classes:
18 FileRecord Holds GenePop data.
19
20 Functions:
21
22
23 """
24 from Bio.PopGen.GenePop import get_indiv
25
26
28 """Parses a file containing a GenePop file.
29
30 fname is a file name that contains a GenePop record.
31 """
32 record = FileRecord(fname)
33 return record
34
35
37 """Holds information from a GenePop record.
38
39 Members:
40 marker_len The marker length (2 or 3 digit code per allele).
41
42 comment_line Comment line.
43
44 loci_list List of loci names.
45
46 Functions:
47 get_individual Returns the next individual of the current population.
48
49 skip_population Skips the current population.
50
51 skip_population skips the individuals of the current population, returns
52 True if there are more populations.
53
54 get_individual returns an individual of the current population (or None
55 if the list ended).
56 Each individual is a pair composed by individual
57 name and a list of alleles (2 per marker or 1 for haploid data).
58 Examples
59 ('Ind1', [(1,2), (3,3), (200,201)]
60 ('Ind2', [(2,None), (3,3), (None,None)]
61 ('Other1', [(1,1), (4,3), (200,200)]
62
63
64 """
66 self.comment_line = ""
67 self.loci_list = []
68 self.fname = fname
69 self.start_read()
70
72 """Returns (reconstructs) a GenePop textual representation.
73
74 This might take a lot of memory.
75 Marker length will be 3.
76 """
77 marker_len = 3
78 rep = [self.comment_line + '\n']
79 rep.append('\n'.join(self.loci_list) + '\n')
80 current_pop = self.current_pop
81 current_ind = self.current_ind
82 self._handle.seek(0)
83 self.skip_header()
84 rep.append('Pop\n')
85 more = True
86 while more:
87 res = self.get_individual()
88 if res is True:
89 rep.append('Pop\n')
90 elif res is False:
91 more = False
92 else:
93 name, markers = res
94 rep.append(name)
95 rep.append(',')
96 for marker in markers:
97 rep.append(' ')
98 for al in marker:
99 if al is None:
100 al = '0'
101 aStr = str(al)
102 while len(aStr)<marker_len:
103 aStr = "".join(['0', aStr])
104 rep.append(aStr)
105 rep.append('\n')
106 self.seek_position(current_pop, current_ind)
107 return "".join(rep)
108
110 """Starts parsing a file containing a GenePop file.
111 """
112 self._handle = open(self.fname)
113 self.comment_line = self._handle.readline().rstrip()
114
115
116
117 sample_loci_line = self._handle.readline().rstrip().replace(',', '')
118 all_loci = sample_loci_line.split(' ')
119 self.loci_list.extend(all_loci)
120 for line in self._handle:
121 line = line.rstrip()
122 if line.upper()=='POP':
123 break
124 self.loci_list.append(line)
125 else:
126 raise ValueError('No population data found, file probably not GenePop related')
127
128 self.current_pop = 0
129 self.current_ind = 0
130
132 """Skips the Header. To be done after a re-open."""
133 self.current_pop = 0
134 self.current_ind = 0
135 for line in self._handle:
136 if line.rstrip().upper()=="POP":
137 return
138
140 """Seeks a certain position in the file.
141
142 pop - pop position (0 is first)
143 indiv - individual in pop
144 """
145 self._handle.seek(0)
146 self.skip_header()
147 while pop>0:
148 self.skip_population()
149 pop -= 1
150 while indiv>0:
151 self.get_individual()
152 indiv -= 1
153
155 "Skips the current population. Returns true if there is another pop."
156 for line in self._handle:
157 if line=="":
158 return False
159 line = line.rstrip()
160 if line.upper()=='POP':
161 self.current_pop += 1
162 self.current_ind = 0
163 return True
164
166 """Gets the next individual.
167
168 Returns individual information if there are more individuals
169 in the current population.
170 Returns True if there are no more individuals in the current
171 population, but there are more populations. Next read will
172 be of the following pop.
173 Returns False if at end of file.
174 """
175 for line in self._handle:
176 line = line.rstrip()
177 if line.upper()=='POP':
178 self.current_pop += 1
179 self.current_ind = 0
180 return True
181 else:
182 self.current_ind += 1
183 indiv_name, allele_list, ignore = get_indiv(line)
184 return (indiv_name, allele_list)
185 return False
186
188 """Removes a population (by position).
189
190 pos - position
191 fname - file to be created with population removed
192 """
193 old_rec = read(self.fname)
194 f = open(fname, "w")
195 f.write(self.comment_line + "\n")
196 for locus in old_rec.loci_list:
197 f.write(locus + "\n")
198 curr_pop = 0
199 l_parser = old_rec.get_individual()
200 start_pop = True
201 while l_parser:
202 if curr_pop == pos:
203 old_rec.skip_population()
204 curr_pop += 1
205 else:
206 if l_parser is True:
207 curr_pop += 1
208 start_pop = True
209 else:
210 if start_pop:
211 f.write("POP\n")
212 start_pop = False
213 name, markers = l_parser
214 f.write(name + ",")
215 for marker in markers:
216 f.write(' ')
217 for al in marker:
218 if al is None:
219 al = '0'
220 aStr = str(al)
221 while len(aStr)<3:
222 aStr = "".join(['0', aStr])
223 f.write(aStr)
224 f.write('\n')
225
226 l_parser = old_rec.get_individual()
227 f.close()
228
230 """Removes a locus by position.
231
232 pos - position
233 fname - file to be created with locus removed
234 """
235 old_rec = read(self.fname)
236 f = open(fname, "w")
237 f.write(self.comment_line + "\n")
238 loci_list = old_rec.loci_list
239 del loci_list[pos]
240 for locus in loci_list:
241 f.write(locus + "\n")
242 l_parser = old_rec.get_individual()
243 f.write("POP\n")
244 while l_parser:
245 if l_parser is True:
246 f.write("POP\n")
247 else:
248 name, markers = l_parser
249 f.write(name + ",")
250 marker_pos = 0
251 for marker in markers:
252 if marker_pos == pos:
253 marker_pos += 1
254 continue
255 marker_pos += 1
256 f.write(' ')
257 for al in marker:
258 if al is None:
259 al = '0'
260 aStr = str(al)
261 while len(aStr)<3:
262 aStr = "".join(['0', aStr])
263 f.write(aStr)
264 f.write('\n')
265
266 l_parser = old_rec.get_individual()
267 f.close()
268
270 """Removes a set of loci by position.
271
272 positions - positions
273 fname - file to be created with locus removed
274 """
275 old_rec = read(self.fname)
276 f = open(fname, "w")
277 f.write(self.comment_line + "\n")
278 loci_list = old_rec.loci_list
279 positions.sort()
280 positions.reverse()
281 posSet = set()
282 for pos in positions:
283 del loci_list[pos]
284 posSet.add(pos)
285 for locus in loci_list:
286 f.write(locus + "\n")
287 l_parser = old_rec.get_individual()
288 f.write("POP\n")
289 while l_parser:
290 if l_parser is True:
291 f.write("POP\n")
292 else:
293 name, markers = l_parser
294 f.write(name + ",")
295 marker_pos = 0
296 for marker in markers:
297 if marker_pos in posSet:
298 marker_pos += 1
299 continue
300 marker_pos += 1
301 f.write(' ')
302 for al in marker:
303 if al is None:
304 al = '0'
305 aStr = str(al)
306 while len(aStr)<3:
307 aStr = "".join(['0', aStr])
308 f.write(aStr)
309 f.write('\n')
310
311 l_parser = old_rec.get_individual()
312 f.close()
313
315 """Removes a locus by name.
316
317 name - name
318 fname - file to be created with locus removed
319 """
320 for i in range(len(self.loci_list)):
321 if self.loci_list[i] == name:
322 self.remove_locus_by_position(i, fname)
323 return
324
325
326
328 """Removes a loci list (by name).
329
330 names - names
331 fname - file to be created with loci removed
332 """
333 positions = []
334 for i in range(len(self.loci_list)):
335 if self.loci_list[i] in names:
336 positions.append(i)
337 self.remove_loci_by_position(positions, fname)
338
339
340