Package Bio :: Package PopGen :: Package GenePop :: Module LargeFileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.LargeFileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  Large file parsing of Genepop files 
  8   
  9  The standard parser loads the whole file into memory. This parser 
 10  provides an iterator over data. 
 11   
 12  Classes: 
 13  LargeRecord           Holds GenePop data. 
 14   
 15  Functions: 
 16  read             Parses a GenePop record (file) into a Record object. 
 17   
 18  """ 
 19   
 20   
21 -def get_indiv(line):
22 indiv_name, marker_line = line.split(',') 23 markers = marker_line.replace('\t', ' ').split(' ') 24 markers = [marker for marker in markers if marker!=''] 25 if len(markers[0]) in [2, 4]: # 2 digits per allele 26 marker_len = 2 27 else: 28 marker_len = 3 29 try: 30 allele_list = [(int(marker[0:marker_len]), 31 int(marker[marker_len:])) 32 for marker in markers] 33 except ValueError: # Haploid 34 allele_list = [(int(marker[0:marker_len]),) 35 for marker in markers] 36 return indiv_name, allele_list, marker_len
37 38
39 -def read(handle):
40 """Parses a handle containing a GenePop file. 41 42 handle is a file-like object that contains a GenePop record. 43 """ 44 record = Record(handle) 45 record.comment_line = str(handle.readline()).rstrip() 46 # We can now have one loci per line or all loci in a single line 47 # separated by either space or comma+space... 48 # We will remove all commas on loci... that should not be a problem 49 sample_loci_line = str(handle.readline()).rstrip().replace(',', '') 50 all_loci = sample_loci_line.split(' ') 51 record.loci_list.extend(all_loci) 52 line = handle.readline() 53 while line!="": 54 line = line.rstrip() 55 if line.upper()=="POP": 56 record.stack.append("POP") 57 break 58 record.loci_list.append(line) 59 line = handle.readline() 60 next_line = handle.readline().rstrip() 61 indiv_name, allele_list, record.marker_len = get_indiv(next_line) 62 record.stack.append(next_line) 63 return record
64 65
66 -class Record(object):
67 """Holds information from a GenePop record. 68 69 Members: 70 marker_len The marker length (2 or 3 digit code per allele). 71 72 comment_line Comment line. 73 74 loci_list List of loci names. 75 76 data_generator Iterates over population data. 77 78 The generator will only work once. If you want to read a handle 79 twice you have to re-open it! 80 81 data_generator can either be () - an empty tuple - marking a new 82 population or an individual. An individual is something like 83 ('Ind1', [(1,1), (3,None), (200,201)], 84 In the case above the individual is called Ind1, 85 has three diploid loci. For the second loci, one of the alleles 86 is unknown. 87 88 """
89 - def __init__(self, handle):
90 self.handle = handle 91 self.marker_len = 0 92 self.comment_line = "" 93 self.loci_list = [] 94 self.populations = [] 95 self.stack = []
96
97 - def data_generator(self):
98 for handle in [self.stack, self.handle]: 99 for line in handle: 100 line = line.rstrip() 101 if line.upper()=='POP': 102 yield () 103 else: 104 indiv_name, allele_list, marker_len = get_indiv(line) 105 clean_list = [] 106 for locus in allele_list: 107 mk_real = [] 108 for al in locus: 109 if al==0: 110 mk_real.append(None) 111 else: 112 mk_real.append(al) 113 clean_list.append(tuple(mk_real)) 114 yield indiv_name, clean_list 115 raise StopIteration()
116