Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """ Notes about the diverses class of the restriction enzyme implementation. 
  12   
  13          RestrictionType is the type of all restriction enzymes. 
  14      ---------------------------------------------------------------------------- 
  15          AbstractCut implements some methods that are common to all enzymes. 
  16      ---------------------------------------------------------------------------- 
  17          NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  18                                  produced by the enzyme. 
  19                                  they correspond to the 4th field of the rebase 
  20                                  record emboss_e.NNN. 
  21                  0->NoCut    : the enzyme is not characterised. 
  22                  2->OneCut   : the enzyme produce one double strand cut. 
  23                  4->TwoCuts  : two double strand cuts. 
  24      ---------------------------------------------------------------------------- 
  25          Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  26                                  the enzyme. 
  27                                  Not implemented yet. 
  28      ---------------------------------------------------------------------------- 
  29          Palindromic,            if the site is palindromic or not. 
  30          NotPalindromic          allow some optimisations of the code. 
  31                                  No need to check the reverse strand 
  32                                  with palindromic sites. 
  33      ----------------------------------------------------------------------------                                     
  34          Unknown, Blunt,         represent the overhang. 
  35          Ov5, Ov3                Unknown is here for symetry reasons and 
  36                                  correspond to enzymes that are not characterised 
  37                                  in rebase. 
  38      ---------------------------------------------------------------------------- 
  39          Defined, Ambiguous,     represent the sequence of the overhang. 
  40          NotDefined              
  41                                  NotDefined is for enzymes not characterised in 
  42                                  rebase. 
  43                                   
  44                                  Defined correspond to enzymes that display a 
  45                                  constant overhang whatever the sequence. 
  46                                  ex : EcoRI. G^AATTC -> overhang :AATT 
  47                                              CTTAA^G 
  48   
  49                                  Ambiguous : the overhang varies with the 
  50                                  sequence restricted. 
  51                                  Typically enzymes which cut outside their 
  52                                  restriction site or (but not always) 
  53                                  inside an ambiguous site. 
  54                                  ex: 
  55                                  AcuI CTGAAG(22/20)  -> overhang : NN 
  56                                  AasI GACNNN^NNNGTC  -> overhang : NN 
  57                                       CTGN^NNNNNCAG 
  58   
  59              note : these 3 classes refers to the overhang not the site. 
  60                 So the enzyme ApoI (RAATTY) is defined even if its restriction 
  61                 site is ambiguous. 
  62                                   
  63                      ApoI R^AATTY -> overhang : AATT -> Defined 
  64                           YTTAA^R 
  65                 Accordingly, blunt enzymes are always Defined even 
  66                 when they cut outside their restriction site. 
  67      ---------------------------------------------------------------------------- 
  68          Not_available,          as found in rebase file emboss_r.NNN files. 
  69          Commercially_available 
  70                                  allow the selection of the enzymes according to 
  71                                  their suppliers to reduce the quantity 
  72                                  of results. 
  73                                  Also will allow the implementation of buffer 
  74                                  compatibility tables. Not implemented yet. 
  75   
  76                                  the list of suppliers is extracted from 
  77                                  emboss_s.NNN 
  78      ---------------------------------------------------------------------------- 
  79          """ 
  80   
  81  import re 
  82  import itertools 
  83   
  84  from Bio.Seq import Seq, MutableSeq 
  85  from Bio.Alphabet import IUPAC 
  86   
  87  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  88  from Bio.Restriction.Restriction_Dictionary import typedict 
  89  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  90  from Bio.Restriction.RanaConfig import * 
  91  from Bio.Restriction.PrintFormat import PrintFormat 
  92   
  93  #Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 
  94  #namespace), but have deprecated that module. 
95 -def _check_bases(seq_string):
96 """Check characters in a string (PRIVATE). 97 98 Remove digits and white space present in string. Allows any valid ambiguous 99 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 100 101 Other characters (e.g. symbols) trigger a TypeError. 102 103 Returns the string WITH A LEADING SPACE (!). This is for backwards 104 compatibility, and may in part be explained by the fact that 105 Bio.Restriction doesn't use zero based counting. 106 """ 107 #Remove white space and make upper case: 108 seq_string = "".join(seq_string.split()).upper() 109 #Remove digits 110 for c in "0123456789" : seq_string = seq_string.replace(c,"") 111 #Check only allowed IUPAC letters 112 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")) : 113 raise TypeError("Invalid character found in %s" % repr(seq_string)) 114 return " " + seq_string
115 116 117 matching = {'A' : 'ARWMHVDN', 'C' : 'CYSMHBVN', 'G' : 'GRSKBVDN', 118 'T' : 'TYWKHBDN', 'R' : 'ABDGHKMNSRWV', 'Y' : 'CBDHKMNSTWVY', 119 'W' : 'ABDHKMNRTWVY', 'S' : 'CBDGHKMNSRVY', 'M' : 'ACBDHMNSRWVY', 120 'K' : 'BDGHKNSRTWVY', 'H' : 'ACBDHKMNSRTWVY', 121 'B' : 'CBDGHKMNSRTWVY', 'V' : 'ACBDGHKMNSRWVY', 122 'D' : 'ABDGHKMNSRTWVY', 'N' : 'ACBDGHKMNSRTWVY'} 123 124 DNA = Seq 125
126 -class FormattedSeq(object):
127 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 128 129 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 130 131 Roughly: 132 remove anything which is not IUPAC alphabet and then add a space 133 in front of the sequence to get a biological index instead of a 134 python index (i.e. index of the first base is 1 not 0). 135 136 Retains information about the shape of the molecule linear (default) 137 or circular. Restriction sites are search over the edges of circular 138 sequence.""" 139
140 - def __init__(self, seq, linear = True):
141 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 142 143 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 144 if seq is a FormattedSeq, linear will have no effect on the 145 shape of the sequence.""" 146 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 147 stringy = seq.tostring() 148 self.lower = stringy.islower() 149 #Note this adds a leading space to the sequence (!) 150 self.data = _check_bases(stringy) 151 self.linear = linear 152 self.klass = seq.__class__ 153 self.alphabet = seq.alphabet 154 elif isinstance(seq, FormattedSeq): 155 self.lower = seq.lower 156 self.data = seq.data 157 self.linear = seq.linear 158 self.alphabet = seq.alphabet 159 self.klass = seq.klass 160 else: 161 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
162
163 - def __len__(self):
164 return len(self.data) - 1
165
166 - def __repr__(self):
167 return 'FormattedSeq(%s, linear=%s)' %(repr(self[1:]), repr(self.linear))
168
169 - def __eq__(self, other):
170 if isinstance(other, FormattedSeq): 171 if repr(self) == repr(other): 172 return True 173 else: 174 return False 175 return False
176
177 - def circularise(self):
178 """FS.circularise() -> circularise FS""" 179 self.linear = False 180 return
181
182 - def linearise(self):
183 """FS.linearise() -> linearise FS""" 184 self.linear = True 185 return
186
187 - def to_linear(self):
188 """FS.to_linear() -> new linear FS instance""" 189 new = self.__class__(self) 190 new.linear = True 191 return new
192
193 - def to_circular(self):
194 """FS.to_circular() -> new circular FS instance""" 195 new = self.__class__(self) 196 new.linear = False 197 return new
198
199 - def is_linear(self):
200 """FS.is_linear() -> bool. 201 202 True if the sequence will analysed as a linear sequence.""" 203 return self.linear
204
205 - def finditer(self, pattern, size):
206 """FS.finditer(pattern, size) -> list. 207 208 return a list of pattern into the sequence. 209 the list is made of tuple (location, pattern.group). 210 the latter is used with non palindromic sites. 211 pattern is the regular expression pattern corresponding to the 212 enzyme restriction site. 213 size is the size of the restriction enzyme recognition-site size.""" 214 if self.is_linear(): 215 data = self.data 216 else: 217 data = self.data + self.data[1:size] 218 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
219
220 - def __getitem__(self, i):
221 if self.lower: 222 return self.klass((self.data[i]).lower(), self.alphabet) 223 return self.klass(self.data[i], self.alphabet)
224 225
226 -class RestrictionType(type):
227 """RestrictionType. Type from which derives all enzyme classes. 228 229 Implement the operator methods.""" 230
231 - def __init__(cls, name='', bases=(), dct={}):
232 """RE(name, bases, dct) -> RestrictionType instance. 233 234 Not intended to be used in normal operation. The enzymes are 235 instantiated when importing the module. 236 237 see below.""" 238 if "-" in name : 239 raise ValueError("Problem with hyphen in %s as enzyme name" \ 240 % repr(name)) 241 super(RestrictionType, cls).__init__(cls, name, bases, dct) 242 try : 243 cls.compsite = re.compile(cls.compsite) 244 except Exception, err : 245 raise ValueError("Problem with regular expression, re.compiled(%s)" \ 246 % repr(cls.compsite))
247
248 - def __add__(cls, other):
249 """RE.__add__(other) -> RestrictionBatch(). 250 251 if other is an enzyme returns a batch of the two enzymes. 252 if other is already a RestrictionBatch add enzyme to it.""" 253 if isinstance(other, RestrictionType): 254 return RestrictionBatch([cls, other]) 255 elif isinstance(other, RestrictionBatch): 256 return other.add_nocheck(cls) 257 else: 258 raise TypeError
259
260 - def __div__(cls, other):
261 """RE.__div__(other) -> list. 262 263 RE/other 264 returns RE.search(other).""" 265 return cls.search(other)
266
267 - def __rdiv__(cls, other):
268 """RE.__rdiv__(other) -> list. 269 270 other/RE 271 returns RE.search(other).""" 272 return cls.search(other)
273
274 - def __truediv__(cls, other):
275 """RE.__truediv__(other) -> list. 276 277 RE/other 278 returns RE.search(other).""" 279 return cls.search(other)
280
281 - def __rtruediv__(cls, other):
282 """RE.__rtruediv__(other) -> list. 283 284 other/RE 285 returns RE.search(other).""" 286 return cls.search(other)
287
288 - def __floordiv__(cls, other):
289 """RE.__floordiv__(other) -> list. 290 291 RE//other 292 returns RE.catalyse(other).""" 293 return cls.catalyse(other)
294
295 - def __rfloordiv__(cls, other):
296 """RE.__rfloordiv__(other) -> list. 297 298 other//RE 299 returns RE.catalyse(other).""" 300 return cls.catalyse(other)
301
302 - def __str__(cls):
303 """RE.__str__() -> str. 304 305 return the name of the enzyme.""" 306 return cls.__name__
307
308 - def __repr__(cls):
309 """RE.__repr__() -> str. 310 311 used with eval or exec will instantiate the enzyme.""" 312 return "%s" % cls.__name__
313
314 - def __len__(cls):
315 """RE.__len__() -> int. 316 317 length of the recognition site.""" 318 return cls.size
319
320 - def __hash__(cls):
321 #Python default is to use id(...) 322 #This is consistent with the __eq__ implementation 323 return id(cls)
324
325 - def __eq__(cls, other):
326 """RE == other -> bool 327 328 True if RE and other are the same enzyme. 329 330 Specifically this checks they are the same Python object. 331 """ 332 #assert (id(cls)==id(other)) == (other is cls) == (cls is other) 333 return id(cls)==id(other)
334
335 - def __ne__(cls, other):
336 """RE != other -> bool. 337 isoschizomer strict, same recognition site, same restriction -> False 338 all the other-> True 339 340 WARNING - This is not the inverse of the __eq__ method. 341 """ 342 if not isinstance(other, RestrictionType): 343 return True 344 elif cls.