Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """ Notes about the diverses class of the restriction enzyme implementation. 
  12   
  13          RestrictionType is the type of all restriction enzymes. 
  14      ---------------------------------------------------------------------------- 
  15          AbstractCut implements some methods that are common to all enzymes. 
  16      ---------------------------------------------------------------------------- 
  17          NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  18                                  produced by the enzyme. 
  19                                  they correspond to the 4th field of the rebase 
  20                                  record emboss_e.NNN. 
  21                  0->NoCut    : the enzyme is not characterised. 
  22                  2->OneCut   : the enzyme produce one double strand cut. 
  23                  4->TwoCuts  : two double strand cuts. 
  24      ---------------------------------------------------------------------------- 
  25          Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  26                                  the enzyme. 
  27                                  Not implemented yet. 
  28      ---------------------------------------------------------------------------- 
  29          Palindromic,            if the site is palindromic or not. 
  30          NotPalindromic          allow some optimisations of the code. 
  31                                  No need to check the reverse strand 
  32                                  with palindromic sites. 
  33      ---------------------------------------------------------------------------- 
  34          Unknown, Blunt,         represent the overhang. 
  35          Ov5, Ov3                Unknown is here for symetry reasons and 
  36                                  correspond to enzymes that are not characterised 
  37                                  in rebase. 
  38      ---------------------------------------------------------------------------- 
  39          Defined, Ambiguous,     represent the sequence of the overhang. 
  40          NotDefined 
  41                                  NotDefined is for enzymes not characterised in 
  42                                  rebase. 
  43   
  44                                  Defined correspond to enzymes that display a 
  45                                  constant overhang whatever the sequence. 
  46                                  ex : EcoRI. G^AATTC -> overhang :AATT 
  47                                              CTTAA^G 
  48   
  49                                  Ambiguous : the overhang varies with the 
  50                                  sequence restricted. 
  51                                  Typically enzymes which cut outside their 
  52                                  restriction site or (but not always) 
  53                                  inside an ambiguous site. 
  54                                  ex: 
  55                                  AcuI CTGAAG(22/20)  -> overhang : NN 
  56                                  AasI GACNNN^NNNGTC  -> overhang : NN 
  57                                       CTGN^NNNNNCAG 
  58   
  59              note : these 3 classes refers to the overhang not the site. 
  60                 So the enzyme ApoI (RAATTY) is defined even if its restriction 
  61                 site is ambiguous. 
  62   
  63                      ApoI R^AATTY -> overhang : AATT -> Defined 
  64                           YTTAA^R 
  65                 Accordingly, blunt enzymes are always Defined even 
  66                 when they cut outside their restriction site. 
  67      ---------------------------------------------------------------------------- 
  68          Not_available,          as found in rebase file emboss_r.NNN files. 
  69          Commercially_available 
  70                                  allow the selection of the enzymes according to 
  71                                  their suppliers to reduce the quantity 
  72                                  of results. 
  73                                  Also will allow the implementation of buffer 
  74                                  compatibility tables. Not implemented yet. 
  75   
  76                                  the list of suppliers is extracted from 
  77                                  emboss_s.NNN 
  78      ---------------------------------------------------------------------------- 
  79          """ 
  80   
  81  from __future__ import print_function 
  82  from Bio._py3k import zip 
  83  from Bio._py3k import filter 
  84  from Bio._py3k import range 
  85   
  86  import re 
  87  import itertools 
  88   
  89  from Bio.Seq import Seq, MutableSeq 
  90  from Bio.Alphabet import IUPAC 
  91   
  92  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  93  from Bio.Restriction.Restriction_Dictionary import typedict 
  94  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  95  # TODO: Consider removing this wildcard import. 
  96  from Bio.Restriction.RanaConfig import * 
  97  from Bio.Restriction.PrintFormat import PrintFormat 
98 99 100 #Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 101 #namespace), but have deprecated that module. 102 -def _check_bases(seq_string):
103 """Check characters in a string (PRIVATE). 104 105 Remove digits and white space present in string. Allows any valid ambiguous 106 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 107 108 Other characters (e.g. symbols) trigger a TypeError. 109 110 Returns the string WITH A LEADING SPACE (!). This is for backwards 111 compatibility, and may in part be explained by the fact that 112 Bio.Restriction doesn't use zero based counting. 113 """ 114 #Remove white space and make upper case: 115 seq_string = "".join(seq_string.split()).upper() 116 #Remove digits 117 for c in "0123456789": 118 seq_string = seq_string.replace(c, "") 119 #Check only allowed IUPAC letters 120 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")) : 121 raise TypeError("Invalid character found in %s" % repr(seq_string)) 122 return " " + seq_string
123 124 125 matching = {'A' : 'ARWMHVDN', 'C' : 'CYSMHBVN', 'G' : 'GRSKBVDN', 126 'T' : 'TYWKHBDN', 'R' : 'ABDGHKMNSRWV', 'Y' : 'CBDHKMNSTWVY', 127 'W' : 'ABDHKMNRTWVY', 'S' : 'CBDGHKMNSRVY', 'M' : 'ACBDHMNSRWVY', 128 'K' : 'BDGHKNSRTWVY', 'H' : 'ACBDHKMNSRTWVY', 129 'B' : 'CBDGHKMNSRTWVY', 'V' : 'ACBDGHKMNSRWVY', 130 'D' : 'ABDGHKMNSRTWVY', 'N' : 'ACBDGHKMNSRTWVY'} 131 132 DNA = Seq
133 134 135 -class FormattedSeq(object):
136 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 137 138 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 139 140 Roughly: 141 remove anything which is not IUPAC alphabet and then add a space 142 in front of the sequence to get a biological index instead of a 143 python index (i.e. index of the first base is 1 not 0). 144 145 Retains information about the shape of the molecule linear (default) 146 or circular. Restriction sites are search over the edges of circular 147 sequence.""" 148
149 - def __init__(self, seq, linear = True):
150 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 151 152 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 153 if seq is a FormattedSeq, linear will have no effect on the 154 shape of the sequence.""" 155 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 156 stringy = str(seq) 157 self.lower = stringy.islower() 158 #Note this adds a leading space to the sequence (!) 159 self.data = _check_bases(stringy) 160 self.linear = linear 161 self.klass = seq.__class__ 162 self.alphabet = seq.alphabet 163 elif isinstance(seq, FormattedSeq): 164 self.lower = seq.lower 165 self.data = seq.data 166 self.linear = seq.linear 167 self.alphabet = seq.alphabet 168 self.klass = seq.klass 169 else: 170 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
171
172 - def __len__(self):
173 return len(self.data) - 1
174
175 - def __repr__(self):
176 return 'FormattedSeq(%s, linear=%s)' %(repr(self[1:]), repr(self.linear))
177
178 - def __eq__(self, other):
179 if isinstance(other, FormattedSeq): 180 if repr(self) == repr(other): 181 return True 182 else: 183 return False 184 return False
185
186 - def circularise(self):
187 """FS.circularise() -> circularise FS""" 188 self.linear = False 189 return
190
191 - def linearise(self):
192 """FS.linearise() -> linearise FS""" 193 self.linear = True 194 return
195
196 - def to_linear(self):
197 """FS.to_linear() -> new linear FS instance""" 198 new = self.__class__(self) 199 new.linear = True 200 return new
201
202 - def to_circular(self):
203 """FS.to_circular() -> new circular FS instance""" 204 new = self.__class__(self) 205 new.linear = False 206 return new
207
208 - def is_linear(self):
209 """FS.is_linear() -> bool. 210 211 True if the sequence will analysed as a linear sequence.""" 212 return self.linear
213
214 - def finditer(self, pattern, size):
215 """FS.finditer(pattern, size) -> list. 216 217 return a list of pattern into the sequence. 218 the list is made of tuple (location, pattern.group). 219 the latter is used with non palindromic sites. 220 pattern is the regular expression pattern corresponding to the 221 enzyme restriction site. 222 size is the size of the restriction enzyme recognition-site size.""" 223 if self.is_linear(): 224 data = self.data 225 else: 226 data = self.data + self.data[1:size] 227 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
228
229 - def __getitem__(self, i):
230 if self.lower: 231 return self.klass((self.data[i]).lower(), self.alphabet) 232 return self.klass(self.data[i], self.alphabet)
233
234 235 -class RestrictionType(type):
236 """RestrictionType. Type from which derives all enzyme classes. 237 238 Implement the operator methods.""" 239
240 - def __init__(cls, name='', bases=(), dct={}):
241 """RE(name, bases, dct) -> RestrictionType instance. 242 243 Not intended to be used in normal operation. The enzymes are 244 instantiated when importing the module. 245 246 see below.""" 247 if "-" in name : 248 raise ValueError("Problem with hyphen in %s as enzyme name" 249 % repr(name)) 250 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 251 # but all unit tests seem to pass without it. 252 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 253 try : 254 cls.compsite = re.compile(cls.compsite) 255 except Exception as err : 256 raise ValueError("Problem with regular expression, re.compiled(%s)" 257 % repr(cls.compsite))
258
259 - def __add__(cls, other):
260 """RE.__add__(other) -> RestrictionBatch(). 261 262 if other is an enzyme returns a batch of the two enzymes. 263 if other is already a RestrictionBatch add enzyme to it.""" 264 if isinstance(other, RestrictionType): 265 return RestrictionBatch([cls, other]) 266 elif isinstance(other, RestrictionBatch): 267 return other.add_nocheck(cls) 268 else: 269 raise TypeError
270
271 - def __div__(cls, other):
272 """RE.__div__(other) -> list. 273 274 RE/other 275 returns RE.search(other).""" 276 return cls.search(other)
277
278 - def __rdiv__(cls, other):
279 """RE.__rdiv__(other) -> list. 280 281 other/RE 282 returns RE.search(other).""" 283 return cls.search(other)
284
285 - def __truediv__(cls, other):
286 """RE.__truediv__(other) -> list. 287 288 RE/other 289 returns RE.search(other).""" 290 return cls.search(other)
291
292 - def __rtruediv__(cls, other):
293 """RE.__rtruediv__(other) -> list. 294 295 other/RE 296 returns RE.search(other).""" 297 return cls.search(other)
298
299 - def __floordiv__(cls, other):
300 """RE.__floordiv__(other) -> list. 301 302 RE//other 303 returns RE.catalyse(other).""" 304 return cls.catalyse(other)
305
306 - def __rfloordiv__(cls, other):
307 """RE.__rfloordiv__(other) -> list. 308 309 other//RE 310 returns RE.catalyse(other).""" 311 return cls.catalyse(other)
312
313 - def __str__(cls):
314 """RE.__str__() -> str. 315 316 return the name of the enzyme.""" 317 return cls.__name__
318
319 - def __repr__(cls):
320 """RE.__repr__() -> str. 321 322 used with eval or exec will instantiate the enzyme.""" 323 return "%s" % cls.__name__
324
325 - def __len__(cls):
326 """RE.__len__() -> int. 327 328 length of the recognition site.""" 329 return cls.size
330
331 - def __hash__(cls):
332 #Python default is to use id(...) 333 #This is consistent with the __eq__ implementation 334 return id(cls)
335
336 - def __eq__(cls, other):
337 """RE == other -> bool 338 339 True if RE and other are the same enzyme. 340 341 Specifically this checks they are the same Python object. 342 """ 343 #assert (id(cls)==id(other)) == (other is cls) == (cls is other) 344 return id(cls)==id(other)
345
346 - def __ne__(cls, other):
347 """RE != other -> bool. 348 isoschizomer strict, same recognition site, same restriction -> False 349 all the other-> True 350 351 WARNING - This is not the inverse of the __eq__ method. 352 """ 353 if not isinstance(other, RestrictionType): 354 return True 355 elif cls.charac == other.charac: 356 return False 357 else: 358 return True
359
360 - def __rshift__(cls, other):
361 """RE >> other -> bool. 362 363 neoschizomer : same recognition site, different restriction. -> True 364 all the others : -> False""" 365 if not isinstance(other, RestrictionType): 366 return False 367 elif cls.site == other.site and cls.charac != other.charac: 368 return True 369 else: 370 return False
371
372 - def __mod__(cls, other):
373 """a % b -> bool. 374 375 Test compatibility of the overhang of a and b. 376 True if a and b have compatible overhang.""" 377 if not isinstance(other, RestrictionType): 378 raise TypeError( 379 'expected RestrictionType, got %s instead' % type(other)) 380 return cls._mod1(other)
381
382 - def __ge__(cls, other):
383 """a >= b -> bool. 384 385 a is greater or equal than b if the a site is longer than b site. 386 if their site have the same length sort by alphabetical order of their 387 names.""" 388 if not isinstance(other, RestrictionType): 389 raise NotImplementedError 390 if len(cls) > len(other): 391 return True 392 elif cls.size == len(other) and cls.__name__ >= other.__name__: 393 return True 394 else: 395 return False
396
397 - def __gt__(cls, other):
398 """a > b -> bool. 399 400 sorting order: 401 1. size of the recognition site. 402 2. if equal size, alphabetical order of the names.""" 403 if not isinstance(other, RestrictionType): 404 raise NotImplementedError 405 if len(cls) > len(other): 406 return True 407 elif cls.size == len(other) and cls.__name__ > other.__name__: 408 return True 409 else: 410 return False
411
412 - def __le__(cls, other):
413 """a <= b -> bool. 414 415 sorting order: 416 1. size of the recognition site. 417 2. if equal size, alphabetical order of the names.""" 418 if not isinstance(other, RestrictionType): 419 raise NotImplementedError 420 elif len(cls) < len(other): 421 return True 422 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 423 return True 424 else: 425 return False
426
427 - def __lt__(cls, other):
428 """a < b -> bool. 429 430 sorting order: 431 1. size of the recognition site. 432 2. if equal size, alphabetical order of the names.""" 433 if not isinstance(other, RestrictionType): 434 raise NotImplementedError 435 elif len(cls) < len(other): 436 return True 437 elif len(cls) == len(other) and cls.__name__ < other.__name__: 438 return True 439 else: 440 return False
441
442 443 -class AbstractCut(RestrictionType):
444 """Implement the methods that are common to all restriction enzymes. 445 446 All the methods are classmethod. 447 448 For internal use only. Not meant to be instantiate.""" 449 450 @classmethod
451 - def search(cls, dna, linear=True):
452 """RE.search(dna, linear=True) -> list. 453 454 return a list of all the site of RE in dna. Compensate for circular 455 sequences and so on. 456 457 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 458 459 if linear is False, the restriction sites than span over the boundaries 460 will be included. 461 462 The positions are the first base of the 3' fragment, 463 i.e. the first base after the position the enzyme will cut. """ 464 # 465 # Separating search from _search allow a (very limited) optimisation 466 # of the search when using a batch of restriction enzymes. 467 # in this case the DNA is tested once by the class which implements 468 # the batch instead of being tested by each enzyme single. 469 # see RestrictionBatch.search() for example. 470 # 471 if isinstance(dna, FormattedSeq): 472 cls.dna = dna 473 return cls._search() 474 else: 475 cls.