Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example, 
  7   
  8  X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9  print(X.count_amino_acids()) 
 10  print(X.get_amino_acids_percent()) 
 11  print(X.molecular_weight()) 
 12  print(X.aromaticity()) 
 13  print(X.instability_index()) 
 14  print(X.flexibility()) 
 15  print(X.isoelectric_point()) 
 16  print(X.secondary_structure_fraction()) 
 17  print(X.protein_scale(ProtParamData.kd, 9, 0.4)) 
 18  """ 
 19  #TODO - Turn that into a working doctest 
 20   
 21  from __future__ import print_function 
 22   
 23  import sys 
 24  from . import ProtParamData  # Local 
 25  from . import IsoelectricPoint  # Local 
 26  from Bio.Seq import Seq 
 27  from Bio.Alphabet import IUPAC 
 28  from Bio.Data import IUPACData 
 29   
 30   
31 -class ProteinAnalysis(object):
32 """Class containing methods for protein analysis. 33 34 The constructor takes two arguments. 35 The first is the protein sequence as a string, which is then converted to a 36 sequence object using the Bio.Seq module. This is done just to make sure 37 the sequence is a protein sequence and not anything else. 38 39 The second argument is optional. If set to True, the weight of the amino 40 acids will be calculated using their monoisotopic mass (the weight of the 41 most abundant isotopes for each element), instead of the average molecular 42 mass (the averaged weight of all stable isotopes for each element). 43 If set to false (the default value) or left out, the IUPAC average 44 molecular mass will be used for the calculation. 45 46 """
47 - def __init__(self, prot_sequence, monoisotopic=False):
48 if prot_sequence.islower(): 49 self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) 50 else: 51 self.sequence = Seq(prot_sequence, IUPAC.protein) 52 self.amino_acids_content = None 53 self.amino_acids_percent = None 54 self.length = len(self.sequence) 55 self.monoisotopic = monoisotopic
56
57 - def count_amino_acids(self):
58 """Count standard amino acids, returns a dict. 59 60 Counts the number times each amino acid is in the protein 61 sequence. Returns a dictionary {AminoAcid:Number}. 62 63 The return value is cached in self.amino_acids_content. 64 It is not recalculated upon subsequent calls. 65 """ 66 if self.amino_acids_content is None: 67 prot_dic = dict((k, 0) for k in IUPACData.protein_letters) 68 for aa in prot_dic: 69 prot_dic[aa] = self.sequence.count(aa) 70 71 self.amino_acids_content = prot_dic 72 73 return self.amino_acids_content
74
75 - def get_amino_acids_percent(self):
76 """Calculate the amino acid content in percentages. 77 78 The same as count_amino_acids only returns the Number in percentage of 79 entire sequence. Returns a dictionary of {AminoAcid:percentage}. 80 81 The return value is cached in self.amino_acids_percent. 82 83 input is the dictionary self.amino_acids_content. 84 output is a dictionary with amino acids as keys. 85 """ 86 if self.amino_acids_percent is None: 87 aa_counts = self.count_amino_acids() 88 89 percentages = {} 90 for aa in aa_counts: 91 percentages[aa] = aa_counts[aa] / float(self.length) 92 93 self.amino_acids_percent = percentages 94 95 return self.amino_acids_percent
96
97 - def molecular_weight(self):
98 """Calculate MW from Protein sequence""" 99 # make local dictionary for speed 100 if self.monoisotopic: 101 water = 18.01 102 iupac_weights = IUPACData.monoisotopic_protein_weights 103 else: 104 iupac_weights = IUPACData.protein_weights 105 water = 18.02 106 107 aa_weights = {} 108 for i in iupac_weights: 109 # remove a molecule of water from the amino acid weight 110 aa_weights[i] = iupac_weights[i] - water 111 112 total_weight = water # add just one water molecule for the whole sequence 113 for aa in self.sequence: 114 total_weight += aa_weights[aa] 115 116 return total_weight
117
118 - def aromaticity(self):
119 """Calculate the aromaticity according to Lobry, 1994. 120 121 Calculates the aromaticity value of a protein according to Lobry, 1994. 122 It is simply the relative frequency of Phe+Trp+Tyr. 123 """ 124 aromatic_aas = 'YWF' 125 aa_percentages = self.get_amino_acids_percent() 126 127 aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) 128 129 return aromaticity
130
131 - def instability_index(self):
132 """Calculate the instability index according to Guruprasad et al 1990. 133 134 Implementation of the method of Guruprasad et al. 1990 to test a 135 protein for stability. Any value above 40 means the protein is unstable 136 (has a short half life). 137 138 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 139 Protein Engineering 4:155-161(1990). 140 """ 141 index = ProtParamData.DIWV 142 score = 0.0 143 144 for i in range(self.length - 1): 145 this, next = self.sequence[i:i+2] 146 dipeptide_value = index[this][next] 147 score += dipeptide_value 148 149 return (10.0 / self.length) * score
150
151 - def flexibility(self):
152 """Calculate the flexibility according to Vihinen, 1994. 153 154 No argument to change window size because parameters are specific for a 155 window=9. The parameters used are optimized for determining the flexibility. 156 """ 157 flexibilities = ProtParamData.Flex 158 window_size = 9 159 weights = [0.25, 0.4375, 0.625, 0.8125, 1] 160 scores = [] 161 162 for i in range(self.length - window_size): 163 subsequence = self.sequence[i:i+window_size] 164 score = 0.0 165 166 for j in range(window_size // 2): 167 front = subsequence[j] 168 back = subsequence[window_size - j - 1] 169 score += (flexibilities[front] + flexibilities[back]) * weights[j] 170 171 middle = subsequence[window_size // 2 + 1] 172 score += flexibilities[middle] 173 174 scores.append(score / 5.25) 175 176 return scores
177
178 - def gravy(self):
179 """Calculate the gravy according to Kyte and Doolittle.""" 180 total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) 181 182 return total_gravy / self.length
183
184 - def _weight_list(self, window, edge):
185 """Makes a list of relative weight of the 186 window edges compared to the window center. The weights are linear. 187 it actually generates half a list. For a window of size 9 and edge 0.4 188 you get a list of [0.4, 0.55, 0.7, 0.85]. 189 """ 190 unit = 2 * (1.0 - edge) / (window - 1) 191 weights = [0.0] * (window // 2) 192 193 for i in range(window // 2): 194 weights[i] = edge + unit * i 195 196 return weights
197
198 - def protein_scale(self, param_dict, window, edge=1.0):
199 """Compute a profile by any amino acid scale. 200 201 An amino acid scale is defined by a numerical value assigned to each type of 202 amino acid. The most frequently used scales are the hydrophobicity or 203 hydrophilicity scales and the secondary structure conformational parameters 204 scales, but many other scales exist which are based on different chemical and 205 physical properties of the amino acids. You can set several parameters that 206 control the computation of a scale profile, such as the window size and the 207 window edge relative weight value. 208 209 WindowSize: The window size is the length 210 of the interval to use for the profile computation. For a window size n, we 211 use the i-(n-1)/2 neighboring residues on each side to compute 212 the score for residue i. The score for residue i is the sum of the scaled values 213 for these amino acids, optionally weighted according to their position in the 214 window. 215 216 Edge: The central amino acid of the window always has a weight of 1. 217 By default, the amino acids at the remaining window positions have the same 218 weight, but you can make the residue at the center of the window have a 219 larger weight than the others by setting the edge value for the residues at 220 the beginning and end of the interval to a value between 0 and 1. For 221 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 222 1.0, 0.7, 0.4. 223 224 The method returns a list of values which can be plotted to 225 view the change along a protein sequence. Many scales exist. Just add your 226 favorites to the ProtParamData modules. 227 228 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 229 """ 230 # generate the weights 231 # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 232 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 233 # in the loop. 234 weights = self._weight_list(window, edge) 235 scores = [] 236 237 # the score in each Window is divided by the sum of weights 238 # (* 2 + 1) since the weight list is one sided: 239 sum_of_weights = sum(weights) * 2 + 1 240 241 for i in range(self.length - window + 1): 242 subsequence = self.sequence[i:i+window] 243 score = 0.0 244 245 for j in range(window // 2): 246 # walk from the outside of the Window towards the middle. 247 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid 248 try: 249 front = param_dict[subsequence[j]] 250 back = param_dict[subsequence[window - j - 1]] 251 score += weights[j] * front + weights[j] * back 252 except KeyError: 253 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 254 (subsequence[j], subsequence[window - j - 1])) 255 256 # Now add the middle value, which always has a weight of 1. 257 middle = subsequence[window // 2] 258 if middle in param_dict: 259 score += param_dict[middle] 260 else: 261 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) 262 263 scores.append(score / sum_of_weights) 264 265 return scores
266
267 - def isoelectric_point(self):
268 """Calculate the isoelectric point. 269 270 Uses the module IsoelectricPoint to calculate the pI of a protein. 271 """ 272 aa_content = self.count_amino_acids() 273 274 ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) 275 return ie_point.pi()
276
278 """Calculate fraction of helix, turn and sheet. 279 280 Returns a list of the fraction of amino acids which tend 281 to be in Helix, Turn or Sheet. 282 283 Amino acids in helix: V, I, Y, F, W, L. 284 Amino acids in Turn: N, P, G, S. 285 Amino acids in sheet: E, M, A, L. 286 287 Returns a tuple of three integers (Helix, Turn, Sheet). 288 """ 289 aa_percentages = self.get_amino_acids_percent() 290 291 helix = sum(aa_percentages[r] for r in 'VIYFWL') 292 turn = sum(aa_percentages[r] for r in 'NPGS') 293 sheet = sum(aa_percentages[r] for r in 'EMAL') 294 295 return helix, turn, sheet
296