Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11  """ 
 12   
 13  from Bio.Seq import Seq 
 14  from Bio.SeqRecord import SeqRecord 
 15  from Bio.Align import MultipleSeqAlignment 
 16  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 17   
 18   
19 -class ClustalWriter(SequentialAlignmentWriter):
20 """Clustalw alignment writer."""
21 - def write_alignment(self, alignment):
22 """Use this to write (another) single alignment to an open file.""" 23 24 if len(alignment) == 0: 25 raise ValueError("Must have at least one sequence") 26 if alignment.get_alignment_length() == 0: 27 #This doubles as a check for an alignment object 28 raise ValueError("Non-empty sequences are required") 29 30 #Old versions of the parser in Bio.Clustalw used a ._version property, 31 try: 32 version = str(alignment._version) 33 except AttributeError: 34 version = "" 35 if not version: 36 version = '1.81' 37 if version.startswith("2."): 38 #e.g. 2.0.x 39 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 40 else: 41 #e.g. 1.81 or 1.83 42 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 43 44 cur_char = 0 45 max_length = len(alignment[0]) 46 47 if max_length <= 0: 48 raise ValueError("Non-empty sequences are required") 49 50 # keep displaying sequences until we reach the end 51 while cur_char != max_length: 52 # calculate the number of sequences to show, which will 53 # be less if we are at the end of the sequence 54 if (cur_char + 50) > max_length: 55 show_num = max_length - cur_char 56 else: 57 show_num = 50 58 59 # go through all of the records and print out the sequences 60 # when we output, we do a nice 80 column output, although this 61 # may result in truncation of the ids. 62 for record in alignment: 63 #Make sure we don't get any spaces in the record 64 #identifier when output in the file by replacing 65 #them with underscores: 66 line = record.id[0:30].replace(" ", "_").ljust(36) 67 line += str(record.seq[cur_char:(cur_char + show_num)]) 68 output += line + "\n" 69 70 # now we need to print out the star info, if we've got it 71 # This was stored by Bio.Clustalw using a ._star_info property. 72 if hasattr(alignment, "_star_info") and alignment._star_info != '': 73 output += (" " * 36) + \ 74 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 75 76 output += "\n" 77 cur_char += show_num 78 79 # Want a trailing blank new line in case the output is concatenated 80 self.handle.write(output + "\n")
81 82
83 -class ClustalIterator(AlignmentIterator):
84 """Clustalw alignment iterator.""" 85
86 - def next(self):
87 handle = self.handle 88 try: 89 #Header we saved from when we were parsing 90 #the previous alignment. 91 line = self._header 92 del self._header 93 except AttributeError: 94 line = handle.readline() 95 if not line: 96 raise StopIteration 97 98 #Whitelisted headers we know about 99 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS'] 100 if line.strip().split()[0] not in known_headers: 101 raise ValueError("%s is not a known CLUSTAL header: %s" % 102 (line.strip().split()[0], 103 ", ".join(known_headers))) 104 105 # find the clustal version in the header line 106 version = None 107 for word in line.split(): 108 if word[0] == '(' and word[-1] == ')': 109 word = word[1:-1] 110 if word[0] in '0123456789': 111 version = word 112 break 113 114 #There should be two blank lines after the header line 115 line = handle.readline() 116 while line.strip() == "": 117 line = handle.readline() 118 119 #If the alignment contains entries with the same sequence 120 #identifier (not a good idea - but seems possible), then this 121 #dictionary based parser will merge their sequences. Fix this? 122 ids = [] 123 seqs = [] 124 consensus = "" 125 seq_cols = None # Used to extract the consensus 126 127 #Use the first block to get the sequence identifiers 128 while True: 129 if line[0] != " " and line.strip() != "": 130 #Sequences identifier... 131 fields = line.rstrip().split() 132 133 #We expect there to be two fields, there can be an optional 134 #"sequence number" field containing the letter count. 135 if len(fields) < 2 or len(fields) > 3: 136 raise ValueError("Could not parse line:\n%s" % line) 137 138 ids.append(fields[0]) 139 seqs.append(fields[1]) 140 141 #Record the sequence position to get the consensus 142 if seq_cols is None: 143 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 144 end = start + len(fields[1]) 145 seq_cols = slice(start, end) 146 del start, end 147 assert fields[1] == line[seq_cols] 148 149 if len(fields) == 3: 150 #This MAY be an old style file with a letter count... 151 try: 152 letters = int(fields[2]) 153 except ValueError: 154 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 155 if len(fields[1].replace("-", "")) != letters: 156 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 157 elif line[0] == " ": 158 #Sequence consensus line... 159 assert len(ids) == len(seqs) 160 assert len(ids) > 0 161 assert seq_cols is not None 162 consensus = line[seq_cols] 163 assert not line[:seq_cols.start].strip() 164 assert not line[seq_cols.stop:].strip() 165 #Check for blank line (or end of file) 166 line = handle.readline() 167 assert line.strip() == "" 168 break 169 else: 170 #No consensus 171 break 172 line = handle.readline() 173 if not line: 174 break # end of file 175 176 assert line.strip() == "" 177 assert seq_cols is not None 178 179 #Confirm all same length 180 for s in seqs: 181 assert len(s) == len(seqs[0]) 182 if consensus: 183 assert len(consensus) == len(seqs[0]) 184 185 #Loop over any remaining blocks... 186 done = False 187 while not done: 188 #There should be a blank line between each block. 189 #Also want to ignore any consensus line from the 190 #previous block. 191 while (not line) or line.strip() == "": 192 line = handle.readline() 193 if not line: 194 break # end of file 195 if not line: 196 break # end of file 197 198 if line.split(None, 1)[0] in known_headers: 199 #Found concatenated alignment. 200 done = True 201 self._header = line 202 break 203 204 for i in range(len(ids)): 205 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 206 fields = line.rstrip().split() 207 208 #We expect there to be two fields, there can be an optional 209 #"sequence number" field containing the letter count. 210 if len(fields) < 2 or len(fields) > 3: 211 raise ValueError("Could not parse line:\n%s" % repr(line)) 212 213 if fields[0] != ids[i]: 214 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" 215 % (fields[0], ids[i])) 216 217 if fields[1] != line[seq_cols]: 218 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 219 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 220 end = start + len(fields[1]) 221 seq_cols = slice(start, end) 222 del start, end 223 224 #Append the sequence 225 seqs[i] += fields[1] 226 assert len(seqs[i]) == len(seqs[0]) 227 228 if len(fields) == 3: 229 #This MAY be an old style file with a letter count... 230 try: 231 letters = int(fields[2]) 232 except ValueError: 233 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 234 if len(seqs[i].replace("-", "")) != letters: 235 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 236 237 #Read in the next line 238 line = handle.readline() 239 #There should now be a consensus line 240 if consensus: 241 assert line[0] == " " 242 assert seq_cols is not None 243 consensus += line[seq_cols] 244 assert len(consensus) == len(seqs[0]) 245 assert not line[:seq_cols.start].strip() 246 assert not line[seq_cols.stop:].strip() 247 #Read in the next line 248 line = handle.readline() 249 250 assert len(ids) == len(seqs) 251 if len(seqs) == 0 or len(seqs[0]) == 0: 252 raise StopIteration 253 254 if self.records_per_alignment is not None \ 255 and self.records_per_alignment != len(ids): 256 raise ValueError("Found %i records in this alignment, told to expect %i" 257 % (len(ids), self.records_per_alignment)) 258 259 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 260 for (i, s) in zip(ids, seqs)) 261 alignment = MultipleSeqAlignment(records, self.alphabet) 262 #TODO - Handle alignment annotation better, for now 263 #mimic the old parser in Bio.Clustalw 264 if version: 265 alignment._version = version 266 if consensus: 267 alignment_length = len(seqs[0]) 268 assert len(consensus) == alignment_length, \ 269 "Alignment length is %i, consensus length is %i, '%s'" \ 270 % (alignment_length, len(consensus), consensus) 271 alignment._star_info = consensus 272 return alignment
273 274 if __name__ == "__main__": 275 print "Running a quick self-test" 276 277 #This is a truncated version of the example in Tests/cw02.aln 278 #Notice the inclusion of sequence numbers (right hand side) 279 aln_example1 = \ 280 """CLUSTAL W (1.81) multiple sequence alignment 281 282 283 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 284 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 285 * *: :: :. :* : :. : . :* :: . 286 287 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 288 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 289 : ** **:... *.*** .. 290 291 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 292 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 293 .:* * *: .* :* : :* .* 294 295 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 296 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 297 *::. . .:: :*..* :* .* .. . : . : 298 299 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 300 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 301 *. .:: : . 302 """ 303 304 #This example is a truncated version of the dataset used here: 305 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 306 #with the last record repeated twice (deliberate toture test) 307 aln_example2 = \ 308 """CLUSTAL X (1.83) multiple sequence alignment 309 310 311 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 312 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 313 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 314 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 315 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 316 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 317 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 318 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 319 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 320 : . : :. 321 322 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 323 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 324 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 325 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 326 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 327 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 328 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 329 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 330 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 331 ** .: *::::. : :. . ..: 332 333 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 334 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 335 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 336 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 337 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 338 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 339 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 340 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 341 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 342 *.: . * . * *: : 343 344 """ 345 346 from StringIO import StringIO 347 348 alignments = list(ClustalIterator(StringIO(aln_example1))) 349 assert 1 == len(alignments) 350 assert alignments[0]._version == "1.81" 351 alignment = alignments[0] 352 assert 2 == len(alignment) 353 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" 354 assert alignment[1].id == "gi|671626|emb|CAA85685.1|" 355 assert str(alignment[0].seq) == \ 356 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 357 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 358 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 359 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 360 "VPTTRAQRRA" 361 362 alignments = list(ClustalIterator(StringIO(aln_example2))) 363 assert 1 == len(alignments) 364 assert alignments[0]._version == "1.83" 365 alignment = alignments[0] 366 assert 9 == len(alignment) 367 assert alignment[-1].id == "HISJ_E_COLI" 368 assert str(alignment[-1].seq) == \ 369 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 370 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 371 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 372 373 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): 374 print "Alignment with %i records of length %i" \ 375 % (len(alignment), 376 alignment.get_alignment_length()) 377 378 print "Checking empty file..." 379 assert 0 == len(list(ClustalIterator(StringIO("")))) 380 381 print "Checking write/read..." 382 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 383 + list(ClustalIterator(StringIO(aln_example2)))*2 384 handle = StringIO() 385 ClustalWriter(handle).write_file(alignments) 386 handle.seek(0) 387 for i, a in enumerate(ClustalIterator(handle)): 388 assert a.get_alignment_length() == alignments[i].get_alignment_length() 389 handle.seek(0) 390 391 print "Testing write/read when there is only one sequence..." 392 alignment = alignment[0:1] 393 handle = StringIO() 394 ClustalWriter(handle).write_file([alignment]) 395 handle.seek(0) 396 for i, a in enumerate(ClustalIterator(handle)): 397 assert a.get_alignment_length() == alignment.get_alignment_length() 398 assert len(a) == 1 399 400 aln_example3 = \ 401 """CLUSTAL 2.0.9 multiple sequence alignment 402 403 404 Test1seq ------------------------------------------------------------ 405 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC 406 AT3G20900.1-CDS ------------------------------------------------------------ 407 408 409 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT 410 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT 411 AT3G20900.1-CDS ------------------------------------------------------------ 412 413 414 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 415 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 416 AT3G20900.1-CDS ------------------------------------------------------------ 417 418 419 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA 420 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA 421 AT3G20900.1-CDS ------------------------------------------------------------ 422 423 424 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA 425 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA 426 AT3G20900.1-CDS ------------------------------------------------------------ 427 428 429 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 430 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 431 AT3G20900.1-CDS ------------------------------------------------------------ 432 433 434 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 435 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 436 AT3G20900.1-CDS ------------------------------------------------------------ 437 438 439 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 440 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 441 AT3G20900.1-CDS ------------------------------------------------------ATGAAC 442 * 443 444 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 445 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 446 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT 447 * *** ***** * * ** **************************** 448 449 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 450 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 451 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 452 ******* ** * **** *************************************** 453 454 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT 455 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 456 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 457 **************************************** ******************* 458 459 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT- 460 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG 461 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG 462 ************************* 463 """ 464 alignments = list(ClustalIterator(StringIO(aln_example3))) 465 assert 1 == len(alignments) 466 assert alignments[0]._version == "2.0.9" 467 468 print "The End" 469