Intergenic regions

From Biopython
(Difference between revisions)
Jump to: navigation, search
m (Solution)
(Solution)
 
(3 intermediate revisions by 2 users not shown)
Line 4: Line 4:
 
== Solution ==
 
== Solution ==
  
The following ready-to-run script reads a genbank file, which is probably a genomic or chromosomal one. It uses the CDS feature to discover  the 5' and 3' ends of ORFs. Yes, ORFs are not exactly synonymous with genes, but this is the way we did it. Also, you may want to swap the CDS feature for the 'gene' feature, if you are also interested in RNA coding genes.  The "intergene_length" variable is a threshold on the minimal length of intergenic regions to be analyzed, and is set by default to 100. The program outputs to a file with the suffix ".ign" The program outputs the + strand or the reverse-complement based on the genbank file annotation. The output is in FASTA format, and the header includes the intergenic region coordinates, and unique ID, and whether the sequence was derived from the + or - strand.
+
The following ready-to-run script reads a genbank file, which is probably a genomic or chromosomal one. It uses the CDS feature to discover  the 5' and 3' ends of ORFs. Yes, ORFs are not exactly synonymous with genes, but this is the way we did it. Also, you may want to swap the CDS feature for the 'gene' feature, if you are also interested in RNA coding genes.  The "intergene_length" variable is a threshold on the minimal length of intergenic regions to be analyzed, and is set by default to 1. The program outputs to a file with the suffix "_ign.fasta" The program outputs the + strand or the reverse-complement based on the genbank file annotation. The output is in FASTA format, and the header includes the intergenic region coordinates, and unique ID, and whether the sequence was derived from the + or - strand.
  
 
<python>
 
<python>
Line 13: Line 13:
 
from Bio.SeqRecord import SeqRecord
 
from Bio.SeqRecord import SeqRecord
 
import os
 
import os
 +
 
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
 
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
 
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
 
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
 
# Do not remove this comment
 
# Do not remove this comment
def get_interregions(genbank_path,intergene_length=100):
+
def get_interregions(genbank_path,intergene_length=1):
 
     seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
 
     seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
      
+
     cds_list_plus = []
     cds_list = []
+
     cds_list_minus = []
 
     intergenic_records = []
 
     intergenic_records = []
     # Loop over the genome file, get the  
+
     # Loop over the genome file, get the CDS features on each of the strands
     for fnum, feature in enumerate(seq_record.features):
+
     for feature in seq_record.features:
         if feature.type == 'CDS':
+
         if feature.type == 'CDS':
 
             mystart = feature.location._start.position
 
             mystart = feature.location._start.position
 
             myend = feature.location._end.position
 
             myend = feature.location._end.position
             cds_list.append((mystart,myend,feature.strand))
+
             if feature.strand == -1:
     for i,pospair in enumerate(cds_list[1:]):
+
                cds_list_minus.append((mystart,myend,-1))
 +
            elif feature.strand == 1:
 +
                cds_list_plus.append((mystart,myend,1))
 +
            else:
 +
                sys.stderr.write("No strand indicated %d-%d. Assuming +\n" %
 +
                                  (mystart, myend))
 +
                cds_list_plus.append((mystart,myend,1))
 +
 
 +
     for i,pospair in enumerate(cds_list_plus[1:]):
 
         # Compare current start position to previous end position
 
         # Compare current start position to previous end position
         last_end = cds_list[i][1]
+
         last_end = cds_list_plus[i][1]
 
         this_start = pospair[0]
 
         this_start = pospair[0]
 
         strand = pospair[2]
 
         strand = pospair[2]
 
         if this_start - last_end >= intergene_length:
 
         if this_start - last_end >= intergene_length:
 
             intergene_seq = seq_record.seq[last_end:this_start]
 
             intergene_seq = seq_record.seq[last_end:this_start]
             if strand == -1:
+
             strand_string = "+"
                intergene_seq = intergene_seq.reverse_complement()
+
             intergenic_records.append(
                strand_string = "-"
+
            else:
+
                strand_string = "+"
+
             intergenic_records.append(  
+
 
                   SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
 
                   SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
 
                   description="%s %d-%d %s" % (seq_record.name, last_end+1,
 
                   description="%s %d-%d %s" % (seq_record.name, last_end+1,
 
                                                         this_start,strand_string)))
 
                                                         this_start,strand_string)))
     outpath = os.path.splitext(os.path.basename(genbank_path))[0] + ".ign"
+
    for i,pospair in enumerate(cds_list_minus[1:]):
 +
        last_end = cds_list_minus[i][1]
 +
        this_start = pospair[0]
 +
        strand = pospair[2]
 +
        if this_start - last_end >= intergene_length:
 +
            intergene_seq = seq_record.seq[last_end:this_start]
 +
            strand_string = "-"
 +
            intergenic_records.append(
 +
                  SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
 +
                  description="%s %d-%d %s" % (seq_record.name, last_end+1,
 +
                                                        this_start,strand_string)))
 +
     outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
 
     SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
 
     SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
  
         
 
 
if __name__ == '__main__':
 
if __name__ == '__main__':
 
     if len(sys.argv) == 2:
 
     if len(sys.argv) == 2:
Line 55: Line 70:
 
         print "Usage: get_intergenic.py gb_file [intergenic_length]"
 
         print "Usage: get_intergenic.py gb_file [intergenic_length]"
 
         sys.exit(0)
 
         sys.exit(0)
 
 
  </python>
 
  </python>
 +
 +
For a full explanation of the code, see here: [http://bytesizebio.net/index.php/2010/02/11/short-bioinformatic-hacks-reading-between-the-genes/]
  
 
== Running ==
 
== Running ==
Line 64: Line 80:
 
</bash>
 
</bash>
  
Run on the genbank file. The intergenic sequences will appear in the file mygenbank.ign in FASTA format.
+
Run on the genbank file. The intergenic sequences will appear in the file mygenbank_ign.fasta in FASTA format.
  
  
 
  [[category:Cookbook]]
 
  [[category:Cookbook]]

Latest revision as of 21:18, 23 August 2010

Problem

Extract intergenic regions from a sequence. Sometimes, reading between the genes is more interesting....

Solution

The following ready-to-run script reads a genbank file, which is probably a genomic or chromosomal one. It uses the CDS feature to discover the 5' and 3' ends of ORFs. Yes, ORFs are not exactly synonymous with genes, but this is the way we did it. Also, you may want to swap the CDS feature for the 'gene' feature, if you are also interested in RNA coding genes. The "intergene_length" variable is a threshold on the minimal length of intergenic regions to be analyzed, and is set by default to 1. The program outputs to a file with the suffix "_ign.fasta" The program outputs the + strand or the reverse-complement based on the genbank file annotation. The output is in FASTA format, and the header includes the intergenic region coordinates, and unique ID, and whether the sequence was derived from the + or - strand.

#!/usr/bin/env python
import sys
import Bio
from Bio import SeqIO, SeqFeature
from Bio.SeqRecord import SeqRecord
import os
 
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
# Do not remove this comment
def get_interregions(genbank_path,intergene_length=1):
    seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
    cds_list_plus = []
    cds_list_minus = []
    intergenic_records = []
    # Loop over the genome file, get the CDS features on each of the strands
    for feature in seq_record.features:
        if feature.type == 'CDS':
            mystart = feature.location._start.position
            myend = feature.location._end.position
            if feature.strand == -1:
                cds_list_minus.append((mystart,myend,-1))
            elif feature.strand == 1:
                cds_list_plus.append((mystart,myend,1))
            else:
                sys.stderr.write("No strand indicated %d-%d. Assuming +\n" %
                                  (mystart, myend))
                cds_list_plus.append((mystart,myend,1))
 
    for i,pospair in enumerate(cds_list_plus[1:]):
        # Compare current start position to previous end position
        last_end = cds_list_plus[i][1]
        this_start = pospair[0]
        strand = pospair[2]
        if this_start - last_end >= intergene_length:
            intergene_seq = seq_record.seq[last_end:this_start]
            strand_string = "+"
            intergenic_records.append(
                  SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
                  description="%s %d-%d %s" % (seq_record.name, last_end+1,
                                                        this_start,strand_string)))
    for i,pospair in enumerate(cds_list_minus[1:]):
        last_end = cds_list_minus[i][1]
        this_start = pospair[0]
        strand = pospair[2]
        if this_start - last_end >= intergene_length:
            intergene_seq = seq_record.seq[last_end:this_start]
            strand_string = "-"
            intergenic_records.append(
                  SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
                  description="%s %d-%d %s" % (seq_record.name, last_end+1,
                                                        this_start,strand_string)))
    outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
    SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
 
if __name__ == '__main__':
    if len(sys.argv) == 2:
         get_interregions(sys.argv[1])
    elif len(sys.argv) == 3:
         get_interregions(sys.argv[1],int(sys.argv[2]))
    else:
         print "Usage: get_intergenic.py gb_file [intergenic_length]"
         sys.exit(0)

For a full explanation of the code, see here: [1]

Running

./get_intergene mygenbankfile.gb 1

Run on the genbank file. The intergenic sequences will appear in the file mygenbank_ign.fasta in FASTA format.

Personal tools
Namespaces
Variants
Actions
Navigation
Toolbox