Thursday 16 October 2014

Code to find ORF and genes in a string

#include<iostream>
#include<cstring>
#include<vector>
#include<algorithm>

using namespace std;
// this is the function for collection a raw string as a input and converting into a vector
int complement(string &seq);
int get_codon(string seq,vector<string>&ivec,int frame_no)
{
    string s1;
    //this is for forward frame code extraction
    if(frame_no ==1 ||frame_no==2 || frame_no==3 )
    {
        //this is to extract the sequence into the vector called ivec
        for(int i = frame_no;i<seq.size()-1;i++)
        {
            s1="";
            s1.push_back(seq[i-1]);
            s1.push_back(seq[i]);
            s1.push_back(seq[i+1]);
            //this is for extracting the string as codons to a string in triplets
            i = i+2;
            if(s1.size()>=3)
            {
                ivec.push_back(s1);
            }
            else
            {
                break;
            }
        }
    }
    //this is for reverse open reading frame ie -1,-2,-3
    else if (frame_no ==4 || frame_no ==5|| frame_no == 6)
    {
        complement(seq);
        reverse(seq.begin(),seq.end());
        for(int i = frame_no; i<seq.size()-1;i++)
        {
            s1="";
            s1.push_back(seq[i-1]);
            s1.push_back(seq[i]);
            s1.push_back(seq[i+1]);
            //xtraction of the codes into triplets as s1 string
            i =i+2;
            if(s1.size()>=3)
            {
                ivec.push_back(s1);
            }
            else
            {
                break;
            }
        }
    }
}
int complement(string &seq)
{
    for(int i = 0; i < seq.size();i++)
    {
        if(seq[i]=='A'||seq[i]=='a')
            seq[i]='T';
        else if(seq[i]=='T'||seq[i]=='t')
            seq[i]='A';
        else if(seq[i]=='G'||seq[i]=='g')
            seq[i]='C';
        else if(seq[i]=='C'||seq[i]=='c')
            seq[i]='G';
        else
            continue;
    }
}
// this function is for extraction of gene from given orf frame which is having start and stop flags :P

void codon_find(vector<string>&ivec,vector<string>&gvec)
{
    int start_ind = 0;
    int stop_ind =0;
    string gene;
    string start = "ATG";
    string stop1 = "TAG";
    string stop2 = "TAA";
    string stop3 = "TGA";
    for(int i = 0;i<ivec.size();i++)
    {
    //this is statement for the begining of the a gene. We are iterating through the vector in search for
    //start string and then we are adding it to the gene string ;-)
        if ((ivec[i]==start)&&(start_ind==0)&&(start_ind==0))
        {
            start_ind=1;
            gene = gene+ivec[i];
        }   
    //this condition is when the genecomes across the start codon whichc is
    // already having a start codon before
        else if((ivec[i]==start)&&(start_ind == 1)&& (stop_ind==0))
        {
            gene= "";
            gene = gene+ivec[i];
        }
    //when program comes across the terminating codon
    //Pushing all the codons which are present  in gene string to gvec vector :P
        else if ((ivec[i]==stop1)||(ivec[i]==stop2)||(ivec[i]==stop3)&&(start_ind==1)&&(stop_ind==0))
        {
            gene = gene+ivec[i];
            gvec.push_back(gene);
            gvec.push_back("\n");
            gene = "";
            start_ind = 0;
            start_ind = 0;   
       
        }
    // if we doesnt come across any of the special cases mentioned above then.......
        else if ((start_ind == 1 )&& (stop_ind == 0))
        {
            gene = gene+ivec[i];
        }
       
    }
}
int main()
{
    string seq= "GAAGTGTTTTATCTGACTTACACCCCTGAAGATGTTGAAGGGAATGTTCAGCTGGAAACTGGAGATAAAATAAACTTTGTAATTGATAACAATAAACATACTGGTGCTGTAAGTGCTCGTAATATTATGCTGTTGAAAAAGAAACAAGCTCGCTATCAGGGAGTAGTTTGTGCCATGAAAGAGGCATTTGGCTTTATTGAAAGAGGCGATATTGTAAAGGAGATATTCTTTCACTATAGTGAATTTAAAGGTGACTTAGAATCCTTACAGCCTGGAGATGACGTGGAATTCACAATCAAGGACCGAAATGGTAAAGAAGTTGCAACAGATGTCAGACTATTGCCTCAAGGAACAGTCATTTTTGAAGATATCAGCATTGAACATTTTGAAGGAACTGTAACCAAAGTTATCCCCAAAGTACCCAGTAAAAACCAGAATGACCCATTGCCAGGACGCATCAAAGTTGATTTTGTGATTCCTAAAGAACTTCCCTTTGGAGACAAAGATACAAAATCCAAGGTGACGCTGTTGGAAGGTGACCACGTTAGGTTTAATATTTCAACAGACCGTCGTGACAAATTAGAACGAGCAACCAACATAGAAGTTCTATCAAATACATTTCAGTTCACTAATGAAGCCAGAGAGATGGGTGTAATTGCTGCCATGAGAGATGGTTTTGGTTTCATCAAGTGTGTGGATCGTGATGCTCGTATGTTCTTCCACTTCAGTGAAATTCTGGATGGGAACCAGCTTCATATTGCAGATGAAGTAGAGTTTACTGTGGTTCCTGATATGCTCTCTGCCCAAAGAAATCATGCTATTAGGATTAAAAAACTTCCCAAGGGCACGGTTTCGTTCCACTCCCATTCAGATCATCGTTTTCTGGGCACTGTAGAAAAAGAGGCCACTTTTTCGAATCCTAAAACCACTAGCCCAAATAAAGGCAAAGAAAAGGAGGCTGAGGATGGCATTATTGCTTATGATGATTGTGGGGTGAAACTGACTATTGCTTTTCAAGCCAAGGATGTGGAAGGATCTACTTCTCCTCAAATAGGAGACAAGGTTGAATTTAGTATTAGTGACAAACAGAGGCCTGGACAGCAGATTGCAACTTGTGTGCGGCTCTTAGGTCGTAATTCAAACTCCAAGAGGCTCTTGGGTTATGTGGCAACTTTGAAGGATAATTTTGGATTTATTGAAACAGCCAATCATGATAAGGAAATCTTTTTCCATTACAGTGAGTTCTCTGGTGATGTTGATAGCCTGGAACTGGGGGACATGGTTGAGTACAGCTTGTCCAAAGGAAAAGGCAACAAAGTCAGTGCAGAAAAAGTGAACAAAACACACTCAGTGAATGGCATTACTGAGGAAGCTGATCCCACCATCTACTCTGGTAAAGTCATTCGCCCCTTGAGGAGTGTTGATCCAACACAGAATGAGTACCAAGGAATGATTGAGATCGTGGACGAAGGGGATATGAAAGGTGAGGTCTATCCATTTGGCATAGTTGGGATGGCCAACAAAGGGGATTGCCTACAGAAAGGGGAGAGTGTCAAGTTCCAGTTGTGTGTCCTGGGCCAAAATGCACAGACTATGGCCTACAACATCACACCCCTGCGTAGGGCTACAGTGGAGTGTGTGAAAGATCAGTTTGGCTTCATTAACTATGAAGTAGGAGATAGCAAGAAGCTCTTTTTCCACGTGAAAGAAGTTCAGGATGGCATTGAGCTACAGGCAGGAGATGAGGTGGAATTCTCAGTGATTCTTAATCAGCGCACTGGCAAGTGCAGTGCTTGTAATGTTTGGCGAGTCTGCGAGGGCCCCAAGGCTGTTGCAGCTCCACGACCTGATAGGTTGGTCAATCGCTTGAAGAATATCACCCTGGATGATGCCAGTGCTCCTCGCCTAATGGTTCTTCGTCAGCCAAGGGGACCAGATAACTCAATGGGATTTGGTGCAGAAAGAAAGATCCGTCAAGCTGGTGTCATTGACTAACCACATCCACAAAGCACATCATTAATCCACTATGATCAAGTTGGGGGGATTCTGGTGAAGGGTTCTGAATATCTCTCTCTTCATCCCTCCCAAAATCTGGAATACTTATTCTATTGAGCTATTACACCAGTTTTAACACCTTCC";
    vector<string>ivec;
    vector<string>gvec;
    //calling the function for ORF finder :P
    for(int j=1; j<=6;j++)
    {

        cout<<"ORF"<<j<<":-"<<endl;
        get_codon(seq,ivec,j);
       
        cout<<endl;
        for(int i = 0;i<ivec.size();i++)
        {
            cout<<ivec[i]<<" ";
           
        }
        cout<<endl;
    //calling the codon find function :P
        codon_find(ivec,gvec);
        vector<string>::iterator iter;
        cout << "Genes :- "<< endl;   
        for(iter= gvec.begin();iter<gvec.end();iter++)
        {
            cout<<*iter;
           
        }   
        cout<<endl;
        ivec.clear();
        gvec.clear();
       
    }
    return (0);

}

No comments:

Post a Comment