ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GAlnExtend.cpp
(Generate patch)
# Line 701 | Line 701
701   }
702  
703  
704 < GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, const char* seqa, int a_len,
705 <        GVec<uint16>* amers[], const char* seqb, int b_len) {
706 < int bimin=GMAX(0,(b_len-a_len-6));
707 < GXSeedTable gx(a_len, b_len);
708 < GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
709 < for (int bi=0;bi<=b_len-6;bi++) {
704 > GXBandSet* collectSeeds_R(GList<GXSeed>& seeds, GXSeqData& sd) {
705 > int bimin=GMAX(0,(sd.blen-sd.alen-6));
706 > GXSeedTable gx(sd.alen, sd.blen);
707 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
708 > //for (int bi=0;bi<=b_len-6;bi++) {
709 > for (int bi=sd.blen-6;bi>=0;bi--) {
710     //for each 6-mer of seqb
711 <   uint16 bv = get6mer((char*)&seqb[bi]);
712 <   GVec<uint16>* alocs = amers[bv];
711 >   uint16 bv = get6mer((char*)&(sd.bseq[bi]));
712 >   GVec<uint16>* alocs = sd.amers[bv];
713     if (alocs==NULL) continue;
714     //extend each hit
715     for (int h=0;h<alocs->Count();h++) {
# Line 720 | Line 720
720             continue;
721           for (int i=0;i<6;i++)
722              gx.x(ai+i,bi+i)=1;
723 <         //see if we can extend to the right
724 <         int aix=ai+6;
725 <         int bix=bi+6;
723 >         //see if we can extend to the left
724 >         int aix=ai-1;
725 >         int bix=bi-1;
726           int len=6;
727 <         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
727 >         while (bix>=0 && aix>=0 && sd.aseq[aix]==sd.bseq[bix]) {
728                   gx.x(aix,bix)=1;
729 <                 aix++;bix++;
729 >                 ai=aix;
730 >                 bi=bix;
731 >                 aix--;bix--;
732                   len++;
733                   }
734 <        /*if (len>22) {
735 <                //heuristics: very likely the best we can get
734 >        if (len>sd.amlen) {
735 >                //heuristics: likely the best we can get
736                  //quick match shortcut
737                  diagstrips->qmatch=new GXSeed(ai,bi,len);
738                  return diagstrips;
739 <                } */
739 >                }
740           if (bi<bimin && len<9) continue; //skip middle seeds that are not high scoring enough
741           GXSeed* newseed=new GXSeed(ai,bi,len);
742           seeds.Add(newseed);
743           diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
744           //special last resort terminal match to be used if no better alignment is there
745 <         if (ai<2 && bi+len>b_len-2 &&
745 >         if (ai<2 && bi+len>sd.blen-2 &&
746                   (!diagstrips->tmatch || diagstrips->tmatch->len<len)) diagstrips->tmatch=newseed;
747           }
748    }
# Line 751 | Line 753
753   return diagstrips;
754   }
755  
756 < GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, const char* seqa, int a_len,
755 <        GVec<uint16>* amers[], const char* seqb, int b_len) {
756 > GXBandSet* collectSeeds_L(GList<GXSeed>& seeds, GXSeqData& sd) {
757   //overlap of left (5') end of seqb
758   //hash the last 12 bases of seqa:
759 < int bimax=GMIN((a_len+2), (b_len-6));
759 > int bimax=GMIN((sd.alen+2), (sd.blen-6));
760   //gx.init(a_maxlen, b_maxlen);
761 < GXSeedTable gx(a_len, b_len);
762 < GXBandSet* diagstrips=new GXBandSet(a_len, b_len); //set of overlapping 3-diagonal strips
763 < for (int bi=0;bi<=b_len-6;bi++) {
761 > GXSeedTable gx(sd.alen, sd.blen);
762 > GXBandSet* diagstrips=new GXBandSet(sd.alen, sd.blen); //set of overlapping 3-diagonal strips
763 > for (int bi=0;bi<=sd.blen-6;bi++) {
764     //for each 6-mer of seqb
765 <   uint16 bv = get6mer((char*)&seqb[bi]);
766 <   GVec<uint16>* alocs = amers[bv];
765 >   uint16 bv = get6mer((char*) & (sd.bseq[bi]));
766 >   GVec<uint16>* alocs = sd.amers[bv];
767     if (alocs==NULL) continue;
768     //extend each hit
769     for (int h=0;h<alocs->Count();h++) {
# Line 777 | Line 778
778           int aix=ai+6;
779           int bix=bi+6;
780           int len=6;
781 <         while (bix<b_len && aix<a_len && seqa[aix]==seqb[bix]) {
781 >         while (bix<sd.blen && aix<sd.alen && sd.aseq[aix]==sd.bseq[bix]) {
782                   gx.x(aix,bix)=1;
783                   aix++;bix++;
784                   len++;
785                   }
786 <         /* if (len>22) {
786 >         if (len>sd.amlen) {
787                  //heuristics: very likely the best we can get
788                  //quick match shortcut
789                  diagstrips->qmatch=new GXSeed(ai,bi,len);
790                  return diagstrips;
791 <                } */
791 >                }
792           if (bi>bimax && len<9) continue; //skip middle seeds that are not high scoring enough
793           GXSeed* newseed=new GXSeed(ai,bi,len);
794           seeds.Add(newseed);
795           diagstrips->addSeed(newseed);//add it to all 3 adjacent diagonals
796       //special last resort terminal match to be used if no better alignment is there
797 <     if (bi<2 && ai+len>=a_len-1 &&
797 >     if (bi<2 && ai+len>=sd.alen-1 &&
798                   (!diagstrips->tmatch || diagstrips->tmatch->len<len))
799                     diagstrips->tmatch=newseed;
800       }
# Line 875 | Line 876
876    int retscore = 0;
877    int numdiffs = 0;
878    if (trim!=NULL && trim->type==galn_TrimLeft) {
879 +    //intent: trimming the left side
880      if (editscript)
881         ed_script_rev=new GXEditScript();
882  
883      int numdiffs_l = GXGreedyExtend(s_seq, s_alnstart, q_seq, q_alnstart, true, xdrop,
884                     reward, penalty, s_ext_l, q_ext_l, *gxmem, ed_script_rev);
885      //check this extension here and bail out if it's not a good extension
886 <    if (s_alnstart+1-s_ext_l>trim->boundary) {
886 >    if (s_ext_l+(trim->seedlen>>1) < trim->safelen && s_alnstart+1-s_ext_l>trim->boundary) {
887        delete ed_script_rev;
888        if (freeAlnMem) delete gxmem;
889        return NULL;
# Line 906 | Line 908
908      //check extension here and bail out if not a good right extension
909      //assuming s_max is really at the right end of s_seq
910      if (trim!=NULL && trim->type==galn_TrimRight &&
911 <           s_alnstart+s_ext_r<trim->boundary) {
911 >        s_ext_r+(trim->seedlen>>1) < trim->safelen &&
912 >            s_alnstart+s_ext_r<trim->boundary) {
913        delete ed_script_fwd;
914        if (freeAlnMem) delete gxmem;
915        return NULL;
# Line 961 | Line 964
964       reward, penalty, xdrop, gxmem, trim, editscript);
965   }
966  
967 < GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
968 <                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
967 > GXAlnInfo* match_RightEnd(GXSeqData& sd, CGreedyAlignData* gxmem,
968 >                 int min_pid) {
969    bool editscript=false;
970    #ifdef GDEBUG
971     editscript=true;
972 <   GMessage("==========> matching Right (3') end : %s\n", seqa);
972 >   GMessage("==========> matching Right (3') end : %s\n", sd.aseq);
973    #endif
974 <
972 <  CAlnTrim trimInfo(galn_TrimRight, seqb, seqb_len);
974 >  CAlnTrim trimInfo(galn_TrimRight, sd.bseq, sd.blen, sd.amlen);
975    GList<GXSeed> rseeds(true,true,false);
976 <  GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
976 >
977 >  GXBandSet* alnbands=collectSeeds_R(rseeds, sd);
978    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
979    //did we find a shortcut?
980    if (alnbands->qmatch) {
# Line 1005 | Line 1008
1008      GXSeed& aseed=*anchor_seeds[i];
1009      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1010      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1011 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1012 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1011 >    trimInfo.seedlen=aseed.len;
1012 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1013 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1014      if (alninfo && alninfo->pid>=min_pid &&
1015 <        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1015 >        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1016               galns.AddIfNew(alninfo, true);
1017          else delete alninfo;
1018      }
1019    if (galns.Count()==0 && alnbands->tmatch) {
1020        //last resort seed
1021        GXSeed& aseed=*alnbands->tmatch;
1022 <      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1023 <      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1024 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1025 <                                seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1022 >      int halfseed=aseed.len>>1;
1023 >      int a1=aseed.a_ofs+halfseed+1;
1024 >      int a2=aseed.b_ofs+halfseed+1;
1025 >      trimInfo.seedlen=aseed.len;
1026 >      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1027 >                                sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1028        if (alninfo && alninfo->pid>=min_pid &&
1029 <           trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1029 >           trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, alninfo->ql-1))
1030                   galns.AddIfNew(alninfo, true);
1031              else delete alninfo;
1032        }
# Line 1075 | Line 1081
1081        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1082            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1083        if (bestaln->gapinfo!=NULL) {
1084 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1084 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen, sd.bseq, sd.blen);
1085          }
1086      #endif
1087  
# Line 1084 | Line 1090
1090    else return NULL;
1091   }
1092  
1093 < GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, GVec<uint16>* amers[],
1088 <                 const char* seqb, int seqb_len, CGreedyAlignData* gxmem, int min_pid) {
1093 > GXAlnInfo* match_LeftEnd(GXSeqData& sd, CGreedyAlignData* gxmem, int min_pid) {
1094    bool editscript=false;
1095    #ifdef GDEBUG
1096     editscript=true;
1097 <   GMessage("==========> matching Left (5') end : %s\n", seqa);
1097 >   GMessage("==========> matching Left (5') end : %s\n", sd.aseq);
1098    #endif
1099 <  CAlnTrim trimInfo(galn_TrimLeft, seqb, seqb_len);
1099 >  CAlnTrim trimInfo(galn_TrimLeft, sd.bseq, sd.blen, sd.amlen);
1100    GList<GXSeed> rseeds(true,true,false);
1101 <  GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, amers, seqb, seqb_len);
1101 >  GXBandSet* alnbands=collectSeeds_L(rseeds, sd);
1102    GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
1103    if (alnbands->qmatch) {
1104      #ifdef GDEBUG
# Line 1126 | Line 1131
1131      GXSeed& aseed=*anchor_seeds[i];
1132      int a1=aseed.a_ofs+(aseed.len>>1)+1;
1133      int a2=aseed.b_ofs+(aseed.len>>1)+1;
1134 <    GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1135 <                            seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1134 >    trimInfo.seedlen=aseed.len;
1135 >    GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1136 >                            sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1137      if (alninfo && alninfo->pid>=min_pid
1138 <           && trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1138 >           && trimInfo.validate(alninfo->sl, alninfo->sr,
1139 >                    alninfo->pid, sd.alen-alninfo->qr))
1140              galns.AddIfNew(alninfo, true);
1141         else delete alninfo;
1142      }
# Line 1138 | Line 1145
1145        GXSeed& aseed=*alnbands->tmatch;
1146        int a1=aseed.a_ofs+(aseed.len>>1)+1;
1147        int a2=aseed.b_ofs+(aseed.len>>1)+1;
1148 <      GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
1149 <                              seqb, a2, seqb_len, gxmem, &trimInfo, editscript);
1148 >      trimInfo.seedlen=aseed.len;
1149 >      GXAlnInfo* alninfo=GreedyAlignRegion(sd.aseq, a1, sd.alen,
1150 >                              sd.bseq, a2, sd.blen, gxmem, &trimInfo, editscript);
1151        if (alninfo && alninfo->pid>=min_pid &&
1152 <        trimInfo.validate(seqa_len, alninfo->sl, alninfo->sr, alninfo->pid, seqa_len-alninfo->qr))
1152 >        trimInfo.validate(alninfo->sl, alninfo->sr, alninfo->pid, sd.alen-alninfo->qr))
1153           galns.Add(alninfo);
1154        }
1155    /*
# Line 1166 | Line 1174
1174        GMessage("Best alignment: a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", bestaln->ql, bestaln->qr,
1175            bestaln->sl, bestaln->sr, bestaln->score, bestaln->pid);
1176        if (bestaln->gapinfo!=NULL) {
1177 <        bestaln->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
1177 >        bestaln->gapinfo->printAlignment(stderr, sd.aseq, sd.alen,
1178 >                                          sd.bseq, sd.blen);
1179          }
1180      #endif
1181      return bestaln;

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines