/*------------------------------------------------------------- Copyright (C) 2000 Peter Clote. All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for NON-COMMERCIAL purposes and without fee is hereby granted provided that this copyright notice appears in all copies. THE AUTHOR AND PUBLISHER MAKE NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. THE AUTHORS AND PUBLISHER SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. -------------------------------------------------------------*/ /************************************************* Program: smithWaterman.c Peter Clote, 11 Oct 2000 Program for local sequence alignment, using the Smith-Waterman algorithm and assuming a LINEAR gap penalty. A traceback is used to determine the alignment, and is determined by choosing that direction, for which S(i-1,j-1)+sigma(a_i,b_j), S(i-1,j)+Delta and S(i,j-1)+Delta is maximum, i.e. for which _ | | H(i-1,j-1) + sigma(a_i,b_j) (diagonal) H(i,j) = MAX of | H(i-1,j) + delta (up) | H(i,j-1) + delta (left) | 0 | _ is a maximum. 1/16/12, modifications made by MMS for use in CS 560 - Enable the protein array size to be dynamic. - Replaced tabs with spaces. Additional input file notes -Skips first line of input protein files. -Assumes next line has integer with number of characters in sequence. *************************************************/ #include #include // character handling #include // def of RAND_MAX //#define N 1000 // size of protein arrays #define AA 20 // number of amino acids #define MAX2(x,y) ((x)<(y) ? (y) : (x)) #define MAX3(x,y,z) (MAX2(x,y)<(z) ? (z) : MAX2(x,y)) // function prototypes void error(char *); /** error handling */ int char2AA(char); char AA2char(int); main(int argc, char *argv[]) { // variable declarations FILE * in1, *in2, *pam; char ch; int temp; int i,j,tempi,tempj,x,y,diag,down,right,DELTA; int topskip,bottomskip; //char aout[N],bout[N]; char *aout, *bout; int Aend,Bend,Abegin,Bbegin; int max, Max, xMax, yMax; // Max is first found maximum in similarity matrix H // max is auxilliary to determine largest of // diag,down,right, xMax,yMax are h-coord of Max //short a[N], b[N]; //int h[N][N]; int numcharA, numcharB; // used to be N short *a, *b; int **h; int sim[AA][AA]; // PAM similarity matrix //short xTraceback[N][N], yTraceback[N][N]; short **xTraceback, **yTraceback; /**** Error handling for input file ****/ if (argc != 5) { fprintf(stderr,"%s protein1 protein2 PAM gapPenalty\n",argv[0]); exit(1); } /***** Initialization of input file and pair array **/ in1 = fopen(argv[1],"r"); in2 = fopen(argv[2],"r"); pam = fopen(argv[3],"r"); DELTA = atoi(argv[4]); /** read PAM250 similarity matrix **/ for (i=0;i Max){ Max=max; xMax=i; yMax=j; } } // end for loop } // output values to show traceback i=xMax; j=yMax; /* while ( i>0 && j>0 && h[i][j]>0){ printf("%d %d\n",i,j); //printf("%c %c\n",AA2char(a[i]),AA2char(b[j])); //printf("score %d\n",h[i][j]); // previous 2 lines for debugging tempi=i; tempj=j; i=xTraceback[tempi][tempj]; j=yTraceback[tempi][tempj]; } */ // initialize output arrays aout = malloc(a[0]); bout = malloc(b[0]); // reset to max point to do alignment i=xMax; j=yMax; x=y=0; topskip = bottomskip = 1; while (i>0 && j>0 && h[i][j] > 0){ if (topskip && bottomskip) { aout[x++]=AA2char(a[i]); bout[y++]=AA2char(b[j]); } else if (topskip) { aout[x++]='-'; bout[y++]=AA2char(b[j]); } else if (bottomskip) { aout[x++]=AA2char(a[i]); bout[y++]='-'; } topskip = (j>yTraceback[i][j]); bottomskip = (i>xTraceback[i][j]); tempi=i;tempj=j; i=xTraceback[tempi][tempj]; j=yTraceback[tempi][tempj]; } // print alignment printf("\n"); printf("\n"); for (i=x-1;i>=0;i--) { printf("%c",aout[i]); } printf("\n"); for (j=y-1;j>=0;j--) { printf("%c",bout[j]); } printf("\n"); printf("\n"); } void error(char * s) { fprintf(stderr,"%s\n",s); exit(1); } int char2AA(char ch){ switch (ch) { case 'C' : return 0; case 'G' : return 1; case 'P' : return 2; case 'S' : return 3; case 'A' : return 4; case 'T' : return 5; case 'D' : return 6; case 'E' : return 7; case 'N' : return 8; case 'Q' : return 9; case 'H' : return 10; case 'K' : return 11; case 'R' : return 12; case 'V' : return 13; case 'M' : return 14; case 'I' : return 15; case 'L' : return 16; case 'F' : return 17; case 'Y' : return 18; case 'W' : return 19; default : fprintf(stderr,"Nonstandard amino acid code: %d\n",ch); exit(1); } } char AA2char(int x){ switch (x) { case 0 : return 'C'; case 1 : return 'G'; case 2 : return 'P'; case 3 : return 'S'; case 4 : return 'A'; case 5 : return 'T'; case 6 : return 'D'; case 7 : return 'E'; case 8 : return 'N'; case 9 : return 'Q'; case 10: return 'H'; case 11: return 'K'; case 12: return 'R'; case 13: return 'V'; case 14: return 'M'; case 15: return 'I'; case 16: return 'L'; case 17: return 'F'; case 18: return 'Y'; case 19: return 'W'; default : fprintf(stderr,"Bad char: %d\n",x); error("Bad integer representation of AA"); } }