diff --git a/String Algorithms/Smith-Waterman Algorithm/Program.c b/String Algorithms/Smith-Waterman Algorithm/Program.c new file mode 100644 index 00000000..e590b35d --- /dev/null +++ b/String Algorithms/Smith-Waterman Algorithm/Program.c @@ -0,0 +1,111 @@ +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MAX3(a, b, c) (MAX(MAX(a, b), c)) + +// Function to find the maximum of 3 integers (match, insert, delete) + +// Function to initialize a 2D array for the scoring matrix +int** createMatrix(int rows, int cols) { + int** matrix = (int**)malloc(rows * sizeof(int*)); + for (int i = 0; i < rows; i++) { + matrix[i] = (int*)malloc(cols * sizeof(int)); + } + return matrix; +} + +// Function to perform the Smith-Waterman algorithm for local sequence alignment +void smithWaterman(char* seq1, char* seq2, int matchScore, int mismatchPenalty, int gapPenalty) { + int len1 = strlen(seq1); + int len2 = strlen(seq2); + + // Initialize scoring matrix with extra row and column for initial zeroes + int** scoreMatrix = createMatrix(len1 + 1, len2 + 1); + + // Initialize maximum score to zero (for local alignment) + int maxScore = 0; + int endRow = 0, endCol = 0; + + // Fill in the scoring matrix + for (int i = 0; i <= len1; i++) { + for (int j = 0; j <= len2; j++) { + if (i == 0 || j == 0) { + // Set first row and first column to 0 + scoreMatrix[i][j] = 0; + } else { + // Calculate match, delete, and insert scores + int match = scoreMatrix[i - 1][j - 1] + (seq1[i - 1] == seq2[j - 1] ? matchScore : mismatchPenalty); + int delete = scoreMatrix[i - 1][j] + gapPenalty; + int insert = scoreMatrix[i][j - 1] + gapPenalty; + + // Get the maximum score for this cell (including zero for local alignment) + scoreMatrix[i][j] = MAX3(0, match, MAX(delete, insert)); + + // Track the maximum score and the position for traceback + if (scoreMatrix[i][j] > maxScore) { + maxScore = scoreMatrix[i][j]; + endRow = i; + endCol = j; + } + } + } + } + + // Output the maximum score + printf("Maximum Alignment Score: %d\n", maxScore); + + // Traceback to find the aligned sequences + char alignedSeq1[100], alignedSeq2[100]; + int idx1 = 0, idx2 = 0; + + while (endRow > 0 && endCol > 0 && scoreMatrix[endRow][endCol] > 0) { + if (seq1[endRow - 1] == seq2[endCol - 1]) { + alignedSeq1[idx1++] = seq1[endRow - 1]; + alignedSeq2[idx2++] = seq2[endCol - 1]; + endRow--; + endCol--; + } else if (scoreMatrix[endRow][endCol] == scoreMatrix[endRow - 1][endCol] + gapPenalty) { + alignedSeq1[idx1++] = seq1[endRow - 1]; + alignedSeq2[idx2++] = '-'; + endRow--; + } else { + alignedSeq1[idx1++] = '-'; + alignedSeq2[idx2++] = seq2[endCol - 1]; + endCol--; + } + } + + // Reverse the aligned sequences + alignedSeq1[idx1] = '\0'; + alignedSeq2[idx2] = '\0'; + strrev(alignedSeq1); + strrev(alignedSeq2); + + // Print the aligned sequences + printf("Aligned Sequence 1: %s\n", alignedSeq1); + printf("Aligned Sequence 2: %s\n", alignedSeq2); + + // Free allocated memory for the matrix + for (int i = 0; i <= len1; i++) { + free(scoreMatrix[i]); + } + free(scoreMatrix); +} + +int main() { + char seq1[] = "ACACACTA"; + char seq2[] = "AGCACACA"; + int matchScore = 2; + int mismatchPenalty = -1; + int gapPenalty = -2; + + printf("Sequence 1: %s\n", seq1); + printf("Sequence 2: %s\n", seq2); + + // Call the Smith-Waterman function + smithWaterman(seq1, seq2, matchScore, mismatchPenalty, gapPenalty); + + return 0; +} \ No newline at end of file diff --git a/String Algorithms/Smith-Waterman Algorithm/README.md b/String Algorithms/Smith-Waterman Algorithm/README.md new file mode 100644 index 00000000..99300f84 --- /dev/null +++ b/String Algorithms/Smith-Waterman Algorithm/README.md @@ -0,0 +1,68 @@ +# Smith-Waterman Algorithm + +## Description + +The Smith-Waterman algorithm is a dynamic programming algorithm used for local sequence alignment. It finds the optimal alignment of a subset of one sequence against another. Unlike global alignment algorithms, Smith-Waterman focuses on finding the most similar region between two sequences, making it ideal for biological applications such as DNA or protein sequence matching. + +### Problem Definition + +Given: +- A sequence `seq1` of length `m` +- A sequence `seq2` of length `n` + +Objective: +- Find the best local alignment of `seq1` and `seq2` + +### Algorithm Overview + +1. **Initialization**: + - Create a scoring matrix with dimensions `(m+1) x (n+1)` initialized to zeros. + +2. **Scoring**: + - For each cell in the matrix, calculate the score based on the match/mismatch and gap penalties. + - Track the maximum score and its position. + +3. **Traceback**: + - Start from the cell with the highest score and trace back to reconstruct the optimal local alignment. + +### Key Features + +- Finds local alignments by allowing partial overlaps of sequences. +- Uses a scoring system for matches, mismatches, and gaps. +- Backtracks from the cell with the highest score to find the optimal alignment. +- Time complexity: O(mn), where `m` and `n` are the lengths of the sequences. + +### Time Complexity + +- **Worst Case**: O(mn), where `m` is the length of `seq1` and `n` is the length of `seq2` + +### Space Complexity + +- O(mn), since a matrix of size `(m+1) x (n+1)` is required to store the scores. + +## Implementation + +The implementation in C demonstrates the Smith-Waterman algorithm for local sequence alignment. It includes: + +1. A function to initialize and populate the scoring matrix. +2. The Smith-Waterman function to compute the alignment score and perform the traceback to find the aligned sequences. +3. A demonstration of how to use the algorithm on two sample sequences. + +## Usage + +Compile the program and run it. The example in the `main` function demonstrates how the Smith-Waterman algorithm can be used to align two sequences. + +```bash +gcc smith_waterman.c -o smith_waterman +./smith_waterman +``` + +## Limitations + +- The algorithm works for short to medium-length sequences but can be memory-intensive for very long sequences due to the need for a full scoring matrix. +- The current implementation assumes equal penalties for all types of mismatches and gaps. These can be adjusted as needed for specific applications. + +## Extensions + +- The algorithm can be extended by using affine gap penalties, which distinguish between opening and extending a gap. +- It can also be adapted for approximate matching in non-biological contexts, such as comparing textual or binary data. \ No newline at end of file