-
Notifications
You must be signed in to change notification settings - Fork 294
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d54ce87
commit 37388fa
Showing
2 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#include <stdio.h> | ||
#include <string.h> | ||
#include <stdlib.h> | ||
|
||
#define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||
#define MAX3(a, b, c) (MAX(MAX(a, b), c)) | ||
|
||
// Function to find the maximum of 3 integers (match, insert, delete) | ||
|
||
// Function to initialize a 2D array for the scoring matrix | ||
int** createMatrix(int rows, int cols) { | ||
int** matrix = (int**)malloc(rows * sizeof(int*)); | ||
for (int i = 0; i < rows; i++) { | ||
matrix[i] = (int*)malloc(cols * sizeof(int)); | ||
} | ||
return matrix; | ||
} | ||
|
||
// Function to perform the Smith-Waterman algorithm for local sequence alignment | ||
void smithWaterman(char* seq1, char* seq2, int matchScore, int mismatchPenalty, int gapPenalty) { | ||
int len1 = strlen(seq1); | ||
int len2 = strlen(seq2); | ||
|
||
// Initialize scoring matrix with extra row and column for initial zeroes | ||
int** scoreMatrix = createMatrix(len1 + 1, len2 + 1); | ||
|
||
// Initialize maximum score to zero (for local alignment) | ||
int maxScore = 0; | ||
int endRow = 0, endCol = 0; | ||
|
||
// Fill in the scoring matrix | ||
for (int i = 0; i <= len1; i++) { | ||
for (int j = 0; j <= len2; j++) { | ||
if (i == 0 || j == 0) { | ||
// Set first row and first column to 0 | ||
scoreMatrix[i][j] = 0; | ||
} else { | ||
// Calculate match, delete, and insert scores | ||
int match = scoreMatrix[i - 1][j - 1] + (seq1[i - 1] == seq2[j - 1] ? matchScore : mismatchPenalty); | ||
int delete = scoreMatrix[i - 1][j] + gapPenalty; | ||
int insert = scoreMatrix[i][j - 1] + gapPenalty; | ||
|
||
// Get the maximum score for this cell (including zero for local alignment) | ||
scoreMatrix[i][j] = MAX3(0, match, MAX(delete, insert)); | ||
|
||
// Track the maximum score and the position for traceback | ||
if (scoreMatrix[i][j] > maxScore) { | ||
maxScore = scoreMatrix[i][j]; | ||
endRow = i; | ||
endCol = j; | ||
} | ||
} | ||
} | ||
} | ||
|
||
// Output the maximum score | ||
printf("Maximum Alignment Score: %d\n", maxScore); | ||
|
||
// Traceback to find the aligned sequences | ||
char alignedSeq1[100], alignedSeq2[100]; | ||
int idx1 = 0, idx2 = 0; | ||
|
||
while (endRow > 0 && endCol > 0 && scoreMatrix[endRow][endCol] > 0) { | ||
if (seq1[endRow - 1] == seq2[endCol - 1]) { | ||
alignedSeq1[idx1++] = seq1[endRow - 1]; | ||
alignedSeq2[idx2++] = seq2[endCol - 1]; | ||
endRow--; | ||
endCol--; | ||
} else if (scoreMatrix[endRow][endCol] == scoreMatrix[endRow - 1][endCol] + gapPenalty) { | ||
alignedSeq1[idx1++] = seq1[endRow - 1]; | ||
alignedSeq2[idx2++] = '-'; | ||
endRow--; | ||
} else { | ||
alignedSeq1[idx1++] = '-'; | ||
alignedSeq2[idx2++] = seq2[endCol - 1]; | ||
endCol--; | ||
} | ||
} | ||
|
||
// Reverse the aligned sequences | ||
alignedSeq1[idx1] = '\0'; | ||
alignedSeq2[idx2] = '\0'; | ||
strrev(alignedSeq1); | ||
strrev(alignedSeq2); | ||
|
||
// Print the aligned sequences | ||
printf("Aligned Sequence 1: %s\n", alignedSeq1); | ||
printf("Aligned Sequence 2: %s\n", alignedSeq2); | ||
|
||
// Free allocated memory for the matrix | ||
for (int i = 0; i <= len1; i++) { | ||
free(scoreMatrix[i]); | ||
} | ||
free(scoreMatrix); | ||
} | ||
|
||
int main() { | ||
char seq1[] = "ACACACTA"; | ||
char seq2[] = "AGCACACA"; | ||
int matchScore = 2; | ||
int mismatchPenalty = -1; | ||
int gapPenalty = -2; | ||
|
||
printf("Sequence 1: %s\n", seq1); | ||
printf("Sequence 2: %s\n", seq2); | ||
|
||
// Call the Smith-Waterman function | ||
smithWaterman(seq1, seq2, matchScore, mismatchPenalty, gapPenalty); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# Smith-Waterman Algorithm | ||
|
||
## Description | ||
|
||
The Smith-Waterman algorithm is a dynamic programming algorithm used for local sequence alignment. It finds the optimal alignment of a subset of one sequence against another. Unlike global alignment algorithms, Smith-Waterman focuses on finding the most similar region between two sequences, making it ideal for biological applications such as DNA or protein sequence matching. | ||
|
||
### Problem Definition | ||
|
||
Given: | ||
- A sequence `seq1` of length `m` | ||
- A sequence `seq2` of length `n` | ||
|
||
Objective: | ||
- Find the best local alignment of `seq1` and `seq2` | ||
|
||
### Algorithm Overview | ||
|
||
1. **Initialization**: | ||
- Create a scoring matrix with dimensions `(m+1) x (n+1)` initialized to zeros. | ||
|
||
2. **Scoring**: | ||
- For each cell in the matrix, calculate the score based on the match/mismatch and gap penalties. | ||
- Track the maximum score and its position. | ||
|
||
3. **Traceback**: | ||
- Start from the cell with the highest score and trace back to reconstruct the optimal local alignment. | ||
|
||
### Key Features | ||
|
||
- Finds local alignments by allowing partial overlaps of sequences. | ||
- Uses a scoring system for matches, mismatches, and gaps. | ||
- Backtracks from the cell with the highest score to find the optimal alignment. | ||
- Time complexity: O(mn), where `m` and `n` are the lengths of the sequences. | ||
|
||
### Time Complexity | ||
|
||
- **Worst Case**: O(mn), where `m` is the length of `seq1` and `n` is the length of `seq2` | ||
|
||
### Space Complexity | ||
|
||
- O(mn), since a matrix of size `(m+1) x (n+1)` is required to store the scores. | ||
|
||
## Implementation | ||
|
||
The implementation in C demonstrates the Smith-Waterman algorithm for local sequence alignment. It includes: | ||
|
||
1. A function to initialize and populate the scoring matrix. | ||
2. The Smith-Waterman function to compute the alignment score and perform the traceback to find the aligned sequences. | ||
3. A demonstration of how to use the algorithm on two sample sequences. | ||
|
||
## Usage | ||
|
||
Compile the program and run it. The example in the `main` function demonstrates how the Smith-Waterman algorithm can be used to align two sequences. | ||
|
||
```bash | ||
gcc smith_waterman.c -o smith_waterman | ||
./smith_waterman | ||
``` | ||
|
||
## Limitations | ||
|
||
- The algorithm works for short to medium-length sequences but can be memory-intensive for very long sequences due to the need for a full scoring matrix. | ||
- The current implementation assumes equal penalties for all types of mismatches and gaps. These can be adjusted as needed for specific applications. | ||
|
||
## Extensions | ||
|
||
- The algorithm can be extended by using affine gap penalties, which distinguish between opening and extending a gap. | ||
- It can also be adapted for approximate matching in non-biological contexts, such as comparing textual or binary data. |