-
-
Notifications
You must be signed in to change notification settings - Fork 46.8k
added smith waterman algorithm #9001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4672878
44314e4
65b95a6
8e56b7e
d8a6bcb
fc58801
0662f69
892858a
37d7fed
0e199f7
2729a57
2a3e20a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
|
||
# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm | ||
# Score constants | ||
""" | ||
Score constants used in the Smith-Waterman algorithm. Matches are given a positive | ||
score while mismatches are given a negative score. Gaps are also penalized. | ||
""" | ||
MATCH = 1 | ||
MISMATCH = -1 | ||
GAP = -2 | ||
|
||
|
||
def score_function(a: str, b: str) -> int: | ||
BAW2501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Calculate the score for a character pair based on whether they match or mismatch. | ||
Returns 1 if the characters match, -1 if they mismatch. | ||
>>> score_function('A', 'A') | ||
1 | ||
>>> score_function('A', 'C') | ||
-1 | ||
""" | ||
if a == b: | ||
return MATCH | ||
else: | ||
return MISMATCH | ||
|
||
|
||
def smith_waterman(query: str, subject: str) -> list[list[int]]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add optional parameters for the score constants in this function's header as well? Users will call this function specifically to run the algorithm, and currently there's no way for the user to pass in their desired score constants into this function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i hate repeating code, but there seems to be no way around it when it comes to default params def smith_waterman( query: str, subject: str, match: int = 1, mismatch: int = -1, gap: int = -2, ) -> list[list[int]]: def score_function( source_char: str, target_char: str, match: int = 1, mismatch: int = -1, gap: int = -2, ) -> int: either way i got about it i'll have to unpack the kwargs in score function params or the body of the function and i think the params are a better way for clarity |
||
""" | ||
Perform the Smith-Waterman local sequence alignment algorithm. | ||
Returns a 2D list representing the score matrix. Each value in the matrix | ||
corresponds to the score of the best local alignment ending at that point. | ||
>>> smith_waterman('ACAC', 'CA') | ||
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] | ||
BAW2501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
|
||
# Initialize score matrix | ||
m = len(query) | ||
n = len(subject) | ||
score = [[0] * (n + 1) for _ in range(m + 1)] | ||
|
||
for i in range(1, m + 1): | ||
for j in range(1, n + 1): | ||
# Calculate scores for each cell | ||
match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1]) | ||
delete = score[i - 1][j] + GAP | ||
insert = score[i][j - 1] + GAP | ||
|
||
# Take maximum score | ||
score[i][j] = max(0, match, delete, insert) | ||
|
||
return score | ||
|
||
|
||
def traceback(score: list[list[int]], query: str, subject: str) -> str: | ||
r""" | ||
Perform traceback to find the optimal local alignment. | ||
Starts from the highest scoring cell in the matrix and traces back recursively | ||
until a 0 score is found. Returns the alignment strings. | ||
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA') | ||
'CAC\nCA-' | ||
BAW2501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
|
||
# Traceback logic to find optimal alignment | ||
i = len(query) | ||
j = len(subject) | ||
BAW2501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
align1 = "" | ||
align2 = "" | ||
|
||
while i > 0 and j > 0: | ||
if score[i][j] == score[i - 1][j - 1] + score_function( | ||
query[i - 1], subject[j - 1] | ||
): | ||
# optimal path is a diagonal take both letters | ||
align1 = query[i - 1] + align1 | ||
align2 = subject[j - 1] + align2 | ||
i -= 1 | ||
j -= 1 | ||
elif score[i][j] == score[i - 1][j] + GAP: | ||
# optimal path is a vertical | ||
align1 = query[i - 1] + align1 | ||
align2 = "-" + align2 | ||
i -= 1 | ||
else: | ||
# optimal path is a horizontal | ||
align1 = "-" + align1 | ||
align2 = subject[j - 1] + align2 | ||
j -= 1 | ||
|
||
return f'{align1}\n{align2}' | ||
|
||
|
||
if __name__ == "__main__": | ||
query = "HEAGAWGHEE" | ||
subject = "PAWHEAE" | ||
|
||
score = smith_waterman(query, subject) | ||
print(traceback(score, query, subject)) |
Uh oh!
There was an error while loading. Please reload this page.