Skip to content

Commit

Permalink
add bioinformatics
Browse files Browse the repository at this point in the history
  • Loading branch information
ynastt committed May 19, 2024
1 parent b1c9b42 commit 9b63348
Show file tree
Hide file tree
Showing 16 changed files with 1,628 additions and 0 deletions.
63 changes: 63 additions & 0 deletions bioinformatics/affine-gap-penalty-ynastt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
[![Review Assignment Due Date](https://classroom.github.com/assets/deadline-readme-button-24ddc0f5d75046c5622901739e7c5dd533143b0c8e959d652212380cedb1ea36.svg)](https://classroom.github.com/a/BrjuW67c)
[![Open in Visual Studio Code](https://classroom.github.com/assets/open-in-vscode-718a45dd9cf7e7f842a935f5ebbe5719a5e09af4491e668f4dbf3b35d5cca122.svg)](https://classroom.github.com/online_ide?assignment_repo_id=12868164&assignment_repo_type=AssignmentRepo)
# bioinf_assignment4
Задача - реализовать модификацию алгоритма NW с аффинным штрафом за пропуски.

Инсерция или делеция - это существенно более редкое эволюционное событие, чем замена нуклеотида. При этом за одно такое событие может быть вставлен или удален протяженный участок ДНК. Поэтому последовательные гэпы, т.е. вставки или делеции, имеет смысл штрафовать с учетом того, что это одна мутация, а не несколько.

Поэтому в отличие от базового алгоритма Нидлмана-Вунша, в котором используется фиксированное значение штрафа за гэп, здесь функция принимает вид:
```python
gap = open + (length - 1) * extend
```
где open - штраф за открытие (первый гэп), length - число последовательных гэпов в одной строке, extend - штраф за продолжение гэпа (со второго до последнего).

Пример:

пусть за совпадение нуклеодитов мы получаем +5, за открытие гэпа -10, за продолжнение -1.
Тогда оптимальным будет следующее выравнивание:
```
ACG---T
ACGGCTT
4 * (+5) + 1 * (-10) + 2 * (-1) = 8
```
но не, например, такое:
```
AC-G-T-
ACGGCTT
4 * (+5) + 3 * (-10) + 0 * (-1) = -10
```

Для решения используем матрицы, соответствующие случаям (A, A), (A, -) и (-, A).

Инициализируем:
```
M(0,0) = 0
I(0, 0) = D(0, 0) = infinity
M(i, 0) = infinity
I(i, 0) = open + (i - 1) * extend
D(i, 0) = infinity
M(0, j) = infinity
I(0, j) = infinity
D(0, j) = open + (j - 1) * extend
infinity >= 2 * open + (n + m) * extend + 1
```

Заполняем:
```
Match/Mismatch (две буквы):   
M(i, j) = max(M(i - 1, j - 1) + score(i, j),
I(i - 1, j - 1) + score(i, j),
D(i - 1, j - 1) + score(i, j))
Insertion (буква и гэп):   
I(i, j) = max(I(i, j - 1) + extend,
M(i, j - 1) + open)
Deletion (гэп и буква):   
D(i, j) = max(D(i - 1, j) + extend,
M(i - 1, j) + open)
Result(i, j) = max(M(i, j), I(i, j), D(i, j))
```

Восстанавливаем выравнивание по указателям или рассчитывая его по пути от Result[-1][-1] к Result[0][0].
Empty file.
289 changes: 289 additions & 0 deletions bioinformatics/affine-gap-penalty-ynastt/src/nw_affine_gap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
from typing import Callable, Tuple

DEBUG = False

# modified function for printing my RES table
def print_array(matrix: list):
print('result table:')
for row in matrix:
for element in row:
print(f"{element[0]}", end="") # modified for printing RES table
print()
print()

def score_fun(a: str,
b: str,
match_score: int = 5,
mismatch_score: int = -4) -> int:
return match_score if a == b else mismatch_score

def needleman_wunsch_affine(seq1: str,
seq2: str,
score_fun: Callable = score_fun,
gap_open: int = -10,
gap_extend: int = -1) -> Tuple[str, str, int]:
'''
Inputs:
seq1 - first sequence
seq2 - second sequence
score_fun - function that takes two characters and returns score
gap_open - gap open penalty
gap_extend - gap extend penalty
Outputs:
aln1 - first aligned sequence
aln2 - second aligned sequence
score - score of the alignment
'''

#infinity = 2 * gap_open + (n + m - 2) * gap_extend + 1
# infinity >= 2 * open + (n + m) * extend + 1
infinity = float('-inf')

# 1. Initialize matrices
n = len(seq1) + 1
m = len(seq2) + 1
# for (A, A)
M = [[0 for _ in range(m)] for _ in range(n)]
# for (A, -)
I = [[0 for _ in range(m)] for _ in range(n)]
# for (-, A)
D = [[0 for _ in range(m)] for _ in range(n)]

# RES - matrix of pairs [score, flag]
# flag can be set to:
# 0 - For match/mismatch,
# 1 - For insertion
# 2 - For deletion
RES = [[[0, 2] for _ in range(m)] for _ in range(n)]

I[0][0], D[0][0] = infinity, infinity
# RES[0][0] = [0, 2]

for i in range(1, n):
M[i][0] = infinity
I[i][0] = gap_open + (i - 1) * gap_extend
D[i][0] = infinity
RES[i][0] = [I[i][0], 2]

for j in range(1, m):
M[0][j] = infinity
I[0][j] = infinity
D[0][j] = gap_open + (j - 1) * gap_extend
RES[0][j] = [D[0][j], 1]

# 2. Fill matrices
# We assume that consecutive gaps on different sequences are not allowed
for i in range(1, n):
for j in range(1, m):
cost = score_fun(seq1[i - 1], seq2[j - 1])
# Match / Mismatch (A, A)
M[i][j] = max(
M[i - 1][j - 1],
I[i - 1][j - 1],
D[i - 1][j - 1]) + cost
# Insertion (A, -)
I[i][j] = max(
I[i][j - 1] + gap_extend,
M[i][j - 1] + gap_open)
# Deletion (-, A)
D[i][j] = max(
D[i - 1][j] + gap_extend,
M[i - 1][j] + gap_open)

res_score = max(M[i][j], I[i][j], D[i][j])
if res_score == D[i][j]:
RES[i][j] = [res_score, 2]
elif res_score == I[i][j]:
RES[i][j] = [res_score, 1]
elif res_score == M[i][j]:
RES[i][j] = [res_score, 0]

# print_array(RES)

# 3. Traceback
aln1 = ''
aln2 = ''
i = len(seq1)
j = len(seq2)
while i > 0 or j > 0:
flag = RES[i][j][1]
if flag == 0: # if match/mismatch
aln1 += seq1[i - 1]
aln2 += seq2[j - 1]
i -= 1
j -= 1
elif flag == 1: # if insertion
aln1 += "-"
aln2 += seq2[j - 1]
j -= 1
elif flag == 2: # if deletion
aln1 += seq1[i - 1]
aln2 += "-"
i -= 1

# print_array(RES)

# result square of table
score = RES[n - 1][m - 1]
print(f'result score: {score[0]}')
print(f'last action: {score[1]}') # 0 - For match/mismatch, # 1 - For insertion, # 2 - For deletion
return aln1[::-1], aln2[::-1], score[0]

def main():
aln1, aln2, score = needleman_wunsch_affine("ACGT", "TAGT", gap_open=-10, gap_extend=-1)
print(f'str 1: {aln1}')
print(f'str 2: {aln2}')
print(f'score: {score}')


if __name__ == "__main__":
main()

print("\nTest 1")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT"
assert score == 20

print("\nTest 2")
aln1, aln2, score = needleman_wunsch_affine("ACG", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "ACG-"
assert aln2 == "ACGT"
assert score == 5

print("\nTest 3")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "ACG")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT"
assert aln2 == "ACG-"
assert score == 5

print("\nTest 4")
aln1, aln2, score = needleman_wunsch_affine("ACAGT", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "ACAGT"
assert aln2 == "AC-GT"
assert score == 10

print("\nTest 5")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "ACAGT")
assert len(aln1) == len(aln2)
assert aln1 == "AC-GT"
assert aln2 == "ACAGT"
assert score == 10

print("\nTest 6")
aln1, aln2, score = needleman_wunsch_affine("CAGT", "ACAGT")
assert len(aln1) == len(aln2)
assert aln1 == "-CAGT"
assert aln2 == "ACAGT"
assert score == 10

print("\nTest 7")
aln1, aln2, score = needleman_wunsch_affine("ACAGT", "CAGT")
assert len(aln1) == len(aln2)
assert aln1 == "ACAGT"
assert aln2 == "-CAGT"
assert score == 10

print("\nTest 8")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "A")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT"
assert aln2 == "A---"
assert score == -7

print("\nTest 9")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT"
assert aln2 == "----"
assert score == -13

print("\nTest 10")
aln1, aln2, score = needleman_wunsch_affine("A", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "A---"
assert aln2 == "ACGT"
assert score == -7

print("\nTest 11")
aln1, aln2, score = needleman_wunsch_affine("", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "----"
assert aln2 == "ACGT"
assert score == -13

print("\nTest 12")
aln1, aln2, score = needleman_wunsch_affine("", "")
assert aln1 == ""
assert aln2 == ""
assert score == 0

print("\nTest 13")
aln1, aln2, score = needleman_wunsch_affine("TACGT", "ATGT")
assert len(aln1) == len(aln2)
assert aln1 == "TACGT"
assert aln2 == "-ATGT"
assert score == 1

print("\nTest 14")
aln1, aln2, score = needleman_wunsch_affine("TACGT", "ACTGT")
assert len(aln1) == len(aln2)
assert aln1 == "TAC-GT"
assert aln2 == "-ACTGT"
assert score == 0

print("\nTest 15")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "TAGTA")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT-"
assert aln2 == "TAGTA"
assert score == -8

print("\nTest 16")
aln1, aln2, score = needleman_wunsch_affine("TAGTA", "ACGT")
assert len(aln1) == len(aln2)
assert aln1 == "TAGTA"
assert aln2 == "ACGT-"
assert score == -8

print("\nTest 17")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "TAGT", gap_open=-1, gap_extend=0)
assert len(aln1) == len(aln2)
assert aln1 == "-ACGT"
assert aln2 == "TA-GT"
assert score == 13

print("\nTest 18")
aln1, aln2, score = needleman_wunsch_affine("TAGT", "ACGT", gap_open=10, gap_extend=10)
assert len(aln1) == len(aln2)
assert len(aln1) == 8
assert score == 80

print("\nTest 19")
aln1, aln2, score = needleman_wunsch_affine("GGAGCCAAGGTGAAGTTGTAGCAGTGTGTCC",
"GACTTGTGGAACCTCTGTCCTCCGAGCTCTC", gap_open=-5, gap_extend=-5)
assert len(aln1) == len(aln2)
assert len(aln1) == 36
assert score == 8

print("\nTest 20")
aln1, aln2, score = needleman_wunsch_affine("AAAAAAATTTTTTT", "TTTTTTTAAAAAAA", gap_open=-5, gap_extend=-5)
assert len(aln1) == len(aln2)
assert len(aln1) == 21
assert score == -35

print("\nTest 21")
aln1, aln2, score = needleman_wunsch_affine("ACGGCTT", "ACGT")
assert aln1 == "ACGGCTT"
assert aln2 == "ACG---T"
assert score == 8

print("\nTest 22")
aln1, aln2, score = needleman_wunsch_affine("ACGT", "TAGT")
assert len(aln1) == len(aln2)
assert aln1 == "ACGT"
assert aln2 == "TAGT"
assert score == 2
Loading

0 comments on commit 9b63348

Please sign in to comment.