I have the following t=5
DNA strings:
DNA = '''CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA
GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG
TAGTACCGAGACCGAAAGAAGTATACAGGCGT
TAGATCAAGTTTCAGGTGCACGTCGGTGAACC
AATCCACCAGCTCCACGTGCAATGTTGGCCTA'''
k = 8
t = 5
I'm trying to find the best motifs of length k=8
from the collection of strings using a laplace transform to randomly sample chunks of length k from each of the t strings.
My helper functions are as follows:
def window(s, k):
for i in range(1 + len(s) - k):
yield s[i:i+k]
def HammingDistance(seq1, seq2):
if len(seq1) != len(seq2):
raise ValueError('Undefined for sequences of unequal length.')
return sum(ch1 != ch2 for ch1, ch2 in zip(seq1, seq2))
def score(motifs):
score = 0
for i in range(len(motifs[0])):
motif = ''.join([motifs[j][i] for j in range(len(motifs))])
score += min([HammingDistance(motif, homogeneous*len(motif)) for homogeneous in 'ACGT'])
return score
def profile(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc))/float(len(col)) for nuc in 'ACGT'])
return prof
def profile_most_probable_kmer(dna, k, prof):
dna = dna.splitlines()
nuc_loc = {nucleotide:index for index,nucleotide in enumerate('ACGT')}
motif_matrix = []
max_prob = [-1, None]
for i in range(len(dna)):
motif_matrix.append(max_prob)
for i in range(len(dna)):
for chunk in window(dna[i],K):
current_prob = 1
for j, nuc in enumerate(chunk):
current_prob*=prof[j][nuc_loc[nuc]]
if current_prob>motif_matrix[i][0]:
motif_matrix[i] = [current_prob,chunk]
return list(list(zip(*motif_matrix))[1])
def profile_with_pseudocounts(motifs):
prof = []
for i in range(len(motifs[0])):
col = ''.join([motifs[j][i] for j in range(len(motifs))])
prof.append([float(col.count(nuc)+1)/float(len(col)+4) for nuc in 'ACGT'])
return prof
from random import randint
def SampleMotifs(Dna,k,t):
Dna = Dna.splitlines()
BestMotifs = []
for line in Dna:
position = randint(0,len(line)-k)
BestMotifs.append(line[position:position+k])
return BestMotifs
def motifs_from_profile(profile, dna, k):
return [profile_most_probable_kmer(seq,k,profile) for seq in dna]
def randomized_motif_search(dna,k,t):
from random import randint
dna = dna.splitlines()
rand_ints = [randint(0,len(dna[0])-k)) for a in range(len(dna))]
motifs = [dna[i][r:r+k] for i,r in enumerate(rand_ints)]
best_score = [score(motifs), motifs]
while True:
current_profile = profile_with_pseudocounts(motifs)
motifs = motifs_from_profile(current_profile, dna, k)
current_score = score(motifs)
if current_score < best_score[0]:
best_score = [current_score, motifs]
else:
return best_score[1]
def Laplace(dna,k,t):
i = 0
LastMotifs = randomized_motif_search(dna,k,t)
while i < 1000:
try:
BestMotifs = randomized_motif_search(dna,k,t)
if score(BestMotifs)<score(LastMotifs):
LastMotifs = BestMotifs
except:
pass
i+=1
print(*LastMotifs)
The output I should be getting for this is :
TCTCGGGG
CCAAGGTG
TACAGGCG
TTCAGGTG
TCCACGTG
I get different outputs every time which I expect with a method with a random element, but it should be converging when I iterate 1000 times and only update my best motifs if the score is lower. The fact I had to put in an error handler in the laplace since I get an index when I call randomized_motif_search(dna,k,t)
tells me that might be the source of the problem. I've spent the last two days scouring the code to make sure everything is the right shape, but the fact I'm getting the wrong answer is more than just a little vexing. Any help would be greatly appreciated.
Aucun commentaire:
Enregistrer un commentaire