-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatchToLibrary.py
More file actions
224 lines (167 loc) · 10 KB
/
matchToLibrary.py
File metadata and controls
224 lines (167 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import nwalign3 as nw
import os, subprocess, argparse
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm
tqdm.pandas()
pandarallel.initialize(progress_bar=True)
# gapPenalty1 = 0
# gapExtension1 = 0
# gapPenalty2 = -1
# gapExtension2 = 0
scoringMatrix = "NUC.4.4"
def getScorePvalue(nwScore, m, n, k=0.0097, l=0.5735, nwScoreScale=0.2773):
"""Ge pvalue of extreme value distribution which model's likelihood of achieveng a more extreme
alignment score for two sequences of length m and n. K and l are empirically determined.
nwScoreScale is a factor applied to scores of NUC4.4 matrices (from MATLAB nwalign)"""
u = np.log(k*m*n)/l
score = nwScore*nwScoreScale
return (1 - np.exp(-np.exp(-l*(score-u))))
def calcMinQScore(libRegion):
q_scores = [ord(x)-33 for x in libRegion]
return np.min(q_scores)
def getAlignment(seq1_in, seq2_in, gapPenalty, gapExtension,scoringMatrix="NUC.4.4"):
seq1_out, seq2_out = nw.global_align(seq1_in, seq2_in, gap_open=gapPenalty, gap_extend=gapExtension, matrix=scoringMatrix)
score = nw.score_alignment(seq1_out, seq2_out, gap_open=gapPenalty, gap_extend=gapExtension, matrix=scoringMatrix)
pvalue = getScorePvalue(score, len(seq1_in), len(seq2_in))
return seq1_out, seq2_out, score, pvalue
def get_hairpin(row, fiveprime_region ='GCTGTTGAAGGCTCGCGATGCACACGCTCTGGTACAAGGAA',
threeprime_regions=['AAGGCACTGGGCAATACGAGCTCAAGCCAGTCTCGCAGTCC',
'AAGGCGACTCCACTATAGTACCGTCGTCCGGTGGAGTCTGG'], debug=False,pValueCutoff=1e-3):
'''
Align read to two flanking regions and return region in between them.
Input:
row of dataframe containing fields for `sequence` and `phred`
Output:
libregion: detected region between two flanking regions (str)
phred_libregion: phred of detected region
which_threeprime_ind: which threeprime region was aligned
If sequence fails to align: returns NaN's for all 3
'''
sequence=row['sequence']
seq1a, seq1b, score1, pvalue1 = getAlignment(sequence, fiveprime_region, 0, 0)
# seq1a, seq1b = nw.global_align(sequence, fiveprime_region, gap_open=gapPenalty1, gap_extend=gapExtension1, matrix=scoringMatrix)
# score1 = nw.score_alignment(seq1a, seq1b, gap_open=gapPenalty1, gap_extend=gapExtension1, matrix=scoringMatrix)
# pvalue1 = getScorePvalue(score1, len(fiveprime_region), len(sequence))
if pvalue1 < pValueCutoff:
# Test first threeprime_region, if pval not below cutoff, try next one
seq2a, seq2b, score2_0, pvalue2 = getAlignment(sequence, threeprime_regions[0],-1,0)
seq2a, seq2b, score2_1, pvalue2 = getAlignment(sequence, threeprime_regions[1],-1,0)
if score2_0 > score2_1:
which_threeprime_ind=0
else:
which_threeprime_ind = 1
seq2a, seq2b, score2, pvalue2 = getAlignment(sequence, threeprime_regions[which_threeprime_ind],-1,0)
# seq2a, seq2b = nw.global_align(sequence, threeprime_region[0],gap_open=gapPenalty2, gap_extend=gapExtension2, matrix=scoringMatrix)
# score2 = nw.score_alignment(seq2a, seq2b, gap_open=gapPenalty2, gap_extend=gapExtension2, matrix=scoringMatrix)
# pvalue2 = getScorePvalue(score2, len(threeprime_region[0]), len(sequence))
if pvalue2 < pValueCutoff:
lib_start = seq1b.find('-')
lib_end = seq2b.find(threeprime_regions[which_threeprime_ind][0])
if debug:
print(seq1a)
print(seq1b)
print(seq2a)
print(seq2b)
print('')
print(score1, pvalue1, score2, pvalue2)
libregion = sequence[len(fiveprime_region):lib_end]
phred_libregion = row['phred'][len(fiveprime_region):lib_end]
return libregion, phred_libregion, which_threeprime_ind
else:
return np.nan, np.nan, np.nan
else:
return np.nan, np.nan, np.nan
def getLibraryRef(ex_seq, sort_lib_seqs, beam=1000, second_try = False, exact=False,
gapPenalty1 = -1, gapExtension1 = -1, pValueCutoff = 1e-6, debug=False):
if ex_seq=='':
return ''
idx = np.searchsorted(sort_lib_seqs, ex_seq)
if idx < len(sort_lib_seqs) and sort_lib_seqs[idx] == ex_seq:
return sort_lib_seqs[idx], 0
else:
if exact:
return np.nan, np.nan # only returning if exact match
else:
min_index = max(0, idx-beam)
max_index = min(len(sort_lib_seqs), idx+beam)
scores, pvalues = [],[]
for trial_seq in sort_lib_seqs[min_index:max_index]:
seq1a, seq1b = nw.global_align(ex_seq, trial_seq, gap_open=gapPenalty1, gap_extend=gapExtension1, matrix=scoringMatrix)
score1 = nw.score_alignment(seq1a, seq1b, gap_open=gapPenalty1, gap_extend=gapExtension1, matrix=scoringMatrix)
pvalue1 = getScorePvalue(score1, len(ex_seq), len(trial_seq))
if debug:
print(seq1a)
print(seq1b)
print(score1, pvalue1)
scores.append(score1)
pvalues.append(pvalue1)
winner = np.argmax(scores)
if pvalues[winner] < pValueCutoff:
return sort_lib_seqs[min_index:max_index][winner], pvalues[winner]
#else:
# if not second_try:
# return getLibraryRef(ex_seq, sort_lib_seqs, beam=100000, second_try = True)
else:
return np.nan, np.nan # didn't find a matching sequence
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument("cpseq", help="CPseq file containing paired-end reads.")
parser.add_argument("--library", action='store', help="CSV file containing library information. Must have column called 'RefSeq'.")
parser.add_argument("--exact", action='store_true', help="Only return sequences that are exact match to library RefSeqs.")
parser.add_argument("--OligoPValue", action='store', default=1e-3, help='P-value cutoff for aligning reads to fluor and quench oligo.')
parser.add_argument("--LibPValue", action='store', default=1e-6, help='P-value cutoff for aligning libregions to library.')
parser.add_argument('-o', action='store',help='name of output CPseq without the .CPseq extension')
parser.add_argument('--clusterAnnotation', action='store', help='truncated CPseq file with only clusterID and their variant annotation')
parser.add_argument('--variantCol', action='store', default='SEQID', help='str, header name of the column for variant annotation')
args = parser.parse_args()
if args.o is None:
args.o = 'annotated_output.csv'
library = pd.read_csv(args.library)
#clean RefSeq column
library['RefSeq'] = [x.upper().replace('U','T') for x in library['RefSeq']]
# ensure ref seqs are sorted
sort_lib_seqs = list(np.sort(list(library['RefSeq'])))
print('Read library')
if os.path.exists('%s_libregion_only.json.zip' % args.o):
df = pd.read_json('%s_libregion_only.json.zip' % args.o)
else:
df = pd.read_csv(args.cpseq, names=['clusterID','sequence','phred'], delimiter='\t')
df['clusterID'] = [x.replace('@','').replace(' 1:N:0:1','') for x in df['clusterID']]
print('Read CPseq in')
print(df.head())
print("Extracting variable region from between Cy3' and quench' regions....")
df[['libRegion', 'libRegionPhred', 'whichThreePrime']] = df.progress_apply(lambda row: get_hairpin(row, pValueCutoff=args.OligoPValue), axis=1, result_type='expand')
df.to_json('%s_libregion_only.json.zip' % args.o)
df.iloc[:1000].to_csv('%s_libregion_only_test.csv' % args.o,index=False)
n_passed_alignment = len(df.loc[~df['libRegion'].isna()])
n_threeprime_0 = len(df.loc[~df['libRegion'].isna()][df.whichThreePrime==0])
n_threeprime_1 = len(df.loc[~df['libRegion'].isna()][df.whichThreePrime==1])
print("%d/%d (%.2f) passed aligning to Cy3' quench' regions" % (n_passed_alignment, len(df), 100*n_passed_alignment/len(df)))
print("%d/%d (%.2f) were 3' region 0" % (n_threeprime_0, n_passed_alignment, 100*n_threeprime_0/n_passed_alignment))
print("%d/%d (%.2f) were 3' region 1" % (n_threeprime_1, n_passed_alignment, 100*n_threeprime_1/n_passed_alignment))
df = df.loc[~df['libRegion'].isna()]
df['len_libRegion'] = df.apply(lambda row: len(row['libRegion']), axis=1)
df = df.loc[df['len_libRegion']>0]
df['minQScore'] = df.parallel_apply(lambda row: calcMinQScore(row['libRegionPhred']), axis=1)
print('Len prior to minQscore filtering: ', len(df))
df = df.loc[df['minQScore']>=20]
print('Length with minQscore>20', len(df))
uniqueLibRegions = pd.DataFrame()
uniqueLibRegions['libRegion'] = df['libRegion'].unique()
print("Found %d unique libRegions to align" % len(uniqueLibRegions))
# match each libRegion to the most likely reference sequence from the library
print("Aligning library region to reference sequences and matching to most likely ref sequence ....")
uniqueLibRegions[['RefSeq','RefSeqPValue']] = uniqueLibRegions.parallel_apply(lambda row: getLibraryRef(row['libRegion'],sort_lib_seqs, exact=args.exact,pValueCutoff=args.LibPValue),axis=1, result_type='expand')
#merge the library data to the unique_libregion data
uniqueLibRegions = uniqueLibRegions.merge(library, on='RefSeq')
#merge the unique_libregion data to the original CPseq data
df = df.merge(uniqueLibRegions, on='libRegion')
n_passed_alignment2 = len(df.loc[~df['RefSeq'].isna()])
print("%d/%d (%.2f) passed aligning to library RefSeqs" % (n_passed_alignment2, n_passed_alignment, n_passed_alignment2/n_passed_alignment))
# remove libRegions that did not align to a ref seq
df= df.loc[~df['RefSeq'].isna()]
df.to_csv(args.o + 'CPseq', sep='\t', index=False)
if args.clusterAnnotation is not None:
df[['clusterID', args.variantCol]].to_csv(args.cluster_annotation, sep='/t', index=False)