Skip to content

Commit 5b6e0cb

Browse files
committed
[issue-1] - added segment ad residue score filters
1 parent ee9c514 commit 5b6e0cb

File tree

2 files changed

+34
-10
lines changed

2 files changed

+34
-10
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ python3.5 deepcoil.py [-h] -i FILE [-out_path DIR] [-pssm] [-pssm_path DIR]
4040
| **`-out_path`** | Directory where the predictions are saved. For each entry one file will be saved. |
4141
| **`-out_type`** | Output type. Either **'ascii'** (default), which will write single file for each entry in input or **'h5'** which will generate single hdf5 file storing all predictions. |
4242
| **`-out_filename`** | Works with **"-out_type h5"** option and specifies the hdf5 output filename Overrides the **-out_path** if specified. |
43+
| **`-min_residue_score`** | Number from range <0,1>. If passed return sequences which have at least one residue with greater score |
44+
| **`-min_segment_score`** | Number greater than 0. If passed return sequences which have segment of length at least **-min_segment_length** |
4345

4446
PSSM filenames should be based on the identifiers in the fasta file (only alphanumeric characters and '_'). For example if a fasta sequence is as follows:
4547
```

deepcoil.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
import argparse
2-
from Bio import SeqIO
32
import os
43
import sys
5-
import numpy as np
6-
from utils import enc_seq_onehot, enc_pssm, is_fasta, get_pssm_sequence, DeepCoil_Model, decode
7-
import keras.backend as K
4+
85
import h5py
6+
import keras.backend as K
7+
import numpy as np
8+
from Bio import SeqIO
99

10+
from utils import enc_seq_onehot, enc_pssm, is_fasta, get_pssm_sequence, DeepCoil_Model, decode, SegmentResultFilter, \
11+
ScoreResultFilter
1012

1113
# cx_freeze specific
1214
if getattr(sys, 'frozen', False):
@@ -41,13 +43,19 @@
4143
parser.add_argument('-skip_checks',
4244
action='store_true',
4345
help='Skips input verification saving some time. Use only if entirely sure or in the re-runs')
46+
parser.add_argument('-min_residue_score',
47+
default=None,
48+
help="minimum score to assign residue as part of coiled coil")
49+
parser.add_argument('-min_segment_length',
50+
default=None,
51+
help="minimum number of consecutive residues to ")
4452
args = parser.parse_args()
4553

4654
# Verify whether weights files are present
4755

4856
for i in range(1, 6):
4957
if not os.path.isfile('%s/weights/final_seq_%s.h5' % (my_loc, i)) and not os.path.isfile(
50-
'%s/weights/final_seq_pssm_%s.h5' % (my_loc, i)):
58+
'%s/weights/final_seq_pssm_%s.h5' % (my_loc, i)):
5159
print("Weight files for the DeepCoil model are not available.")
5260
exit()
5361

@@ -111,9 +119,10 @@
111119
try:
112120
parsed_pssm = np.genfromtxt(pssm_fn, skip_header=3, skip_footer=5, usecols=(i for i in range(2, 22)))
113121
if not parsed_pssm.shape[0] == len(seq):
114-
parsed_pssm = np.genfromtxt(pssm_fn, skip_header=3, skip_footer=3, usecols=(i for i in range(2, 22)))
115-
if not parsed_pssm.shape[0] == len(seq):
116-
raise ValueError
122+
parsed_pssm = np.genfromtxt(pssm_fn, skip_header=3, skip_footer=3,
123+
usecols=(i for i in range(2, 22)))
124+
if not parsed_pssm.shape[0] == len(seq):
125+
raise ValueError
117126
except ValueError:
118127
print("ERROR: Malformed PSSM file for entry %s!" % entry)
119128
exit()
@@ -156,7 +165,7 @@
156165
predictions = model.predict(enc_sequences, verbose=1)
157166
print()
158167
decoded_predictions = [decode(pred, encoded_seq) for pred, encoded_seq in
159-
zip(predictions, enc_sequences)]
168+
zip(predictions, enc_sequences)]
160169
for decoded_prediction, entry in zip(decoded_predictions, entries):
161170
if i == 1:
162171
ensemble_results[entry] = decoded_prediction
@@ -168,13 +177,26 @@
168177
for entry, seq in zip(entries, sequences):
169178
f = open('%s/%s.out' % (args.out_path, entry), 'w')
170179
final_results = np.average(ensemble_results[entry], axis=0)
180+
res_filter = None
181+
if args.min_residue_score:
182+
res_filter = ScoreResultFilter(final_results, args.min_residue_score)
183+
res_filter.write_results(entry, seq,
184+
os.path.join(args.out_path, 'residue_filter_{}'.format(args.min_residue_score)))
185+
if args.min_segment_length:
186+
seg_filter = SegmentResultFilter(final_results, args.min_segment_length, other_filter=res_filter)
187+
seg_filter.write_results(entry, seq,
188+
os.path.join(args.out_path, 'segment_filter_{}'.format(args.min_segment_length)))
171189
for aa, prob in zip(seq, final_results):
172190
f.write("%s %s\n" % (aa, "% .3f" % prob))
173191
f.close()
174192
elif args.out_type == 'h5':
175193
f = h5py.File(args.out_filename, 'w')
176194
for entry, seq in zip(entries, sequences):
177-
f.create_dataset(data=np.average(ensemble_results[entry], axis=0), name=entry)
195+
final_results = np.average(ensemble_results[entry], axis=0)
196+
can_pass = ScoreResultFilter(args.min_residue_score, final_results).is_correct and SegmentResultFilter(
197+
args.min_segment_length, final_results).is_correct
198+
if can_pass:
199+
f.create_dataset(data=final_results, axis=0, name=entry)
178200
f.close()
179201
print()
180202
print("Done!")

0 commit comments

Comments
 (0)