|
| 1 | +import os |
| 2 | +import argparse |
| 3 | +from Bio import SeqIO |
| 4 | +from deepcoil import DeepCoil |
| 5 | +from deepcoil.utils import is_fasta, sharpen_preds, plot_preds |
| 6 | + |
| 7 | + |
| 8 | +def main(): |
| 9 | + |
| 10 | + parser = argparse.ArgumentParser(description='DeepCoil') |
| 11 | + parser.add_argument('-i', |
| 12 | + help='Input file with sequence in fasta format.', |
| 13 | + required=True, |
| 14 | + metavar='FILE') |
| 15 | + parser.add_argument('-out_path', |
| 16 | + help='Output directory', |
| 17 | + default='.', |
| 18 | + metavar='DIR') |
| 19 | + parser.add_argument('-n_cpu', |
| 20 | + help='Number of CPUs to use in the prediction', |
| 21 | + default=-1, |
| 22 | + type=int, |
| 23 | + metavar='NCPU') |
| 24 | + parser.add_argument('--gpu', |
| 25 | + help='Use GPU. This option overrides -n_cpu option', |
| 26 | + action='store_true') |
| 27 | + parser.add_argument('--plot', |
| 28 | + help='Plot predictions. Images will be stored in the path defined by the -out_path', |
| 29 | + action='store_true') |
| 30 | + parser.add_argument('--dpi', |
| 31 | + help='DPI of the produced images', |
| 32 | + default=300, |
| 33 | + type=int, |
| 34 | + metavar='DPI') |
| 35 | + args = parser.parse_args() |
| 36 | + |
| 37 | + # Check if input file exists |
| 38 | + if not os.path.isfile(args.i): |
| 39 | + print('ERROR: Input file does not exist!') |
| 40 | + exit() |
| 41 | + # Check if input is valid fasta file |
| 42 | + if not is_fasta(args.i): |
| 43 | + print("ERROR: Malformed fasta file. Please check input!") |
| 44 | + exit() |
| 45 | + # Check if output dir exists |
| 46 | + if not os.path.isdir(args.out_path): |
| 47 | + print("ERROR: Output directory does not exist!") |
| 48 | + exit() |
| 49 | + |
| 50 | + # Verify fasta file |
| 51 | + raw_data = list(SeqIO.parse(args.i, "fasta")) |
| 52 | + data = {''.join(e for e in str(entry.id) if (e.isalnum() or e == '_')): str(entry.seq) for entry in raw_data} |
| 53 | + if not len(data) == len(raw_data): |
| 54 | + print("ERROR: Sequence identifiers in the fasta file are not unique!") |
| 55 | + exit() |
| 56 | + |
| 57 | + print("Loading DeepCoil model...") |
| 58 | + dc = DeepCoil(use_gpu=args.gpu, n_cpu=args.n_cpu) |
| 59 | + |
| 60 | + print('Predicting...') |
| 61 | + preds = dc.predict(data) |
| 62 | + |
| 63 | + print('Writing output...') |
| 64 | + |
| 65 | + inp_keys = set(data.keys()) |
| 66 | + out_keys = set(preds.keys()) |
| 67 | + |
| 68 | + if len(out_keys) < len(inp_keys): |
| 69 | + print('WARNING: Predictions for some sequences were not calculated due to length limitations and/or other errors.' \ |
| 70 | + ' Inspect the warnings and results carefully!') |
| 71 | + |
| 72 | + for entry in out_keys: |
| 73 | + f = open(f'{args.out_path}/{entry}.out', 'w') |
| 74 | + cc_pred_raw = preds[entry]['cc'] |
| 75 | + cc_pred = sharpen_preds(cc_pred_raw) |
| 76 | + hept_pred = preds[entry]['hept'] |
| 77 | + f.write('aa\tcc\traw_cc\tprob_a\tprob_d\n') |
| 78 | + for aa, cc_prob, cc_prob_raw, a_prob, d_prob in zip(data[entry], cc_pred, cc_pred_raw, hept_pred[:, 1], hept_pred[:, 2]): |
| 79 | + f.write('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}\n'.format(aa, float(cc_prob), float(cc_prob_raw), float(a_prob), float(d_prob))) |
| 80 | + f.close() |
| 81 | + if args.plot: |
| 82 | + for entry in out_keys: |
| 83 | + plot_preds(preds[entry], out_file=f'{args.out_path}/{entry}.png', dpi=args.dpi) |
| 84 | + print("Done!") |
| 85 | + |
0 commit comments