Skip to content

Commit 52ab15c

Browse files
committed
Add existing file
1 parent c5713e7 commit 52ab15c

File tree

11 files changed

+1110
-51
lines changed

11 files changed

+1110
-51
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include *.txt

discere/discere.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
import subprocess
22
import sys
33
from process_fasta import process_fasta
4-
4+
from feature_extraction import feature_extraction
55
system = sys.platform
66

7-
def feat_ext():
8-
print("Extracting features....")
9-
if system == 'linux':
10-
subprocess.run(['python3','feature_extraction.py'])
11-
else:
12-
subprocess.run(['python.exe','process_fasta.py'])
7+
# def feat_ext():
8+
# print("Extracting features....")
9+
# if system == 'linux':
10+
# subprocess.run(['python3','feature_extraction.py'])
11+
# else:
12+
# subprocess.run(['python.exe','process_fasta.py'])
1313

14-
def extract_feature(positive, negative):
14+
def extract_feature(positive, negative, outdir):
1515
code = process_fasta(positive, negative)
1616
if code is True:
1717
try:
18-
feat_ext()
18+
feature_extraction(outdir)
1919
except Exception:
2020
print('Failed to extract feautres... \n Code exiting with incomplete termination...')
2121
else:print('Error processing the fasta files !')

discere/feature_extraction.py

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,45 @@
44
import pandas as pd
55
from Bio.SeqUtils.ProtParam import ProteinAnalysis
66

7-
path_=os.getcwd() # get the current working directory
8-
_=(os.path.join(path_,"data/data.txt")) # path to input data (sequence) folder
9-
10-
print('Reading data...')
11-
if os.path.exists(_):data=pd.read_table(_) #read the file as Pandas DataFrame
12-
print('Clearing existing files...')
13-
try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])] # remove the file if already exist
14-
except Exception:pass
15-
16-
seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists
17-
18-
pth=(path_+"/output/")
19-
if not os.path.exists(pth):os.makedirs(pth)
20-
21-
try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])] # remove the file if already exist
22-
except Exception:pass
23-
24-
attr=open(path_+"/config/attrib","rb")
25-
attr=pickle.load(attr) # load the pickle file with attribue names (for weka)
26-
with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr))
27-
28-
def format_output(aa_count,cnt): # write the extracted feature values to arff (weka), txt(svm) and csv file
29-
a=(dict(zip(it.count(), list(aa_count.values()))))
30-
if cnt==1:
31-
with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
32-
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n")
33-
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n")
34-
else:
35-
with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
36-
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n")
37-
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n")
38-
39-
for seq,cl in zip(seq_list,cls_list): # main loop to extract the features
40-
_= ProteinAnalysis(seq) # Biopython protein analysis package
41-
aa_count=(_.count_amino_acids()) # amino acid count
42-
aromat, fraction, iso=_.aromaticity(), _.secondary_structure_fraction(), _.isoelectric_point()
43-
try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index())
44-
except Exception:mol_w,ins= mol_w,ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index
45-
format_output(aa_count,cl)
46-
print("Feature extraction complete...")
47-
print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats")
7+
def feature_extraction(outdir):
8+
path_=os.getcwd() # get the current working directory
9+
_=(os.path.join(path_,"data/data.txt")) # path to input data (sequence) folder
10+
11+
print('Reading data...')
12+
if os.path.exists(_):data=pd.read_table(_) #read the file as Pandas DataFrame
13+
print('Clearing existing files...')
14+
try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])] # remove the file if already exist
15+
except Exception:pass
16+
17+
seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists
18+
19+
pth=outdir+'/'
20+
if not os.path.exists(pth):os.makedirs(pth)
21+
22+
try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])] # remove the file if already exist
23+
except Exception:pass
24+
25+
attr=open(path_+"/config/attrib","rb")
26+
attr=pickle.load(attr) # load the pickle file with attribue names (for weka)
27+
with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr))
28+
29+
def format_output(aa_count,cnt): # write the extracted feature values to arff (weka), txt(svm) and csv file
30+
a=(dict(zip(it.count(), list(aa_count.values()))))
31+
if cnt==1:
32+
with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
33+
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n")
34+
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n")
35+
else:
36+
with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
37+
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n")
38+
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n")
39+
40+
for seq,cl in zip(seq_list,cls_list): # main loop to extract the features
41+
_= ProteinAnalysis(seq) # Biopython protein analysis package
42+
aa_count=(_.count_amino_acids()) # amino acid count
43+
aromat, fraction, iso=_.aromaticity(), _.secondary_structure_fraction(), _.isoelectric_point()
44+
try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index())
45+
except Exception:mol_w,ins= mol_w,ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index
46+
format_output(aa_count,cl)
47+
print("Feature extraction complete...")
48+
print("Extracted features are saved in" +outdir + "/ directory in .txt, .arff and .csv formats")

0 commit comments

Comments
 (0)