Skip to content

Commit 34f2ba8

Browse files
committed
Add existing file
1 parent 47d46d4 commit 34f2ba8

File tree

5 files changed

+47
-35
lines changed

5 files changed

+47
-35
lines changed

README.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,9 @@ pip3 install discere
3939
## Usage
4040

4141
```
42-
from discere import discere
43-
```
42+
import discere
4443
45-
Steps to run the code..
46-
1. Add fasta file containing positive sequences with the file name "positive_training.fasta" to the "data" folder
47-
2. Add fasta file containing negative sequences with the file name "negative_training.fasta" to the "data" folder
48-
3. Run ML.py script*
49-
4. The output will be saved in /data/output/ folder in three different file formats upon the successful execution of the code
44+
discere.extract_feature('positive_training.fasta', 'negative_training.fasta')
5045
51-
*Although you could also run process_fasta.py and feature_extraction.py individually in the respective order
46+
```
47+
## output

discere/config/attrib

630 Bytes
Binary file not shown.

discere/discere.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,26 @@
11
import subprocess
22
import sys
3+
from process_fasta import process_fasta
34

45
system = sys.platform
56

6-
def fasta_process():
7-
print("Processing fasta files....")
8-
if system == 'linux':
9-
subprocess.run(['python3','process_fasta.py'])
10-
else:
11-
subprocess.run(['python.exe','process_fasta.py'])
12-
137
def feat_ext():
148
print("Extracting features....")
159
if system == 'linux':
1610
subprocess.run(['python3','feature_extraction.py'])
1711
else:
1812
subprocess.run(['python.exe','process_fasta.py'])
1913

14+
def extract_feature(positive, negative):
15+
code = process_fasta(positive, negative)
16+
if code is True:
17+
try:
18+
feat_ext()
19+
except Exception:
20+
print('Failed to extract feautres... \n Code exiting with incomplete termination...')
21+
else:print('Error processing the fasta files !')
22+
2023
if __name__ == '__main__':
21-
fasta_process()
22-
feat_ext()
23-
print("Feature extraction complete...")
24-
print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats")
24+
extract_feature(positive, negative)
2525
import os
2626
print("Have a great day ",os.getlogin(), "!")

discere/feature_extraction.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,32 @@
77
path_=os.getcwd() # get the current working directory
88
_=(os.path.join(path_,"data/data.txt")) # path to input data (sequence) folder
99

10+
print('Reading data...')
1011
if os.path.exists(_):data=pd.read_table(_) #read the file as Pandas DataFrame
12+
print('Clearing existing files...')
13+
try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])] # remove the file if already exist
14+
except Exception:pass
1115

1216
seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists
1317

14-
pth=(path_+"/data/output/")
18+
pth=(path_+"/output/")
1519
if not os.path.exists(pth):os.makedirs(pth)
1620

1721
try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])] # remove the file if already exist
1822
except Exception:pass
1923

20-
attr=open(path_+"/data/attrib","rb")
24+
attr=open(path_+"/config/attrib","rb")
2125
attr=pickle.load(attr) # load the pickle file with attribue names (for weka)
2226
with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr))
2327

2428
def format_output(aa_count,cnt): # write the extracted feature values to arff (weka), txt(svm) and csv file
2529
a=(dict(zip(it.count(), list(aa_count.values()))))
2630
if cnt==1:
27-
with open(pth+"/svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
31+
with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
2832
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n")
2933
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n")
3034
else:
31-
with open(path_+"/data/output/svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
35+
with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
3236
with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n")
3337
with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n")
3438

@@ -39,3 +43,5 @@ def format_output(aa_count,cnt): # write the extracted feature values to arf
3943
try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index())
4044
except Exception:mol_w,ins= mol_w,ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index
4145
format_output(aa_count,cl)
46+
print("Feature extraction complete...")
47+
print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats")

discere/process_fasta.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,29 @@
11
from Bio import SeqIO
22
import os
33

4-
path_=os.getcwd() # get the location of Current working directory
5-
_=(os.path.join(path_,"data/data.txt"))
4+
def process_fasta(positive, negative):
5+
print("Processing fasta files....")
6+
path_=os.getcwd() # get the location of Current working directory
7+
print('Cleaning existing data...')
8+
if os.path.isdir(path_+"/data") is True:
9+
os.rmdir(path_+"/data")
10+
os.mkdir(path_+"/data")
611

7-
if os.path.isfile(_):
8-
try:os.remove("data/data.txt") # Remove the data.txt file from previous run
9-
except Exception:pass
12+
_=(os.path.join(path_,"data/data.txt"))
1013

11-
o_put=open("data/data.txt","a+") # create and open a new data.txt file
12-
o_put.write("sequence"+"\t"+"class"+"\n")
13-
for record in SeqIO.parse("data/positive_training.fasta", "fasta"): # Code to parse multiple sequences using Bio Python
14-
o_put.write(str(record.seq)+"\t"+"1"+"\n") # Positive sequence output
14+
if os.path.isfile(_):
15+
try:os.remove("data.txt") # Remove the data.txt file from previous run
16+
except Exception:pass
17+
print('Generating intermediate files...')
18+
os.chdir(os.getcwd())
19+
o_put = open("data/data.txt","a+") # create and open a new data.txt file
20+
o_put.write("sequence"+"\t"+"class"+"\n")
21+
for record in SeqIO.parse(positive, "fasta"): # Code to parse multiple sequences using Bio Python
22+
o_put.write(str(record.seq)+"\t"+"1"+"\n") # Positive sequence output
1523

16-
for record in SeqIO.parse("data/negative_training.fasta", "fasta"): # Code to parse multiple sequences using Bio Python
17-
o_put.write(str(record.seq)+"\t"+"0"+"\n") # Negative sequence output
24+
for record in SeqIO.parse(negative, "fasta"): # Code to parse multiple sequences using Bio Python
25+
o_put.write(str(record.seq)+"\t"+"0"+"\n") # Negative sequence output
26+
o_put.close()
27+
return True
1828

19-
o_put.close()
29+
#process_fasta('positive_training.fasta', 'negative_training.fasta')

0 commit comments

Comments
 (0)