Add existing file

jithin8mathew · jithin8mathew · commit 34f2ba8eb3ae · 2019-11-21T18:30:48.000-06:00
diff --git a/README.md b/README.md
@@ -39,13 +39,9 @@ pip3 install discere
 ## Usage
 
 ```
-from discere import discere
-```
+import discere
 
-Steps to run the code..
-1. Add fasta file containing positive sequences with the file name "positive_training.fasta" to the "data" folder
-2. Add fasta file containing negative sequences with the file name "negative_training.fasta" to the "data" folder
-3. Run ML.py script*
-4. The output will be saved in /data/output/ folder in three different file formats upon the successful execution of the code
+discere.extract_feature('positive_training.fasta', 'negative_training.fasta')
 
-*Although you could also run process_fasta.py and feature_extraction.py individually in the respective order
+```
+## output
diff --git a/discere/config/attrib b/discere/config/attrib
diff --git a/discere/discere.py b/discere/discere.py
@@ -1,26 +1,26 @@
 import subprocess
 import sys
+from process_fasta import process_fasta
 
 system = sys.platform
 
-def fasta_process():
-	print("Processing fasta files....")
-	if system == 'linux':
-		subprocess.run(['python3','process_fasta.py'])
-	else:
-		subprocess.run(['python.exe','process_fasta.py'])
-
 def feat_ext():
 	print("Extracting features....")
 	if system == 'linux':
 		subprocess.run(['python3','feature_extraction.py'])
 	else:
 		subprocess.run(['python.exe','process_fasta.py'])
 
+def extract_feature(positive, negative):
+	code = process_fasta(positive, negative)
+	if code is True:
+		try:
+			feat_ext()
+		except Exception:
+			print('Failed to extract feautres... \n Code exiting with incomplete termination...')
+	else:print('Error processing the fasta files !')
+
 if __name__ == '__main__':
-	fasta_process()
-	feat_ext()
-	print("Feature extraction complete...")
-	print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats")
+	extract_feature(positive, negative)
 	import os
 	print("Have a great day ",os.getlogin(), "!")
diff --git a/discere/feature_extraction.py b/discere/feature_extraction.py
@@ -7,28 +7,32 @@
 path_=os.getcwd()		# get the current working directory
 _=(os.path.join(path_,"data/data.txt"))		# path to input data (sequence) folder
 
+print('Reading data...')
 if os.path.exists(_):data=pd.read_table(_)		#read the file as Pandas DataFrame
+print('Clearing existing files...')
+try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])]		# remove the file if already exist
+except Exception:pass
 
 seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists
 
-pth=(path_+"/data/output/")
+pth=(path_+"/output/")
 if not os.path.exists(pth):os.makedirs(pth)
 
 try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])]		# remove the file if already exist
 except Exception:pass
 
-attr=open(path_+"/data/attrib","rb")
+attr=open(path_+"/config/attrib","rb")
 attr=pickle.load(attr)		# load the pickle file with attribue names (for weka)
 with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr))
 
 def format_output(aa_count,cnt):				 # write the extracted feature values to arff (weka), txt(svm) and csv file
 	a=(dict(zip(it.count(), list(aa_count.values()))))
 	if cnt==1:
-		with open(pth+"/svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
+		with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
 		with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n")
 		with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n")
 	else:
-		with open(path_+"/data/output/svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
+		with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n")
 		with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n")
 		with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n")
 
@@ -39,3 +43,5 @@ def format_output(aa_count,cnt):				 # write the extracted feature values to arf
 	try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index())
 	except Exception:mol_w,ins= mol_w,ins	# aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index
 	format_output(aa_count,cl)
+print("Feature extraction complete...")
+print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats")
diff --git a/discere/process_fasta.py b/discere/process_fasta.py
@@ -1,19 +1,29 @@
 from Bio import SeqIO
 import os
 
-path_=os.getcwd()  			# get the location of Current working directory
-_=(os.path.join(path_,"data/data.txt"))
+def process_fasta(positive, negative):
+	print("Processing fasta files....")
+	path_=os.getcwd()  			# get the location of Current working directory
+	print('Cleaning existing data...')
+	if os.path.isdir(path_+"/data") is True:
+		os.rmdir(path_+"/data")
+	os.mkdir(path_+"/data")
 
-if os.path.isfile(_):
-	try:os.remove("data/data.txt")		# Remove the data.txt file from previous run
-	except Exception:pass
+	_=(os.path.join(path_,"data/data.txt"))
 
-o_put=open("data/data.txt","a+")		# create and open a new data.txt file
-o_put.write("sequence"+"\t"+"class"+"\n")
-for record in SeqIO.parse("data/positive_training.fasta", "fasta"):		# Code to parse multiple sequences using Bio Python
-	o_put.write(str(record.seq)+"\t"+"1"+"\n")								# Positive sequence output
+	if os.path.isfile(_):
+		try:os.remove("data.txt")		# Remove the data.txt file from previous run
+		except Exception:pass
+	print('Generating intermediate files...')
+	os.chdir(os.getcwd())
+	o_put = open("data/data.txt","a+")		# create and open a new data.txt file
+	o_put.write("sequence"+"\t"+"class"+"\n")
+	for record in SeqIO.parse(positive, "fasta"):		# Code to parse multiple sequences using Bio Python
+		o_put.write(str(record.seq)+"\t"+"1"+"\n")								# Positive sequence output
 
-for record in SeqIO.parse("data/negative_training.fasta", "fasta"):		# Code to parse multiple sequences using Bio Python
-	o_put.write(str(record.seq)+"\t"+"0"+"\n")								# Negative sequence output
+	for record in SeqIO.parse(negative, "fasta"):		# Code to parse multiple sequences using Bio Python
+		o_put.write(str(record.seq)+"\t"+"0"+"\n")								# Negative sequence output
+	o_put.close()
+	return True
 
-o_put.close()
+#process_fasta('positive_training.fasta', 'negative_training.fasta')