|
4 | 4 | import pandas as pd |
5 | 5 | from Bio.SeqUtils.ProtParam import ProteinAnalysis |
6 | 6 |
|
7 | | -path_=os.getcwd() # get the current working directory |
8 | | -_=(os.path.join(path_,"data/data.txt")) # path to input data (sequence) folder |
9 | | - |
10 | | -print('Reading data...') |
11 | | -if os.path.exists(_):data=pd.read_table(_) #read the file as Pandas DataFrame |
12 | | -print('Clearing existing files...') |
13 | | -try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])] # remove the file if already exist |
14 | | -except Exception:pass |
15 | | - |
16 | | -seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists |
17 | | - |
18 | | -pth=(path_+"/output/") |
19 | | -if not os.path.exists(pth):os.makedirs(pth) |
20 | | - |
21 | | -try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])] # remove the file if already exist |
22 | | -except Exception:pass |
23 | | - |
24 | | -attr=open(path_+"/config/attrib","rb") |
25 | | -attr=pickle.load(attr) # load the pickle file with attribue names (for weka) |
26 | | -with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr)) |
27 | | - |
28 | | -def format_output(aa_count,cnt): # write the extracted feature values to arff (weka), txt(svm) and csv file |
29 | | - a=(dict(zip(it.count(), list(aa_count.values())))) |
30 | | - if cnt==1: |
31 | | - with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n") |
32 | | - with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n") |
33 | | - with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n") |
34 | | - else: |
35 | | - with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n") |
36 | | - with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n") |
37 | | - with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n") |
38 | | - |
39 | | -for seq,cl in zip(seq_list,cls_list): # main loop to extract the features |
40 | | - _= ProteinAnalysis(seq) # Biopython protein analysis package |
41 | | - aa_count=(_.count_amino_acids()) # amino acid count |
42 | | - aromat, fraction, iso=_.aromaticity(), _.secondary_structure_fraction(), _.isoelectric_point() |
43 | | - try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index()) |
44 | | - except Exception:mol_w,ins= mol_w,ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index |
45 | | - format_output(aa_count,cl) |
46 | | -print("Feature extraction complete...") |
47 | | -print("Extracted features are saved in data/output directory in .txt, .arff and .csv formats") |
| 7 | +def feature_extraction(outdir): |
| 8 | + path_=os.getcwd() # get the current working directory |
| 9 | + _=(os.path.join(path_,"data/data.txt")) # path to input data (sequence) folder |
| 10 | + |
| 11 | + print('Reading data...') |
| 12 | + if os.path.exists(_):data=pd.read_table(_) #read the file as Pandas DataFrame |
| 13 | + print('Clearing existing files...') |
| 14 | + try:[os.remove(filenames[0]+x) for filenames in os.walk(path_+'/data/') for x in (filenames[2])] # remove the file if already exist |
| 15 | + except Exception:pass |
| 16 | + |
| 17 | + seq_list, cls_list=data['sequence'].tolist(), data['class'].tolist() # get the sequence and class to lists |
| 18 | + |
| 19 | + pth=outdir+'/' |
| 20 | + if not os.path.exists(pth):os.makedirs(pth) |
| 21 | + |
| 22 | + try:[os.remove(filenames[0]+x) for filenames in os.walk(pth) for x in (filenames[2])] # remove the file if already exist |
| 23 | + except Exception:pass |
| 24 | + |
| 25 | + attr=open(path_+"/config/attrib","rb") |
| 26 | + attr=pickle.load(attr) # load the pickle file with attribue names (for weka) |
| 27 | + with open(pth+"/weka_output.arff","a+") as wk: wk.write("".join('{}\n'.format(x) for x in attr)) |
| 28 | + |
| 29 | + def format_output(aa_count,cnt): # write the extracted feature values to arff (weka), txt(svm) and csv file |
| 30 | + a=(dict(zip(it.count(), list(aa_count.values())))) |
| 31 | + if cnt==1: |
| 32 | + with open(pth+"svm_out.txt","a+")as s: s.write("+1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n") |
| 33 | + with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" serk\n") |
| 34 | + with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+str(cnt)+"\n") |
| 35 | + else: |
| 36 | + with open(path_+"svm_out.txt","a+")as s:s.write("-1 "+' '.join("{}:{}".format(k, v) for k, v in a.items())+"\n") |
| 37 | + with open(pth+"weka_output.arff","a+") as w: w.write(' '.join("{},".format(x) for x in list(aa_count.values()))+" loc\n") |
| 38 | + with open(pth+"tain_DL.csv","a+") as DPL: DPL.write(''.join("{},".format(x) for x in list(aa_count.values()))+str(round(aromat,3))+","+str(round(fraction[0],3))+","+str(round(fraction[1],3))+","+str(round(fraction[2],3))+","+str(round(iso,3))+","+str(mol_w)+","+str(ins)+","+"0"+"\n") |
| 39 | + |
| 40 | + for seq,cl in zip(seq_list,cls_list): # main loop to extract the features |
| 41 | + _= ProteinAnalysis(seq) # Biopython protein analysis package |
| 42 | + aa_count=(_.count_amino_acids()) # amino acid count |
| 43 | + aromat, fraction, iso=_.aromaticity(), _.secondary_structure_fraction(), _.isoelectric_point() |
| 44 | + try:mol_w, ins=("%0.2f" % _.molecular_weight()),("%0.2f" %_.instability_index()) |
| 45 | + except Exception:mol_w,ins= mol_w,ins # aromaticity, sec_strucure_fraction, iso_electric point , molecular weight, instability index |
| 46 | + format_output(aa_count,cl) |
| 47 | + print("Feature extraction complete...") |
| 48 | + print("Extracted features are saved in" +outdir + "/ directory in .txt, .arff and .csv formats") |
0 commit comments