Skip to content

Commit 120a966

Browse files
committed
Refactor feature sets for MEKA and WEKA
1 parent 33ef97a commit 120a966

File tree

4 files changed

+266
-83
lines changed

4 files changed

+266
-83
lines changed

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/FeaturesHandler.java renamed to swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/FeatureSet.java

Lines changed: 95 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
import de.fraunhofer.iem.swan.data.Method;
66
import de.fraunhofer.iem.swan.features.code.CodeFeatureHandler;
77
import de.fraunhofer.iem.swan.features.code.soot.SourceFileLoader;
8+
import de.fraunhofer.iem.swan.features.code.type.IFeature;
89
import de.fraunhofer.iem.swan.features.doc.DocFeatureHandler;
910
import de.fraunhofer.iem.swan.features.doc.manual.IDocFeature;
1011
import de.fraunhofer.iem.swan.features.doc.nlp.AnnotatedMethod;
11-
import de.fraunhofer.iem.swan.features.code.type.IFeature;
1212
import de.fraunhofer.iem.swan.io.dataset.SrmList;
13-
import de.fraunhofer.iem.swan.util.Util;
13+
import de.fraunhofer.iem.swan.model.ModelEvaluator;
1414
import weka.core.Attribute;
1515
import weka.core.DenseInstance;
1616
import weka.core.Instance;
@@ -19,66 +19,64 @@
1919
import java.util.*;
2020
import java.util.stream.Collectors;
2121

22-
/**
23-
* @author Oshando Johnson on 27.09.20
24-
*/
25-
public class FeaturesHandler {
22+
abstract class FeatureSet {
2623

27-
private Map<IFeature, Attribute> codeAttributes;
28-
private final HashMap<String, Integer> instanceMap;
29-
private final SwanOptions options;
30-
private SrmList trainData;
31-
private CodeFeatureHandler codeFeatureHandler;
32-
private SourceFileLoader testData;
33-
private DocFeatureHandler docFeatureHandler;
34-
private HashMap<String, Instances> instances;
24+
protected Map<IFeature, Attribute> codeAttributes;
25+
protected final HashMap<String, Integer> instanceMap;
26+
protected final SwanOptions options;
27+
protected SrmList trainData;
28+
protected CodeFeatureHandler codeFeatureHandler;
29+
protected SourceFileLoader testData;
30+
protected DocFeatureHandler docFeatureHandler;
31+
protected HashMap<String, Instances> instances;
32+
protected ModelEvaluator.Mode mode;
3533

3634
/**
3735
* Available feature sets:
3836
* CODE: source code features
3937
* DOC_MANUAL: Javadoc manual features
4038
* DOC_AUTO: Javadoc automatic (word embedding) features
4139
*/
42-
public enum FeatureSet {
40+
public enum Type {
4341
CODE("CODE"),
4442
DOC_AUTO("DOC-AUTO"),
4543
DOC_MANUAL("DOC-MANUAL");
4644

4745
private final String value;
4846

49-
FeatureSet(String value) {
47+
Type(String value) {
5048
this.value = value;
5149
}
5250

53-
public static FeaturesHandler.FeatureSet getValue(String value) {
54-
for (FeaturesHandler.FeatureSet featureSet : FeaturesHandler.FeatureSet.values()) {
51+
public static FeatureSet.Type getValue(String value) {
52+
for (FeatureSet.Type featureSet : FeatureSet.Type.values()) {
5553
if (featureSet.value.contains(value)) {
5654
return featureSet;
5755
}
5856
}
59-
return null;// not found
57+
return null;
6058
}
6159
}
6260

63-
public FeaturesHandler(SrmList trainData, SourceFileLoader testData, SwanOptions options) {
61+
public FeatureSet(SrmList trainData, SourceFileLoader testData, SwanOptions options, ModelEvaluator.Mode mode) {
6462
this.instanceMap = new HashMap<>();
6563
this.options = options;
6664
this.trainData = trainData;
6765
this.testData = testData;
66+
this.mode = mode;
6867
instances = new HashMap<>();
6968
}
7069

7170
/**
7271
*
7372
*/
74-
public void createFeatures() {
73+
public List<FeatureSet.Type> initializeFeatures() {
7574

76-
List<FeaturesHandler.FeatureSet> featureSets = options.getFeatureSet().stream()
77-
.map(f -> FeaturesHandler.FeatureSet.getValue(f.toUpperCase()))
75+
List<FeatureSet.Type> featureSets = options.getFeatureSet().stream()
76+
.map(f -> FeatureSet.Type.getValue(f.toUpperCase()))
7877
.collect(Collectors.toList());
7978

80-
//Creat
81-
for (FeaturesHandler.FeatureSet featureSet : featureSets)
79+
for (FeatureSet.Type featureSet : featureSets)
8280
switch (featureSet) {
8381
case CODE:
8482
codeFeatureHandler = new CodeFeatureHandler(trainData.getClasspath(), testData.getClasspath());
@@ -98,38 +96,7 @@ public void createFeatures() {
9896
break;
9997
}
10098

101-
for (String category : options.getAllClasses()) {
102-
103-
//TRAIN
104-
//Create attributes for feature set
105-
ArrayList<Attribute> trainAttributes = createAttributes(getCategories(category), trainData.getMethods(), featureSets);
106-
107-
//Set attributes to the train instances.
108-
Instances trainInstances = createInstances(featureSets, Category.fromText(category), trainAttributes, trainData.getMethods(), category + "-train-instances");
109-
this.instances.put(category, trainInstances);
110-
Util.exportInstancesToArff(trainInstances);
111-
112-
//TEST
113-
ArrayList<Attribute> testAttributes = createAttributes(getCategories(category), testData.getMethods(), featureSets);
114-
115-
//Set attributes to the train instances.
116-
Instances testInstances = createInstances(featureSets, Category.fromText(category), testAttributes, testData.getMethods(), category + "-test-instances");
117-
//this.instances.put(category, trainInstances);
118-
Util.exportInstancesToArff(testInstances);
119-
}
120-
}
121-
122-
public HashSet<Category> getCategories(String cat) {
123-
124-
HashSet<Category> categories;
125-
126-
if (cat.contentEquals("authentication"))
127-
categories = new HashSet<>(Arrays.asList(Category.AUTHENTICATION_TO_HIGH,
128-
Category.AUTHENTICATION_TO_LOW, Category.AUTHENTICATION_NEUTRAL, Category.NONE));
129-
else
130-
categories = new HashSet<>(Arrays.asList(Category.fromText(cat), Category.NONE));
131-
132-
return categories;
99+
return featureSets;
133100
}
134101

135102
/**
@@ -139,11 +106,16 @@ public HashSet<Category> getCategories(String cat) {
139106
* @param methods list of training methods
140107
* @param featureSets classification mode
141108
*/
142-
public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods, List<FeaturesHandler.FeatureSet> featureSets) {
109+
public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods, List<FeatureSet.Type> featureSets) {
143110

144111
ArrayList<Attribute> attributes = new ArrayList<>();
112+
113+
// Add method signatures as id attribute
114+
Attribute idAttr = new Attribute("id", methods.stream().map(Method::getArffSafeSignature).collect(Collectors.toList()));
115+
attributes.add(idAttr);
116+
145117
//Create feature set and add to attributes
146-
for (FeaturesHandler.FeatureSet featureSet : featureSets)
118+
for (FeatureSet.Type featureSet : featureSets)
147119
switch (featureSet) {
148120

149121
case CODE:
@@ -155,14 +127,6 @@ public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Metho
155127
break;
156128
}
157129

158-
// Add method signatures as id attribute
159-
Attribute idAttr = new Attribute("id", methods.stream().map(Method::getArffSafeSignature).collect(Collectors.toList()));
160-
attributes.add(idAttr);
161-
162-
// Collect classes and add to attributes
163-
Attribute classAttr = new Attribute("class", categories.stream().map(Category::toString).collect(Collectors.toList()));
164-
attributes.add(classAttr);
165-
166130
return attributes;
167131
}
168132

@@ -204,7 +168,7 @@ public ArrayList<Attribute> addCodeAttributes(Set<Category> categories) {
204168
*
205169
* @param instanceSet classification mode
206170
*/
207-
public ArrayList<Attribute> addDocAttributes(FeaturesHandler.FeatureSet instanceSet) {
171+
public ArrayList<Attribute> addDocAttributes(FeatureSet.Type instanceSet) {
208172

209173
ArrayList<Attribute> attributes = new ArrayList<>();
210174

@@ -227,32 +191,33 @@ public ArrayList<Attribute> addDocAttributes(FeaturesHandler.FeatureSet instance
227191
return attributes;
228192
}
229193

230-
public Instances createInstances(List<FeaturesHandler.FeatureSet> featureSets, Category category, ArrayList<Attribute> attributes, Set<Method> methods, String name) {
194+
public Instances createInstances(List<Type> featureSets, ArrayList<Attribute> attributes,
195+
Set<Method> methods, Set<Category> categories, String name) {
231196

232197
Instances instances = new Instances(name, attributes, 0);
233-
instances.setClass(instances.attribute("class"));
234198

235-
for (FeaturesHandler.FeatureSet featureSet : featureSets)
199+
for (FeatureSet.Type featureSet : featureSets)
236200
switch (featureSet) {
237201
case CODE:
238-
instances.addAll(getCodeInstances(instances, methods, category, attributes));
202+
instances.addAll(getCodeInstances(instances, methods, categories, attributes));
239203
break;
240204
case DOC_MANUAL:
241205
case DOC_AUTO:
242-
instances.addAll(getDocInstances(instances, methods, category, featureSet, attributes));
206+
instances.addAll(getDocInstances(instances, methods, categories, featureSet, attributes));
243207
break;
244208
}
245209
return instances;
246210
}
247211

212+
248213
/**
249214
* Adds data for SWAN features to instance set.
250215
*
251216
* @param instances instance srt
252217
* @param methods training set
253218
* @return instance set containing data from SWAN
254219
*/
255-
public ArrayList<Instance> getCodeInstances(Instances instances, Set<Method> methods, Category category, ArrayList<Attribute> attributes) {
220+
public ArrayList<Instance> getCodeInstances(Instances instances, Set<Method> methods, Set<Category> categories, ArrayList<Attribute> attributes) {
256221

257222
ArrayList<Instance> instanceList = new ArrayList<>();
258223

@@ -264,10 +229,34 @@ public ArrayList<Instance> getCodeInstances(Instances instances, Set<Method> met
264229
Instance inst = new DenseInstance(attributes.size());
265230
inst.setDataset(instances);
266231

267-
if (method.getSrm() != null || method.getCwe() != null)
268-
inst.setClassValue(getCategory(method, category));
232+
for (Category cat : categories) {
233+
if (cat.isAuthentication() && !method.getAuthSrm().isEmpty()) {
234+
235+
if (mode == ModelEvaluator.Mode.MEKA)
236+
inst.setValue(instances.attribute(cat.getId()), "1");
237+
else {
238+
for (Category auth : method.getAuthSrm()) {
239+
switch (auth) {
240+
case AUTHENTICATION_TO_LOW:
241+
inst.setValue(instances.attribute(cat.getId()), "1");
242+
break;
243+
case AUTHENTICATION_NEUTRAL:
244+
inst.setValue(instances.attribute(cat.getId()), "2");
245+
break;
246+
case AUTHENTICATION_TO_HIGH:
247+
inst.setValue(instances.attribute(cat.getId()), "3");
248+
break;
249+
}
250+
}
251+
}
252+
} else if (method.getAllCategories().contains(cat)) {
253+
inst.setValue(instances.attribute(cat.getId()), "1");
254+
} else
255+
inst.setValue(instances.attribute(cat.getId()), "0");
256+
}
269257

270258
for (Map.Entry<IFeature, Attribute> entry : codeAttributes.entrySet()) {
259+
271260
switch (entry.getKey().applies(method)) {
272261
case TRUE:
273262
inst.setValue(entry.getValue(), "true");
@@ -295,7 +284,8 @@ public ArrayList<Instance> getCodeInstances(Instances instances, Set<Method> met
295284
* @param instances instance srt
296285
* @return Instances containing data from SWAN-DOC
297286
*/
298-
public ArrayList<Instance> getDocInstances(Instances instances, Set<Method> methods, Category category, FeaturesHandler.FeatureSet instanceSet, ArrayList<Attribute> attributes) {
287+
public ArrayList<Instance> getDocInstances(Instances instances, Set<Method> methods, Set<Category> categories,
288+
FeatureSet.Type instanceSet, ArrayList<Attribute> attributes) {
299289

300290
ArrayList<Instance> instanceList = new ArrayList<>();
301291

@@ -312,12 +302,25 @@ public ArrayList<Instance> getDocInstances(Instances instances, Set<Method> meth
312302
inst.setDataset(instances);
313303
isNewInstance = true;
314304

315-
if (method.getSrm() != null || method.getCwe() != null)
316-
inst.setClassValue(getCategory(method, category));
305+
switch (mode) {
306+
case MEKA:
307+
for (Category cat : categories) {
308+
if (method.getAllCategories().contains(cat) || (cat.isAuthentication() && !method.getAuthSrm().isEmpty())) {
309+
inst.setValue(instances.attribute(cat.getId()), "1");
310+
} else
311+
inst.setValue(instances.attribute(cat.getId()), "0");
312+
}
313+
break;
314+
315+
case WEKA:
316+
if (method.getSrm() != null || method.getCwe() != null)
317+
// inst.setClassValue(getCategory(method, categories));
318+
break;
319+
}
320+
317321
inst.setValue(instances.attribute("id"), method.getArffSafeSignature());
318322
}
319323

320-
321324
switch (instanceSet) {
322325
case DOC_MANUAL:
323326
for (Class<? extends IDocFeature> feature : docFeatureHandler.getManualFeatureSet()) {
@@ -326,8 +329,8 @@ public ArrayList<Instance> getDocInstances(Instances instances, Set<Method> meth
326329
IDocFeature javadocFeature = feature.newInstance();
327330
AnnotatedMethod annotatedMethod = docFeatureHandler.getManualFeatureData().get(method.getSignature());
328331

329-
if(annotatedMethod!=null)
330-
inst.setValue(instances.attribute(feature.getSimpleName()), javadocFeature.evaluate(annotatedMethod).getTotalValue());
332+
if (annotatedMethod != null)
333+
inst.setValue(instances.attribute(feature.getSimpleName()), javadocFeature.evaluate(annotatedMethod).getTotalValue());
331334
} catch (InstantiationException | IllegalAccessException e) {
332335
e.printStackTrace();
333336
}
@@ -374,6 +377,15 @@ public HashMap<String, Instances> getInstances() {
374377
return instances;
375378
}
376379

380+
public Instances getTrainInstances() {
381+
return instances.get("train");
382+
}
383+
384+
public Instances getTestInstances() {
385+
return instances.get("test");
386+
}
387+
388+
377389
public void setInstances(HashMap<String, Instances> instances) {
378390
this.instances = instances;
379391
}
@@ -393,4 +405,4 @@ public DocFeatureHandler getDocFeatureHandler() {
393405
public void setDocFeatureHandler(DocFeatureHandler docFeatureHandler) {
394406
this.docFeatureHandler = docFeatureHandler;
395407
}
396-
}
408+
}

0 commit comments

Comments
 (0)