Skip to content

Commit 2e3ca9f

Browse files
authored
Merge pull request #47 from secure-software-engineering/fix/doc-features-setup
Refactor doc features creation and evaluation
2 parents d209237 + b54cdd0 commit 2e3ca9f

File tree

11 files changed

+97
-136
lines changed

11 files changed

+97
-136
lines changed

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/cli/CliRunner.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public class CliRunner implements Callable<Integer> {
2121
private String datasetJson = "/dataset/swan-dataset.json";
2222

2323
@CommandLine.Option(names = {"-in", "--train-instances"}, description = {"Path to ARFF files that contain training instances"})
24-
private List<String> instancesArff = new ArrayList<>();
24+
private List<String> arffInstancesFiles = new ArrayList<>();
2525

2626
@CommandLine.Option(names = {"-o", "--output"}, description = {"Directory to save output files"})
2727
private String outputDir = "";
@@ -72,7 +72,7 @@ public SwanOptions initializeOptions(){
7272
split,
7373
phase);
7474
options.setPredictionThreshold(predictionThreshold);
75-
options.setInstances(instancesArff);
75+
options.setInstances(arffInstancesFiles);
7676

7777
return options;
7878
}

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/cli/SwanCli.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public Integer run(SwanOptions options) throws Exception {
3737
options.setFeatureSet(Arrays.asList("code", "doc-manual", "doc-auto"));
3838
}
3939

40-
if (options.getInstances().isEmpty()) {
40+
if (options.getArffInstancesFiles().isEmpty() && options.getTrainDataDir().isEmpty()) {
4141

4242
List<String> instances = new ArrayList<>();
4343

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/cli/SwanOptions.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public class SwanOptions {
2424
private double trainTestSplit;
2525
private String phase;
2626
private double predictionThreshold;
27-
private List<String> instancesArff;
27+
private List<String> arffInstancesFiles;
2828

2929
public SwanOptions(String testDataDir, String trainDataDir, String datasetJson, String outputDir,
3030
List<String> featureSet, String toolkit, List<String> srmClasses,
@@ -178,12 +178,12 @@ public void setPredictionThreshold(double predictionThreshold) {
178178
this.predictionThreshold = predictionThreshold;
179179
}
180180

181-
public List<String> getInstances() {
182-
return instancesArff;
181+
public List<String> getArffInstancesFiles() {
182+
return arffInstancesFiles;
183183
}
184184

185185
public void setInstances(List<String> instancesArff) {
186-
this.instancesArff = instancesArff;
186+
this.arffInstancesFiles = instancesArff;
187187
}
188188

189189
@Override
@@ -192,7 +192,7 @@ public String toString() {
192192
"testData='" + testDataDir + '\'' +
193193
", trainData='" + trainDataDir + '\'' +
194194
", datasetJson='" + datasetJson + '\'' +
195-
", instances='" + instancesArff + '\'' +
195+
", instances='" + arffInstancesFiles + '\'' +
196196
", outputDir='" + outputDir + '\'' +
197197
", featureSet='" + featureSet + '\'' +
198198
", learningMode='" + toolkit + '\'' +

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/FeatureSet.java

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import java.util.*;
1919
import java.util.stream.Collectors;
2020

21-
abstract class FeatureSet {
21+
public abstract class FeatureSet {
2222

2323
protected Map<IFeature, Attribute> codeAttributes;
2424
protected final HashMap<String, Integer> instanceMap;
@@ -28,6 +28,7 @@ abstract class FeatureSet {
2828
protected DocFeatureHandler docFeatureHandler;
2929
protected HashMap<String, Instances> instances;
3030
protected ModelEvaluator.Toolkit toolkit;
31+
protected List<FeatureSet.Type> featureSets;
3132

3233
/**
3334
* Available feature sets:
@@ -46,6 +47,10 @@ public enum Type {
4647
this.value = value;
4748
}
4849

50+
public String getValue(){
51+
return value.toLowerCase();
52+
}
53+
4954
public static FeatureSet.Type getValue(String value) {
5055
for (FeatureSet.Type featureSet : FeatureSet.Type.values()) {
5156
if (featureSet.value.contains(value)) {
@@ -62,16 +67,16 @@ public FeatureSet(Dataset dataset, SwanOptions options, ModelEvaluator.Toolkit t
6267
this.dataset = dataset;
6368
this.toolkit = toolkit;
6469
instances = new HashMap<>();
70+
71+
featureSets = options.getFeatureSet().stream()
72+
.map(f -> FeatureSet.Type.getValue(f.toUpperCase()))
73+
.collect(Collectors.toList());
6574
}
6675

6776
/**
68-
*
77+
* Initialize feature handlers.
6978
*/
70-
public List<FeatureSet.Type> initializeFeatures() {
71-
72-
List<FeatureSet.Type> featureSets = options.getFeatureSet().stream()
73-
.map(f -> FeatureSet.Type.getValue(f.toUpperCase()))
74-
.collect(Collectors.toList());
79+
public void initializeFeatures() {
7580

7681
for (FeatureSet.Type featureSet : featureSets)
7782
switch (featureSet) {
@@ -80,30 +85,23 @@ public List<FeatureSet.Type> initializeFeatures() {
8085
codeFeatureHandler.initializeFeatures();
8186
break;
8287
case DOC_MANUAL:
83-
84-
docFeatureHandler = new DocFeatureHandler(dataset.getTrainMethods());
88+
docFeatureHandler = new DocFeatureHandler();
8589
docFeatureHandler.initialiseManualFeatureSet();
86-
docFeatureHandler.evaluateManualFeatureData();
8790
break;
8891
case DOC_AUTO:
89-
90-
docFeatureHandler = new DocFeatureHandler(dataset.getTrainMethods());
92+
docFeatureHandler = new DocFeatureHandler();
9193
docFeatureHandler.initialiseAutomaticFeatureSet();
92-
docFeatureHandler.evaluateAutomaticFeatureData();
9394
break;
9495
}
95-
96-
return featureSets;
9796
}
9897

9998
/**
10099
* Creates instances and adds attributes for the features, classes, and method signatures.
101100
*
102-
* @param categories list of categories
103-
* @param methods list of training methods
104-
* @param featureSets classification mode
101+
* @param categories list of categories
102+
* @param methods list of training methods
105103
*/
106-
public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods, List<FeatureSet.Type> featureSets) {
104+
public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods) {
107105

108106
ArrayList<Attribute> attributes = new ArrayList<>();
109107

@@ -188,8 +186,22 @@ public ArrayList<Attribute> addDocAttributes(FeatureSet.Type instanceSet) {
188186
return attributes;
189187
}
190188

189+
public void evaluateFeatureData(Set<Method> methods) {
190+
191+
for (FeatureSet.Type featureSet : featureSets)
192+
switch (featureSet) {
193+
case CODE:
194+
break;
195+
case DOC_MANUAL:
196+
docFeatureHandler.evaluateManualFeatureData(methods);
197+
break;
198+
case DOC_AUTO:
199+
docFeatureHandler.evaluateAutomaticFeatureData(methods);
200+
break;
201+
}
202+
}
191203

192-
public Instances createInstances(Instances instances, List<Type> featureSets, ArrayList<Attribute> attributes,
204+
public Instances createInstances(Instances instances, ArrayList<Attribute> attributes,
193205
Set<Method> methods, Set<Category> categories) {
194206

195207
for (FeatureSet.Type featureSet : featureSets)
@@ -206,12 +218,12 @@ public Instances createInstances(Instances instances, List<Type> featureSets, Ar
206218
}
207219

208220

209-
public Instances createInstances(List<Type> featureSets, ArrayList<Attribute> attributes,
221+
public Instances createInstances(ArrayList<Attribute> attributes,
210222
Set<Method> methods, Set<Category> categories) {
211223

212224
Instances instances = new Instances("swan-srm", attributes, 0);
213225

214-
return createInstances(instances, featureSets, attributes, methods, categories);
226+
return createInstances(instances, attributes, methods, categories);
215227
}
216228

217229

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/IFeatureSet.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ public interface IFeatureSet {
1212

1313
void createFeatures();
1414

15-
ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods, List<FeatureSet.Type> featureSets);
15+
ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Method> methods);
1616
}

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/MekaFeatureSet.java

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020
public class MekaFeatureSet extends FeatureSet implements IFeatureSet {
2121

22+
Instances trainInstances = null;
23+
Instances testInstances = null;
24+
Instances structure = null;
25+
2226
public MekaFeatureSet(Dataset dataset, SwanOptions options) {
2327
super(dataset, options, ModelEvaluator.Toolkit.MEKA);
2428
}
@@ -28,37 +32,47 @@ public MekaFeatureSet(Dataset dataset, SwanOptions options) {
2832
*/
2933
public void createFeatures() {
3034

31-
List<FeatureSet.Type> featureSets = initializeFeatures();
32-
33-
Instances trainInstances = null;
34-
Instances structure = null;
35+
initializeFeatures();
3536

3637
//Create and set attributes for the train instances
37-
if (options.getInstances().isEmpty()) {
38+
if (options.getArffInstancesFiles().isEmpty()) {
3839
ArrayList<Attribute> trainAttributes = createAttributes(getCategories(options.getAllClasses()), dataset.getTrainMethods(), featureSets);
39-
trainInstances = createInstances(featureSets, trainAttributes, dataset.getTrainMethods(), getCategories(options.getAllClasses()));
40+
structure = new Instances("swan-srm", trainAttributes, 0);
41+
convertToMekaInstances(structure);
42+
43+
Set<Method> methods = new HashSet<>(dataset.getTrainMethods());
44+
methods.addAll(dataset.getTestMethods());
45+
46+
evaluateFeatureData(methods);
47+
trainInstances = createInstances(structure, trainAttributes, dataset.getTrainMethods(), getCategories(options.getAllClasses()));
4048
} else {
4149
ArffLoader loader = new ArffLoader();
4250

4351
try {
44-
loader.setSource(new File(options.getInstances().get(0)));
52+
loader.setSource(new File(options.getArffInstancesFiles().get(0)));
4553
trainInstances = loader.getDataSet();
4654
structure = loader.getStructure();
4755
} catch (IOException e) {
4856
e.printStackTrace();
4957
}
58+
59+
createAttributes(getCategories(options.getAllClasses()), dataset.getTestMethods(), featureSets);
60+
evaluateFeatureData(dataset.getTestMethods());
5061
}
62+
testInstances = createTestSet();
63+
64+
this.instances.put("train", convertToMekaInstances(trainInstances));
65+
this.instances.put("test", convertToMekaInstances(testInstances));
66+
}
67+
68+
public Instances createTestSet() {
5169

5270
//Create and set attributes for the test instances.
5371
Attribute idAttr = new Attribute("id", dataset.getTestMethods().stream().map(Method::getArffSafeSignature).collect(Collectors.toList()));
5472
structure.replaceAttributeAt(idAttr, structure.attribute("id").index());
5573
ArrayList<Attribute> aList = Collections.list(structure.enumerateAttributes());
5674

57-
ArrayList<Attribute> testAttributes = createAttributes(getCategories(options.getAllClasses()), dataset.getTestMethods(), featureSets);
58-
Instances testInstances = (createInstances(structure, featureSets, aList, dataset.getTestMethods(), getCategories(options.getAllClasses())));
59-
60-
this.instances.put("train", convertToMekaInstances(trainInstances));
61-
this.instances.put("test", convertToMekaInstances(testInstances));
75+
return createInstances(structure, aList, dataset.getTestMethods(), getCategories(options.getAllClasses()));
6276
}
6377

6478
/**
@@ -78,7 +92,7 @@ public ArrayList<Attribute> createAttributes(Set<Category> categories, Set<Metho
7892
attributes.add(catAttribute);
7993
}
8094

81-
attributes.addAll(super.createAttributes(categories, methods, featureSets));
95+
attributes.addAll(super.createAttributes(categories, methods));
8296

8397
return attributes;
8498
}

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/WekaFeatureSet.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@ public WekaFeatureSet(Dataset dataset, SwanOptions options) {
2626
*/
2727
public void createFeatures() {
2828

29-
List<FeatureSet.Type> featureSets = initializeFeatures();
29+
initializeFeatures();
3030

3131
for (Category category : options.getAllClasses().stream().map(Category::fromText).collect(Collectors.toList())) {
3232

3333
//Create and set attributes for the train instances
34-
ArrayList<Attribute> trainAttributes = createAttributes(category, dataset.getTrainMethods(), featureSets);
34+
ArrayList<Attribute> trainAttributes = createAttributes(category, dataset.getTrainMethods());
3535

3636
String instanceName = category.getId().toLowerCase() + "-train-instances";
37-
Instances trainInstances = createInstances(featureSets, trainAttributes, dataset.getTrainMethods(), Collections.singleton(category));
37+
Instances trainInstances = createInstances(trainAttributes, dataset.getTrainMethods(), Collections.singleton(category));
3838
this.instances.put(category.getId().toLowerCase(), trainInstances);
3939
Util.exportInstancesToArff(trainInstances);
4040

@@ -48,13 +48,12 @@ public void createFeatures() {
4848
/**
4949
* Creates instances and adds attributes for the features, classes, and method signatures.
5050
*
51-
* @param category list of categories
52-
* @param methods list of training methods
53-
* @param featureSets classification mode
51+
* @param category list of categories
52+
* @param methods list of training methods
5453
*/
55-
public ArrayList<Attribute> createAttributes(Category category, Set<Method> methods, List<FeatureSet.Type> featureSets) {
54+
public ArrayList<Attribute> createAttributes(Category category, Set<Method> methods) {
5655

57-
ArrayList<Attribute> attributes = new ArrayList<>(super.createAttributes(getCategories(category), methods, featureSets));
56+
ArrayList<Attribute> attributes = new ArrayList<>(super.createAttributes(getCategories(category), methods));
5857
ArrayList<String> attributeValues = new ArrayList<>(Arrays.asList("0", "1"));
5958

6059
// Collect classes and add to attributes

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/doc/DocFeatureHandler.java

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,12 @@ public class DocFeatureHandler {
2323
private ArrayList<String> automaticFeatureSet;
2424
private HashMap<String, AnnotatedMethod> manualFeatureData;
2525
private HashMap<String, HashMap<String, Double>> automaticFeatureData;
26-
private Set<Method> methodSet;
2726

28-
public DocFeatureHandler(Set<Method> trainingSet) {
27+
public DocFeatureHandler() {
2928
manualFeatureSet = new HashSet<>();
3029
manualFeatureData = new HashMap<>();
3130
automaticFeatureSet = new ArrayList<>();
3231
automaticFeatureData = new HashMap<>();
33-
methodSet = new HashSet<>(trainingSet);
34-
}
35-
36-
public Set<Method> getMethodSet() {
37-
return methodSet;
3832
}
3933

4034
public HashMap<String, AnnotatedMethod> getManualFeatureData() {
@@ -45,12 +39,12 @@ public HashMap<String, HashMap<String, Double>> getAutomaticFeatureData() {
4539
return automaticFeatureData;
4640
}
4741

48-
public void evaluateManualFeatureData() {
42+
public void evaluateManualFeatureData(Set<Method> methodSet) {
4943
CoreNLPExecutor nlpExecutor = new CoreNLPExecutor();
5044
manualFeatureData = nlpExecutor.run(new ArrayList<>(methodSet));
5145
}
5246

53-
public void evaluateAutomaticFeatureData() {
47+
public void evaluateAutomaticFeatureData(Set<Method> methodSet) {
5448
DocCommentVector docCommentVector = new DocCommentVector();
5549
docCommentVector.fitVectors();
5650

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/doc/manual/annotated/LemmaCountFeature.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,16 @@ public FeatureResult evaluate(AnnotatedMethod annotatedMethod) {
3434
return featureResult;
3535
}
3636

37-
public int countLemmas(List<CoreMap> sentences){
37+
public int countLemmas(List<CoreMap> sentences) {
3838

3939
Set<String> lemmas = new HashSet<>();
4040

41-
if(sentences!=null)
42-
for (CoreLabel token : sentences.get(0).get(CoreAnnotations.TokensAnnotation.class)) {
41+
if (sentences != null && sentences.size() > 0)
42+
for (CoreLabel token : sentences.get(0).get(CoreAnnotations.TokensAnnotation.class)) {
4343

44-
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
45-
lemmas.add(lemma);
46-
}
44+
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
45+
lemmas.add(lemma);
46+
}
4747

4848
return lemmas.size();
4949
}

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/features/doc/manual/annotated/StopWordCountFeature.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,13 @@ public int countStopWords(List<CoreMap> sentences) {
5050

5151
ArrayList<String> lemmas = new ArrayList<>();
5252

53-
if(sentences!=null)
54-
for (CoreLabel token : sentences.get(0).get(CoreAnnotations.TokensAnnotation.class)) {
53+
if (sentences != null && sentences.size() > 0)
54+
for (CoreLabel token : sentences.get(0).get(CoreAnnotations.TokensAnnotation.class)) {
5555

56-
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
57-
if (stopWordSet.contains(lemma))
58-
lemmas.add(lemma);
59-
}
56+
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
57+
if (stopWordSet.contains(lemma))
58+
lemmas.add(lemma);
59+
}
6060

6161
return lemmas.size();
6262
}

0 commit comments

Comments
 (0)