Skip to content

Commit 25d9b94

Browse files
committed
Move dataset filters to data processor
1 parent fe4309d commit 25d9b94

File tree

2 files changed

+32
-35
lines changed

2 files changed

+32
-35
lines changed

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/io/dataset/DatasetProcessor.java

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
package de.fraunhofer.iem.swan.io.dataset;
22

33
import de.fraunhofer.iem.swan.cli.SwanOptions;
4+
import de.fraunhofer.iem.swan.data.Method;
45
import de.fraunhofer.iem.swan.io.doc.JavadocProcessor;
56
import de.fraunhofer.iem.swan.soot.Soot;
67
import de.fraunhofer.iem.swan.util.Util;
8+
import edu.stanford.nlp.util.StringUtils;
79
import org.slf4j.Logger;
810
import org.slf4j.LoggerFactory;
9-
1011
import java.io.IOException;
12+
import java.util.HashSet;
1113

1214
public class DatasetProcessor {
1315

@@ -31,23 +33,44 @@ public Dataset run() {
3133

3234
if (!options.getTrainDataDir().isEmpty())
3335
soot.cleanupList(dataset.getTrain());
36+
37+
logger.info("Importing {} SRMs from TRAIN dataset in {}, distribution={}",
38+
dataset.getTrainMethods().size(), options.getDatasetJson(),
39+
Util.countCategories(dataset.getTrainMethods()));
40+
41+
//Apply filters to dataset
42+
if (options.getDiscovery().size() > 0 || options.isDocumented()) {
43+
44+
for (Method method : new HashSet<>(dataset.getTrainMethods())) {
45+
46+
if (!options.getDiscovery().contains(method.getDiscovery()) ||
47+
((method.getJavadoc().getMethodComment().length() == 0
48+
|| StringUtils.split(method.getJavadoc().getMethodComment(), " ").size() <= 1) && options.isDocumented())) {
49+
dataset.getTrainMethods().remove(method);
50+
}
51+
}
52+
}
53+
3454
} catch (IOException e) {
3555
throw new RuntimeException(e);
3656
}
3757

38-
logger.info("Loaded {} training methods, distribution={}", dataset.getTrainMethods().size(), Util.countCategories(dataset.getTrainMethods()));
39-
4058
if (options.getPhase().equals("predict")) {
4159
//Load methods from the test set
4260
dataset.setTest(new SrmList(options.getTestDataDir()));
4361
dataset.getTest().setMethods(soot.loadMethods(dataset.getTest().getTestClasses()));
4462

63+
logger.info("Importing {} SRMs from TEST dataset in {}, distribution={}",
64+
dataset.getTestMethods().size(), options.getTestDataDir(),
65+
Util.countCategories(dataset.getTestMethods()));
66+
4567
if (options.getFeatureSet().contains("doc-")) {
4668

4769
//Extract doc comments and add to test set, if option is selected
4870
JavadocProcessor javadocProcessor = new JavadocProcessor(options.getTestDataSourceDir(), options.getOutputDir());
4971
javadocProcessor.run(dataset.getTestMethods(), options.getFeatureSet());
50-
logger.info("Loaded {} methods from {}", dataset.getTestMethods().size(), options.getTestDataDir());
72+
73+
logger.info("Extracting doc comments for {} methods in {}", dataset.getTestMethods().size(), options.getTestDataDir());
5174
}
5275
}
5376
return dataset;

swan-pipeline/src/main/java/de/fraunhofer/iem/swan/io/dataset/SrmListUtils.java

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,12 @@
44
import de.fraunhofer.iem.swan.data.Method;
55
import de.fraunhofer.iem.swan.io.doc.Javadoc;
66
import de.fraunhofer.iem.swan.io.doc.ssldoclet.MethodBlockType;
7-
import edu.stanford.nlp.util.StringUtils;
87
import org.slf4j.Logger;
98
import org.slf4j.LoggerFactory;
109

1110
import java.io.File;
1211
import java.io.IOException;
1312
import java.util.ArrayList;
14-
import java.util.HashSet;
15-
import java.util.List;
1613
import java.util.Set;
1714

1815
/**
@@ -34,11 +31,8 @@ public class SrmListUtils {
3431
public static SrmList importFile(String file) throws IOException {
3532

3633
ObjectMapper objectMapper = new ObjectMapper();
37-
SrmList srmList = objectMapper.readValue(new File(file), SrmList.class);
38-
logger.info("Collected {} methods from the training set.", srmList.getMethods().size());
3934

40-
// removeManualMethods(srmList);
41-
return srmList;
35+
return objectMapper.readValue(new File(file), SrmList.class);
4236
}
4337

4438
/**
@@ -51,38 +45,18 @@ public static void exportFile(SrmList srmList, String file) throws IOException {
5145

5246
ObjectMapper objectMapper = new ObjectMapper();
5347
objectMapper.writeValue(new File(file), srmList);
54-
logger.info("{} SRMs exported to {}", srmList.getMethods().size(), file);
55-
}
56-
57-
public static void removeUndocumentedMethods(SrmList list) {
58-
Set<Method> temp = new HashSet<>(list.getMethods());
59-
60-
for (Method method : temp) {
61-
List<String> words = StringUtils.split(method.getJavadoc().getMethodComment(), " ");
62-
63-
if (method.getJavadoc().getMethodComment().length() == 0 || words.size() <= 1)
64-
list.getMethods().remove(method);
65-
}
66-
}
67-
68-
public static void removeManualMethods(SrmList list) {
69-
Set<Method> temp = new HashSet<>(list.getMethods());
70-
71-
for (Method method : temp) {
72-
73-
if(!method.getDiscovery().contains("manual"))
74-
list.getMethods().remove(method);
75-
}
48+
logger.info("Exporting {} SRMs to {}", srmList.getMethods().size(), file);
7649
}
7750

7851
/**
7952
* Adds doc comments to method set.
80-
* @param methods methods to be updated
53+
*
54+
* @param methods methods to be updated
8155
* @param javadocs list of Javadoc objects
8256
* @return updated method set
8357
*/
8458
public static Set<Method> addDocComments(Set<Method> methods, ArrayList<Javadoc> javadocs) {
85-
59+
//TODO Check if returned set is used
8660
for (Javadoc doc : javadocs) {
8761

8862
for (MethodBlockType methodBlock : doc.getMethodBlocks().values()) {

0 commit comments

Comments
 (0)