11package de .fraunhofer .iem .swan .io .dataset ;
22
33import de .fraunhofer .iem .swan .cli .SwanOptions ;
4+ import de .fraunhofer .iem .swan .data .Method ;
45import de .fraunhofer .iem .swan .io .doc .JavadocProcessor ;
56import de .fraunhofer .iem .swan .soot .Soot ;
67import de .fraunhofer .iem .swan .util .Util ;
8+ import edu .stanford .nlp .util .StringUtils ;
79import org .slf4j .Logger ;
810import org .slf4j .LoggerFactory ;
9-
1011import java .io .IOException ;
12+ import java .util .HashSet ;
1113
1214public class DatasetProcessor {
1315
@@ -31,23 +33,44 @@ public Dataset run() {
3133
3234 if (!options .getTrainDataDir ().isEmpty ())
3335 soot .cleanupList (dataset .getTrain ());
36+
37+ logger .info ("Importing {} SRMs from TRAIN dataset in {}, distribution={}" ,
38+ dataset .getTrainMethods ().size (), options .getDatasetJson (),
39+ Util .countCategories (dataset .getTrainMethods ()));
40+
41+ //Apply filters to dataset
42+ if (options .getDiscovery ().size () > 0 || options .isDocumented ()) {
43+
44+ for (Method method : new HashSet <>(dataset .getTrainMethods ())) {
45+
46+ if (!options .getDiscovery ().contains (method .getDiscovery ()) ||
47+ ((method .getJavadoc ().getMethodComment ().length () == 0
48+ || StringUtils .split (method .getJavadoc ().getMethodComment (), " " ).size () <= 1 ) && options .isDocumented ())) {
49+ dataset .getTrainMethods ().remove (method );
50+ }
51+ }
52+ }
53+
3454 } catch (IOException e ) {
3555 throw new RuntimeException (e );
3656 }
3757
38- logger .info ("Loaded {} training methods, distribution={}" , dataset .getTrainMethods ().size (), Util .countCategories (dataset .getTrainMethods ()));
39-
4058 if (options .getPhase ().equals ("predict" )) {
4159 //Load methods from the test set
4260 dataset .setTest (new SrmList (options .getTestDataDir ()));
4361 dataset .getTest ().setMethods (soot .loadMethods (dataset .getTest ().getTestClasses ()));
4462
63+ logger .info ("Importing {} SRMs from TEST dataset in {}, distribution={}" ,
64+ dataset .getTestMethods ().size (), options .getTestDataDir (),
65+ Util .countCategories (dataset .getTestMethods ()));
66+
4567 if (options .getFeatureSet ().contains ("doc-" )) {
4668
4769 //Extract doc comments and add to test set, if option is selected
4870 JavadocProcessor javadocProcessor = new JavadocProcessor (options .getTestDataSourceDir (), options .getOutputDir ());
4971 javadocProcessor .run (dataset .getTestMethods (), options .getFeatureSet ());
50- logger .info ("Loaded {} methods from {}" , dataset .getTestMethods ().size (), options .getTestDataDir ());
72+
73+ logger .info ("Extracting doc comments for {} methods in {}" , dataset .getTestMethods ().size (), options .getTestDataDir ());
5174 }
5275 }
5376 return dataset ;
0 commit comments