1- package de .fraunhofer .iem .swan .model ;
1+ package de .fraunhofer .iem .swan .features ;
22
33import de .fraunhofer .iem .swan .data .Category ;
44import de .fraunhofer .iem .swan .data .Method ;
5- import de .fraunhofer .iem .swan .doc .features .DocFeatureHandler ;
6- import de .fraunhofer .iem .swan .doc .features .automatic .DocCommentVector ;
7- import de .fraunhofer .iem .swan .doc .features .manual .IDocFeature ;
8- import de .fraunhofer .iem .swan .doc .nlp .AnnotatedMethod ;
9- import de .fraunhofer .iem .swan .doc .nlp .CoreNLPExecutor ;
10- import de .fraunhofer .iem .swan .doc .nlp .NLPUtils ;
11- import de .fraunhofer .iem .swan .features .type .IFeature ;
12- import de .fraunhofer .iem .swan .util .SwanConfig ;
13- import org .nd4j .linalg .cpu .nativecpu .NDArray ;
5+ import de .fraunhofer .iem .swan .features .doc .DocFeatureHandler ;
6+ import de .fraunhofer .iem .swan .features .doc .manual .IDocFeature ;
7+ import de .fraunhofer .iem .swan .features .doc .nlp .AnnotatedMethod ;
8+ import de .fraunhofer .iem .swan .features .code .type .IFeature ;
149import weka .core .Attribute ;
1510import weka .core .DenseInstance ;
1611import weka .core .Instance ;
1712import weka .core .Instances ;
18- import weka .core .converters .ArffSaver ;
1913
20- import java .io .File ;
21- import java .io .IOException ;
2214import java .util .*;
2315
2416/**
@@ -36,20 +28,18 @@ public class InstancesHandler {
3628 public enum INSTANCE_SET {
3729 SWAN ,
3830 SWAN_SWANDOC_MANUAL ,
39- SWAN_SWANDOC_AUTOMATIC ,
31+ SWAN_SWANDOC_WORD_EMBEDDING ,
4032 SWANDOC_MANUAL ,
41- SWANDOC_AUTOMATIC
33+ SWANDOC_WORD_EMBEDDING
4234 }
4335
4436 private ArrayList <Attribute > attributes ;
4537 private Map <IFeature , Attribute > swanFeatureAttribs ;
46- private Set <Class <? extends IDocFeature >> swanDocFeatureSet ;
4738 private HashMap <String , Integer > instanceMap ;
48- private HashMap < String , Method > methodMap ;
39+ private Instances instances ;
4940
5041 public InstancesHandler () {
5142 this .instanceMap = new HashMap <>();
52- this .methodMap = new HashMap <>();
5343 }
5444
5545 /**
@@ -59,39 +49,34 @@ public InstancesHandler() {
5949 * @param instanceSet
6050 * @return
6151 */
62- public Instances createInstances (Set <Method > trainingSet ,
52+ public void createInstances (Set <Method > trainingSet ,
6353 Map <Category , Set <IFeature >> features , DocFeatureHandler docFeatures , Set <Category > categories ,
6454 INSTANCE_SET instanceSet ) {
6555
66- for (Method method : trainingSet ) {
67- methodMap .put (method .getSignature (), method );
68- }
69-
7056 //Initialize instances
7157 initializeInstances (features , docFeatures , categories , trainingSet , null , instanceSet );
7258
7359 // Set attributes to the train instances.
74- Instances trainInstances = new Instances ("training-methods" , attributes , 0 );
75- trainInstances .setClass (trainInstances .attribute ("class" ));
60+ instances = new Instances ("training-methods" , attributes , 0 );
61+ instances .setClass (instances .attribute ("class" ));
7662
7763 //Populate SWAN feature attributes
7864 switch (instanceSet ) {
7965 case SWAN :
80- trainInstances = addSwanInstances (trainInstances , trainingSet , categories );
66+ instances = addSwanInstances (instances , trainingSet , categories );
8167 break ;
8268 case SWANDOC_MANUAL :
83- case SWANDOC_AUTOMATIC :
84- trainInstances = addSwanDocInstances (trainInstances , docFeatures , categories , instanceSet );
69+ case SWANDOC_WORD_EMBEDDING :
70+ instances = addSwanDocInstances (instances , trainingSet , docFeatures , categories , instanceSet );
8571 break ;
8672 case SWAN_SWANDOC_MANUAL :
87- case SWAN_SWANDOC_AUTOMATIC :
88- trainInstances = addSwanInstances (trainInstances , trainingSet , categories );
89- trainInstances = addSwanDocInstances (trainInstances , docFeatures , categories , instanceSet );
73+ case SWAN_SWANDOC_WORD_EMBEDDING :
74+ instances = addSwanInstances (instances , trainingSet , categories );
75+ instances = addSwanDocInstances (instances , trainingSet , docFeatures , categories , instanceSet );
9076 break ;
9177 }
9278
93- exportInstances (trainInstances , categories );
94- return trainInstances ;
79+ //exportInstances(trainInstances, categories);
9580 }
9681
9782
@@ -116,30 +101,27 @@ public void initializeInstances(Map<Category, Set<IFeature>> features, DocFeatur
116101 initializeSwanFeatures (features , categories );
117102 break ;
118103 case SWAN_SWANDOC_MANUAL :
119- case SWAN_SWANDOC_AUTOMATIC :
104+ case SWAN_SWANDOC_WORD_EMBEDDING :
120105 initializeSwanFeatures (features , categories );
121106 initializeSwanDocFeatures (docFeatures , instanceSet );
122107 break ;
123108 case SWANDOC_MANUAL :
124- case SWANDOC_AUTOMATIC :
109+ case SWANDOC_WORD_EMBEDDING :
125110 initializeSwanDocFeatures (docFeatures , instanceSet );
126111 break ;
127112 }
128113
129114 // Add method signatures as id attribute
130115 ArrayList <String > methodStrings = new ArrayList <>();
131116
117+ // System.out.println("TOTAL SET...."+trainingSet.size());
132118 int c = 0 ;
133119 for (Method am : trainingSet ) {
134- methodStrings .add (am .getSignature ().replace ("," , "+" ));
120+ methodStrings .add (am .getArffSafeSignature ());
121+ //System.out.println("ini: "+am.getArffSafeSignature());
135122 c ++;
136123 }
137124
138- if (testSet != null )
139- for (Method am : testSet ) {
140- methodStrings .add (am .getSignature ());
141- }
142-
143125 Attribute idAttr = new Attribute ("id" , methodStrings );
144126 attributes .add (idAttr );
145127
@@ -203,8 +185,8 @@ public void initializeSwanDocFeatures(DocFeatureHandler features, INSTANCE_SET i
203185 attributes .add (attribute );
204186 }
205187 break ;
206- case SWAN_SWANDOC_AUTOMATIC :
207- case SWANDOC_AUTOMATIC :
188+ case SWAN_SWANDOC_WORD_EMBEDDING :
189+ case SWANDOC_WORD_EMBEDDING :
208190
209191 for (String feature : features .getAutomaticFeatureSet ()) {
210192 Attribute attribute = new Attribute (feature );
@@ -269,11 +251,11 @@ public Instances addSwanInstances(Instances trainInstances, Set<Method> training
269251 }
270252
271253 //Set id attribute
272- inst .setValue (trainInstances .attribute ("id" ), am .getSignature (). replace ( "," , "+" ));
254+ inst .setValue (trainInstances .attribute ("id" ), am .getArffSafeSignature ( ));
273255 c ++;
274256
275257 trainInstances .add (inst );
276- instanceMap .put (am .getSignature (), instanceIndex ++);
258+ instanceMap .put (am .getArffSafeSignature (), instanceIndex ++);
277259 }
278260 return trainInstances ;
279261 }
@@ -286,25 +268,30 @@ public Instances addSwanInstances(Instances trainInstances, Set<Method> training
286268 * @param categories set of categories
287269 * @return Instances containing data from SWAN-DOC
288270 */
289- public Instances addSwanDocInstances (Instances trainInstances , DocFeatureHandler docFeatures , Set <Category > categories , INSTANCE_SET instanceSet ) {
271+ public Instances addSwanDocInstances (Instances trainInstances , Set < Method > trainingSet , DocFeatureHandler docFeatures , Set <Category > categories , INSTANCE_SET instanceSet ) {
290272
291- for (Method method : docFeatures .getMethodSet ()) {
273+ // System.out.println("TOTAL DOCS...."+docFeatures.getMethodSet().size());
274+ for (Method method : trainingSet ) {
292275
276+ // System.out.println("Value not found: "+method.getArffSafeSignature());
293277 Instance inst ;
278+ boolean isNewInstance = false ;
294279
295280 //If instance exists already, update it. Otherwise create a new instance
296- if (instanceMap .containsKey (method .getSignature ())) {
281+ if (instanceMap .containsKey (method .getArffSafeSignature ())) {
297282
298- inst = trainInstances .instance (instanceMap .get (method .getSignature ()));
283+ inst = trainInstances .instance (instanceMap .get (method .getArffSafeSignature ()));
299284 // System.out.println("Instance for: "+ inst.attribute());
300285 // trainInstances.delete(instanceMap.get(method.getMethod().getSignature()));
301286 } else {
302287 inst = new DenseInstance (attributes .size ());
303288 inst .setDataset (trainInstances );
289+ isNewInstance = true ;
304290 }
305291
306- //System.out.println(method.getMethod().getSignature());
307- inst .setValue (trainInstances .attribute ("id" ), method .getSignature ().replace ("," , "+" ));
292+ // System.out.println("Value really not found: "+method.getArffSafeSignature());
293+ inst .setValue (trainInstances .attribute ("id" ), method .getArffSafeSignature ());
294+
308295 Category categoryClassified = null ;
309296
310297 for (Category category : method .getCategoriesTrained ()) {
@@ -329,11 +316,9 @@ public Instances addSwanDocInstances(Instances trainInstances, DocFeatureHandler
329316 case SWAN_SWANDOC_MANUAL :
330317 for (Class <? extends IDocFeature > feature : docFeatures .getManualFeatureSet ()) {
331318
332- IDocFeature javadocFeature ;
333-
334319 try {
335320
336- javadocFeature = feature .newInstance ();
321+ IDocFeature javadocFeature = feature .newInstance ();
337322 AnnotatedMethod annotatedMethod = docFeatures .getManualFeatureData ().get (method .getSignature ());
338323 //System.out.println("Adding: "+trainInstances.attribute(feature.getSimpleName())+" "+ javadocFeature.evaluate(method).getTotalValue());
339324 inst .setValue (trainInstances .attribute (feature .getSimpleName ()), javadocFeature .evaluate (annotatedMethod ).getTotalValue ());
@@ -342,39 +327,22 @@ public Instances addSwanDocInstances(Instances trainInstances, DocFeatureHandler
342327 }
343328 }
344329 break ;
345- case SWANDOC_AUTOMATIC :
346- case SWAN_SWANDOC_AUTOMATIC :
330+ case SWANDOC_WORD_EMBEDDING :
331+ case SWAN_SWANDOC_WORD_EMBEDDING :
347332 HashMap <String , Double > vectorValues = docFeatures .getAutomaticFeatureData ().get (method .getSignature ());
348333
349334 for (String key : vectorValues .keySet ()) {
350335 inst .setValue (trainInstances .attribute (key ), vectorValues .get (key ));
351336 }
352337 break ;
353338 }
354- trainInstances .add (inst );
339+ if (isNewInstance )
340+ trainInstances .add (inst );
355341 }
356342 return trainInstances ;
357343 }
358344
359- public void exportInstances (Instances trainInstances , Set <Category > categories ) {
360- SwanConfig swanConfig = new SwanConfig ();
361- Properties config = swanConfig .getConfig ();
362-
363- if (Boolean .parseBoolean (config .getProperty ("output_train_arff_data" ))) {
364- // Save arff data.
365- ArffSaver saver = new ArffSaver ();
366- saver .setInstances (trainInstances );
367- List <Category > fileNameList = new ArrayList <>(categories );
368- Collections .sort (fileNameList );
369- String fileName = fileNameList .toString ();
370- fileName = fileName .substring (1 , fileName .length () - 1 );
371- fileName = fileName .replace (", " , "_" );
372- try {
373- saver .setFile (new File ("/Users/oshando/Projects/thesis/03-code/swan/swan_core/swan-out/weka/" + "Train_" + fileName + ".arff" ));
374- saver .writeBatch ();
375- } catch (IOException e ) {
376- e .printStackTrace ();
377- }
378- }
345+ public Instances getInstances () {
346+ return instances ;
379347 }
380348}
0 commit comments