@@ -314,7 +314,7 @@ private static void extractFeatureWorker(
314314 // construct the featureName name and set to 1.0
315315 // however, only add the featureName if the featureName alphabet is allowed to grow.
316316 String fname = internalFeatureNamePrefix ;
317- addToFeatureVector (fv , fname , 1.0 );
317+ setInFeatureVector (fv , fname , 1.0 );
318318 } else {
319319 // First get the value inputAS an Object, if there is no value, we have an Object that is null
320320 // If the sourceAnnotation is null, we already did not find the source at all,
@@ -344,22 +344,22 @@ private static void extractFeatureWorker(
344344 Iterable iterable = (Iterable ) valObj ;
345345 for (Object obj : iterable ) {
346346 String val = obj .toString ();
347- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
347+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
348348 }
349349 } else if (valObj instanceof Map ) {
350350 Map map = (Map ) valObj ;
351351 for (Object key : map .keySet ()) {
352352 Object mapval = map .get (key );
353353 String val = key .toString () + "=" + mapval .toString ();
354- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
354+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
355355 }
356356 } else if (valObj instanceof Object []) {
357357 for (Object obj : ((Object []) valObj )) {
358- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + obj .toString (), 1.0 );
358+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + obj .toString (), 1.0 );
359359 }
360360 } else if (valObj instanceof int []) {
361361 for (int intval : ((int []) valObj )) {
362- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + intval , 1.0 );
362+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + intval , 1.0 );
363363 }
364364 // TODO: other array types??
365365 } else {
@@ -371,18 +371,18 @@ private static void extractFeatureWorker(
371371 for (String v : vals ) {
372372 // NOTE: we automatically remove any empty elements here
373373 if (!v .trim ().isEmpty ()) {
374- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + v .trim (), 1.0 );
374+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + v .trim (), 1.0 );
375375 }
376376 }
377377 } else // just take the value as is.
378378 // Only in this case we allow for optionally getting the score from a different
379379 // feature of the same annotation we got the value from.
380380 if (featureName4Value .isEmpty ()) {
381- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
381+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , 1.0 );
382382 } else {
383383 // NOTE: sourceAnnotation should always ne non-null here since valObj is non-null
384384 double score = gate .plugin .learningframework .LFUtils .anyToDoubleOrElse (sourceAnnotation .getFeatures ().get (featureName4Value ), 1.0 );
385- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , score );
385+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + val , score );
386386 }
387387 }
388388 } else {
@@ -397,7 +397,7 @@ private static void extractFeatureWorker(
397397 case zero_value : // we treat this identical to keep: no feature set
398398 break ;
399399 case special_value : // we use the predefined special value
400- addToFeatureVector (fv , internalFeatureNamePrefix + VALSEP + MVVALUE , 1.0 );
400+ setInFeatureVector (fv , internalFeatureNamePrefix + VALSEP + MVVALUE , 1.0 );
401401 break ;
402402 default :
403403 throw new NotImplementedException ("MV-Handling" );
@@ -416,12 +416,12 @@ private static void extractFeatureWorker(
416416 if (alphabet .contains (val )) {
417417 // add the featureName, using the value we have stored for it, but only if the featureName
418418 // itself can be added
419- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
419+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
420420 } else // we have not seen this value: if the alphabet is allowed to grow add it and
421421 // then try to add the featureName, otherwise, do nothing
422422 if (!alphabet .growthStopped ()) {
423423 // the lookupIndex method automatically adds the value if it is not there yet
424- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
424+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (val ));
425425 } else {
426426 //System.out.println("DEBUG: number, growStopped");
427427 }
@@ -433,21 +433,21 @@ private static void extractFeatureWorker(
433433 //System.out.println("DEBUG: other, mv, setProp");
434434 break ;
435435 case keep :
436- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
436+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
437437 break ;
438438 case zero_value : // use the "special_value"
439- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
439+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
440440 String val = MVVALUE ;
441441 if (alphabet .contains (val )) {
442- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
442+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
443443 } else if (!alphabet .growthStopped ()) {
444- addToFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
444+ setInFeatureVector (fv , internalFeatureNamePrefix , alphabet .lookupIndex (MVVALUE ));
445445 } else {
446446 //System.out.println("DEBUG: number, growStopped");
447447 }
448448 break ;
449449 case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
450- addToFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
450+ setInFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
451451 break ;
452452 default :
453453 throw new NotImplementedException ("MV-Handling" );
@@ -462,27 +462,27 @@ private static void extractFeatureWorker(
462462 double val = 0.0 ;
463463 if (valObj instanceof Number ) {
464464 val = ((Number ) valObj ).doubleValue ();
465- addToFeatureVector (fv , internalFeatureNamePrefix , val );
465+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
466466 } else if (valObj instanceof Boolean ) {
467467 if ((Boolean ) valObj ) {
468468 val = 1.0 ;
469469 } else {
470470 val = 0.0 ;
471471 }
472- addToFeatureVector (fv , internalFeatureNamePrefix , val );
472+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
473473 } else if (valObj instanceof double []) {
474474 // create one feature for each entry in the array
475475 int i = 0 ;
476476 for (double el : ((double []) valObj )) {
477- addToFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , el );
477+ setInFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , el );
478478 i ++;
479479 }
480480 } else if (valObj instanceof Iterable ) {
481481 int i = 0 ;
482482 for (Object el : (Iterable ) valObj ) {
483483 val = LFUtils .anyToDoubleOrElse (el , 0.0 );
484484 if (val != 0.0 ) {
485- addToFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , LFUtils .anyToDoubleOrElse (el , val ));
485+ setInFeatureVector (fv , internalFeatureNamePrefix + ELEMSEP + i , LFUtils .anyToDoubleOrElse (el , val ));
486486 }
487487 i ++;
488488 }
@@ -497,7 +497,7 @@ private static void extractFeatureWorker(
497497 + // take it from the annotation, annType can be empty!
498498 " at offset " + gate .Utils .start (sourceAnnotation ) + " in document " + doc .getName ());
499499 }
500- addToFeatureVector (fv , internalFeatureNamePrefix , val );
500+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
501501 }
502502 //System.err.println("DEBUG: for fname="+featureName+",dt="+dt+", valObj="+valObj+", fv="+fv.numLocations());
503503 } else {
@@ -508,14 +508,14 @@ private static void extractFeatureWorker(
508508 //System.out.println("DEBUG: numeric, mv, setProp");
509509 break ;
510510 case keep : // for this kind of codeas, we use the value NaN
511- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
511+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
512512 break ;
513513 case zero_value : // use the first value, does not make much sense really, but ...
514514 // TODO: document that this combination should be avoided, probably
515- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
515+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
516516 break ;
517517 case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
518- addToFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
518+ setInFeatureVector (fv , internalFeatureNamePrefix , -1.0 );
519519 break ;
520520 default :
521521 throw new NotImplementedException ("MV-Handling" );
@@ -547,7 +547,7 @@ private static void extractFeatureWorker(
547547 " at offset " + gate .Utils .start (sourceAnnotation ) + " in document " + doc .getName ());
548548 }
549549 }
550- addToFeatureVector (fv , internalFeatureNamePrefix , val );
550+ setInFeatureVector (fv , internalFeatureNamePrefix , val );
551551 } else {
552552 // we have a missing boolean value
553553 switch (mvt ) {
@@ -556,14 +556,14 @@ private static void extractFeatureWorker(
556556 inst .setProperty (PROP_IGNORE_HAS_MV , true );
557557 break ;
558558 case keep : // for this kind of codeas, we use the value NaN
559- addToFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
559+ setInFeatureVector (fv , internalFeatureNamePrefix , Double .NaN );
560560 break ;
561561 case zero_value : // Use zero which will make false identical to missing
562562 // and work well with sparse vectors
563- addToFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
563+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.0 );
564564 break ;
565565 case special_value : // we use the special value -1.0 which should get handled by Mallet somehow
566- addToFeatureVector (fv , internalFeatureNamePrefix , 0.5 );
566+ setInFeatureVector (fv , internalFeatureNamePrefix , 0.5 );
567567 break ;
568568 default :
569569 throw new NotImplementedException ("MV-Handling" );
@@ -693,15 +693,20 @@ private static void extractFeature(
693693 prefix = ng .name ;
694694 }
695695 prefix = prefix + NAMESEP + "N" + number ;
696- // NOTE: if we have a featureName4Value set, then we set the feature value to
697- // what we have calculated, otherwise we add one for each time the ngram occurs
698- // within the span.
699- if (featureName4Value .isEmpty ()) {
700- addToFeatureVector (fv , prefix + VALSEP + ngram , score );
701- } else {
702- setFeatureVector (fv , prefix + VALSEP + ngram , score );
703- }
696+ // NOTE: for now, we always add to any existing value of the feature vector we
697+ // may already have. That way, if some ngram occurs multiple times, we use the
698+ // sum its scores (and the score either is just 1.0 or whatever we got from using
699+ // the featureName4Value value).
700+ accumulateInFeatureVector (fv , prefix + VALSEP + ngram , score );
701+ // NOTE: previously, we only accumulated if there was no weight feature, otherwise
702+ // the weight was directly used without accumulation
703+ //if (featureName4Value.isEmpty()) {
704+ // accumulateInFeatureVector(fv, prefix + VALSEP + ngram, score);
705+ //} else {
706+ // setInFeatureVector(fv, prefix + VALSEP + ngram, score);
707+ //}
704708 }
709+ //System.err.println("DEBUG: Vector after adding feature "+ng+" is now "+fv);
705710 } // extractFeature(NGram)
706711
707712 private static void extractFeature (
@@ -1079,28 +1084,44 @@ public static FeatureSpecAttribute lookupAttributeForFeatureName(List<FeatureSpe
10791084 /// HELPER AND UTILITY METHODS
10801085 ///=======================================
10811086 /**
1082- * Same inputAS the method, but makes sure a non-growable Alphabet is considered.
1087+ * Set a feature in the feature vector, to the given value.
1088+ * However, if growth is stopped, do not set the feature if the key is not known.
10831089 *
1090+ * This method assumes that the key for this feature vector is only set once, if it
1091+ * is set another time for the same feature vector, any old value is overridden!
1092+ *
10841093 * @param fv
10851094 * @param key
10861095 * @param val
10871096 */
1088- private static void addToFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1097+ private static void setInFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
10891098 Alphabet a = fv .getAlphabet ();
10901099 if (!a .contains (key ) && a .growthStopped ()) {
10911100 //System.err.println("DEBUG: GROWTH STOPPED! key="+key+",a="+a);
10921101 return ;
10931102 }
1094- fv .add (key , val );
1103+ if (fv .contains (key )) {
1104+ System .err .println ("LF DEBUG: setting/overriding a value where there is already one! key=" +key );
1105+ fv .setValue (a .lookupIndex (key ), val );
1106+ } else {
1107+ fv .add (key , val );
1108+ }
10951109 }
10961110
1097- private static void setFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
1111+ private static void accumulateInFeatureVector (AugmentableFeatureVector fv , Object key , double val ) {
10981112 Alphabet a = fv .getAlphabet ();
10991113 if (!a .contains (key ) && a .growthStopped ()) {
11001114 return ;
11011115 }
1102- int index = a .lookupIndex (key );
1103- fv .setValue (index , val );
1116+ fv .add (key ,val );
1117+ // Instead of the previous statement the following was used for debugging:
1118+ //if(fv.contains(key)) {
1119+ // fv.add(key,val);
1120+ //System.err.println("DEBUG accumulate: adding to existing: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1121+ //} else {
1122+ // fv.add(key,val);
1123+ //System.err.println("DEBUG accumulate: creating new: key="+key+" index="+a.lookupIndex(key)+" loc="+fv.location(a.lookupIndex(key)));
1124+ //}
11041125 }
11051126
11061127}
0 commit comments