From 50c3ca923745c005a0fad6280625bea5704692a5 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 26 Oct 2025 20:51:10 +0100 Subject: [PATCH 01/20] add refchecker class which does the core functionality of refchecking --- .../jabref/logic/integrity/RefChecker.java | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java new file mode 100644 index 00000000000..4796f914527 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -0,0 +1,151 @@ +package org.jabref.logic.integrity; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +import org.jabref.logic.database.DuplicateCheck; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.IdBasedFetcher; +import org.jabref.logic.importer.fetcher.ArXivFetcher; +import org.jabref.logic.importer.fetcher.CrossRef; +import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.logic.importer.plaincitation.PlainCitationParser; +import org.jabref.model.database.BibDatabaseMode; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BibEntryTypesManager; +import org.jabref.model.entry.identifier.DOI; + +/** + * Validates a BibEntry depending on if it + * is consistent with the fetched Entry + */ +public class RefChecker { + PlainCitationParser parser; + DoiFetcher doiFetcher; + ArXivFetcher arxivFetcher; + CrossRef crossRef; + DuplicateCheck duplicateCheck; + + public RefChecker(PlainCitationParser parser, + DoiFetcher doiFetcher, + ArXivFetcher arXivFetcher) { + this(parser, doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(new BibEntryTypesManager())); + } + + public RefChecker(PlainCitationParser parser, + DoiFetcher doiFetcher, + ArXivFetcher arXivFetcher, + CrossRef crossRef, + DuplicateCheck duplicateCheck) { + this.parser = parser; + this.doiFetcher = doiFetcher; + this.arxivFetcher = arXivFetcher; + this.crossRef = crossRef; + this.duplicateCheck = duplicateCheck; + } + + private ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { + return validityFromDoiFetcher(entry).lazyOr(() -> + validityFromCrossRef(entry) + ).lazyOr(() -> validityFromArxiv(entry)); + } + + private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fetcher) throws FetcherException { + Optional doi = entry.getDOI(); + if (doi.isEmpty()) { + return new Fake(); + } + + Optional other = fetcher.performSearchById(doi.get().asString()); + return other.map(o -> compareReferences(entry, o)) + .orElse(new Fake()); + } + + private ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { + return validityFromFetcher(entry, doiFetcher); + } + + private ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { + Optional doiFound = crossRef.findIdentifier(entry); + + if (doiFound.isEmpty()) { + return new Fake(); + } else { + DOI doi = doiFound.get(); + return doiFetcher.performSearchById(doi.asString()).map( + (found) -> compareReferences(entry, found) + ).orElse(new Fake()); + } + } + + private ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { + + var m = arxivFetcher.findIdentifier(entry); + if (m.isEmpty()) { + return new Fake(); + } + return arxivFetcher.performSearchById(m.get().asString()).map( + found -> compareReferences(entry, found) + ).orElse(new Fake()); + } + + private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { + if (duplicateCheck.isDuplicate(original, trueEntry, BibDatabaseMode.BIBTEX)) { + return new Real(trueEntry); + } else { + return new Fake(); + } + } + + @FunctionalInterface + private interface ReferenceValiditySupplier { + ReferenceValidity get() throws FetcherException; + } + + public abstract sealed class ReferenceValidity permits Real, Unsure, Fake { + + public ReferenceValidity or(ReferenceValidity other) { + if (this instanceof Real || other instanceof Fake) { + return this; + } + if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) { + otherUnsure.addAll(thisUnsure); + } + return other; + } + + private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws FetcherException { + if (this instanceof Real) { + return this; + } else { + return or(other.get()); + } + } + } + + public final class Real extends ReferenceValidity { + BibEntry matchingReference; + + public Real(BibEntry matchingReference) { + this.matchingReference = matchingReference; + } + } + + public final class Unsure extends ReferenceValidity { + List matchingReferences; + + public Unsure(BibEntry matchingReference) { + List matchingReferences = new ArrayList<>(); + matchingReferences.add(matchingReference); + this.matchingReferences = matchingReferences; + } + + void addAll(Unsure other) { + this.matchingReferences.addAll(other.matchingReferences); + } + } + + public final class Fake extends ReferenceValidity { + } +} From eeec045fb72ef2d8859be423ca7c1817cff4bb90 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 26 Oct 2025 20:52:59 +0100 Subject: [PATCH 02/20] add made some methods public that are useful --- .../main/java/org/jabref/logic/integrity/RefChecker.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 4796f914527..887d6d8d523 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -62,11 +62,11 @@ private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fet .orElse(new Fake()); } - private ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { + public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { return validityFromFetcher(entry, doiFetcher); } - private ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { + public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { Optional doiFound = crossRef.findIdentifier(entry); if (doiFound.isEmpty()) { @@ -79,7 +79,7 @@ private ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherExc } } - private ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { + public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { var m = arxivFetcher.findIdentifier(entry); if (m.isEmpty()) { From 735f985e5e65dfffe533123327b5592fefc584c1 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 26 Oct 2025 21:04:47 +0100 Subject: [PATCH 03/20] removed unused Parser argument --- .../jabref/logic/integrity/RefChecker.java | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 887d6d8d523..67c40799b70 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -10,7 +10,6 @@ import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.CrossRef; import org.jabref.logic.importer.fetcher.DoiFetcher; -import org.jabref.logic.importer.plaincitation.PlainCitationParser; import org.jabref.model.database.BibDatabaseMode; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibEntryTypesManager; @@ -21,23 +20,22 @@ * is consistent with the fetched Entry */ public class RefChecker { - PlainCitationParser parser; DoiFetcher doiFetcher; ArXivFetcher arxivFetcher; CrossRef crossRef; DuplicateCheck duplicateCheck; - public RefChecker(PlainCitationParser parser, - DoiFetcher doiFetcher, - ArXivFetcher arXivFetcher) { - this(parser, doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(new BibEntryTypesManager())); + public RefChecker( + DoiFetcher doiFetcher, + ArXivFetcher arXivFetcher) { + this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(new BibEntryTypesManager())); } - public RefChecker(PlainCitationParser parser, - DoiFetcher doiFetcher, - ArXivFetcher arXivFetcher, - CrossRef crossRef, - DuplicateCheck duplicateCheck) { + public RefChecker( + DoiFetcher doiFetcher, + ArXivFetcher arXivFetcher, + CrossRef crossRef, + DuplicateCheck duplicateCheck) { this.parser = parser; this.doiFetcher = doiFetcher; this.arxivFetcher = arXivFetcher; From be4d58818df7cbba988568b666a2818f3a6927f0 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 26 Oct 2025 22:11:35 +0100 Subject: [PATCH 04/20] add tests for refchecker and modified refchecker to add equals methods --- .../jabref/logic/integrity/RefChecker.java | 46 ++++- .../logic/integrity/RefCheckerTest.java | 169 ++++++++++++++++++ 2 files changed, 209 insertions(+), 6 deletions(-) create mode 100644 jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 67c40799b70..4f6b1e1d268 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -15,6 +15,8 @@ import org.jabref.model.entry.BibEntryTypesManager; import org.jabref.model.entry.identifier.DOI; +import com.google.common.base.Objects; + /** * Validates a BibEntry depending on if it * is consistent with the fetched Entry @@ -36,14 +38,13 @@ public RefChecker( ArXivFetcher arXivFetcher, CrossRef crossRef, DuplicateCheck duplicateCheck) { - this.parser = parser; this.doiFetcher = doiFetcher; this.arxivFetcher = arXivFetcher; this.crossRef = crossRef; this.duplicateCheck = duplicateCheck; } - private ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { + public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { return validityFromDoiFetcher(entry).lazyOr(() -> validityFromCrossRef(entry) ).lazyOr(() -> validityFromArxiv(entry)); @@ -101,7 +102,7 @@ private interface ReferenceValiditySupplier { ReferenceValidity get() throws FetcherException; } - public abstract sealed class ReferenceValidity permits Real, Unsure, Fake { + public static abstract sealed class ReferenceValidity permits Real, Unsure, Fake { public ReferenceValidity or(ReferenceValidity other) { if (this instanceof Real || other instanceof Fake) { @@ -122,15 +123,30 @@ private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws Fetcher } } - public final class Real extends ReferenceValidity { + public static final class Real extends ReferenceValidity { BibEntry matchingReference; public Real(BibEntry matchingReference) { this.matchingReference = matchingReference; } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Real real = (Real) o; + return Objects.equal(matchingReference, real.matchingReference); + } + + @Override + public int hashCode() { + return Objects.hashCode(matchingReference); + } } - public final class Unsure extends ReferenceValidity { + public static final class Unsure extends ReferenceValidity { List matchingReferences; public Unsure(BibEntry matchingReference) { @@ -142,8 +158,26 @@ public Unsure(BibEntry matchingReference) { void addAll(Unsure other) { this.matchingReferences.addAll(other.matchingReferences); } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Unsure unsure = (Unsure) o; + return Objects.equal(matchingReferences, unsure.matchingReferences); + } + + @Override + public int hashCode() { + return Objects.hashCode(matchingReferences); + } } - public final class Fake extends ReferenceValidity { + public static final class Fake extends ReferenceValidity { + public boolean equals(Object o) { + return o.getClass() == Fake.class; + } } } diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java new file mode 100644 index 00000000000..22992c1ca80 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -0,0 +1,169 @@ +package org.jabref.logic.integrity; + +import java.util.List; + +import javafx.collections.FXCollections; + +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.fetcher.ArXivFetcher; +import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.InternalField; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; +import org.jabref.model.entry.types.StandardEntryType; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class RefCheckerTest { + private static final ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + + public BibEntry realEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "296--303") + .withField(StandardField.DOI, "10.1109/icws.2007.59"); + public BibEntry realEntryNoDoi = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "296--303"); + public BibEntry realEntryArxiv = new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "The Architecture of Mr. DLib's Scientific Recommender-System API") + .withField(StandardField.DATE, "2018-11-26") + .withField(StandardField.ABSTRACT, "Recommender systems in academia are not widely available. This may be in part due to the difficulty and cost of developing and maintaining recommender systems. Many operators of academic products such as digital libraries and reference managers avoid this effort, although a recommender system could provide significant benefits to their users. In this paper, we introduce Mr. DLib's \"Recommendations as-a-Service\" (RaaS) API that allows operators of academic products to easily integrate a scientific recommender system into their products. Mr. DLib generates recommendations for research articles but in the future, recommendations may include call for papers, grants, etc. Operators of academic products can request recommendations from Mr. DLib and display these recommendations to their users. Mr. DLib can be integrated in just a few hours or days; creating an equivalent recommender system from scratch would require several months for an academic operator. Mr. DLib has been used by GESIS Sowiport and by the reference manager JabRef. Mr. DLib is open source and its goal is to facilitate the application of, and research on, scientific recommender systems. In this paper, we present the motivation for Mr. DLib, the architecture and details about the effectiveness. Mr. DLib has delivered 94m recommendations over a span of two years with an average click-through rate of 0.12%.") + .withField(StandardField.EPRINT, "1811.10364") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1811.10364v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "cs.IR") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license") + .withField(InternalField.KEY_FIELD, "https://doi.org/10.48550/arxiv.1811.10364") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.KEYWORDS, "Information Retrieval (cs.IR), Artificial Intelligence (cs.AI), Digital Libraries (cs.DL), Machine Learning (cs.LG), FOS: Computer and information sciences") + .withField(StandardField.AUTHOR, "Beel, Joeran and Collins, Andrew and Aizawa, Akiko") + .withField(StandardField.PUBLISHER, "arXiv") + .withField(StandardField.DOI, "10.48550/ARXIV.1811.10364"); + public BibEntry closeToRealEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2008") + .withField(StandardField.PAGES, "296--303") + .withField(StandardField.DOI, "10.1109/icws.2007.59"); + public BibEntry fakeEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2003") + .withField(StandardField.AUTHOR, "Kopp, Oliver") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "Some Title") + .withField(StandardField.YEAR, "2013") + .withField(StandardField.PAGES, "296--303"); + + public RefChecker refChecker; + + @BeforeAll + public static void setUpAll() { + when(importFormatPreferences.bibEntryPreferences().getKeywordSeparator()).thenReturn(','); + // Used during DOI fetch process + when(importFormatPreferences.fieldPreferences().getNonWrappableFields()).thenReturn( + FXCollections.observableArrayList(List.of( + StandardField.PDF, + StandardField.PS, + StandardField.URL, + StandardField.DOI, + StandardField.FILE, + StandardField.ISBN, + StandardField.ISSN))); + } + + @BeforeEach + public void setUp() { + ArXivFetcher af = new ArXivFetcher(importFormatPreferences); + DoiFetcher df = new DoiFetcher(importFormatPreferences); + this.refChecker = new RefChecker(df, af); + } + + @Test + void findsRealEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntry); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void findsRealEntryFromDoi() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.validityFromDoiFetcher(realEntry); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void findsRealEntryWithoutDoi() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryNoDoi); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void noFakeEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(fakeEntry); + assertEquals(RefChecker.Fake.class, rv.getClass()); + } + + @Test + void findsRealFromArxiv() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryArxiv); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Nested + public class ReferenceValidityTest { + @Test + void realEquals() { + var t1 = new RefChecker.Real(realEntry); + var t2 = new RefChecker.Real(realEntry); + assertEquals(t1, t2); + assertNotEquals(t1, new RefChecker.Real(fakeEntry)); + } + + @Test + void fakeEquals() { + var t1 = new RefChecker.Real(null); + var t2 = new RefChecker.Fake(); + + assertNotEquals(t1, t2); + + assertEquals(t2, new RefChecker.Fake()); + } + + @Test + void orTest() { + var t1 = new RefChecker.Real(realEntry); + var t2 = new RefChecker.Real(fakeEntry); + var t3 = new RefChecker.Fake(); + assertEquals(t1, t1.or(t2)); + assertEquals(t1, t1.or(t3)); + assertEquals(t2, t3.or(t2)); + } + } +} From 99398993fc1bfd2b2dae7fb6cf86ba341b2ac152 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 26 Oct 2025 23:05:12 +0100 Subject: [PATCH 05/20] added validation for entry lists and parsing from input --- .../jabref/logic/integrity/RefChecker.java | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 4f6b1e1d268..b87a4de2c1b 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -1,7 +1,10 @@ package org.jabref.logic.integrity; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Objects; import java.util.Optional; import org.jabref.logic.database.DuplicateCheck; @@ -10,13 +13,13 @@ import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.CrossRef; import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.logic.importer.plaincitation.PlainCitationParser; +import org.jabref.logic.importer.plaincitation.SeveralPlainCitationParser; import org.jabref.model.database.BibDatabaseMode; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibEntryTypesManager; import org.jabref.model.entry.identifier.DOI; -import com.google.common.base.Objects; - /** * Validates a BibEntry depending on if it * is consistent with the fetched Entry @@ -89,6 +92,20 @@ public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherExcepti ).orElse(new Fake()); } + public Map validateListOfEntries(List entries) throws FetcherException { + + Map entriesToValidity = new HashMap<>(); + for (BibEntry entry : entries) { + entriesToValidity.put(entry, referenceValidityOfEntry(entry)); + } + return entriesToValidity; + } + + public Map parseListAndValidate(String input, PlainCitationParser parser) throws FetcherException { + SeveralPlainCitationParser citationParser = new SeveralPlainCitationParser(parser); + return validateListOfEntries(citationParser.parseSeveralPlainCitations(input)); + } + private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { if (duplicateCheck.isDuplicate(original, trueEntry, BibDatabaseMode.BIBTEX)) { return new Real(trueEntry); @@ -137,7 +154,7 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Real real = (Real) o; - return Objects.equal(matchingReference, real.matchingReference); + return Objects.equals(matchingReference, real.matchingReference); } @Override @@ -166,7 +183,7 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Unsure unsure = (Unsure) o; - return Objects.equal(matchingReferences, unsure.matchingReferences); + return Objects.equals(matchingReferences, unsure.matchingReferences); } @Override From 037d24393fb706db268bc9df5fe6afd95cf0a4d3 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Mon, 27 Oct 2025 12:37:54 +0100 Subject: [PATCH 06/20] added test for list of entries --- .../jabref/logic/integrity/RefCheckerTest.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index 22992c1ca80..d8c8c64e04e 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -134,6 +134,23 @@ void noFakeEntry() throws FetcherException { void findsRealFromArxiv() throws FetcherException { RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryArxiv); assertEquals(RefChecker.Real.class, rv.getClass()); + assertEquals(RefChecker.Real.class, refChecker.validityFromArxiv(realEntryArxiv).getClass()); + } + + @Test + void validateListOfEntriesTest() throws FetcherException { + List entries = List.of(realEntry, realEntryNoDoi, fakeEntry); + var e = refChecker.validateListOfEntries(entries); + + assertEquals(e.size(), 3); + assertEquals(e.get(realEntry).getClass(), RefChecker.Real.class); + assertEquals(e.get(realEntryNoDoi).getClass(), RefChecker.Real.class); + assertEquals(e.get(fakeEntry).getClass(), RefChecker.Fake.class); + } + + @Test + void parseAndValidateTest() throws FetcherException { + } @Nested From 9725af03c76b6666b339dea6206705e35e4a93f3 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Mon, 27 Oct 2025 12:38:58 +0100 Subject: [PATCH 07/20] removed parse and validate, there is no reason to have that in this class --- .../main/java/org/jabref/logic/integrity/RefChecker.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index b87a4de2c1b..4d507225767 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -13,8 +13,6 @@ import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.CrossRef; import org.jabref.logic.importer.fetcher.DoiFetcher; -import org.jabref.logic.importer.plaincitation.PlainCitationParser; -import org.jabref.logic.importer.plaincitation.SeveralPlainCitationParser; import org.jabref.model.database.BibDatabaseMode; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibEntryTypesManager; @@ -101,11 +99,6 @@ public Map validateListOfEntries(List ent return entriesToValidity; } - public Map parseListAndValidate(String input, PlainCitationParser parser) throws FetcherException { - SeveralPlainCitationParser citationParser = new SeveralPlainCitationParser(parser); - return validateListOfEntries(citationParser.parseSeveralPlainCitations(input)); - } - private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { if (duplicateCheck.isDuplicate(original, trueEntry, BibDatabaseMode.BIBTEX)) { return new Real(trueEntry); From 43dc552bd98645fe90cb58762af668c261dc2ad6 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Mon, 27 Oct 2025 12:50:11 +0100 Subject: [PATCH 08/20] added documentation --- .../jabref/logic/integrity/RefChecker.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 4d507225767..42fe9660b02 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -45,6 +45,17 @@ public RefChecker( this.duplicateCheck = duplicateCheck; } + /** + * Tries to find the best reference validity + * among current ways. If any of the methods signal + * that it is real, it returns early. + *

+ * DoiFetcher -> CrossRef -> ArxivFetcher + * + * @param entry entry checking + * @return the reference validity + * @throws FetcherException any error from fetchers + */ public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { return validityFromDoiFetcher(entry).lazyOr(() -> validityFromCrossRef(entry) @@ -62,10 +73,24 @@ private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fet .orElse(new Fake()); } + /** + * Tests validity only from the DoiFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { return validityFromFetcher(entry, doiFetcher); } + /** + * Validity only from the CrossRef and later from the DoiFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { Optional doiFound = crossRef.findIdentifier(entry); @@ -79,6 +104,13 @@ public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherExce } } + /** + * Validity only from the arxivFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { var m = arxivFetcher.findIdentifier(entry); @@ -90,6 +122,14 @@ public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherExcepti ).orElse(new Fake()); } + /** + * Takes a list for entries and returns the mapping of them with their corresponding + * reference validity. + * + * @param entries the entries + * @return the map + * @throws FetcherException the fetcher exception + */ public Map validateListOfEntries(List entries) throws FetcherException { Map entriesToValidity = new HashMap<>(); From b9027dab40739af50d3d0403d6ebe6ba6a7b238f Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 01:19:27 +0100 Subject: [PATCH 09/20] added degree of similarity to see if entries have similar fields --- .../jabref/logic/database/DuplicateCheck.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index 5aae78d1729..8f4dd52848f 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -28,6 +28,7 @@ import org.jabref.model.strings.StringUtil; import com.google.common.collect.Sets; +import org.apache.commons.text.similarity.LevenshteinDistance; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -364,4 +365,26 @@ public Optional containsDuplicate(final BibDatabase database, return database.getEntries().stream().filter(other -> isDuplicate(entry, other, bibDatabaseMode)).findFirst(); } + + /** + * Checks across all fields of the entries, + * any matching ones get compared. + * If they are not the same the score goes down. + * + * + * + * @param one The first entry + * @param two The second entry + * @return number [0,1] 1 representing the same (one potentially having more fields), 0 representing completely different + */ + public double degreeOfSimilarity(final BibEntry one, final BibEntry two) { + return one.getFields((f) -> two.getField(f).isPresent()) + .stream().mapToDouble((field) -> { + String first = one.getField(field).get(); + String second = two.getField(field).get(); + int maxLength = Math.max(first.length(), second.length()); + Integer levenshteinDistance = LevenshteinDistance.getDefaultInstance().apply(first, second); + return 1 - levenshteinDistance / (double) maxLength; + }).average().orElse(0.0); + } } From ead14fd936a147c8ac9a826500d4b2ebe0c7de98 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 01:19:58 +0100 Subject: [PATCH 10/20] added tests for degree of similarity --- .../logic/database/DuplicateCheckTest.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java index 609b50109dc..567c569484f 100644 --- a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java +++ b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java @@ -7,6 +7,7 @@ import org.jabref.model.entry.BibEntryTypesManager; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; import org.jabref.model.entry.types.StandardEntryType; import org.junit.jupiter.api.BeforeEach; @@ -611,4 +612,51 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() { assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX)); } + + @Test + void degreeOfSimilarityOfSameEntryIsOne() { + + assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle())); + assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection())); + } + + @Test + void differentEntriesHaveSmallDegreeOfSimilarity() { + assertTrue(0.3 > + duplicateChecker.degreeOfSimilarity( + new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "Some Article"), + new BibEntry(StandardEntryType.InCollection) + .withField(StandardField.TITLE, "Other Collection") + ) + ); + } + + @Test + void entriesWithNoMatchingFieldHaveNoSimilarity() { + assertEquals(0.0, duplicateChecker.degreeOfSimilarity( + new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "Some Article"), + new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Some Author") + )); + } + + @Test + void moreFieldsDoesNotAffectTheSimilarity() { + assertEquals(1.0, duplicateChecker.degreeOfSimilarity( + getSimpleArticle(), + getSimpleArticle().withField(new UnknownField("secret"), "Something") + )); + } + + @Test + void similarEntriesHaveAHighDegreeOfSimilarity() { + double similarity = duplicateChecker.degreeOfSimilarity( + getSimpleArticle().withField(StandardField.YEAR, "2018"), + getSimpleArticle() + ); + assertTrue(0.8 < similarity); + assertTrue(1.0 > similarity); + } } From e442b24bb7c0a7aff612d58afe859af601a2eb28 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 01:21:37 +0100 Subject: [PATCH 11/20] changed refchecker to work with degree of similarity instead of duplicate check --- .../main/java/org/jabref/logic/integrity/RefChecker.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 42fe9660b02..88a242c30de 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -13,7 +13,6 @@ import org.jabref.logic.importer.fetcher.ArXivFetcher; import org.jabref.logic.importer.fetcher.CrossRef; import org.jabref.logic.importer.fetcher.DoiFetcher; -import org.jabref.model.database.BibDatabaseMode; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibEntryTypesManager; import org.jabref.model.entry.identifier.DOI; @@ -140,8 +139,11 @@ public Map validateListOfEntries(List ent } private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { - if (duplicateCheck.isDuplicate(original, trueEntry, BibDatabaseMode.BIBTEX)) { + double similarity = duplicateCheck.degreeOfSimilarity(original, trueEntry); + if (similarity >= 0.999) { return new Real(trueEntry); + } else if (similarity > 0.8) { + return new Unsure(trueEntry); } else { return new Fake(); } From 151aaa7e49de02b3a6db2fb6a5ddd76c62f679ba Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 14:53:36 +0100 Subject: [PATCH 12/20] added tests for updated refchecker with unsure --- .../jabref/logic/integrity/RefChecker.java | 7 ++++--- .../logic/integrity/RefCheckerTest.java | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 88a242c30de..5687c8a7977 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -1,11 +1,12 @@ package org.jabref.logic.integrity; -import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import org.jabref.logic.database.DuplicateCheck; import org.jabref.logic.importer.FetcherException; @@ -199,10 +200,10 @@ public int hashCode() { } public static final class Unsure extends ReferenceValidity { - List matchingReferences; + Set matchingReferences; public Unsure(BibEntry matchingReference) { - List matchingReferences = new ArrayList<>(); + Set matchingReferences = new HashSet<>(); matchingReferences.add(matchingReference); this.matchingReferences = matchingReferences; } diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index d8c8c64e04e..e85a8f3def3 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -69,7 +69,7 @@ public class RefCheckerTest { .withField(StandardField.MONTH, "#jul#") .withField(StandardField.PUBLISHER, "IEEE") .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") - .withField(StandardField.YEAR, "2008") + .withField(StandardField.YEAR, "2008") // Incorrect Field .withField(StandardField.PAGES, "296--303") .withField(StandardField.DOI, "10.1109/icws.2007.59"); public BibEntry fakeEntry = new BibEntry(StandardEntryType.InProceedings) @@ -118,6 +118,13 @@ void findsRealEntryFromDoi() throws FetcherException { assertEquals(RefChecker.Real.class, rv.getClass()); } + @Test + void closeToRealEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(closeToRealEntry); + System.out.println(((RefChecker.Unsure) rv).matchingReferences); + assertEquals(RefChecker.Unsure.class, rv.getClass()); + } + @Test void findsRealEntryWithoutDoi() throws FetcherException { RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryNoDoi); @@ -182,5 +189,15 @@ void orTest() { assertEquals(t1, t1.or(t3)); assertEquals(t2, t3.or(t2)); } + + @Test + void unsureTest() { + var t1 = new RefChecker.Unsure(realEntry); + var t2 = new RefChecker.Unsure(fakeEntry); + assertNotEquals(t1, t2); + t1.or(t2); + t2.or(t1); + assertEquals(t1, t2); + } } } From c8bfb21d571cca3de0c2099f07b476a2d0f1fa8d Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 15:00:06 +0100 Subject: [PATCH 13/20] improved degree of similarity docs --- .../org/jabref/logic/database/DuplicateCheck.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index 8f4dd52848f..58338815df0 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -370,8 +370,16 @@ public Optional containsDuplicate(final BibDatabase database, * Checks across all fields of the entries, * any matching ones get compared. * If they are not the same the score goes down. - * - * + * The score goes down depending on the + * Levenshtein distance between the two entries. + *

+ * If the result is zero, it means that either no common fields were found + * or that all common fields were very far apart lexically. + *

+ * If the result is one, it means that there was at least one common field + * and all the common fields were the same. + *

+ * Similar entries have a score of above 0.8 * * @param one The first entry * @param two The second entry From 6d783e9b0144344afe7a44b3a105b10c92697fac Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 15:33:06 +0100 Subject: [PATCH 14/20] fixed checkstyle --- .../org/jabref/logic/integrity/RefChecker.java | 16 ++++++++++++---- .../logic/database/DuplicateCheckTest.java | 1 - .../jabref/logic/integrity/RefCheckerTest.java | 15 +++++---------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index 5687c8a7977..d4cc253e26b 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -185,10 +185,12 @@ public Real(BibEntry matchingReference) { @Override public boolean equals(Object o) { - if (this == o) + if (this == o) { return true; - if (o == null || getClass() != o.getClass()) + } + if (o == null || getClass() != o.getClass()) { return false; + } Real real = (Real) o; return Objects.equals(matchingReference, real.matchingReference); } @@ -214,10 +216,12 @@ void addAll(Unsure other) { @Override public boolean equals(Object o) { - if (this == o) + if (this == o) { return true; - if (o == null || getClass() != o.getClass()) + } + if (o == null || getClass() != o.getClass()) { return false; + } Unsure unsure = (Unsure) o; return Objects.equals(matchingReferences, unsure.matchingReferences); } @@ -232,5 +236,9 @@ public static final class Fake extends ReferenceValidity { public boolean equals(Object o) { return o.getClass() == Fake.class; } + + public int hashCode() { + return Objects.hashCode(Fake.class); + } } } diff --git a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java index 567c569484f..c428a8a9961 100644 --- a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java +++ b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java @@ -615,7 +615,6 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() { @Test void degreeOfSimilarityOfSameEntryIsOne() { - assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle())); assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection())); } diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index e85a8f3def3..ebe23d46d0f 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -26,7 +26,7 @@ import static org.mockito.Mockito.when; public class RefCheckerTest { - private static final ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + private static final ImportFormatPreferences IMPORT_FORMAT_PREFERENCES = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); public BibEntry realEntry = new BibEntry(StandardEntryType.InProceedings) .withCitationKey("Decker_2007") @@ -86,9 +86,9 @@ public class RefCheckerTest { @BeforeAll public static void setUpAll() { - when(importFormatPreferences.bibEntryPreferences().getKeywordSeparator()).thenReturn(','); + when(IMPORT_FORMAT_PREFERENCES.bibEntryPreferences().getKeywordSeparator()).thenReturn(','); // Used during DOI fetch process - when(importFormatPreferences.fieldPreferences().getNonWrappableFields()).thenReturn( + when(IMPORT_FORMAT_PREFERENCES.fieldPreferences().getNonWrappableFields()).thenReturn( FXCollections.observableArrayList(List.of( StandardField.PDF, StandardField.PS, @@ -101,8 +101,8 @@ public static void setUpAll() { @BeforeEach public void setUp() { - ArXivFetcher af = new ArXivFetcher(importFormatPreferences); - DoiFetcher df = new DoiFetcher(importFormatPreferences); + ArXivFetcher af = new ArXivFetcher(IMPORT_FORMAT_PREFERENCES); + DoiFetcher df = new DoiFetcher(IMPORT_FORMAT_PREFERENCES); this.refChecker = new RefChecker(df, af); } @@ -155,11 +155,6 @@ void validateListOfEntriesTest() throws FetcherException { assertEquals(e.get(fakeEntry).getClass(), RefChecker.Fake.class); } - @Test - void parseAndValidateTest() throws FetcherException { - - } - @Nested public class ReferenceValidityTest { @Test From f77341aa996271da03bd515b0ca214bec9e18786 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Fri, 31 Oct 2025 15:41:48 +0100 Subject: [PATCH 15/20] fix rewrite run --- .../java/org/jabref/logic/integrity/RefCheckerTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index ebe23d46d0f..37bd708629f 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -149,10 +149,10 @@ void validateListOfEntriesTest() throws FetcherException { List entries = List.of(realEntry, realEntryNoDoi, fakeEntry); var e = refChecker.validateListOfEntries(entries); - assertEquals(e.size(), 3); - assertEquals(e.get(realEntry).getClass(), RefChecker.Real.class); - assertEquals(e.get(realEntryNoDoi).getClass(), RefChecker.Real.class); - assertEquals(e.get(fakeEntry).getClass(), RefChecker.Fake.class); + assertEquals(3, e.size()); + assertEquals(RefChecker.Real.class, e.get(realEntry).getClass()); + assertEquals(RefChecker.Real.class, e.get(realEntryNoDoi).getClass()); + assertEquals(RefChecker.Fake.class, e.get(fakeEntry).getClass()); } @Nested From a33eb13a39a679663aae17c9d6a9dfd3b7041f5f Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sat, 1 Nov 2025 19:11:00 +0100 Subject: [PATCH 16/20] refactored duplicateCheck#degreeOfSimilarity to use the StringSimilarity class --- .../main/java/org/jabref/logic/database/DuplicateCheck.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index 58338815df0..154b591923e 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -28,7 +28,6 @@ import org.jabref.model.strings.StringUtil; import com.google.common.collect.Sets; -import org.apache.commons.text.similarity.LevenshteinDistance; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -390,9 +389,7 @@ public double degreeOfSimilarity(final BibEntry one, final BibEntry two) { .stream().mapToDouble((field) -> { String first = one.getField(field).get(); String second = two.getField(field).get(); - int maxLength = Math.max(first.length(), second.length()); - Integer levenshteinDistance = LevenshteinDistance.getDefaultInstance().apply(first, second); - return 1 - levenshteinDistance / (double) maxLength; + return new StringSimilarity().similarity(first, second); }).average().orElse(0.0); } } From 6c1aa0e6cbb3a2ce4c780a60f449788c75b06f0e Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 2 Nov 2025 00:40:23 +0100 Subject: [PATCH 17/20] fix small changes for code quality --- .../main/java/org/jabref/logic/database/DuplicateCheck.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index 154b591923e..a21e589cd4b 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -385,11 +385,12 @@ public Optional containsDuplicate(final BibDatabase database, * @return number [0,1] 1 representing the same (one potentially having more fields), 0 representing completely different */ public double degreeOfSimilarity(final BibEntry one, final BibEntry two) { - return one.getFields((f) -> two.getField(f).isPresent()) + StringSimilarity stringSimilarity = new StringSimilarity(); + return one.getFields((field) -> two.getField(field).isPresent()) .stream().mapToDouble((field) -> { String first = one.getField(field).get(); String second = two.getField(field).get(); - return new StringSimilarity().similarity(first, second); + return stringSimilarity.similarity(first, second); }).average().orElse(0.0); } } From 80b8058628d4e4e30e624536d06afc405123abf4 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 2 Nov 2025 01:20:18 +0100 Subject: [PATCH 18/20] changed the or functionality to return a new Unsure object instead of modifying the old one, small code quality changes --- .../jabref/logic/integrity/RefChecker.java | 48 ++++++++++++------- .../logic/integrity/RefCheckerTest.java | 6 +-- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java index d4cc253e26b..fced43e5e35 100644 --- a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -16,8 +16,11 @@ import org.jabref.logic.importer.fetcher.DoiFetcher; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibEntryTypesManager; +import org.jabref.model.entry.identifier.ArXivIdentifier; import org.jabref.model.entry.identifier.DOI; +import com.airhacks.afterburner.injection.Injector; + /** * Validates a BibEntry depending on if it * is consistent with the fetched Entry @@ -28,10 +31,8 @@ public class RefChecker { CrossRef crossRef; DuplicateCheck duplicateCheck; - public RefChecker( - DoiFetcher doiFetcher, - ArXivFetcher arXivFetcher) { - this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(new BibEntryTypesManager())); + public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher) { + this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(Injector.instantiateModelOrService(BibEntryTypesManager.class))); } public RefChecker( @@ -69,7 +70,7 @@ private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fet } Optional other = fetcher.performSearchById(doi.get().asString()); - return other.map(o -> compareReferences(entry, o)) + return other.map(foundEntry -> compareReferences(entry, foundEntry)) .orElse(new Fake()); } @@ -113,12 +114,12 @@ public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherExce */ public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { - var m = arxivFetcher.findIdentifier(entry); - if (m.isEmpty()) { + Optional foundIdentifier = arxivFetcher.findIdentifier(entry); + if (foundIdentifier.isEmpty()) { return new Fake(); } - return arxivFetcher.performSearchById(m.get().asString()).map( - found -> compareReferences(entry, found) + return arxivFetcher.performSearchById(foundIdentifier.get().asString()).map( + foundEntry -> compareReferences(entry, foundEntry) ).orElse(new Fake()); } @@ -139,12 +140,12 @@ public Map validateListOfEntries(List ent return entriesToValidity; } - private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { - double similarity = duplicateCheck.degreeOfSimilarity(original, trueEntry); + private ReferenceValidity compareReferences(BibEntry localEntry, BibEntry validFoundEntry) { + double similarity = duplicateCheck.degreeOfSimilarity(localEntry, validFoundEntry); if (similarity >= 0.999) { - return new Real(trueEntry); + return new Real(validFoundEntry); } else if (similarity > 0.8) { - return new Unsure(trueEntry); + return new Unsure(validFoundEntry); } else { return new Fake(); } @@ -162,7 +163,10 @@ public ReferenceValidity or(ReferenceValidity other) { return this; } if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) { - otherUnsure.addAll(thisUnsure); + Unsure merge = new Unsure(); + merge.addAll(thisUnsure); + merge.addAll(otherUnsure); + return merge; } return other; } @@ -199,15 +203,21 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hashCode(matchingReference); } + + public BibEntry getMatchingReference() { + return matchingReference; + } } public static final class Unsure extends ReferenceValidity { Set matchingReferences; public Unsure(BibEntry matchingReference) { - Set matchingReferences = new HashSet<>(); - matchingReferences.add(matchingReference); - this.matchingReferences = matchingReferences; + this.matchingReferences = new HashSet<>(Set.of(matchingReference)); + } + + private Unsure() { + this.matchingReferences = new HashSet<>(); } void addAll(Unsure other) { @@ -230,6 +240,10 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hashCode(matchingReferences); } + + public Set getMatchingReferences() { + return matchingReferences; + } } public static final class Fake extends ReferenceValidity { diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index 37bd708629f..b1f4c9520b9 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -190,9 +190,9 @@ void unsureTest() { var t1 = new RefChecker.Unsure(realEntry); var t2 = new RefChecker.Unsure(fakeEntry); assertNotEquals(t1, t2); - t1.or(t2); - t2.or(t1); - assertEquals(t1, t2); + var result = t1.or(t2); + var otherResult = t2.or(t1); + assertEquals(result, otherResult); } } } From 6f0e9c7aa6e6b81ef5025f552fe029db4fb1e737 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 2 Nov 2025 15:53:46 +0100 Subject: [PATCH 19/20] changed var to qualified names and removed a print statements --- .../logic/integrity/RefCheckerTest.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java index b1f4c9520b9..f835ff5dfe6 100644 --- a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -1,6 +1,7 @@ package org.jabref.logic.integrity; import java.util.List; +import java.util.Map; import javafx.collections.FXCollections; @@ -121,7 +122,6 @@ void findsRealEntryFromDoi() throws FetcherException { @Test void closeToRealEntry() throws FetcherException { RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(closeToRealEntry); - System.out.println(((RefChecker.Unsure) rv).matchingReferences); assertEquals(RefChecker.Unsure.class, rv.getClass()); } @@ -147,7 +147,7 @@ void findsRealFromArxiv() throws FetcherException { @Test void validateListOfEntriesTest() throws FetcherException { List entries = List.of(realEntry, realEntryNoDoi, fakeEntry); - var e = refChecker.validateListOfEntries(entries); + Map e = refChecker.validateListOfEntries(entries); assertEquals(3, e.size()); assertEquals(RefChecker.Real.class, e.get(realEntry).getClass()); @@ -159,16 +159,16 @@ void validateListOfEntriesTest() throws FetcherException { public class ReferenceValidityTest { @Test void realEquals() { - var t1 = new RefChecker.Real(realEntry); - var t2 = new RefChecker.Real(realEntry); + RefChecker.ReferenceValidity t1 = new RefChecker.Real(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Real(realEntry); assertEquals(t1, t2); assertNotEquals(t1, new RefChecker.Real(fakeEntry)); } @Test void fakeEquals() { - var t1 = new RefChecker.Real(null); - var t2 = new RefChecker.Fake(); + RefChecker.ReferenceValidity t1 = new RefChecker.Real(null); + RefChecker.ReferenceValidity t2 = new RefChecker.Fake(); assertNotEquals(t1, t2); @@ -177,9 +177,9 @@ void fakeEquals() { @Test void orTest() { - var t1 = new RefChecker.Real(realEntry); - var t2 = new RefChecker.Real(fakeEntry); - var t3 = new RefChecker.Fake(); + RefChecker.ReferenceValidity t1 = new RefChecker.Real(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Real(fakeEntry); + RefChecker.ReferenceValidity t3 = new RefChecker.Fake(); assertEquals(t1, t1.or(t2)); assertEquals(t1, t1.or(t3)); assertEquals(t2, t3.or(t2)); @@ -187,11 +187,11 @@ void orTest() { @Test void unsureTest() { - var t1 = new RefChecker.Unsure(realEntry); - var t2 = new RefChecker.Unsure(fakeEntry); + RefChecker.ReferenceValidity t1 = new RefChecker.Unsure(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Unsure(fakeEntry); assertNotEquals(t1, t2); - var result = t1.or(t2); - var otherResult = t2.or(t1); + RefChecker.ReferenceValidity result = t1.or(t2); + RefChecker.ReferenceValidity otherResult = t2.or(t1); assertEquals(result, otherResult); } } From b98b78a09b3f8322ae7ae6edc25c746994aeba07 Mon Sep 17 00:00:00 2001 From: Kosmas Xenakis Karapanagiotis Date: Sun, 2 Nov 2025 16:00:25 +0100 Subject: [PATCH 20/20] fixed degree of similarity documentation --- .../main/java/org/jabref/logic/database/DuplicateCheck.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index a21e589cd4b..c1152dcb167 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -369,8 +369,7 @@ public Optional containsDuplicate(final BibDatabase database, * Checks across all fields of the entries, * any matching ones get compared. * If they are not the same the score goes down. - * The score goes down depending on the - * Levenshtein distance between the two entries. + * The score goes down depending on the StringSimilarity score. *

* If the result is zero, it means that either no common fields were found * or that all common fields were very far apart lexically.