Skip to content

Commit a995a3d

Browse files
committed
Triples validation
- Check if the extracted triples belong to English wikidata persist them - Fresh fetch option for summary builder
1 parent 4c415d0 commit a995a3d

File tree

4 files changed

+30
-18
lines changed

4 files changed

+30
-18
lines changed

commons/wiki_entity.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,18 @@ def __init__(self, wiki_mapping: WikiMapping):
1717
self._wikidata_label = None
1818
self._wikidata_description = None
1919

20-
def get_summaries(self, keep_results=True):
20+
def get_summaries(self, keep_results=True, fresh=False):
2121
if self._summaries:
2222
return self._summaries
2323
if not keep_results:
24-
build_summaries(self.wikipedia_id, self.wikipedia_page_title, self.wikidata_id)
24+
build_summaries(self.wikipedia_id, self.wikipedia_page_title, self.wikidata_id, fresh=fresh)
2525
return
26-
self._summaries = build_summaries(self.wikipedia_id, self.wikipedia_page_title, self.wikidata_id)
26+
self._summaries = build_summaries(self.wikipedia_id, self.wikipedia_page_title, self.wikidata_id, fresh)
2727
return self._summaries
2828

29-
def get_detailed_summaries(self):
29+
def get_detailed_summaries(self, fresh=False):
3030
if not self._summaries:
31-
self._summaries = self.get_summaries()
31+
self._summaries = self.get_summaries(fresh=fresh)
3232
results = []
3333
for summary in self._summaries:
3434
from_wikipedia_id, from_wikipedia_title, from_wikidata_id = fetch_wiki_mapping(summary[0])

wiki-storage/src/main/java/com/github/msorkhpar/wikistorage/utils/WikidataEnglishInfoDTO.java

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,5 @@ public WikidataEnglishInfoDTO(String title, AdditionalInfo data) {
2323
if (wikipediaInfo != null) {
2424
this.enWikiTitle = wikipediaInfo.getTitle();
2525
}
26-
String key = "";
27-
if (this.label == null && data.getLabels() != null && !data.getLabels().isEmpty()) {
28-
key = data.getLabels().keySet().iterator().next();
29-
this.label = data.getLabels().get(key).getValue();
30-
this.description = data.getDescription() != null ? data.getDescription().getOrDefault(key, null).getValue() : null;
31-
}
32-
33-
if (this.description == null && data.getDescription() != null && !data.getDescription().isEmpty()) {
34-
this.description = data.getDescription().values().iterator().next().getValue();
35-
}
3626
}
3727
}

wiki_summary/summary_builder.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,28 @@ def _get_edge_candidates(root_wikidata_id, mentions: list[str]) -> dict[str, lis
5353
return edge_candidates
5454

5555

56-
def build_summaries(wikipedia_id, wikipedia_title, root_wikidata_id) -> list[tuple[str, str, str]]:
56+
def build_summaries(wikipedia_id, wikipedia_title, root_wikidata_id, fresh=False) -> list[tuple[str, str, str]]:
5757
"""
5858
:param wikipedia_title:
5959
:param root_wikidata_id:
6060
:return: list of summaries: [ (from_entity, predicate, to_entity), ...]
6161
"""
6262
# check if the summaries are already stored in Neo4j and return them
63+
if fresh:
64+
page_content = fetch_wikipedia_page_content(wikipedia_title)
65+
raw_abstract = extract_raw_abstract(page_content)
66+
mentions = extract_mention_titles(raw_abstract)
67+
edge_candidates = _get_edge_candidates(root_wikidata_id, mentions)
68+
summaries = []
69+
for index, candidates in edge_candidates.items():
70+
if len(candidates) > 1:
71+
abstract_embedding = compute_embeddings(dewiki(raw_abstract))
72+
candidate = _pick_most_relevant_predicate(abstract_embedding, candidates)
73+
else:
74+
candidate = candidates[0]
75+
summaries.append(candidate)
76+
return summaries
77+
6378
summaries = fetch_summaries(root_wikidata_id)
6479
if summaries:
6580
mark_wikipedia_page_processed(wikipedia_id)

wikidata-graph-builder/src/main/java/com/github/msorkhpar/graphbuilder/service/WikidataDumpFileService.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,16 @@ public CompletableFuture<Path> process(Path dumpFile, boolean multiStream) {
5252
String pageString = xmlBuilder.toString();
5353
try {
5454
Element xmlPage = createPage(pageString);
55-
boolean triplePersistentStatus = processTriples(WikiDataEntityExtractor.extractTriples(xmlPage));
55+
Optional<Set<KGTriple>> tripleSet = WikiDataEntityExtractor.extractTriples(xmlPage);
56+
Optional<WikidataEnglishInfoDTO> metadata = extractMetadata(xmlPage);
57+
if (tripleSet.isEmpty() || metadata.isEmpty() ||
58+
(metadata.get().getLabel().isEmpty() && metadata.get().getDescription().isEmpty())
59+
) {
60+
continue;
61+
}
62+
boolean triplePersistentStatus = processTriples(tripleSet);
5663
if (triplePersistentStatus) {
57-
processMetadata(extractMetadata(xmlPage));
64+
processMetadata(metadata);
5865
}
5966
} catch (Exception e) {
6067
logger.info("Extraction from the following text was not successful, {}", pageString);

0 commit comments

Comments
 (0)