Skip to content

Commit 4c415d0

Browse files
committed
- Fix missing data
1 parent 87c30fe commit 4c415d0

File tree

6 files changed

+24208
-5
lines changed

6 files changed

+24208
-5
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,9 @@ with include drop, create tables, create indexes, reset sequences
4242
EOT
4343
4444
pgloader ./sqlite-to-page-migration.load
45+
```
46+
47+
After running the experiment we encountered some issues with the `wikimapper` library, so we developed the following script to correct the missing data:
48+
```shell
49+
python3 missing_data_correction.py
4550
```

commons/storage.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def fetch_wiki_mapping(identifier: str) -> tuple[str, str, str] | None:
113113
record = (None, None, identifier)
114114
else:
115115
record = (None, None, None)
116+
print(query)
116117
print("Identifier not found!!!", identifier)
117118
except (Exception, psycopg2.DatabaseError) as error:
118119
print("Error while fetching abstract", error)
@@ -246,6 +247,28 @@ def mark_wikipedia_page_process_failed(root_wikipedia_id):
246247
postgresql_pool.putconn(connection)
247248

248249

250+
def insert_missing_mappings(wikipedia_id, wikipedia_title, wikidata_id):
251+
connection = None
252+
try:
253+
connection = postgresql_pool.getconn()
254+
if connection:
255+
with connection.cursor() as cursor:
256+
if wikidata_id:
257+
cursor.execute(
258+
"INSERT INTO wiki_page_to_wiki_data_mappings(wikipedia_id, wikipedia_title, wikidata_id)"
259+
" VALUES (%s, %s, %s)",
260+
(wikipedia_id, wikipedia_title, wikidata_id))
261+
if not wikidata_id:
262+
cursor.execute("UPDATE wikipedia_pages SET processed = TRUE WHERE id = %s",
263+
(wikipedia_id,))
264+
connection.commit()
265+
except (Exception, psycopg2.DatabaseError) as error:
266+
print("Error while updating processed field", error)
267+
finally:
268+
if connection:
269+
postgresql_pool.putconn(connection)
270+
271+
249272
def create_neo4j_session() -> tuple[Driver, Session]:
250273
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
251274
return driver, driver.session(database=NEO4J_DB)

missing_data_correction.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import csv
2+
3+
import pandas as pd
4+
import wptools
5+
6+
from commons.storage import insert_missing_mappings
7+
from commons.wiki_entity import WikiEntity
8+
from commons.wiki_mapping import WikiMapping
9+
10+
11+
def fetch_wikidata_id(title):
12+
try:
13+
page = wptools.page(title, silent=True).get(show=False)
14+
wikidata_id = page.data['wikibase']
15+
return wikidata_id
16+
except Exception as e:
17+
return None
18+
19+
20+
if __name__ == '__main__':
21+
csv_file = 'missing_data_pages.csv'
22+
df = pd.read_csv(csv_file)
23+
result = []
24+
failed = []
25+
insert_queries = []
26+
for index, row in df.iterrows():
27+
page_id = row['id']
28+
title = row['title']
29+
wikidata_id = fetch_wikidata_id(title)
30+
insert_missing_mappings(page_id, title, wikidata_id)
31+
if wikidata_id:
32+
WikiEntity(WikiMapping(wikidata_id=wikidata_id)).get_summaries(False)

0 commit comments

Comments
 (0)