|
| 1 | +from __future__ import annotations as _annotations |
| 2 | + |
| 3 | +import os |
| 4 | +from typing import Any, cast |
| 5 | + |
| 6 | +from algoliasearch.search_client import SearchClient |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +from mkdocs.config import Config |
| 9 | +from mkdocs.structure.files import Files |
| 10 | +from mkdocs.structure.pages import Page |
| 11 | + |
| 12 | +records: list[dict[str, Any]] = [] |
| 13 | +ALGOLIA_INDEX_NAME = 'logfire-docs' |
| 14 | +ALGOLIA_APP_ID = 'KPPUDTIAVX' |
| 15 | +ALGOLIA_WRITE_API_KEY = os.environ.get('ALGOLIA_WRITE_API_KEY') |
| 16 | + |
| 17 | + |
| 18 | +def on_page_content(html: str, page: Page, config: Config, files: Files) -> str: |
| 19 | + if not ALGOLIA_WRITE_API_KEY: |
| 20 | + return html |
| 21 | + |
| 22 | + assert page.title is not None, 'Page title must not be None' # type: ignore[reportUnknownMemberType] |
| 23 | + title = cast(str, page.title) # type: ignore[reportUnknownMemberType] |
| 24 | + |
| 25 | + soup = BeautifulSoup(html, 'html.parser') |
| 26 | + |
| 27 | + # Find all h1 and h2 headings |
| 28 | + headings = soup.find_all(['h1', 'h2']) |
| 29 | + |
| 30 | + # Process each section |
| 31 | + for i in range(len(headings)): |
| 32 | + current_heading = headings[i] |
| 33 | + heading_id = current_heading.get('id', '') |
| 34 | + section_title = current_heading.get_text().replace('¶', '').replace('dataclass', '').strip() |
| 35 | + |
| 36 | + # Get content until next heading |
| 37 | + content: list[str] = [] |
| 38 | + sibling = current_heading.find_next_sibling() |
| 39 | + while sibling and sibling.name not in ['h1', 'h2']: |
| 40 | + content.append(str(sibling)) |
| 41 | + sibling = sibling.find_next_sibling() |
| 42 | + |
| 43 | + section_html = ''.join(content) |
| 44 | + |
| 45 | + # Create anchor URL |
| 46 | + anchor_url = f'{page.abs_url}#{heading_id}' if heading_id else page.abs_url |
| 47 | + |
| 48 | + # Create record for this section |
| 49 | + records.append( |
| 50 | + { |
| 51 | + 'content': section_html, |
| 52 | + 'pageID': title, |
| 53 | + 'abs_url': anchor_url, |
| 54 | + 'title': f'{title} - {section_title}', |
| 55 | + 'objectID': anchor_url, |
| 56 | + } |
| 57 | + ) |
| 58 | + |
| 59 | + return html |
| 60 | + |
| 61 | + |
| 62 | +def on_post_build(config: Config) -> None: |
| 63 | + if not ALGOLIA_WRITE_API_KEY: |
| 64 | + return |
| 65 | + |
| 66 | + client = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_WRITE_API_KEY) |
| 67 | + index = client.init_index(ALGOLIA_INDEX_NAME) |
| 68 | + # temporary filter the records from the index if the content is bigger than 10k characters |
| 69 | + filtered_records = list(filter(lambda record: len(record['content']) < 9000, records)) |
| 70 | + print(f'Uploading {len(filtered_records)} out of {len(records)} records to Algolia...') |
| 71 | + index.replace_all_objects(filtered_records, {'createIfNotExists': True}).wait() # type: ignore[reportUnknownMemberType] |
0 commit comments