diff --git a/prepare-api-docs/pixi.lock b/prepare-api-docs/pixi.lock index d4de257..86cec95 100644 --- a/prepare-api-docs/pixi.lock +++ b/prepare-api-docs/pixi.lock @@ -31,6 +31,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.13-h3f84c4b_0_cpython.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.11-8_cp311.conda - conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml-0.18.15-py311h3485c13_1.conda + - conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml.clib-0.2.12-py311h3485c13_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/soupsieve-2.8-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h2c6b04d_2.conda @@ -357,6 +359,41 @@ packages: - pkg:pypi/requests?source=hash-mapping size: 59263 timestamp: 1755614348400 +- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml-0.18.15-py311h3485c13_1.conda + sha256: 4a222db2ec50db5a11ace74090045170a611b24f82d80535e8d98bf478c32cc2 + md5: be5d5993e755c3edc3ef860da01c67b4 + depends: + - python >=3.11,<3.12.0a0 + - python_abi 3.11.* *_cp311 + - ruamel.yaml.clib >=0.1.2 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + arch: x86_64 + platform: win + license: MIT + license_family: MIT + purls: + - pkg:pypi/ruamel-yaml?source=hash-mapping + size: 274899 + timestamp: 1756839144620 +- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml.clib-0.2.12-py311h3485c13_1.conda + sha256: ad383a91985153438817e6b241c9f151692e01ef257279f83dec55f8d024e213 + md5: 713991ee78f7fbbcecfe03c7226dec24 + depends: + - python >=3.11,<3.12.0a0 + - python_abi 3.11.* *_cp311 + - ucrt >=10.0.20348.0 + - vc >=14.3,<15 + - vc14_runtime >=14.44.35208 + arch: x86_64 + platform: win + license: MIT + license_family: MIT + purls: + - pkg:pypi/ruamel-yaml-clib?source=hash-mapping + size: 107842 + timestamp: 1756829092915 - pypi: https://files.pythonhosted.org/packages/ad/d5/62a0e693230bace8e9a767d6d187a4d9421a7c6ee4b48551f8ff7bd1629a/sensai_utils-1.5.0-py3-none-any.whl name: sensai-utils version: 1.5.0 diff --git a/prepare-api-docs/pixi.toml b/prepare-api-docs/pixi.toml index 144800a..e836b6f 100644 --- a/prepare-api-docs/pixi.toml +++ b/prepare-api-docs/pixi.toml @@ -14,6 +14,7 @@ pixi-pycharm = ">=0.0.9,<0.0.10" beautifulsoup4 = ">=4.13.5,<5" markdownify = ">=1.1.0,<2" requests = ">=2.32.5,<3" +"ruamel.yaml" = ">=0.18.15,<0.19" [pypi-dependencies] sensai-utils = ">=1.5.0, <2" diff --git a/prepare-api-docs/prepare_api_docs.py b/prepare-api-docs/prepare_api_docs.py index d5b5d98..dc7997d 100644 --- a/prepare-api-docs/prepare_api_docs.py +++ b/prepare-api-docs/prepare_api_docs.py @@ -1,9 +1,14 @@ +import dataclasses import os +from dataclasses import dataclass +from io import StringIO from pathlib import Path import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from markdownify import MarkdownConverter +from ruamel.yaml import YAML +from ruamel.yaml.scalarstring import LiteralScalarString from sensai.util import logging log = logging.getLogger(__name__) @@ -20,7 +25,7 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter): if "class" in node.attrs and "tsd-breadcrumb" in node.attrs["class"]: return "" - # convert h3 and h4 to plain text + # convert h5 and h4 to plain text if node.name in ["h5", "h4"]: return soup.get_text() @@ -52,7 +57,9 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter): # convert
 blocks to markdown code blocks
         if node.name == "pre":
-            return f"\n```\n{text.strip()}\n```\n\n"
+            for button in soup.find_all("button"):
+                button.decompose()
+            return f"\n```\n{soup.get_text().strip()}\n```\n\n"
 
         # convert tsd-signature elements to code blocks, converting 
to newlines if "class" in node.attrs and "tsd-signature" in node.attrs["class"]: @@ -64,11 +71,53 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter): return super().process_tag(node, parent_tags=parent_tags) +@dataclass +class TypeInfo: + overview: str + """ + the main type information, which contains all the declarations/signatures but no descriptions + """ + members: dict[str, dict[str, str]] + """ + mapping from member type (e.g. "Properties", "Methods") to a mapping of member name to markdown description + """ + + +class YamlConverter: + """Convert dictionaries to YAML with all strings in block literal style""" + + def __init__(self): + self.yaml = YAML() + self.yaml.preserve_quotes = True + self.yaml.width = 4096 # Prevent line wrapping + + def _convert_strings_to_block(self, obj): + if isinstance(obj, dict): + return {k: self._convert_strings_to_block(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [self._convert_strings_to_block(item) for item in obj] + elif isinstance(obj, str): + return LiteralScalarString(obj) + else: + return obj + + def to_yaml(self, data): + processed_data = self._convert_strings_to_block(data) + stream = StringIO() + self.yaml.dump(processed_data, stream) + return stream.getvalue() + + def to_file(self, data, filepath): + processed_data = self._convert_strings_to_block(data) + with open(filepath, 'w', encoding='utf-8') as f: + self.yaml.dump(processed_data, f) + + class PenpotAPIDocsProcessor: def __init__(self): self.md_converter = PenpotAPIContentMarkdownConverter() self.base_url = "https://penpot-plugins-api-doc.pages.dev" - self.pages = {} + self.types: dict[str, TypeInfo] = {} def run(self, target_dir: str): os.makedirs(target_dir, exist_ok=True) @@ -83,15 +132,16 @@ class PenpotAPIDocsProcessor: for link in links: href = link['href'] if href.startswith("interfaces/") or href.startswith("types/"): - page_name = href.split("/")[-1].replace(".html", "") - log.info("Processing page: %s", page_name) - page_md = self._process_page(href) + type_name = href.split("/")[-1].replace(".html", "") + log.info("Processing page: %s", type_name) + type_info = self._process_page(href) + self.types[type_name] = type_info - # save to md file - md_path = os.path.abspath(os.path.join(target_dir, f"{page_name}.md")) - log.info("Writing %s", md_path) - with open(md_path, "w", encoding="utf-8") as f: - f.write(page_md) + # save to yaml + yaml_path = os.path.join(target_dir, "api_types.yml") + log.info("Writing API type information to %s", yaml_path) + data_dict = {k: dataclasses.asdict(v) for k, v in self.types.items()} + YamlConverter().to_file(data_dict, yaml_path) def _fetch(self, rel_url: str) -> str: response = requests.get(f"{self.base_url}/{rel_url}") @@ -100,18 +150,49 @@ class PenpotAPIDocsProcessor: html_content = response.text return html_content - def _process_page(self, rel_url: str): + def _html_to_markdown(self, html_content: str) -> str: + md = self.md_converter.convert(html_content) + md = md.replace("\xa0", " ") # replace non-breaking spaces + return md.strip() + + def _process_page(self, rel_url: str) -> TypeInfo: html_content = self._fetch(rel_url) soup = BeautifulSoup(html_content, "html.parser") content = soup.find(attrs={"class": "col-content"}) + # full_text = self._html_to_markdown(str(content)) - markdown = self.md_converter.convert(str(content)) - return markdown + # extract individual members + members = {} + member_group_tags = [] + for el in content.children: + if isinstance(el, Tag): + if "class" in el.attrs and "tsd-member-group" in el.attrs["class"]: + member_group_tags.append(el) + members_type = el.find("h2").get_text().strip() + members_in_group = {} + members[members_type] = members_in_group + for member_tag in el.find_all(attrs={"class": "tsd-member"}): + member_anchor = member_tag.find("a", attrs={"class": "tsd-anchor"}, recursive=False) + member_name = member_anchor.attrs["id"] + member_tag.find("h3").decompose() # remove heading + members_in_group[member_name] = self._html_to_markdown(str(member_tag)) + + # remove the member groups from the soup + for tag in member_group_tags: + tag.decompose() + + # overview is what remains in content after removing member groups + overview = self._html_to_markdown(str(content)) + + return TypeInfo( + overview=overview, + members=members + ) def main(): - target_dir = Path(__file__).parent.parent / "mcp-server" / "data" / "api" + target_dir = Path(__file__).parent.parent / "mcp-server" / "data" PenpotAPIDocsProcessor().run(target_dir=str(target_dir))