Obtain structured information on each type, storing it in a yaml file

This commit is contained in:
Dominik Jain 2025-09-26 11:31:18 +02:00 committed by Dominik Jain
parent 068817709e
commit 536250410c
3 changed files with 135 additions and 16 deletions

View File

@ -31,6 +31,8 @@ environments:
- conda: https://conda.anaconda.org/conda-forge/win-64/python-3.11.13-h3f84c4b_0_cpython.conda
- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.11-8_cp311.conda
- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhd8ed1ab_0.conda
- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml-0.18.15-py311h3485c13_1.conda
- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml.clib-0.2.12-py311h3485c13_1.conda
- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda
- conda: https://conda.anaconda.org/conda-forge/noarch/soupsieve-2.8-pyhd8ed1ab_0.conda
- conda: https://conda.anaconda.org/conda-forge/win-64/tk-8.6.13-h2c6b04d_2.conda
@ -357,6 +359,41 @@ packages:
- pkg:pypi/requests?source=hash-mapping
size: 59263
timestamp: 1755614348400
- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml-0.18.15-py311h3485c13_1.conda
sha256: 4a222db2ec50db5a11ace74090045170a611b24f82d80535e8d98bf478c32cc2
md5: be5d5993e755c3edc3ef860da01c67b4
depends:
- python >=3.11,<3.12.0a0
- python_abi 3.11.* *_cp311
- ruamel.yaml.clib >=0.1.2
- ucrt >=10.0.20348.0
- vc >=14.3,<15
- vc14_runtime >=14.44.35208
arch: x86_64
platform: win
license: MIT
license_family: MIT
purls:
- pkg:pypi/ruamel-yaml?source=hash-mapping
size: 274899
timestamp: 1756839144620
- conda: https://conda.anaconda.org/conda-forge/win-64/ruamel.yaml.clib-0.2.12-py311h3485c13_1.conda
sha256: ad383a91985153438817e6b241c9f151692e01ef257279f83dec55f8d024e213
md5: 713991ee78f7fbbcecfe03c7226dec24
depends:
- python >=3.11,<3.12.0a0
- python_abi 3.11.* *_cp311
- ucrt >=10.0.20348.0
- vc >=14.3,<15
- vc14_runtime >=14.44.35208
arch: x86_64
platform: win
license: MIT
license_family: MIT
purls:
- pkg:pypi/ruamel-yaml-clib?source=hash-mapping
size: 107842
timestamp: 1756829092915
- pypi: https://files.pythonhosted.org/packages/ad/d5/62a0e693230bace8e9a767d6d187a4d9421a7c6ee4b48551f8ff7bd1629a/sensai_utils-1.5.0-py3-none-any.whl
name: sensai-utils
version: 1.5.0

View File

@ -14,6 +14,7 @@ pixi-pycharm = ">=0.0.9,<0.0.10"
beautifulsoup4 = ">=4.13.5,<5"
markdownify = ">=1.1.0,<2"
requests = ">=2.32.5,<3"
"ruamel.yaml" = ">=0.18.15,<0.19"
[pypi-dependencies]
sensai-utils = ">=1.5.0, <2"

View File

@ -1,9 +1,14 @@
import dataclasses
import os
from dataclasses import dataclass
from io import StringIO
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from markdownify import MarkdownConverter
from ruamel.yaml import YAML
from ruamel.yaml.scalarstring import LiteralScalarString
from sensai.util import logging
log = logging.getLogger(__name__)
@ -20,7 +25,7 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter):
if "class" in node.attrs and "tsd-breadcrumb" in node.attrs["class"]:
return ""
# convert h3 and h4 to plain text
# convert h5 and h4 to plain text
if node.name in ["h5", "h4"]:
return soup.get_text()
@ -52,7 +57,9 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter):
# convert <pre> blocks to markdown code blocks
if node.name == "pre":
return f"\n```\n{text.strip()}\n```\n\n"
for button in soup.find_all("button"):
button.decompose()
return f"\n```\n{soup.get_text().strip()}\n```\n\n"
# convert tsd-signature elements to code blocks, converting <br> to newlines
if "class" in node.attrs and "tsd-signature" in node.attrs["class"]:
@ -64,11 +71,53 @@ class PenpotAPIContentMarkdownConverter(MarkdownConverter):
return super().process_tag(node, parent_tags=parent_tags)
@dataclass
class TypeInfo:
overview: str
"""
the main type information, which contains all the declarations/signatures but no descriptions
"""
members: dict[str, dict[str, str]]
"""
mapping from member type (e.g. "Properties", "Methods") to a mapping of member name to markdown description
"""
class YamlConverter:
"""Convert dictionaries to YAML with all strings in block literal style"""
def __init__(self):
self.yaml = YAML()
self.yaml.preserve_quotes = True
self.yaml.width = 4096 # Prevent line wrapping
def _convert_strings_to_block(self, obj):
if isinstance(obj, dict):
return {k: self._convert_strings_to_block(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [self._convert_strings_to_block(item) for item in obj]
elif isinstance(obj, str):
return LiteralScalarString(obj)
else:
return obj
def to_yaml(self, data):
processed_data = self._convert_strings_to_block(data)
stream = StringIO()
self.yaml.dump(processed_data, stream)
return stream.getvalue()
def to_file(self, data, filepath):
processed_data = self._convert_strings_to_block(data)
with open(filepath, 'w', encoding='utf-8') as f:
self.yaml.dump(processed_data, f)
class PenpotAPIDocsProcessor:
def __init__(self):
self.md_converter = PenpotAPIContentMarkdownConverter()
self.base_url = "https://penpot-plugins-api-doc.pages.dev"
self.pages = {}
self.types: dict[str, TypeInfo] = {}
def run(self, target_dir: str):
os.makedirs(target_dir, exist_ok=True)
@ -83,15 +132,16 @@ class PenpotAPIDocsProcessor:
for link in links:
href = link['href']
if href.startswith("interfaces/") or href.startswith("types/"):
page_name = href.split("/")[-1].replace(".html", "")
log.info("Processing page: %s", page_name)
page_md = self._process_page(href)
type_name = href.split("/")[-1].replace(".html", "")
log.info("Processing page: %s", type_name)
type_info = self._process_page(href)
self.types[type_name] = type_info
# save to md file
md_path = os.path.abspath(os.path.join(target_dir, f"{page_name}.md"))
log.info("Writing %s", md_path)
with open(md_path, "w", encoding="utf-8") as f:
f.write(page_md)
# save to yaml
yaml_path = os.path.join(target_dir, "api_types.yml")
log.info("Writing API type information to %s", yaml_path)
data_dict = {k: dataclasses.asdict(v) for k, v in self.types.items()}
YamlConverter().to_file(data_dict, yaml_path)
def _fetch(self, rel_url: str) -> str:
response = requests.get(f"{self.base_url}/{rel_url}")
@ -100,18 +150,49 @@ class PenpotAPIDocsProcessor:
html_content = response.text
return html_content
def _process_page(self, rel_url: str):
def _html_to_markdown(self, html_content: str) -> str:
md = self.md_converter.convert(html_content)
md = md.replace("\xa0", " ") # replace non-breaking spaces
return md.strip()
def _process_page(self, rel_url: str) -> TypeInfo:
html_content = self._fetch(rel_url)
soup = BeautifulSoup(html_content, "html.parser")
content = soup.find(attrs={"class": "col-content"})
# full_text = self._html_to_markdown(str(content))
markdown = self.md_converter.convert(str(content))
return markdown
# extract individual members
members = {}
member_group_tags = []
for el in content.children:
if isinstance(el, Tag):
if "class" in el.attrs and "tsd-member-group" in el.attrs["class"]:
member_group_tags.append(el)
members_type = el.find("h2").get_text().strip()
members_in_group = {}
members[members_type] = members_in_group
for member_tag in el.find_all(attrs={"class": "tsd-member"}):
member_anchor = member_tag.find("a", attrs={"class": "tsd-anchor"}, recursive=False)
member_name = member_anchor.attrs["id"]
member_tag.find("h3").decompose() # remove heading
members_in_group[member_name] = self._html_to_markdown(str(member_tag))
# remove the member groups from the soup
for tag in member_group_tags:
tag.decompose()
# overview is what remains in content after removing member groups
overview = self._html_to_markdown(str(content))
return TypeInfo(
overview=overview,
members=members
)
def main():
target_dir = Path(__file__).parent.parent / "mcp-server" / "data" / "api"
target_dir = Path(__file__).parent.parent / "mcp-server" / "data"
PenpotAPIDocsProcessor().run(target_dir=str(target_dir))