Source code for papagai.markdown

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later

"""Markdown file parsing utilities."""

import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Self

logger = logging.getLogger("papagai.markdown")

# Regex pattern for frontmatter key: value pairs
# Matches: key_name: value
KEY_VALUE_PATTERN = re.compile(r"^([a-zA-Z0-9_-]+):\s*(.*)$")


[docs] @dataclass class Markdown: """ Markdown file with parsed frontmatter. Attributes: frontmatter: Dictionary of frontmatter key-value pairs """ frontmatter: dict[str, str] = field(default_factory=dict) text: str = ""
[docs] @classmethod def from_string(cls, content: str) -> Self: """ Parse a markdown string and extract frontmatter. Frontmatter is delimited by --- at the start and end, and contains key: value pairs that may span multiple lines. Args: content: Markdown content as a string Returns: Markdown instance with parsed frontmatter """ lines = content.split("\n") # Check if content starts with --- if not lines or lines[0].strip() != "---": return cls(frontmatter={}, text=content) # Find the closing --- current_key = None current_value = [] frontmatter = {} text = content # Default to full content if no closing --- found for idx, line in enumerate(lines[1:], 1): if line.strip() == "---": # End of frontmatter if current_key: frontmatter[current_key] = "\n".join(current_value).strip() text = "\n".join(lines[idx + 1 :]) break # Check if this is a key: value line using regex match = KEY_VALUE_PATTERN.match(line) if match: # Save previous key-value if exists if current_key: frontmatter[current_key] = "\n".join(current_value).strip() # Start new key-value current_key = match.group(1) current_value = [match.group(2)] elif current_key: # Continuation of multi-line value current_value.append(line) else: # No closing --- found, reset to empty frontmatter frontmatter = {} text = content return cls(frontmatter=frontmatter, text=text)
[docs] @classmethod def from_file(cls, file_path: Path) -> Self: """ Parse a markdown file and extract frontmatter. Frontmatter is delimited by --- at the start and end, and contains key: value pairs that may span multiple lines. Args: file_path: Path to the markdown file Returns: Markdown instance with parsed frontmatter Raises: FileNotFoundError: If the file does not exist PermissionError: If the file cannot be read """ content = file_path.read_text() return cls.from_string(content)
[docs] @dataclass class MarkdownInstructions(Markdown): """ Markdown file with parsed instructions frontmatter. Inherits from Markdown and adds convenience fields for description and tools. Attributes: description: Description from frontmatter, or empty string if not found tools: List of tool strings parsed from frontmatter, or empty list if not found """ description: str = "" tools: list[str] = field(default_factory=list)
[docs] @classmethod def from_string(cls, content: str) -> Self: """ Parse a markdown string and extract instructions frontmatter. Parses the description and tools from the frontmatter. Tools are comma-separated, but commas inside parentheses/braces are preserved. Args: content: Markdown content as a string Returns: MarkdownInstructions instance with parsed frontmatter and fields """ # First parse the base frontmatter base = Markdown.from_string(content) # Extract description description = base.frontmatter.get("description", "") # Extract and parse tools tools_str = base.frontmatter.get("tools", "") tools = cls._parse_tools(tools_str) return cls( frontmatter=base.frontmatter, text=base.text, description=description, tools=tools, )
[docs] @classmethod def from_file(cls, file_path: Path) -> Self: """ Parse a markdown file and extract instructions frontmatter. Parses the description and tools from the frontmatter. Tools are comma-separated, but commas inside parentheses/braces are preserved. Args: file_path: Path to the markdown file Returns: MarkdownInstructions instance with parsed frontmatter and fields Raises: FileNotFoundError: If the file does not exist PermissionError: If the file cannot be read """ # First parse the base frontmatter base = Markdown.from_file(file_path) # Extract description description = base.frontmatter.get("description", "") # Extract and parse tools tools_str = base.frontmatter.get("tools", "") tools = cls._parse_tools(tools_str) return cls( frontmatter=base.frontmatter, text=base.text, description=description, tools=tools, )
[docs] def combine(self, other: Self) -> Self: """ Combine this MarkdownInstructions with another. Creates a new MarkdownInstructions with: - Text from both objects concatenated (self.text + other.text) - Tools from both objects combined (deduplicated) - Description from only the first object (self) - Frontmatter merged from both objects Args: other: Another MarkdownInstructions to combine with Returns: New MarkdownInstructions with combined content """ # Combine text from both objects combined_text = self.text + "\n" + other.text # Combine tools, preserving order and removing duplicates seen = set() combined_tools = [] for tool in self.tools + other.tools: if tool not in seen: seen.add(tool) combined_tools.append(tool) # Merge frontmatter (self takes precedence for duplicate keys) combined_frontmatter = {**other.frontmatter, **self.frontmatter} # Description only from first object description = self.description return type(self)( frontmatter=combined_frontmatter, text=combined_text, description=description, tools=combined_tools, )
@staticmethod def _parse_tools(tools_str: str) -> list[str]: """ Parse comma-separated tools, respecting nested parentheses and braces. Args: tools_str: Comma-separated tool specifications Returns: List of tool strings, or empty list if input is empty """ if not tools_str: return [] # Split by comma, but only if not inside parentheses or braces tools = [] current_tool = [] depth = 0 # Track nesting depth of () and {} for char in tools_str: if char in "({": depth += 1 current_tool.append(char) elif char in ")}": depth -= 1 current_tool.append(char) elif char == "," and depth == 0: # Comma at top level - end of current tool tool = "".join(current_tool).strip() if tool: tools.append(tool) current_tool = [] else: current_tool.append(char) # Add the last tool tool = "".join(current_tool).strip() if tool: tools.append(tool) return tools