MD2WordPress/markdown_parser.py

#!/usr/bin/env python3
"""
Markdown Parser mit YAML-Frontmatter Unterstützung
Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import
"""

import re
import yaml
from typing import Dict, Any, Optional, List


def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]:
    """
    Extrahiert YAML-Frontmatter und Markdown-Inhalt

    Args:
        markdown_content: Vollständiger Markdown-Text

    Returns:
        Tuple von (frontmatter_dict, markdown_body)
    """
    # Regex für YAML-Frontmatter (zwischen --- Markern)
    frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    match = re.match(frontmatter_pattern, markdown_content, re.DOTALL)

    if not match:
        # Kein Frontmatter gefunden
        return None, markdown_content

    frontmatter_text = match.group(1)
    markdown_body = match.group(2)

    # YAML parsen
    try:
        frontmatter_data = yaml.safe_load(frontmatter_text)
        return frontmatter_data, markdown_body
    except yaml.YAMLError as e:
        print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}")
        return None, markdown_content


def format_author_as_tag(author_name: str) -> str:
    """
    Formatiert einen Autornamen als Tag im Format Vorname_Nachname

    Args:
        author_name: Autorenname (z.B. "Max Mustermann" oder "Max")

    Returns:
        Tag-formatierter Name (z.B. "Max_Mustermann")
    """
    # Entferne führende/nachfolgende Leerzeichen
    author_name = author_name.strip()

    # Ersetze Leerzeichen durch Unterstriche
    tag_name = author_name.replace(' ', '_')

    # Entferne mehrfache Unterstriche
    while '__' in tag_name:
        tag_name = tag_name.replace('__', '_')

    return tag_name


def extract_wordpress_metadata(frontmatter: Dict[str, Any],
                               default_author: str = "admin") -> Dict[str, Any]:
    """
    Extrahiert WordPress-relevante Metadaten aus Frontmatter

    Args:
        frontmatter: Geparste Frontmatter-Daten
        default_author: Fallback-Autor

    Returns:
        Dictionary mit WordPress-Metadaten
    """
    metadata = {}

    # Titel extrahieren (verschiedene Felder möglich)
    # Priorität: title > name > (aus commonMetadata)
    if 'title' in frontmatter:
        metadata['title'] = frontmatter['title']
    elif 'name' in frontmatter:
        metadata['title'] = frontmatter['name']
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        common = frontmatter['#commonMetadata']
        metadata['title'] = common.get('name', '')

    # Beschreibung/Excerpt extrahieren
    # Priorität: summary > description > (aus commonMetadata)
    if 'summary' in frontmatter:
        metadata['excerpt'] = frontmatter['summary']
    elif 'description' in frontmatter:
        metadata['excerpt'] = frontmatter['description']
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        common = frontmatter['#commonMetadata']
        metadata['excerpt'] = common.get('description', '')

    # Bild extrahieren
    # Priorität: image > cover.image > (aus commonMetadata)
    if 'image' in frontmatter:
        metadata['featured_image'] = frontmatter['image']
    elif isinstance(frontmatter.get('cover'), dict):
        cover_image = frontmatter['cover'].get('image', '')
        if cover_image:
            metadata['featured_image'] = cover_image
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        common = frontmatter['#commonMetadata']
        if 'image' in common:
            metadata['featured_image'] = common['image']

    # Tags extrahieren
    if 'tags' in frontmatter:
        tags = frontmatter['tags']
        if isinstance(tags, list):
            metadata['tags'] = tags
        elif isinstance(tags, str):
            metadata['tags'] = [t.strip() for t in tags.split(',')]

    # Initialisiere tags falls nicht vorhanden
    if 'tags' not in metadata:
        metadata['tags'] = []

    # Kategorien extrahieren (falls vorhanden)
    if 'categories' in frontmatter:
        categories = frontmatter['categories']
        if isinstance(categories, list):
            metadata['categories'] = categories
        elif isinstance(categories, str):
            metadata['categories'] = [c.strip() for c in categories.split(',')]

    # Autor extrahieren
    if 'author' in frontmatter:
        author = frontmatter['author']
        if isinstance(author, list) and len(author) > 0:
            metadata['author'] = author[0]
        elif isinstance(author, str):
            metadata['author'] = author
        else:
            metadata['author'] = default_author
    elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
        static_gen = frontmatter['#staticSiteGenerator']
        if 'author' in static_gen:
            author = static_gen['author']
            if isinstance(author, list) and len(author) > 0:
                metadata['author'] = author[0]
            elif isinstance(author, str):
                metadata['author'] = author
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        common = frontmatter['#commonMetadata']
        if 'creator' in common:
            creator = common['creator']
            if isinstance(creator, list) and len(creator) > 0:
                first_creator = creator[0]
                if isinstance(first_creator, dict):
                    given = first_creator.get('givenName', '')
                    family = first_creator.get('familyName', '')
                    metadata['author'] = f"{given} {family}".strip()

    # Fallback für Autor
    if 'author' not in metadata:
        metadata['author'] = default_author

    # Alle Autoren als Tags hinzufügen (Format: Vorname_Nachname)
    # Sammle alle Autoren aus verschiedenen Quellen
    all_authors = []

    # Aus direktem author-Feld
    if 'author' in frontmatter:
        author = frontmatter['author']
        if isinstance(author, list):
            all_authors.extend(author)
        elif isinstance(author, str):
            all_authors.append(author)

    # Aus #staticSiteGenerator
    if isinstance(frontmatter.get('#staticSiteGenerator'), dict):
        static_gen = frontmatter['#staticSiteGenerator']
        if 'author' in static_gen:
            author = static_gen['author']
            if isinstance(author, list):
                all_authors.extend(author)
            elif isinstance(author, str):
                all_authors.append(author)

    # Aus #commonMetadata.creator
    if isinstance(frontmatter.get('#commonMetadata'), dict):
        common = frontmatter['#commonMetadata']
        if 'creator' in common:
            creator = common['creator']
            if isinstance(creator, list):
                for c in creator:
                    if isinstance(c, dict):
                        given = c.get('givenName', '')
                        family = c.get('familyName', '')
                        full_name = f"{given} {family}".strip()
                        if full_name:
                            all_authors.append(full_name)
            elif isinstance(creator, dict):
                given = creator.get('givenName', '')
                family = creator.get('familyName', '')
                full_name = f"{given} {family}".strip()
                if full_name:
                    all_authors.append(full_name)

    # Duplikate entfernen und als Tags hinzufügen
    seen_authors = set()
    for author_name in all_authors:
        if author_name and author_name not in seen_authors:
            seen_authors.add(author_name)
            author_tag = format_author_as_tag(author_name)
            if author_tag and author_tag not in metadata.get('tags', []):
                if 'tags' not in metadata:
                    metadata['tags'] = []
                metadata['tags'].append(author_tag)

    # Status extrahieren (falls vorhanden)
    if 'status' in frontmatter:
        metadata['status'] = frontmatter['status']
    elif 'creativeWorkStatus' in frontmatter:
        # Direkt im Frontmatter (wenn #commonMetadata: als Kommentar)
        work_status = frontmatter.get('creativeWorkStatus', '').lower()
        if 'publish' in work_status:
            metadata['status'] = 'publish'
        elif work_status == 'draft':
            metadata['status'] = 'draft'
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        # Verschachtelt in #commonMetadata
        common = frontmatter['#commonMetadata']
        work_status = common.get('creativeWorkStatus', '').lower()
        if 'publish' in work_status:
            metadata['status'] = 'publish'
        elif work_status == 'draft':
            metadata['status'] = 'draft'

    # Datum extrahieren (falls vorhanden)
    # Priorität: date > datePublished > (direkt) > (aus commonMetadata) > (aus staticSiteGenerator)
    if 'date' in frontmatter:
        metadata['date'] = str(frontmatter['date'])
    elif 'datePublished' in frontmatter:
        # Direkt im Frontmatter (wenn #commonMetadata: als Kommentar oder als Feld)
        metadata['date'] = str(frontmatter['datePublished'])
    elif isinstance(frontmatter.get('#commonMetadata'), dict):
        # Verschachtelt in #commonMetadata
        common = frontmatter['#commonMetadata']
        if 'datePublished' in common:
            metadata['date'] = str(common['datePublished'])
    elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
        static_gen = frontmatter['#staticSiteGenerator']
        if 'datePublished' in static_gen:
            metadata['date'] = str(static_gen['datePublished'])

    return metadata


def parse_markdown_with_metadata(markdown_content: str,
                                 default_author: str = "admin") -> Dict[str, Any]:
    """
    Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten

    Args:
        markdown_content: Vollständiger Markdown-Text
        default_author: Fallback-Autor

    Returns:
        Dictionary mit 'metadata' und 'content' (Markdown-Body)
    """
    frontmatter, markdown_body = extract_frontmatter(markdown_content)

    result = {
        'content': markdown_body,
        'metadata': {}
    }

    if frontmatter:
        result['metadata'] = extract_wordpress_metadata(frontmatter, default_author)

    return result


def get_base_url(url: str) -> str:
    """
    Extrahiert die Basis-URL aus einer vollständigen URL
    Nützlich für relative Bild-Pfade

    Args:
        url: Vollständige URL

    Returns:
        Basis-URL (z.B. https://example.com/path/)
    """
    parts = url.rsplit('/', 1)
    if len(parts) == 2:
        return parts[0] + '/'
    return url


def resolve_relative_image_url(image_path: str, base_url: str) -> str:
    """
    Löst relative Bild-URLs auf

    Args:
        image_path: Bild-Pfad (relativ oder absolut)
        base_url: Basis-URL der Markdown-Datei

    Returns:
        Absolute URL zum Bild
    """
    # Wenn bereits absolute URL, zurückgeben
    if image_path.startswith('http://') or image_path.startswith('https://'):
        return image_path

    # Relative URL auflösen
    if image_path.startswith('/'):
        # Absoluter Pfad auf dem Server
        from urllib.parse import urlparse
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{image_path}"
    else:
        # Relativer Pfad
        return base_url + image_path