#!/usr/bin/env python3 """ Markdown Parser mit YAML-Frontmatter Unterstützung Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import """ import re import yaml from typing import Dict, Any, Optional, List def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]: """ Extrahiert YAML-Frontmatter und Markdown-Inhalt Args: markdown_content: Vollständiger Markdown-Text Returns: Tuple von (frontmatter_dict, markdown_body) """ # Regex für YAML-Frontmatter (zwischen --- Markern) frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$' match = re.match(frontmatter_pattern, markdown_content, re.DOTALL) if not match: # Kein Frontmatter gefunden return None, markdown_content frontmatter_text = match.group(1) markdown_body = match.group(2) # YAML parsen try: frontmatter_data = yaml.safe_load(frontmatter_text) return frontmatter_data, markdown_body except yaml.YAMLError as e: print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}") return None, markdown_content def extract_wordpress_metadata(frontmatter: Dict[str, Any], default_author: str = "admin") -> Dict[str, Any]: """ Extrahiert WordPress-relevante Metadaten aus Frontmatter Args: frontmatter: Geparste Frontmatter-Daten default_author: Fallback-Autor Returns: Dictionary mit WordPress-Metadaten """ metadata = {} # Titel extrahieren (verschiedene Felder möglich) # Priorität: title > name > (aus commonMetadata) if 'title' in frontmatter: metadata['title'] = frontmatter['title'] elif 'name' in frontmatter: metadata['title'] = frontmatter['name'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] metadata['title'] = common.get('name', '') # Beschreibung/Excerpt extrahieren # Priorität: summary > description > (aus commonMetadata) if 'summary' in frontmatter: metadata['excerpt'] = frontmatter['summary'] elif 'description' in frontmatter: metadata['excerpt'] = frontmatter['description'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] metadata['excerpt'] = common.get('description', '') # Bild extrahieren # Priorität: image > cover.image > (aus commonMetadata) if 'image' in frontmatter: metadata['featured_image'] = frontmatter['image'] elif isinstance(frontmatter.get('cover'), dict): cover_image = frontmatter['cover'].get('image', '') if cover_image: metadata['featured_image'] = cover_image elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'image' in common: metadata['featured_image'] = common['image'] # Tags extrahieren if 'tags' in frontmatter: tags = frontmatter['tags'] if isinstance(tags, list): metadata['tags'] = tags elif isinstance(tags, str): metadata['tags'] = [t.strip() for t in tags.split(',')] # Kategorien extrahieren (falls vorhanden) if 'categories' in frontmatter: categories = frontmatter['categories'] if isinstance(categories, list): metadata['categories'] = categories elif isinstance(categories, str): metadata['categories'] = [c.strip() for c in categories.split(',')] # Autor extrahieren if 'author' in frontmatter: author = frontmatter['author'] if isinstance(author, list) and len(author) > 0: metadata['author'] = author[0] elif isinstance(author, str): metadata['author'] = author else: metadata['author'] = default_author elif isinstance(frontmatter.get('#staticSiteGenerator'), dict): static_gen = frontmatter['#staticSiteGenerator'] if 'author' in static_gen: author = static_gen['author'] if isinstance(author, list) and len(author) > 0: metadata['author'] = author[0] elif isinstance(author, str): metadata['author'] = author elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'creator' in common: creator = common['creator'] if isinstance(creator, list) and len(creator) > 0: first_creator = creator[0] if isinstance(first_creator, dict): given = first_creator.get('givenName', '') family = first_creator.get('familyName', '') metadata['author'] = f"{given} {family}".strip() # Fallback für Autor if 'author' not in metadata: metadata['author'] = default_author # Status extrahieren (falls vorhanden) if 'status' in frontmatter: metadata['status'] = frontmatter['status'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] work_status = common.get('creativeWorkStatus', '').lower() if work_status == 'published': metadata['status'] = 'publish' elif work_status == 'draft': metadata['status'] = 'draft' # Datum extrahieren (falls vorhanden) if 'date' in frontmatter: metadata['date'] = frontmatter['date'] elif 'datePublished' in frontmatter: metadata['date'] = frontmatter['datePublished'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'datePublished' in common: metadata['date'] = common['datePublished'] return metadata def parse_markdown_with_metadata(markdown_content: str, default_author: str = "admin") -> Dict[str, Any]: """ Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten Args: markdown_content: Vollständiger Markdown-Text default_author: Fallback-Autor Returns: Dictionary mit 'metadata' und 'content' (Markdown-Body) """ frontmatter, markdown_body = extract_frontmatter(markdown_content) result = { 'content': markdown_body, 'metadata': {} } if frontmatter: result['metadata'] = extract_wordpress_metadata(frontmatter, default_author) return result def get_base_url(url: str) -> str: """ Extrahiert die Basis-URL aus einer vollständigen URL Nützlich für relative Bild-Pfade Args: url: Vollständige URL Returns: Basis-URL (z.B. https://example.com/path/) """ parts = url.rsplit('/', 1) if len(parts) == 2: return parts[0] + '/' return url def resolve_relative_image_url(image_path: str, base_url: str) -> str: """ Löst relative Bild-URLs auf Args: image_path: Bild-Pfad (relativ oder absolut) base_url: Basis-URL der Markdown-Datei Returns: Absolute URL zum Bild """ # Wenn bereits absolute URL, zurückgeben if image_path.startswith('http://') or image_path.startswith('https://'): return image_path # Relative URL auflösen if image_path.startswith('/'): # Absoluter Pfad auf dem Server from urllib.parse import urlparse parsed = urlparse(base_url) return f"{parsed.scheme}://{parsed.netloc}{image_path}" else: # Relativer Pfad return base_url + image_path