#!/usr/bin/env python3 """ Markdown Parser mit YAML-Frontmatter Unterstützung Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import """ import re import yaml from typing import Dict, Any, Optional, List def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]: """ Extrahiert YAML-Frontmatter und Markdown-Inhalt Args: markdown_content: Vollständiger Markdown-Text Returns: Tuple von (frontmatter_dict, markdown_body) """ # Regex für YAML-Frontmatter (zwischen --- Markern) frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$' match = re.match(frontmatter_pattern, markdown_content, re.DOTALL) if not match: # Kein Frontmatter gefunden return None, markdown_content frontmatter_text = match.group(1) markdown_body = match.group(2) # YAML parsen try: frontmatter_data = yaml.safe_load(frontmatter_text) return frontmatter_data, markdown_body except yaml.YAMLError as e: print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}") return None, markdown_content def format_author_as_tag(author_name: str) -> str: """ Formatiert einen Autornamen als Tag im Format Vorname_Nachname Args: author_name: Autorenname (z.B. "Max Mustermann" oder "Max") Returns: Tag-formatierter Name (z.B. "Max_Mustermann") """ # Entferne führende/nachfolgende Leerzeichen author_name = author_name.strip() # Ersetze Leerzeichen durch Unterstriche tag_name = author_name.replace(' ', '_') # Entferne mehrfache Unterstriche while '__' in tag_name: tag_name = tag_name.replace('__', '_') return tag_name def extract_wordpress_metadata(frontmatter: Dict[str, Any], default_author: str = "admin") -> Dict[str, Any]: """ Extrahiert WordPress-relevante Metadaten aus Frontmatter Args: frontmatter: Geparste Frontmatter-Daten default_author: Fallback-Autor Returns: Dictionary mit WordPress-Metadaten """ metadata = {} # Titel extrahieren (verschiedene Felder möglich) # Priorität: title > name > (aus commonMetadata) if 'title' in frontmatter: metadata['title'] = frontmatter['title'] elif 'name' in frontmatter: metadata['title'] = frontmatter['name'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] metadata['title'] = common.get('name', '') # Beschreibung/Excerpt extrahieren # Priorität: summary > description > (aus commonMetadata) if 'summary' in frontmatter: metadata['excerpt'] = frontmatter['summary'] elif 'description' in frontmatter: metadata['excerpt'] = frontmatter['description'] elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] metadata['excerpt'] = common.get('description', '') # Bild extrahieren # Priorität: image > cover.image > (aus commonMetadata) if 'image' in frontmatter: metadata['featured_image'] = frontmatter['image'] elif isinstance(frontmatter.get('cover'), dict): cover_image = frontmatter['cover'].get('image', '') if cover_image: metadata['featured_image'] = cover_image elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'image' in common: metadata['featured_image'] = common['image'] # Tags extrahieren if 'tags' in frontmatter: tags = frontmatter['tags'] if isinstance(tags, list): metadata['tags'] = tags elif isinstance(tags, str): metadata['tags'] = [t.strip() for t in tags.split(',')] # Initialisiere tags falls nicht vorhanden if 'tags' not in metadata: metadata['tags'] = [] # Kategorien extrahieren (falls vorhanden) if 'categories' in frontmatter: categories = frontmatter['categories'] if isinstance(categories, list): metadata['categories'] = categories elif isinstance(categories, str): metadata['categories'] = [c.strip() for c in categories.split(',')] # Autor extrahieren if 'author' in frontmatter: author = frontmatter['author'] if isinstance(author, list) and len(author) > 0: metadata['author'] = author[0] elif isinstance(author, str): metadata['author'] = author else: metadata['author'] = default_author elif isinstance(frontmatter.get('#staticSiteGenerator'), dict): static_gen = frontmatter['#staticSiteGenerator'] if 'author' in static_gen: author = static_gen['author'] if isinstance(author, list) and len(author) > 0: metadata['author'] = author[0] elif isinstance(author, str): metadata['author'] = author elif isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'creator' in common: creator = common['creator'] if isinstance(creator, list) and len(creator) > 0: first_creator = creator[0] if isinstance(first_creator, dict): given = first_creator.get('givenName', '') family = first_creator.get('familyName', '') metadata['author'] = f"{given} {family}".strip() # Fallback für Autor if 'author' not in metadata: metadata['author'] = default_author # Alle Autoren als Tags hinzufügen (Format: Vorname_Nachname) # Sammle alle Autoren aus verschiedenen Quellen all_authors = [] # Aus direktem author-Feld if 'author' in frontmatter: author = frontmatter['author'] if isinstance(author, list): all_authors.extend(author) elif isinstance(author, str): all_authors.append(author) # Aus #staticSiteGenerator if isinstance(frontmatter.get('#staticSiteGenerator'), dict): static_gen = frontmatter['#staticSiteGenerator'] if 'author' in static_gen: author = static_gen['author'] if isinstance(author, list): all_authors.extend(author) elif isinstance(author, str): all_authors.append(author) # Aus #commonMetadata.creator if isinstance(frontmatter.get('#commonMetadata'), dict): common = frontmatter['#commonMetadata'] if 'creator' in common: creator = common['creator'] if isinstance(creator, list): for c in creator: if isinstance(c, dict): given = c.get('givenName', '') family = c.get('familyName', '') full_name = f"{given} {family}".strip() if full_name: all_authors.append(full_name) elif isinstance(creator, dict): given = creator.get('givenName', '') family = creator.get('familyName', '') full_name = f"{given} {family}".strip() if full_name: all_authors.append(full_name) # Duplikate entfernen und als Tags hinzufügen seen_authors = set() for author_name in all_authors: if author_name and author_name not in seen_authors: seen_authors.add(author_name) author_tag = format_author_as_tag(author_name) if author_tag and author_tag not in metadata.get('tags', []): if 'tags' not in metadata: metadata['tags'] = [] metadata['tags'].append(author_tag) # Status extrahieren (falls vorhanden) if 'status' in frontmatter: metadata['status'] = frontmatter['status'] elif 'creativeWorkStatus' in frontmatter: # Direkt im Frontmatter (wenn #commonMetadata: als Kommentar) work_status = frontmatter.get('creativeWorkStatus', '').lower() if 'publish' in work_status: metadata['status'] = 'publish' elif work_status == 'draft': metadata['status'] = 'draft' elif isinstance(frontmatter.get('#commonMetadata'), dict): # Verschachtelt in #commonMetadata common = frontmatter['#commonMetadata'] work_status = common.get('creativeWorkStatus', '').lower() if 'publish' in work_status: metadata['status'] = 'publish' elif work_status == 'draft': metadata['status'] = 'draft' # Datum extrahieren (falls vorhanden) # Priorität: date > datePublished > (direkt) > (aus commonMetadata) > (aus staticSiteGenerator) if 'date' in frontmatter: metadata['date'] = str(frontmatter['date']) elif 'datePublished' in frontmatter: # Direkt im Frontmatter (wenn #commonMetadata: als Kommentar oder als Feld) metadata['date'] = str(frontmatter['datePublished']) elif isinstance(frontmatter.get('#commonMetadata'), dict): # Verschachtelt in #commonMetadata common = frontmatter['#commonMetadata'] if 'datePublished' in common: metadata['date'] = str(common['datePublished']) elif isinstance(frontmatter.get('#staticSiteGenerator'), dict): static_gen = frontmatter['#staticSiteGenerator'] if 'datePublished' in static_gen: metadata['date'] = str(static_gen['datePublished']) return metadata def parse_markdown_with_metadata(markdown_content: str, default_author: str = "admin") -> Dict[str, Any]: """ Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten Args: markdown_content: Vollständiger Markdown-Text default_author: Fallback-Autor Returns: Dictionary mit 'metadata' und 'content' (Markdown-Body) """ frontmatter, markdown_body = extract_frontmatter(markdown_content) result = { 'content': markdown_body, 'metadata': {} } if frontmatter: result['metadata'] = extract_wordpress_metadata(frontmatter, default_author) return result def get_base_url(url: str) -> str: """ Extrahiert die Basis-URL aus einer vollständigen URL Nützlich für relative Bild-Pfade Args: url: Vollständige URL Returns: Basis-URL (z.B. https://example.com/path/) """ parts = url.rsplit('/', 1) if len(parts) == 2: return parts[0] + '/' return url def resolve_relative_image_url(image_path: str, base_url: str) -> str: """ Löst relative Bild-URLs auf Args: image_path: Bild-Pfad (relativ oder absolut) base_url: Basis-URL der Markdown-Datei Returns: Absolute URL zum Bild """ # Wenn bereits absolute URL, zurückgeben if image_path.startswith('http://') or image_path.startswith('https://'): return image_path # Relative URL auflösen if image_path.startswith('/'): # Absoluter Pfad auf dem Server from urllib.parse import urlparse parsed = urlparse(base_url) return f"{parsed.scheme}://{parsed.netloc}{image_path}" else: # Relativer Pfad return base_url + image_path