Feature: Automatische Metadaten-Extraktion aus Frontmatter

- Neuer markdown_parser.py mit YAML-Frontmatter Extraktion - Unterstützung für drei Modi: Einzelne URL, YAML-Batch, Forgejo-Repo - Metadaten (name, description, tags, image, author) aus Frontmatter - Schema.org-Support für commonMetadata - Vereinfachte posts.yaml (nur URLs statt vollständiger Metadaten) - Aktualisierte Dokumentation (README.md, QUICKSTART.md) - Beispiel-Beitrag mit vollständigem Frontmatter
2025-10-01 08:10:09 +02:00 · 2025-10-01 08:10:09 +02:00 · 7a234be652
commit 7a234be652
parent e3b19bb0df
6 changed files with 880 additions and 180 deletions
--- a/markdown_parser.py
+++ b/markdown_parser.py
@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""
+Markdown Parser mit YAML-Frontmatter Unterstützung
+Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import
+"""
+
+import re
+import yaml
+from typing import Dict, Any, Optional, List
+
+
+def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]:
+    """
+    Extrahiert YAML-Frontmatter und Markdown-Inhalt
+    
+    Args:
+        markdown_content: Vollständiger Markdown-Text
+        
+    Returns:
+        Tuple von (frontmatter_dict, markdown_body)
+    """
+    # Regex für YAML-Frontmatter (zwischen --- Markern)
+    frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
+    match = re.match(frontmatter_pattern, markdown_content, re.DOTALL)
+    
+    if not match:
+        # Kein Frontmatter gefunden
+        return None, markdown_content
+    
+    frontmatter_text = match.group(1)
+    markdown_body = match.group(2)
+    
+    # YAML parsen
+    try:
+        frontmatter_data = yaml.safe_load(frontmatter_text)
+        return frontmatter_data, markdown_body
+    except yaml.YAMLError as e:
+        print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}")
+        return None, markdown_content
+
+
+def extract_wordpress_metadata(frontmatter: Dict[str, Any], 
+                               default_author: str = "admin") -> Dict[str, Any]:
+    """
+    Extrahiert WordPress-relevante Metadaten aus Frontmatter
+    
+    Args:
+        frontmatter: Geparste Frontmatter-Daten
+        default_author: Fallback-Autor
+        
+    Returns:
+        Dictionary mit WordPress-Metadaten
+    """
+    metadata = {}
+    
+    # Titel extrahieren (verschiedene Felder möglich)
+    # Priorität: title > name > (aus commonMetadata)
+    if 'title' in frontmatter:
+        metadata['title'] = frontmatter['title']
+    elif 'name' in frontmatter:
+        metadata['title'] = frontmatter['name']
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        metadata['title'] = common.get('name', '')
+    
+    # Beschreibung/Excerpt extrahieren
+    # Priorität: summary > description > (aus commonMetadata)
+    if 'summary' in frontmatter:
+        metadata['excerpt'] = frontmatter['summary']
+    elif 'description' in frontmatter:
+        metadata['excerpt'] = frontmatter['description']
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        metadata['excerpt'] = common.get('description', '')
+    
+    # Bild extrahieren
+    # Priorität: image > cover.image > (aus commonMetadata)
+    if 'image' in frontmatter:
+        metadata['featured_image'] = frontmatter['image']
+    elif isinstance(frontmatter.get('cover'), dict):
+        cover_image = frontmatter['cover'].get('image', '')
+        if cover_image:
+            metadata['featured_image'] = cover_image
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        if 'image' in common:
+            metadata['featured_image'] = common['image']
+    
+    # Tags extrahieren
+    if 'tags' in frontmatter:
+        tags = frontmatter['tags']
+        if isinstance(tags, list):
+            metadata['tags'] = tags
+        elif isinstance(tags, str):
+            metadata['tags'] = [t.strip() for t in tags.split(',')]
+    
+    # Kategorien extrahieren (falls vorhanden)
+    if 'categories' in frontmatter:
+        categories = frontmatter['categories']
+        if isinstance(categories, list):
+            metadata['categories'] = categories
+        elif isinstance(categories, str):
+            metadata['categories'] = [c.strip() for c in categories.split(',')]
+    
+    # Autor extrahieren
+    if 'author' in frontmatter:
+        author = frontmatter['author']
+        if isinstance(author, list) and len(author) > 0:
+            metadata['author'] = author[0]
+        elif isinstance(author, str):
+            metadata['author'] = author
+        else:
+            metadata['author'] = default_author
+    elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
+        static_gen = frontmatter['#staticSiteGenerator']
+        if 'author' in static_gen:
+            author = static_gen['author']
+            if isinstance(author, list) and len(author) > 0:
+                metadata['author'] = author[0]
+            elif isinstance(author, str):
+                metadata['author'] = author
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        if 'creator' in common:
+            creator = common['creator']
+            if isinstance(creator, list) and len(creator) > 0:
+                first_creator = creator[0]
+                if isinstance(first_creator, dict):
+                    given = first_creator.get('givenName', '')
+                    family = first_creator.get('familyName', '')
+                    metadata['author'] = f"{given} {family}".strip()
+    
+    # Fallback für Autor
+    if 'author' not in metadata:
+        metadata['author'] = default_author
+    
+    # Status extrahieren (falls vorhanden)
+    if 'status' in frontmatter:
+        metadata['status'] = frontmatter['status']
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        work_status = common.get('creativeWorkStatus', '').lower()
+        if work_status == 'published':
+            metadata['status'] = 'publish'
+        elif work_status == 'draft':
+            metadata['status'] = 'draft'
+    
+    # Datum extrahieren (falls vorhanden)
+    if 'date' in frontmatter:
+        metadata['date'] = frontmatter['date']
+    elif 'datePublished' in frontmatter:
+        metadata['date'] = frontmatter['datePublished']
+    elif isinstance(frontmatter.get('#commonMetadata'), dict):
+        common = frontmatter['#commonMetadata']
+        if 'datePublished' in common:
+            metadata['date'] = common['datePublished']
+    
+    return metadata
+
+
+def parse_markdown_with_metadata(markdown_content: str, 
+                                 default_author: str = "admin") -> Dict[str, Any]:
+    """
+    Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten
+    
+    Args:
+        markdown_content: Vollständiger Markdown-Text
+        default_author: Fallback-Autor
+        
+    Returns:
+        Dictionary mit 'metadata' und 'content' (Markdown-Body)
+    """
+    frontmatter, markdown_body = extract_frontmatter(markdown_content)
+    
+    result = {
+        'content': markdown_body,
+        'metadata': {}
+    }
+    
+    if frontmatter:
+        result['metadata'] = extract_wordpress_metadata(frontmatter, default_author)
+    
+    return result
+
+
+def get_base_url(url: str) -> str:
+    """
+    Extrahiert die Basis-URL aus einer vollständigen URL
+    Nützlich für relative Bild-Pfade
+    
+    Args:
+        url: Vollständige URL
+        
+    Returns:
+        Basis-URL (z.B. https://example.com/path/)
+    """
+    parts = url.rsplit('/', 1)
+    if len(parts) == 2:
+        return parts[0] + '/'
+    return url
+
+
+def resolve_relative_image_url(image_path: str, base_url: str) -> str:
+    """
+    Löst relative Bild-URLs auf
+    
+    Args:
+        image_path: Bild-Pfad (relativ oder absolut)
+        base_url: Basis-URL der Markdown-Datei
+        
+    Returns:
+        Absolute URL zum Bild
+    """
+    # Wenn bereits absolute URL, zurückgeben
+    if image_path.startswith('http://') or image_path.startswith('https://'):
+        return image_path
+    
+    # Relative URL auflösen
+    if image_path.startswith('/'):
+        # Absoluter Pfad auf dem Server
+        from urllib.parse import urlparse
+        parsed = urlparse(base_url)
+        return f"{parsed.scheme}://{parsed.netloc}{image_path}"
+    else:
+        # Relativer Pfad
+        return base_url + image_path