MD2WordPress/markdown_parser.py
Jörg Lohrer 9ba1aa7b10 Bugfix: Tag-Duplikate, Post-Duplikate und Veröffentlichungsdatum
Fixes:
- Tag/Kategorie-Erstellung: Bessere Fehlerbehandlung für bereits existierende Tags
- Post-Duplikatsprüfung: Verbesserte Suche mit status='any' und case-insensitive Vergleich
- Veröffentlichungsdatum: datePublished aus Frontmatter wird als WordPress-Datum gesetzt
- Erweiterte Datumsextraktion aus verschiedenen Frontmatter-Strukturen

Neue Datei:
- USAGE_MODES.md: Übersicht der drei Verwendungsmodi
2025-10-01 08:30:07 +02:00

231 lines
7.9 KiB
Python

#!/usr/bin/env python3
"""
Markdown Parser mit YAML-Frontmatter Unterstützung
Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import
"""
import re
import yaml
from typing import Dict, Any, Optional, List
def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]:
"""
Extrahiert YAML-Frontmatter und Markdown-Inhalt
Args:
markdown_content: Vollständiger Markdown-Text
Returns:
Tuple von (frontmatter_dict, markdown_body)
"""
# Regex für YAML-Frontmatter (zwischen --- Markern)
frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
match = re.match(frontmatter_pattern, markdown_content, re.DOTALL)
if not match:
# Kein Frontmatter gefunden
return None, markdown_content
frontmatter_text = match.group(1)
markdown_body = match.group(2)
# YAML parsen
try:
frontmatter_data = yaml.safe_load(frontmatter_text)
return frontmatter_data, markdown_body
except yaml.YAMLError as e:
print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}")
return None, markdown_content
def extract_wordpress_metadata(frontmatter: Dict[str, Any],
default_author: str = "admin") -> Dict[str, Any]:
"""
Extrahiert WordPress-relevante Metadaten aus Frontmatter
Args:
frontmatter: Geparste Frontmatter-Daten
default_author: Fallback-Autor
Returns:
Dictionary mit WordPress-Metadaten
"""
metadata = {}
# Titel extrahieren (verschiedene Felder möglich)
# Priorität: title > name > (aus commonMetadata)
if 'title' in frontmatter:
metadata['title'] = frontmatter['title']
elif 'name' in frontmatter:
metadata['title'] = frontmatter['name']
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
metadata['title'] = common.get('name', '')
# Beschreibung/Excerpt extrahieren
# Priorität: summary > description > (aus commonMetadata)
if 'summary' in frontmatter:
metadata['excerpt'] = frontmatter['summary']
elif 'description' in frontmatter:
metadata['excerpt'] = frontmatter['description']
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
metadata['excerpt'] = common.get('description', '')
# Bild extrahieren
# Priorität: image > cover.image > (aus commonMetadata)
if 'image' in frontmatter:
metadata['featured_image'] = frontmatter['image']
elif isinstance(frontmatter.get('cover'), dict):
cover_image = frontmatter['cover'].get('image', '')
if cover_image:
metadata['featured_image'] = cover_image
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
if 'image' in common:
metadata['featured_image'] = common['image']
# Tags extrahieren
if 'tags' in frontmatter:
tags = frontmatter['tags']
if isinstance(tags, list):
metadata['tags'] = tags
elif isinstance(tags, str):
metadata['tags'] = [t.strip() for t in tags.split(',')]
# Kategorien extrahieren (falls vorhanden)
if 'categories' in frontmatter:
categories = frontmatter['categories']
if isinstance(categories, list):
metadata['categories'] = categories
elif isinstance(categories, str):
metadata['categories'] = [c.strip() for c in categories.split(',')]
# Autor extrahieren
if 'author' in frontmatter:
author = frontmatter['author']
if isinstance(author, list) and len(author) > 0:
metadata['author'] = author[0]
elif isinstance(author, str):
metadata['author'] = author
else:
metadata['author'] = default_author
elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
static_gen = frontmatter['#staticSiteGenerator']
if 'author' in static_gen:
author = static_gen['author']
if isinstance(author, list) and len(author) > 0:
metadata['author'] = author[0]
elif isinstance(author, str):
metadata['author'] = author
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
if 'creator' in common:
creator = common['creator']
if isinstance(creator, list) and len(creator) > 0:
first_creator = creator[0]
if isinstance(first_creator, dict):
given = first_creator.get('givenName', '')
family = first_creator.get('familyName', '')
metadata['author'] = f"{given} {family}".strip()
# Fallback für Autor
if 'author' not in metadata:
metadata['author'] = default_author
# Status extrahieren (falls vorhanden)
if 'status' in frontmatter:
metadata['status'] = frontmatter['status']
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
work_status = common.get('creativeWorkStatus', '').lower()
if work_status == 'published':
metadata['status'] = 'publish'
elif work_status == 'draft':
metadata['status'] = 'draft'
# Datum extrahieren (falls vorhanden)
# Priorität: date > datePublished > (aus commonMetadata) > (aus staticSiteGenerator)
if 'date' in frontmatter:
metadata['date'] = str(frontmatter['date'])
elif 'datePublished' in frontmatter:
metadata['date'] = str(frontmatter['datePublished'])
elif isinstance(frontmatter.get('#commonMetadata'), dict):
common = frontmatter['#commonMetadata']
if 'datePublished' in common:
metadata['date'] = str(common['datePublished'])
elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
static_gen = frontmatter['#staticSiteGenerator']
if 'datePublished' in static_gen:
metadata['date'] = str(static_gen['datePublished'])
return metadata
def parse_markdown_with_metadata(markdown_content: str,
default_author: str = "admin") -> Dict[str, Any]:
"""
Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten
Args:
markdown_content: Vollständiger Markdown-Text
default_author: Fallback-Autor
Returns:
Dictionary mit 'metadata' und 'content' (Markdown-Body)
"""
frontmatter, markdown_body = extract_frontmatter(markdown_content)
result = {
'content': markdown_body,
'metadata': {}
}
if frontmatter:
result['metadata'] = extract_wordpress_metadata(frontmatter, default_author)
return result
def get_base_url(url: str) -> str:
"""
Extrahiert die Basis-URL aus einer vollständigen URL
Nützlich für relative Bild-Pfade
Args:
url: Vollständige URL
Returns:
Basis-URL (z.B. https://example.com/path/)
"""
parts = url.rsplit('/', 1)
if len(parts) == 2:
return parts[0] + '/'
return url
def resolve_relative_image_url(image_path: str, base_url: str) -> str:
"""
Löst relative Bild-URLs auf
Args:
image_path: Bild-Pfad (relativ oder absolut)
base_url: Basis-URL der Markdown-Datei
Returns:
Absolute URL zum Bild
"""
# Wenn bereits absolute URL, zurückgeben
if image_path.startswith('http://') or image_path.startswith('https://'):
return image_path
# Relative URL auflösen
if image_path.startswith('/'):
# Absoluter Pfad auf dem Server
from urllib.parse import urlparse
parsed = urlparse(base_url)
return f"{parsed.scheme}://{parsed.netloc}{image_path}"
else:
# Relativer Pfad
return base_url + image_path