- Previously only first author from list was converted to tag - Now collects all authors from all sources (author, #staticSiteGenerator.author, #commonMetadata.creator) - Handles both single authors and author lists - Removes duplicates if same author appears in multiple sources - All authors are added as individual tags in Vorname_Nachname format - Example: ['Florian Mayrhofer', 'Gina Buchwald-Chassée'] → tags 'Florian_Mayrhofer', 'Gina_Buchwald-Chassée'
321 lines
11 KiB
Python
321 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Markdown Parser mit YAML-Frontmatter Unterstützung
|
|
Extrahiert Metadaten aus Markdown-Dateien für WordPress-Import
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
from typing import Dict, Any, Optional, List
|
|
|
|
|
|
def extract_frontmatter(markdown_content: str) -> tuple[Optional[Dict[str, Any]], str]:
|
|
"""
|
|
Extrahiert YAML-Frontmatter und Markdown-Inhalt
|
|
|
|
Args:
|
|
markdown_content: Vollständiger Markdown-Text
|
|
|
|
Returns:
|
|
Tuple von (frontmatter_dict, markdown_body)
|
|
"""
|
|
# Regex für YAML-Frontmatter (zwischen --- Markern)
|
|
frontmatter_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
|
|
match = re.match(frontmatter_pattern, markdown_content, re.DOTALL)
|
|
|
|
if not match:
|
|
# Kein Frontmatter gefunden
|
|
return None, markdown_content
|
|
|
|
frontmatter_text = match.group(1)
|
|
markdown_body = match.group(2)
|
|
|
|
# YAML parsen
|
|
try:
|
|
frontmatter_data = yaml.safe_load(frontmatter_text)
|
|
return frontmatter_data, markdown_body
|
|
except yaml.YAMLError as e:
|
|
print(f"Warnung: Fehler beim Parsen des YAML-Frontmatters: {e}")
|
|
return None, markdown_content
|
|
|
|
|
|
def format_author_as_tag(author_name: str) -> str:
|
|
"""
|
|
Formatiert einen Autornamen als Tag im Format Vorname_Nachname
|
|
|
|
Args:
|
|
author_name: Autorenname (z.B. "Max Mustermann" oder "Max")
|
|
|
|
Returns:
|
|
Tag-formatierter Name (z.B. "Max_Mustermann")
|
|
"""
|
|
# Entferne führende/nachfolgende Leerzeichen
|
|
author_name = author_name.strip()
|
|
|
|
# Ersetze Leerzeichen durch Unterstriche
|
|
tag_name = author_name.replace(' ', '_')
|
|
|
|
# Entferne mehrfache Unterstriche
|
|
while '__' in tag_name:
|
|
tag_name = tag_name.replace('__', '_')
|
|
|
|
return tag_name
|
|
|
|
|
|
def extract_wordpress_metadata(frontmatter: Dict[str, Any],
|
|
default_author: str = "admin") -> Dict[str, Any]:
|
|
"""
|
|
Extrahiert WordPress-relevante Metadaten aus Frontmatter
|
|
|
|
Args:
|
|
frontmatter: Geparste Frontmatter-Daten
|
|
default_author: Fallback-Autor
|
|
|
|
Returns:
|
|
Dictionary mit WordPress-Metadaten
|
|
"""
|
|
metadata = {}
|
|
|
|
# Titel extrahieren (verschiedene Felder möglich)
|
|
# Priorität: title > name > (aus commonMetadata)
|
|
if 'title' in frontmatter:
|
|
metadata['title'] = frontmatter['title']
|
|
elif 'name' in frontmatter:
|
|
metadata['title'] = frontmatter['name']
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
common = frontmatter['#commonMetadata']
|
|
metadata['title'] = common.get('name', '')
|
|
|
|
# Beschreibung/Excerpt extrahieren
|
|
# Priorität: summary > description > (aus commonMetadata)
|
|
if 'summary' in frontmatter:
|
|
metadata['excerpt'] = frontmatter['summary']
|
|
elif 'description' in frontmatter:
|
|
metadata['excerpt'] = frontmatter['description']
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
common = frontmatter['#commonMetadata']
|
|
metadata['excerpt'] = common.get('description', '')
|
|
|
|
# Bild extrahieren
|
|
# Priorität: image > cover.image > (aus commonMetadata)
|
|
if 'image' in frontmatter:
|
|
metadata['featured_image'] = frontmatter['image']
|
|
elif isinstance(frontmatter.get('cover'), dict):
|
|
cover_image = frontmatter['cover'].get('image', '')
|
|
if cover_image:
|
|
metadata['featured_image'] = cover_image
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
common = frontmatter['#commonMetadata']
|
|
if 'image' in common:
|
|
metadata['featured_image'] = common['image']
|
|
|
|
# Tags extrahieren
|
|
if 'tags' in frontmatter:
|
|
tags = frontmatter['tags']
|
|
if isinstance(tags, list):
|
|
metadata['tags'] = tags
|
|
elif isinstance(tags, str):
|
|
metadata['tags'] = [t.strip() for t in tags.split(',')]
|
|
|
|
# Initialisiere tags falls nicht vorhanden
|
|
if 'tags' not in metadata:
|
|
metadata['tags'] = []
|
|
|
|
# Kategorien extrahieren (falls vorhanden)
|
|
if 'categories' in frontmatter:
|
|
categories = frontmatter['categories']
|
|
if isinstance(categories, list):
|
|
metadata['categories'] = categories
|
|
elif isinstance(categories, str):
|
|
metadata['categories'] = [c.strip() for c in categories.split(',')]
|
|
|
|
# Autor extrahieren
|
|
if 'author' in frontmatter:
|
|
author = frontmatter['author']
|
|
if isinstance(author, list) and len(author) > 0:
|
|
metadata['author'] = author[0]
|
|
elif isinstance(author, str):
|
|
metadata['author'] = author
|
|
else:
|
|
metadata['author'] = default_author
|
|
elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
|
|
static_gen = frontmatter['#staticSiteGenerator']
|
|
if 'author' in static_gen:
|
|
author = static_gen['author']
|
|
if isinstance(author, list) and len(author) > 0:
|
|
metadata['author'] = author[0]
|
|
elif isinstance(author, str):
|
|
metadata['author'] = author
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
common = frontmatter['#commonMetadata']
|
|
if 'creator' in common:
|
|
creator = common['creator']
|
|
if isinstance(creator, list) and len(creator) > 0:
|
|
first_creator = creator[0]
|
|
if isinstance(first_creator, dict):
|
|
given = first_creator.get('givenName', '')
|
|
family = first_creator.get('familyName', '')
|
|
metadata['author'] = f"{given} {family}".strip()
|
|
|
|
# Fallback für Autor
|
|
if 'author' not in metadata:
|
|
metadata['author'] = default_author
|
|
|
|
# Alle Autoren als Tags hinzufügen (Format: Vorname_Nachname)
|
|
# Sammle alle Autoren aus verschiedenen Quellen
|
|
all_authors = []
|
|
|
|
# Aus direktem author-Feld
|
|
if 'author' in frontmatter:
|
|
author = frontmatter['author']
|
|
if isinstance(author, list):
|
|
all_authors.extend(author)
|
|
elif isinstance(author, str):
|
|
all_authors.append(author)
|
|
|
|
# Aus #staticSiteGenerator
|
|
if isinstance(frontmatter.get('#staticSiteGenerator'), dict):
|
|
static_gen = frontmatter['#staticSiteGenerator']
|
|
if 'author' in static_gen:
|
|
author = static_gen['author']
|
|
if isinstance(author, list):
|
|
all_authors.extend(author)
|
|
elif isinstance(author, str):
|
|
all_authors.append(author)
|
|
|
|
# Aus #commonMetadata.creator
|
|
if isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
common = frontmatter['#commonMetadata']
|
|
if 'creator' in common:
|
|
creator = common['creator']
|
|
if isinstance(creator, list):
|
|
for c in creator:
|
|
if isinstance(c, dict):
|
|
given = c.get('givenName', '')
|
|
family = c.get('familyName', '')
|
|
full_name = f"{given} {family}".strip()
|
|
if full_name:
|
|
all_authors.append(full_name)
|
|
elif isinstance(creator, dict):
|
|
given = creator.get('givenName', '')
|
|
family = creator.get('familyName', '')
|
|
full_name = f"{given} {family}".strip()
|
|
if full_name:
|
|
all_authors.append(full_name)
|
|
|
|
# Duplikate entfernen und als Tags hinzufügen
|
|
seen_authors = set()
|
|
for author_name in all_authors:
|
|
if author_name and author_name not in seen_authors:
|
|
seen_authors.add(author_name)
|
|
author_tag = format_author_as_tag(author_name)
|
|
if author_tag and author_tag not in metadata.get('tags', []):
|
|
if 'tags' not in metadata:
|
|
metadata['tags'] = []
|
|
metadata['tags'].append(author_tag)
|
|
|
|
# Status extrahieren (falls vorhanden)
|
|
if 'status' in frontmatter:
|
|
metadata['status'] = frontmatter['status']
|
|
elif 'creativeWorkStatus' in frontmatter:
|
|
# Direkt im Frontmatter (wenn #commonMetadata: als Kommentar)
|
|
work_status = frontmatter.get('creativeWorkStatus', '').lower()
|
|
if 'publish' in work_status:
|
|
metadata['status'] = 'publish'
|
|
elif work_status == 'draft':
|
|
metadata['status'] = 'draft'
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
# Verschachtelt in #commonMetadata
|
|
common = frontmatter['#commonMetadata']
|
|
work_status = common.get('creativeWorkStatus', '').lower()
|
|
if 'publish' in work_status:
|
|
metadata['status'] = 'publish'
|
|
elif work_status == 'draft':
|
|
metadata['status'] = 'draft'
|
|
|
|
# Datum extrahieren (falls vorhanden)
|
|
# Priorität: date > datePublished > (direkt) > (aus commonMetadata) > (aus staticSiteGenerator)
|
|
if 'date' in frontmatter:
|
|
metadata['date'] = str(frontmatter['date'])
|
|
elif 'datePublished' in frontmatter:
|
|
# Direkt im Frontmatter (wenn #commonMetadata: als Kommentar oder als Feld)
|
|
metadata['date'] = str(frontmatter['datePublished'])
|
|
elif isinstance(frontmatter.get('#commonMetadata'), dict):
|
|
# Verschachtelt in #commonMetadata
|
|
common = frontmatter['#commonMetadata']
|
|
if 'datePublished' in common:
|
|
metadata['date'] = str(common['datePublished'])
|
|
elif isinstance(frontmatter.get('#staticSiteGenerator'), dict):
|
|
static_gen = frontmatter['#staticSiteGenerator']
|
|
if 'datePublished' in static_gen:
|
|
metadata['date'] = str(static_gen['datePublished'])
|
|
|
|
return metadata
|
|
|
|
|
|
def parse_markdown_with_metadata(markdown_content: str,
|
|
default_author: str = "admin") -> Dict[str, Any]:
|
|
"""
|
|
Parst Markdown-Datei und extrahiert alle WordPress-relevanten Daten
|
|
|
|
Args:
|
|
markdown_content: Vollständiger Markdown-Text
|
|
default_author: Fallback-Autor
|
|
|
|
Returns:
|
|
Dictionary mit 'metadata' und 'content' (Markdown-Body)
|
|
"""
|
|
frontmatter, markdown_body = extract_frontmatter(markdown_content)
|
|
|
|
result = {
|
|
'content': markdown_body,
|
|
'metadata': {}
|
|
}
|
|
|
|
if frontmatter:
|
|
result['metadata'] = extract_wordpress_metadata(frontmatter, default_author)
|
|
|
|
return result
|
|
|
|
|
|
def get_base_url(url: str) -> str:
|
|
"""
|
|
Extrahiert die Basis-URL aus einer vollständigen URL
|
|
Nützlich für relative Bild-Pfade
|
|
|
|
Args:
|
|
url: Vollständige URL
|
|
|
|
Returns:
|
|
Basis-URL (z.B. https://example.com/path/)
|
|
"""
|
|
parts = url.rsplit('/', 1)
|
|
if len(parts) == 2:
|
|
return parts[0] + '/'
|
|
return url
|
|
|
|
|
|
def resolve_relative_image_url(image_path: str, base_url: str) -> str:
|
|
"""
|
|
Löst relative Bild-URLs auf
|
|
|
|
Args:
|
|
image_path: Bild-Pfad (relativ oder absolut)
|
|
base_url: Basis-URL der Markdown-Datei
|
|
|
|
Returns:
|
|
Absolute URL zum Bild
|
|
"""
|
|
# Wenn bereits absolute URL, zurückgeben
|
|
if image_path.startswith('http://') or image_path.startswith('https://'):
|
|
return image_path
|
|
|
|
# Relative URL auflösen
|
|
if image_path.startswith('/'):
|
|
# Absoluter Pfad auf dem Server
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(base_url)
|
|
return f"{parsed.scheme}://{parsed.netloc}{image_path}"
|
|
else:
|
|
# Relativer Pfad
|
|
return base_url + image_path
|