Feature: Automatische Metadaten-Extraktion aus Frontmatter
- Neuer markdown_parser.py mit YAML-Frontmatter Extraktion - Unterstützung für drei Modi: Einzelne URL, YAML-Batch, Forgejo-Repo - Metadaten (name, description, tags, image, author) aus Frontmatter - Schema.org-Support für commonMetadata - Vereinfachte posts.yaml (nur URLs statt vollständiger Metadaten) - Aktualisierte Dokumentation (README.md, QUICKSTART.md) - Beispiel-Beitrag mit vollständigem Frontmatter
This commit is contained in:
parent
e3b19bb0df
commit
7a234be652
6 changed files with 880 additions and 180 deletions
275
workflow.py
275
workflow.py
|
|
@ -1,8 +1,8 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
WordPress Import Workflow
|
||||
Liest Markdown-Dateien aus URLs oder lokalen Dateien und erstellt WordPress-Beiträge
|
||||
basierend auf einer YAML-Konfigurationsdatei.
|
||||
Liest Markdown-Dateien aus URLs oder lokalen Dateien und erstellt WordPress-Beiträge.
|
||||
Metadaten werden aus dem YAML-Frontmatter der Markdown-Dateien extrahiert.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
@ -14,6 +14,11 @@ from pathlib import Path
|
|||
from dotenv import load_dotenv
|
||||
from typing import Dict, Any, List, Optional
|
||||
from wordpress_api import WordPressAPI
|
||||
from markdown_parser import (
|
||||
parse_markdown_with_metadata,
|
||||
get_base_url,
|
||||
resolve_relative_image_url
|
||||
)
|
||||
|
||||
# Lade Umgebungsvariablen
|
||||
load_dotenv()
|
||||
|
|
@ -122,75 +127,110 @@ def process_featured_image(wp_api: WordPressAPI, image_path: str,
|
|||
def process_post(wp_api: WordPressAPI, post_config: Dict[str, Any],
|
||||
global_settings: Dict[str, Any]) -> Optional[int]:
|
||||
"""
|
||||
Verarbeitet einen einzelnen Beitrag aus der Konfiguration
|
||||
Verarbeitet einen einzelnen Beitrag aus der Konfiguration.
|
||||
Metadaten werden aus dem YAML-Frontmatter extrahiert.
|
||||
|
||||
Args:
|
||||
wp_api: WordPress API Client
|
||||
post_config: Beitrags-Konfiguration
|
||||
post_config: Beitrags-Konfiguration (kann nur URL enthalten)
|
||||
global_settings: Globale Einstellungen
|
||||
|
||||
Returns:
|
||||
Post-ID oder None
|
||||
"""
|
||||
title = post_config.get('title')
|
||||
if not title:
|
||||
print("Fehler: Titel fehlt in der Beitragskonfiguration")
|
||||
# URL oder Datei ermitteln
|
||||
source_url = post_config.get('url') or post_config.get('markdown_url')
|
||||
source_file = post_config.get('file') or post_config.get('markdown_file')
|
||||
|
||||
if not source_url and not source_file:
|
||||
print("Fehler: Keine URL oder Datei in der Beitragskonfiguration")
|
||||
return None
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Verarbeite Beitrag: {title}")
|
||||
if source_url:
|
||||
print(f"Verarbeite Markdown von URL: {source_url}")
|
||||
else:
|
||||
print(f"Verarbeite lokale Markdown-Datei: {source_file}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Markdown-Inhalt abrufen
|
||||
markdown_content = None
|
||||
if 'markdown_url' in post_config:
|
||||
print(f"Lade Markdown von URL: {post_config['markdown_url']}")
|
||||
markdown_content = download_markdown(post_config['markdown_url'])
|
||||
elif 'markdown_file' in post_config:
|
||||
print(f"Lese lokale Markdown-Datei: {post_config['markdown_file']}")
|
||||
markdown_content = read_local_markdown(post_config['markdown_file'])
|
||||
elif 'content' in post_config:
|
||||
markdown_content = post_config['content']
|
||||
base_url = None
|
||||
|
||||
if source_url:
|
||||
markdown_content = download_markdown(source_url)
|
||||
base_url = get_base_url(source_url)
|
||||
elif source_file:
|
||||
markdown_content = read_local_markdown(source_file)
|
||||
# Bei lokalen Dateien nehmen wir das Verzeichnis als Basis
|
||||
base_url = os.path.dirname(os.path.abspath(source_file)) + '/'
|
||||
|
||||
if not markdown_content:
|
||||
print(f"Fehler: Kein Inhalt für Beitrag '{title}'")
|
||||
print(f"Fehler: Konnte Markdown-Inhalt nicht laden")
|
||||
return None
|
||||
|
||||
# Markdown parsen und Metadaten extrahieren
|
||||
default_author = global_settings.get('default_author', 'admin')
|
||||
parsed = parse_markdown_with_metadata(markdown_content, default_author)
|
||||
|
||||
metadata = parsed['metadata']
|
||||
markdown_body = parsed['content']
|
||||
|
||||
# Titel prüfen
|
||||
title = metadata.get('title') or post_config.get('title')
|
||||
if not title:
|
||||
print("Fehler: Kein Titel gefunden (weder im Frontmatter noch in der Konfiguration)")
|
||||
return None
|
||||
|
||||
print(f"Titel: {title}")
|
||||
|
||||
# Markdown zu HTML konvertieren
|
||||
extensions = global_settings.get('markdown_extensions', ['extra', 'codehilite', 'toc'])
|
||||
html_content = markdown_to_html(markdown_content, extensions)
|
||||
html_content = markdown_to_html(markdown_body, extensions)
|
||||
|
||||
# Kategorien verarbeiten
|
||||
# Priorität: Frontmatter > post_config > global_settings
|
||||
category_ids = []
|
||||
if 'categories' in post_config:
|
||||
for cat_name in post_config['categories']:
|
||||
cat_id = wp_api.get_or_create_category(cat_name)
|
||||
if cat_id:
|
||||
category_ids.append(cat_id)
|
||||
categories_list = metadata.get('categories') or post_config.get('categories') or []
|
||||
|
||||
for cat_name in categories_list:
|
||||
cat_id = wp_api.get_or_create_category(cat_name)
|
||||
if cat_id:
|
||||
category_ids.append(cat_id)
|
||||
|
||||
# Tags verarbeiten
|
||||
tag_ids = []
|
||||
if 'tags' in post_config:
|
||||
for tag_name in post_config['tags']:
|
||||
tag_id = wp_api.get_or_create_tag(tag_name)
|
||||
if tag_id:
|
||||
tag_ids.append(tag_id)
|
||||
tags_list = metadata.get('tags') or post_config.get('tags') or []
|
||||
|
||||
for tag_name in tags_list:
|
||||
tag_id = wp_api.get_or_create_tag(tag_name)
|
||||
if tag_id:
|
||||
tag_ids.append(tag_id)
|
||||
|
||||
# Beitragsbild verarbeiten
|
||||
featured_media_id = None
|
||||
if 'featured_image' in post_config:
|
||||
featured_image = metadata.get('featured_image') or post_config.get('featured_image')
|
||||
|
||||
if featured_image:
|
||||
# Relative URLs auflösen
|
||||
if base_url and not featured_image.startswith('http'):
|
||||
featured_image = resolve_relative_image_url(featured_image, base_url)
|
||||
|
||||
skip_duplicate_media = global_settings.get('skip_duplicate_media', True)
|
||||
featured_media_id = process_featured_image(
|
||||
wp_api,
|
||||
post_config['featured_image'],
|
||||
featured_image,
|
||||
check_duplicate=skip_duplicate_media
|
||||
)
|
||||
|
||||
# Status
|
||||
status = post_config.get('status', global_settings.get('default_status', 'draft'))
|
||||
status = metadata.get('status') or post_config.get('status') or global_settings.get('default_status', 'draft')
|
||||
|
||||
# Excerpt
|
||||
excerpt = post_config.get('excerpt', '')
|
||||
excerpt = metadata.get('excerpt') or post_config.get('excerpt', '')
|
||||
|
||||
# Autor
|
||||
author_name = metadata.get('author') or post_config.get('author') or default_author
|
||||
|
||||
# Beitrag erstellen
|
||||
skip_duplicates = global_settings.get('skip_duplicates', True)
|
||||
|
|
@ -208,15 +248,180 @@ def process_post(wp_api: WordPressAPI, post_config: Dict[str, Any],
|
|||
return post_id
|
||||
|
||||
|
||||
def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> List[str]:
|
||||
"""
|
||||
Holt alle Markdown-URLs aus einem Forgejo-Repository
|
||||
|
||||
Args:
|
||||
repo_url: URL zum Repository (z.B. https://codeberg.org/user/repo)
|
||||
branch: Branch-Name (Standard: main)
|
||||
|
||||
Returns:
|
||||
Liste von URLs zu Markdown-Dateien
|
||||
"""
|
||||
# Forgejo/Gitea API endpoint
|
||||
# Format: https://codeberg.org/api/v1/repos/{owner}/{repo}/git/trees/{branch}?recursive=true
|
||||
|
||||
# URL parsen
|
||||
parts = repo_url.rstrip('/').split('/')
|
||||
if len(parts) < 2:
|
||||
print(f"Fehler: Ungültige Repository-URL: {repo_url}")
|
||||
return []
|
||||
|
||||
owner = parts[-2]
|
||||
repo = parts[-1]
|
||||
|
||||
# API-URL ermitteln
|
||||
if 'codeberg.org' in repo_url:
|
||||
api_base = 'https://codeberg.org/api/v1'
|
||||
elif 'gitea' in repo_url or 'forgejo' in repo_url:
|
||||
# Generischer Ansatz für selbst-gehostete Instanzen
|
||||
base_parts = repo_url.split('/')[:3]
|
||||
api_base = '/'.join(base_parts) + '/api/v1'
|
||||
else:
|
||||
print(f"Warnung: Unbekannte Forgejo-Instanz, versuche generischen API-Pfad")
|
||||
base_parts = repo_url.split('/')[:3]
|
||||
api_base = '/'.join(base_parts) + '/api/v1'
|
||||
|
||||
api_url = f"{api_base}/repos/{owner}/{repo}/git/trees/{branch}?recursive=true"
|
||||
|
||||
try:
|
||||
response = requests.get(api_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
markdown_files = []
|
||||
for item in data.get('tree', []):
|
||||
if item['type'] == 'blob' and item['path'].endswith('.md'):
|
||||
# Raw-URL konstruieren
|
||||
raw_url = f"https://codeberg.org/{owner}/{repo}/raw/branch/{branch}/{item['path']}"
|
||||
markdown_files.append(raw_url)
|
||||
|
||||
return markdown_files
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Fehler beim Abrufen der Repository-Dateien: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def main():
|
||||
"""Hauptfunktion des Workflows"""
|
||||
|
||||
# Konfigurationsdatei laden
|
||||
config_file = sys.argv[1] if len(sys.argv) > 1 else 'posts.yaml'
|
||||
# Kommandozeilen-Argumente verarbeiten
|
||||
if len(sys.argv) > 1:
|
||||
arg = sys.argv[1]
|
||||
|
||||
# Prüfe ob es eine direkte URL ist
|
||||
if arg.startswith('http://') or arg.startswith('https://'):
|
||||
# Direkter URL-Modus
|
||||
print(f"Direkt-Modus: Verarbeite URL: {arg}")
|
||||
|
||||
# WordPress-Credentials
|
||||
wp_url = os.getenv('WORDPRESS_URL')
|
||||
wp_username = os.getenv('WORDPRESS_USERNAME')
|
||||
wp_password = os.getenv('WORDPRESS_APP_PASSWORD')
|
||||
|
||||
if not all([wp_url, wp_username, wp_password]):
|
||||
print("Fehler: WordPress-Credentials fehlen in .env-Datei")
|
||||
sys.exit(1)
|
||||
|
||||
wp_api = WordPressAPI(wp_url, wp_username, wp_password)
|
||||
|
||||
# Erstelle minimale Konfiguration
|
||||
post_config = {'url': arg}
|
||||
global_settings = {
|
||||
'default_status': 'draft',
|
||||
'default_author': 'admin',
|
||||
'skip_duplicates': True,
|
||||
'skip_duplicate_media': True
|
||||
}
|
||||
|
||||
try:
|
||||
post_id = process_post(wp_api, post_config, global_settings)
|
||||
if post_id:
|
||||
print(f"\n✅ Erfolgreich: Beitrag erstellt (ID: {post_id})")
|
||||
else:
|
||||
print(f"\n❌ Fehler beim Erstellen des Beitrags")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Fehler: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
return
|
||||
|
||||
# Prüfe ob es eine Forgejo-Repo-URL ist
|
||||
elif '--forgejo-repo' in sys.argv or '--repo' in sys.argv:
|
||||
repo_index = sys.argv.index('--forgejo-repo') if '--forgejo-repo' in sys.argv else sys.argv.index('--repo')
|
||||
if len(sys.argv) > repo_index + 1:
|
||||
repo_url = sys.argv[repo_index + 1]
|
||||
branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 else 'main'
|
||||
|
||||
print(f"Forgejo-Modus: Verarbeite Repository: {repo_url}")
|
||||
print(f"Branch: {branch}")
|
||||
|
||||
markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch)
|
||||
|
||||
if not markdown_urls:
|
||||
print("Keine Markdown-Dateien im Repository gefunden")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\nGefundene Markdown-Dateien: {len(markdown_urls)}")
|
||||
|
||||
# WordPress-Credentials
|
||||
wp_url = os.getenv('WORDPRESS_URL')
|
||||
wp_username = os.getenv('WORDPRESS_USERNAME')
|
||||
wp_password = os.getenv('WORDPRESS_APP_PASSWORD')
|
||||
|
||||
if not all([wp_url, wp_username, wp_password]):
|
||||
print("Fehler: WordPress-Credentials fehlen in .env-Datei")
|
||||
sys.exit(1)
|
||||
|
||||
wp_api = WordPressAPI(wp_url, wp_username, wp_password)
|
||||
|
||||
global_settings = {
|
||||
'default_status': 'draft',
|
||||
'default_author': 'admin',
|
||||
'skip_duplicates': True,
|
||||
'skip_duplicate_media': True
|
||||
}
|
||||
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
|
||||
for url in markdown_urls:
|
||||
try:
|
||||
post_config = {'url': url}
|
||||
post_id = process_post(wp_api, post_config, global_settings)
|
||||
if post_id:
|
||||
success_count += 1
|
||||
else:
|
||||
error_count += 1
|
||||
except Exception as e:
|
||||
print(f"Fehler bei der Verarbeitung von {url}: {e}")
|
||||
error_count += 1
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"ZUSAMMENFASSUNG")
|
||||
print(f"{'='*60}")
|
||||
print(f"Erfolgreich: {success_count}")
|
||||
print(f"Fehler: {error_count}")
|
||||
print(f"Gesamt: {len(markdown_urls)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return
|
||||
|
||||
# Sonst als Konfigurationsdatei behandeln
|
||||
config_file = arg
|
||||
else:
|
||||
config_file = 'posts.yaml'
|
||||
|
||||
# Konfigurationsdatei-Modus
|
||||
if not os.path.exists(config_file):
|
||||
print(f"Fehler: Konfigurationsdatei '{config_file}' nicht gefunden")
|
||||
print("Verwendung: python workflow.py [config.yaml]")
|
||||
print("\nVerwendung:")
|
||||
print(" python workflow.py [config.yaml] # YAML-Konfiguration")
|
||||
print(" python workflow.py <URL> # Einzelne Markdown-URL")
|
||||
print(" python workflow.py --repo <REPO_URL> [branch] # Forgejo-Repository")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Lade Konfiguration aus: {config_file}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue