- Added path_filter parameter to fetch_forgejo_repo_markdown_files() - Filter supports wildcard patterns (e.g., 'posts/*/index.md') - Fixed hardcoded base URL - now detects git.rpi-virtuell.de and other instances - Added --filter command line option for repo mode - Updated help text with filter examples - Enables selective import of specific markdown files from repository Usage: python workflow.py --repo <URL> [branch] --filter 'Website/content/posts/*/index.md' Example: Imports only index.md files from posts subdirectories (59 files found)
544 lines
19 KiB
Python
544 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
WordPress Import Workflow
|
|
Liest Markdown-Dateien aus URLs oder lokalen Dateien und erstellt WordPress-Beiträge.
|
|
Metadaten werden aus dem YAML-Frontmatter der Markdown-Dateien extrahiert.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import yaml
|
|
import requests
|
|
import markdown
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
from typing import Dict, Any, List, Optional
|
|
from wordpress_api import WordPressAPI
|
|
from markdown_parser import (
|
|
parse_markdown_with_metadata,
|
|
get_base_url,
|
|
resolve_relative_image_url
|
|
)
|
|
|
|
# Lade Umgebungsvariablen
|
|
load_dotenv()
|
|
|
|
|
|
def download_markdown(url: str) -> Optional[str]:
|
|
"""
|
|
Lädt Markdown-Inhalt von einer URL herunter
|
|
|
|
Args:
|
|
url: URL zur Markdown-Datei
|
|
|
|
Returns:
|
|
Markdown-Inhalt als String oder None bei Fehler
|
|
"""
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Fehler beim Herunterladen von {url}: {e}")
|
|
return None
|
|
|
|
|
|
def read_local_markdown(file_path: str) -> Optional[str]:
|
|
"""
|
|
Liest Markdown-Inhalt aus einer lokalen Datei
|
|
|
|
Args:
|
|
file_path: Pfad zur lokalen Markdown-Datei
|
|
|
|
Returns:
|
|
Markdown-Inhalt als String oder None bei Fehler
|
|
"""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
except IOError as e:
|
|
print(f"Fehler beim Lesen von {file_path}: {e}")
|
|
return None
|
|
|
|
|
|
def markdown_to_html(markdown_text: str, extensions: Optional[List[str]] = None) -> str:
|
|
"""
|
|
Konvertiert Markdown zu HTML
|
|
|
|
Args:
|
|
markdown_text: Markdown-Text
|
|
extensions: Liste der Markdown-Erweiterungen
|
|
|
|
Returns:
|
|
HTML-String
|
|
"""
|
|
if extensions is None:
|
|
extensions = ['extra', 'codehilite', 'toc']
|
|
|
|
return markdown.markdown(markdown_text, extensions=extensions)
|
|
|
|
|
|
def process_featured_image(wp_api: WordPressAPI, image_path: str,
|
|
check_duplicate: bool = True) -> Optional[int]:
|
|
"""
|
|
Verarbeitet und lädt ein Beitragsbild hoch
|
|
|
|
Args:
|
|
wp_api: WordPress API Client
|
|
image_path: Pfad zum Bild (lokal oder URL)
|
|
check_duplicate: Prüfung auf Duplikate
|
|
|
|
Returns:
|
|
Media-ID oder None
|
|
"""
|
|
# Prüfe ob URL oder lokaler Pfad
|
|
if image_path.startswith('http://') or image_path.startswith('https://'):
|
|
# Download Bild
|
|
try:
|
|
response = requests.get(image_path, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
# Temporäre Datei erstellen
|
|
filename = os.path.basename(image_path.split('?')[0])
|
|
temp_path = f"/tmp/{filename}"
|
|
|
|
with open(temp_path, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
media_id = wp_api.upload_media(temp_path, check_duplicate=check_duplicate)
|
|
|
|
# Temporäre Datei löschen
|
|
os.remove(temp_path)
|
|
|
|
return media_id
|
|
|
|
except Exception as e:
|
|
print(f"Fehler beim Verarbeiten des Bilds von URL: {e}")
|
|
return None
|
|
else:
|
|
# Lokale Datei
|
|
if os.path.exists(image_path):
|
|
return wp_api.upload_media(image_path, check_duplicate=check_duplicate)
|
|
else:
|
|
print(f"Bilddatei nicht gefunden: {image_path}")
|
|
return None
|
|
|
|
|
|
def process_post(wp_api: WordPressAPI, post_config: Dict[str, Any],
|
|
global_settings: Dict[str, Any]) -> Optional[int]:
|
|
"""
|
|
Verarbeitet einen einzelnen Beitrag aus der Konfiguration.
|
|
Metadaten werden aus dem YAML-Frontmatter extrahiert.
|
|
|
|
Args:
|
|
wp_api: WordPress API Client
|
|
post_config: Beitrags-Konfiguration (kann nur URL enthalten)
|
|
global_settings: Globale Einstellungen
|
|
|
|
Returns:
|
|
Post-ID oder None
|
|
"""
|
|
# URL oder Datei ermitteln
|
|
source_url = post_config.get('url') or post_config.get('markdown_url')
|
|
source_file = post_config.get('file') or post_config.get('markdown_file')
|
|
|
|
if not source_url and not source_file:
|
|
print("Fehler: Keine URL oder Datei in der Beitragskonfiguration")
|
|
return None
|
|
|
|
print(f"\n{'='*60}")
|
|
if source_url:
|
|
print(f"Verarbeite Markdown von URL: {source_url}")
|
|
else:
|
|
print(f"Verarbeite lokale Markdown-Datei: {source_file}")
|
|
print(f"{'='*60}")
|
|
|
|
# Markdown-Inhalt abrufen
|
|
markdown_content = None
|
|
base_url = None
|
|
|
|
if source_url:
|
|
markdown_content = download_markdown(source_url)
|
|
base_url = get_base_url(source_url)
|
|
elif source_file:
|
|
markdown_content = read_local_markdown(source_file)
|
|
# Bei lokalen Dateien nehmen wir das Verzeichnis als Basis
|
|
base_url = os.path.dirname(os.path.abspath(source_file)) + '/'
|
|
|
|
if not markdown_content:
|
|
print(f"Fehler: Konnte Markdown-Inhalt nicht laden")
|
|
return None
|
|
|
|
# Markdown parsen und Metadaten extrahieren
|
|
default_author = global_settings.get('default_author', 'admin')
|
|
parsed = parse_markdown_with_metadata(markdown_content, default_author)
|
|
|
|
metadata = parsed['metadata']
|
|
markdown_body = parsed['content']
|
|
|
|
# Titel prüfen
|
|
title = metadata.get('title') or post_config.get('title')
|
|
if not title:
|
|
print("Fehler: Kein Titel gefunden (weder im Frontmatter noch in der Konfiguration)")
|
|
return None
|
|
|
|
print(f"Titel: {title}")
|
|
|
|
# Markdown zu HTML konvertieren
|
|
extensions = global_settings.get('markdown_extensions', ['extra', 'codehilite', 'toc'])
|
|
html_content = markdown_to_html(markdown_body, extensions)
|
|
|
|
# Kategorien verarbeiten
|
|
# Priorität: Frontmatter > post_config > global_settings
|
|
category_ids = []
|
|
categories_list = metadata.get('categories') or post_config.get('categories') or []
|
|
|
|
for cat_name in categories_list:
|
|
cat_id = wp_api.get_or_create_category(cat_name)
|
|
if cat_id:
|
|
category_ids.append(cat_id)
|
|
|
|
# Tags verarbeiten
|
|
tag_ids = []
|
|
tags_list = metadata.get('tags') or post_config.get('tags') or []
|
|
|
|
if tags_list:
|
|
print(f"Tags aus Frontmatter: {tags_list}")
|
|
|
|
for tag_name in tags_list:
|
|
tag_id = wp_api.get_or_create_tag(tag_name)
|
|
if tag_id:
|
|
tag_ids.append(tag_id)
|
|
print(f" → Tag '{tag_name}' ID: {tag_id}")
|
|
|
|
if tag_ids:
|
|
print(f"Gesamt Tag-IDs: {tag_ids}")
|
|
|
|
# Beitragsbild verarbeiten
|
|
featured_media_id = None
|
|
featured_image = metadata.get('featured_image') or post_config.get('featured_image')
|
|
|
|
if featured_image:
|
|
# Relative URLs auflösen
|
|
if base_url and not featured_image.startswith('http'):
|
|
featured_image = resolve_relative_image_url(featured_image, base_url)
|
|
|
|
skip_duplicate_media = global_settings.get('skip_duplicate_media', True)
|
|
featured_media_id = process_featured_image(
|
|
wp_api,
|
|
featured_image,
|
|
check_duplicate=skip_duplicate_media
|
|
)
|
|
|
|
# Status
|
|
status = metadata.get('status') or post_config.get('status') or global_settings.get('default_status', 'publish')
|
|
|
|
# Excerpt
|
|
excerpt = metadata.get('excerpt') or post_config.get('excerpt', '')
|
|
|
|
# Autor
|
|
author_name = metadata.get('author') or post_config.get('author') or default_author
|
|
|
|
# Veröffentlichungsdatum
|
|
# WordPress erwartet ISO 8601 Format: 2025-09-02T12:00:00
|
|
publish_date = metadata.get('date') or post_config.get('date')
|
|
|
|
# Zusätzliche WordPress-Felder vorbereiten
|
|
extra_fields = {}
|
|
if publish_date:
|
|
# Datum formatieren falls nötig
|
|
if isinstance(publish_date, str):
|
|
# Wenn nur Datum (YYYY-MM-DD), füge Uhrzeit hinzu
|
|
if len(publish_date) == 10: # Format: 2025-09-02
|
|
publish_date = f"{publish_date}T00:00:00"
|
|
|
|
# Für draft-Posts: Setze sowohl 'date' als auch 'date_gmt'
|
|
# Für published-Posts: WordPress setzt automatisch bei Veröffentlichung
|
|
extra_fields['date'] = publish_date
|
|
# date_gmt für explizite Kontrolle (ohne Zeitzone)
|
|
extra_fields['date_gmt'] = publish_date
|
|
|
|
print(f"Veröffentlichungsdatum: {publish_date}")
|
|
if status == 'draft':
|
|
print(f" (Hinweis: Datum wird erst beim Veröffentlichen aktiv)")
|
|
|
|
# Beitrag erstellen
|
|
skip_duplicates = global_settings.get('skip_duplicates', True)
|
|
post_id = wp_api.create_post(
|
|
title=title,
|
|
content=html_content,
|
|
status=status,
|
|
featured_media=featured_media_id,
|
|
categories=category_ids if category_ids else None,
|
|
tags=tag_ids if tag_ids else None,
|
|
excerpt=excerpt,
|
|
check_duplicate=skip_duplicates,
|
|
**extra_fields # Datum und andere Felder
|
|
)
|
|
|
|
return post_id
|
|
|
|
|
|
def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main',
|
|
path_filter: str = None) -> List[str]:
|
|
"""
|
|
Holt alle Markdown-URLs aus einem Forgejo-Repository
|
|
|
|
Args:
|
|
repo_url: URL zum Repository (z.B. https://codeberg.org/user/repo)
|
|
branch: Branch-Name (Standard: main)
|
|
path_filter: Optionaler Filter für Pfade (z.B. 'posts/*/index.md')
|
|
|
|
Returns:
|
|
Liste von URLs zu Markdown-Dateien
|
|
"""
|
|
# Forgejo/Gitea API endpoint
|
|
# Format: https://codeberg.org/api/v1/repos/{owner}/{repo}/git/trees/{branch}?recursive=true
|
|
|
|
# URL parsen
|
|
parts = repo_url.rstrip('/').split('/')
|
|
if len(parts) < 2:
|
|
print(f"Fehler: Ungültige Repository-URL: {repo_url}")
|
|
return []
|
|
|
|
owner = parts[-2]
|
|
repo = parts[-1]
|
|
|
|
# Basis-URL ermitteln (für Raw-URLs)
|
|
if 'codeberg.org' in repo_url:
|
|
base_url = 'https://codeberg.org'
|
|
api_base = 'https://codeberg.org/api/v1'
|
|
elif 'git.rpi-virtuell.de' in repo_url:
|
|
base_url = 'https://git.rpi-virtuell.de'
|
|
api_base = 'https://git.rpi-virtuell.de/api/v1'
|
|
elif 'gitea' in repo_url or 'forgejo' in repo_url:
|
|
# Generischer Ansatz für selbst-gehostete Instanzen
|
|
base_parts = repo_url.split('/')[:3]
|
|
base_url = '/'.join(base_parts)
|
|
api_base = base_url + '/api/v1'
|
|
else:
|
|
print(f"Warnung: Unbekannte Forgejo-Instanz, versuche generischen API-Pfad")
|
|
base_parts = repo_url.split('/')[:3]
|
|
base_url = '/'.join(base_parts)
|
|
api_base = base_url + '/api/v1'
|
|
|
|
api_url = f"{api_base}/repos/{owner}/{repo}/git/trees/{branch}?recursive=true"
|
|
|
|
try:
|
|
response = requests.get(api_url, timeout=30)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
markdown_files = []
|
|
for item in data.get('tree', []):
|
|
if item['type'] == 'blob' and item['path'].endswith('.md'):
|
|
# Filter anwenden falls vorhanden
|
|
if path_filter:
|
|
# Konvertiere Wildcard-Filter zu regex-Pattern
|
|
# z.B. 'posts/*/index.md' -> '^posts/[^/]+/index\.md$'
|
|
import re
|
|
pattern = path_filter.replace('*', '[^/]+').replace('.', r'\.')
|
|
if not re.match(f'^{pattern}$', item['path']):
|
|
continue
|
|
|
|
# Raw-URL konstruieren
|
|
raw_url = f"{base_url}/{owner}/{repo}/raw/branch/{branch}/{item['path']}"
|
|
markdown_files.append(raw_url)
|
|
|
|
return markdown_files
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Fehler beim Abrufen der Repository-Dateien: {e}")
|
|
return []
|
|
|
|
|
|
def main():
|
|
"""Hauptfunktion des Workflows"""
|
|
|
|
# Kommandozeilen-Argumente verarbeiten
|
|
if len(sys.argv) > 1:
|
|
arg = sys.argv[1]
|
|
|
|
# Prüfe ob es eine direkte URL ist
|
|
if arg.startswith('http://') or arg.startswith('https://'):
|
|
# Direkter URL-Modus
|
|
print(f"Direkt-Modus: Verarbeite URL: {arg}")
|
|
|
|
# WordPress-Credentials
|
|
wp_url = os.getenv('WORDPRESS_URL')
|
|
wp_username = os.getenv('WORDPRESS_USERNAME')
|
|
wp_password = os.getenv('WORDPRESS_APP_PASSWORD')
|
|
|
|
if not all([wp_url, wp_username, wp_password]):
|
|
print("Fehler: WordPress-Credentials fehlen in .env-Datei")
|
|
sys.exit(1)
|
|
|
|
wp_api = WordPressAPI(wp_url, wp_username, wp_password)
|
|
|
|
# Erstelle minimale Konfiguration
|
|
post_config = {'url': arg}
|
|
global_settings = {
|
|
'default_status': 'publish',
|
|
'default_author': 'admin',
|
|
'skip_duplicates': True,
|
|
'skip_duplicate_media': True
|
|
}
|
|
|
|
try:
|
|
post_id = process_post(wp_api, post_config, global_settings)
|
|
if post_id:
|
|
print(f"\n✅ Erfolgreich: Beitrag erstellt (ID: {post_id})")
|
|
else:
|
|
print(f"\n❌ Fehler beim Erstellen des Beitrags")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Fehler: {e}")
|
|
sys.exit(1)
|
|
|
|
return
|
|
|
|
# Prüfe ob es eine Forgejo-Repo-URL ist
|
|
elif '--forgejo-repo' in sys.argv or '--repo' in sys.argv:
|
|
repo_index = sys.argv.index('--forgejo-repo') if '--forgejo-repo' in sys.argv else sys.argv.index('--repo')
|
|
if len(sys.argv) > repo_index + 1:
|
|
repo_url = sys.argv[repo_index + 1]
|
|
branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 and not sys.argv[repo_index + 2].startswith('--') else 'main'
|
|
|
|
# Pfad-Filter für spezifische Dateien (z.B. nur index.md in posts/)
|
|
path_filter = None
|
|
if '--filter' in sys.argv:
|
|
filter_index = sys.argv.index('--filter')
|
|
if len(sys.argv) > filter_index + 1:
|
|
path_filter = sys.argv[filter_index + 1]
|
|
|
|
print(f"Forgejo-Modus: Verarbeite Repository: {repo_url}")
|
|
print(f"Branch: {branch}")
|
|
if path_filter:
|
|
print(f"Filter: {path_filter}")
|
|
|
|
markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch, path_filter)
|
|
|
|
if not markdown_urls:
|
|
print("Keine Markdown-Dateien im Repository gefunden")
|
|
sys.exit(1)
|
|
|
|
print(f"\nGefundene Markdown-Dateien: {len(markdown_urls)}")
|
|
|
|
# WordPress-Credentials
|
|
wp_url = os.getenv('WORDPRESS_URL')
|
|
wp_username = os.getenv('WORDPRESS_USERNAME')
|
|
wp_password = os.getenv('WORDPRESS_APP_PASSWORD')
|
|
|
|
if not all([wp_url, wp_username, wp_password]):
|
|
print("Fehler: WordPress-Credentials fehlen in .env-Datei")
|
|
sys.exit(1)
|
|
|
|
wp_api = WordPressAPI(wp_url, wp_username, wp_password)
|
|
|
|
global_settings = {
|
|
'default_status': 'publish',
|
|
'default_author': 'admin',
|
|
'skip_duplicates': True,
|
|
'skip_duplicate_media': True
|
|
}
|
|
|
|
success_count = 0
|
|
error_count = 0
|
|
|
|
for url in markdown_urls:
|
|
try:
|
|
post_config = {'url': url}
|
|
post_id = process_post(wp_api, post_config, global_settings)
|
|
if post_id:
|
|
success_count += 1
|
|
else:
|
|
error_count += 1
|
|
except Exception as e:
|
|
print(f"Fehler bei der Verarbeitung von {url}: {e}")
|
|
error_count += 1
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"ZUSAMMENFASSUNG")
|
|
print(f"{'='*60}")
|
|
print(f"Erfolgreich: {success_count}")
|
|
print(f"Fehler: {error_count}")
|
|
print(f"Gesamt: {len(markdown_urls)}")
|
|
print(f"{'='*60}\n")
|
|
|
|
return
|
|
|
|
# Sonst als Konfigurationsdatei behandeln
|
|
config_file = arg
|
|
else:
|
|
config_file = 'posts.yaml'
|
|
|
|
# Konfigurationsdatei-Modus
|
|
if not os.path.exists(config_file):
|
|
print(f"Fehler: Konfigurationsdatei '{config_file}' nicht gefunden")
|
|
print("\nVerwendung:")
|
|
print(" python workflow.py [config.yaml] # YAML-Konfiguration")
|
|
print(" python workflow.py <URL> # Einzelne Markdown-URL")
|
|
print(" python workflow.py --repo <REPO_URL> [branch] # Forgejo-Repository (alle .md)")
|
|
print(" python workflow.py --repo <REPO_URL> [branch] --filter <pattern> # Mit Filter")
|
|
print("\nFilter-Beispiele:")
|
|
print(" --filter 'posts/*/index.md' # Nur index.md in posts/-Unterverzeichnissen")
|
|
print(" --filter 'Website/content/posts/*/index.md' # Mit vollständigem Pfad")
|
|
sys.exit(1)
|
|
|
|
print(f"Lade Konfiguration aus: {config_file}")
|
|
|
|
with open(config_file, 'r', encoding='utf-8') as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
# WordPress-Credentials aus Umgebungsvariablen
|
|
wp_url = os.getenv('WORDPRESS_URL')
|
|
wp_username = os.getenv('WORDPRESS_USERNAME')
|
|
wp_password = os.getenv('WORDPRESS_APP_PASSWORD')
|
|
|
|
if not all([wp_url, wp_username, wp_password]):
|
|
print("Fehler: WordPress-Credentials fehlen in .env-Datei")
|
|
print("Benötigt: WORDPRESS_URL, WORDPRESS_USERNAME, WORDPRESS_APP_PASSWORD")
|
|
sys.exit(1)
|
|
|
|
print(f"\nVerbinde mit WordPress: {wp_url}")
|
|
|
|
# WordPress API initialisieren
|
|
wp_api = WordPressAPI(wp_url, wp_username, wp_password)
|
|
|
|
# Globale Einstellungen
|
|
global_settings = config.get('settings', {})
|
|
|
|
# Beiträge verarbeiten
|
|
posts = config.get('posts', [])
|
|
if not posts:
|
|
print("Warnung: Keine Beiträge in der Konfiguration gefunden")
|
|
return
|
|
|
|
print(f"\nVerarbeite {len(posts)} Beitrag/Beiträge...\n")
|
|
|
|
success_count = 0
|
|
error_count = 0
|
|
|
|
for post_config in posts:
|
|
try:
|
|
post_id = process_post(wp_api, post_config, global_settings)
|
|
if post_id:
|
|
success_count += 1
|
|
else:
|
|
error_count += 1
|
|
except Exception as e:
|
|
print(f"Fehler bei der Verarbeitung: {e}")
|
|
error_count += 1
|
|
|
|
# Zusammenfassung
|
|
print(f"\n{'='*60}")
|
|
print(f"ZUSAMMENFASSUNG")
|
|
print(f"{'='*60}")
|
|
print(f"Erfolgreich: {success_count}")
|
|
print(f"Fehler: {error_count}")
|
|
print(f"Gesamt: {len(posts)}")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|