From 86717185c410ea1876486eeeb6afddb24ecc180f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Lohrer?= Date: Wed, 5 Nov 2025 06:11:15 +0100 Subject: [PATCH] Feature: Add path filter for Forgejo repository imports - Added path_filter parameter to fetch_forgejo_repo_markdown_files() - Filter supports wildcard patterns (e.g., 'posts/*/index.md') - Fixed hardcoded base URL - now detects git.rpi-virtuell.de and other instances - Added --filter command line option for repo mode - Updated help text with filter examples - Enables selective import of specific markdown files from repository Usage: python workflow.py --repo [branch] --filter 'Website/content/posts/*/index.md' Example: Imports only index.md files from posts subdirectories (59 files found) --- workflow.py | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/workflow.py b/workflow.py index 8dc7fa4..2735734 100644 --- a/workflow.py +++ b/workflow.py @@ -279,13 +279,15 @@ def process_post(wp_api: WordPressAPI, post_config: Dict[str, Any], return post_id -def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> List[str]: +def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main', + path_filter: str = None) -> List[str]: """ Holt alle Markdown-URLs aus einem Forgejo-Repository Args: repo_url: URL zum Repository (z.B. https://codeberg.org/user/repo) branch: Branch-Name (Standard: main) + path_filter: Optionaler Filter für Pfade (z.B. 'posts/*/index.md') Returns: Liste von URLs zu Markdown-Dateien @@ -302,17 +304,23 @@ def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> Li owner = parts[-2] repo = parts[-1] - # API-URL ermitteln + # Basis-URL ermitteln (für Raw-URLs) if 'codeberg.org' in repo_url: + base_url = 'https://codeberg.org' api_base = 'https://codeberg.org/api/v1' + elif 'git.rpi-virtuell.de' in repo_url: + base_url = 'https://git.rpi-virtuell.de' + api_base = 'https://git.rpi-virtuell.de/api/v1' elif 'gitea' in repo_url or 'forgejo' in repo_url: # Generischer Ansatz für selbst-gehostete Instanzen base_parts = repo_url.split('/')[:3] - api_base = '/'.join(base_parts) + '/api/v1' + base_url = '/'.join(base_parts) + api_base = base_url + '/api/v1' else: print(f"Warnung: Unbekannte Forgejo-Instanz, versuche generischen API-Pfad") base_parts = repo_url.split('/')[:3] - api_base = '/'.join(base_parts) + '/api/v1' + base_url = '/'.join(base_parts) + api_base = base_url + '/api/v1' api_url = f"{api_base}/repos/{owner}/{repo}/git/trees/{branch}?recursive=true" @@ -324,8 +332,17 @@ def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> Li markdown_files = [] for item in data.get('tree', []): if item['type'] == 'blob' and item['path'].endswith('.md'): + # Filter anwenden falls vorhanden + if path_filter: + # Konvertiere Wildcard-Filter zu regex-Pattern + # z.B. 'posts/*/index.md' -> '^posts/[^/]+/index\.md$' + import re + pattern = path_filter.replace('*', '[^/]+').replace('.', r'\.') + if not re.match(f'^{pattern}$', item['path']): + continue + # Raw-URL konstruieren - raw_url = f"https://codeberg.org/{owner}/{repo}/raw/branch/{branch}/{item['path']}" + raw_url = f"{base_url}/{owner}/{repo}/raw/branch/{branch}/{item['path']}" markdown_files.append(raw_url) return markdown_files @@ -385,12 +402,21 @@ def main(): repo_index = sys.argv.index('--forgejo-repo') if '--forgejo-repo' in sys.argv else sys.argv.index('--repo') if len(sys.argv) > repo_index + 1: repo_url = sys.argv[repo_index + 1] - branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 else 'main' + branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 and not sys.argv[repo_index + 2].startswith('--') else 'main' + + # Pfad-Filter für spezifische Dateien (z.B. nur index.md in posts/) + path_filter = None + if '--filter' in sys.argv: + filter_index = sys.argv.index('--filter') + if len(sys.argv) > filter_index + 1: + path_filter = sys.argv[filter_index + 1] print(f"Forgejo-Modus: Verarbeite Repository: {repo_url}") print(f"Branch: {branch}") + if path_filter: + print(f"Filter: {path_filter}") - markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch) + markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch, path_filter) if not markdown_urls: print("Keine Markdown-Dateien im Repository gefunden") @@ -450,9 +476,13 @@ def main(): if not os.path.exists(config_file): print(f"Fehler: Konfigurationsdatei '{config_file}' nicht gefunden") print("\nVerwendung:") - print(" python workflow.py [config.yaml] # YAML-Konfiguration") - print(" python workflow.py # Einzelne Markdown-URL") - print(" python workflow.py --repo [branch] # Forgejo-Repository") + print(" python workflow.py [config.yaml] # YAML-Konfiguration") + print(" python workflow.py # Einzelne Markdown-URL") + print(" python workflow.py --repo [branch] # Forgejo-Repository (alle .md)") + print(" python workflow.py --repo [branch] --filter # Mit Filter") + print("\nFilter-Beispiele:") + print(" --filter 'posts/*/index.md' # Nur index.md in posts/-Unterverzeichnissen") + print(" --filter 'Website/content/posts/*/index.md' # Mit vollständigem Pfad") sys.exit(1) print(f"Lade Konfiguration aus: {config_file}")