Feature: Add path filter for Forgejo repository imports

- Added path_filter parameter to fetch_forgejo_repo_markdown_files()
- Filter supports wildcard patterns (e.g., 'posts/*/index.md')
- Fixed hardcoded base URL - now detects git.rpi-virtuell.de and other instances
- Added --filter command line option for repo mode
- Updated help text with filter examples
- Enables selective import of specific markdown files from repository

Usage: python workflow.py --repo <URL> [branch] --filter 'Website/content/posts/*/index.md'
Example: Imports only index.md files from posts subdirectories (59 files found)
This commit is contained in:
Jörg Lohrer 2025-11-05 06:11:15 +01:00
parent 98d7244e9d
commit 86717185c4

View file

@ -279,13 +279,15 @@ def process_post(wp_api: WordPressAPI, post_config: Dict[str, Any],
return post_id
def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> List[str]:
def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main',
path_filter: str = None) -> List[str]:
"""
Holt alle Markdown-URLs aus einem Forgejo-Repository
Args:
repo_url: URL zum Repository (z.B. https://codeberg.org/user/repo)
branch: Branch-Name (Standard: main)
path_filter: Optionaler Filter für Pfade (z.B. 'posts/*/index.md')
Returns:
Liste von URLs zu Markdown-Dateien
@ -302,17 +304,23 @@ def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> Li
owner = parts[-2]
repo = parts[-1]
# API-URL ermitteln
# Basis-URL ermitteln (für Raw-URLs)
if 'codeberg.org' in repo_url:
base_url = 'https://codeberg.org'
api_base = 'https://codeberg.org/api/v1'
elif 'git.rpi-virtuell.de' in repo_url:
base_url = 'https://git.rpi-virtuell.de'
api_base = 'https://git.rpi-virtuell.de/api/v1'
elif 'gitea' in repo_url or 'forgejo' in repo_url:
# Generischer Ansatz für selbst-gehostete Instanzen
base_parts = repo_url.split('/')[:3]
api_base = '/'.join(base_parts) + '/api/v1'
base_url = '/'.join(base_parts)
api_base = base_url + '/api/v1'
else:
print(f"Warnung: Unbekannte Forgejo-Instanz, versuche generischen API-Pfad")
base_parts = repo_url.split('/')[:3]
api_base = '/'.join(base_parts) + '/api/v1'
base_url = '/'.join(base_parts)
api_base = base_url + '/api/v1'
api_url = f"{api_base}/repos/{owner}/{repo}/git/trees/{branch}?recursive=true"
@ -324,8 +332,17 @@ def fetch_forgejo_repo_markdown_files(repo_url: str, branch: str = 'main') -> Li
markdown_files = []
for item in data.get('tree', []):
if item['type'] == 'blob' and item['path'].endswith('.md'):
# Filter anwenden falls vorhanden
if path_filter:
# Konvertiere Wildcard-Filter zu regex-Pattern
# z.B. 'posts/*/index.md' -> '^posts/[^/]+/index\.md$'
import re
pattern = path_filter.replace('*', '[^/]+').replace('.', r'\.')
if not re.match(f'^{pattern}$', item['path']):
continue
# Raw-URL konstruieren
raw_url = f"https://codeberg.org/{owner}/{repo}/raw/branch/{branch}/{item['path']}"
raw_url = f"{base_url}/{owner}/{repo}/raw/branch/{branch}/{item['path']}"
markdown_files.append(raw_url)
return markdown_files
@ -385,12 +402,21 @@ def main():
repo_index = sys.argv.index('--forgejo-repo') if '--forgejo-repo' in sys.argv else sys.argv.index('--repo')
if len(sys.argv) > repo_index + 1:
repo_url = sys.argv[repo_index + 1]
branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 else 'main'
branch = sys.argv[repo_index + 2] if len(sys.argv) > repo_index + 2 and not sys.argv[repo_index + 2].startswith('--') else 'main'
# Pfad-Filter für spezifische Dateien (z.B. nur index.md in posts/)
path_filter = None
if '--filter' in sys.argv:
filter_index = sys.argv.index('--filter')
if len(sys.argv) > filter_index + 1:
path_filter = sys.argv[filter_index + 1]
print(f"Forgejo-Modus: Verarbeite Repository: {repo_url}")
print(f"Branch: {branch}")
if path_filter:
print(f"Filter: {path_filter}")
markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch)
markdown_urls = fetch_forgejo_repo_markdown_files(repo_url, branch, path_filter)
if not markdown_urls:
print("Keine Markdown-Dateien im Repository gefunden")
@ -450,9 +476,13 @@ def main():
if not os.path.exists(config_file):
print(f"Fehler: Konfigurationsdatei '{config_file}' nicht gefunden")
print("\nVerwendung:")
print(" python workflow.py [config.yaml] # YAML-Konfiguration")
print(" python workflow.py <URL> # Einzelne Markdown-URL")
print(" python workflow.py --repo <REPO_URL> [branch] # Forgejo-Repository")
print(" python workflow.py [config.yaml] # YAML-Konfiguration")
print(" python workflow.py <URL> # Einzelne Markdown-URL")
print(" python workflow.py --repo <REPO_URL> [branch] # Forgejo-Repository (alle .md)")
print(" python workflow.py --repo <REPO_URL> [branch] --filter <pattern> # Mit Filter")
print("\nFilter-Beispiele:")
print(" --filter 'posts/*/index.md' # Nur index.md in posts/-Unterverzeichnissen")
print(" --filter 'Website/content/posts/*/index.md' # Mit vollständigem Pfad")
sys.exit(1)
print(f"Lade Konfiguration aus: {config_file}")