From c31423d811c0bfc4dc4482095adc8a3c8ab3daf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Lohrer?= Date: Wed, 1 Oct 2025 15:37:55 +0200 Subject: [PATCH] feat: implement core parser and Forgejo API client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implementiert: - YAML Front Matter Extractor mit parse/remove/has Funktionen - AMB Metadata Extractor für Schema.org-konforme Metadaten - Core Parser mit unified/remark Pipeline - parseMarkdownFile für lokale Dateien - parseMarkdownString für Strings - extractHeadings, extractLinks, extractImages Utilities - Forgejo API Client - getFileContent, listDirectory, listPosts - getPostContent, getAllPosts - Volle API-Integration mit Token-Auth - Public API in src/index.js - 3 Beispiele (parse-forgejo, list-all-posts, parse-local) - 11 Unit Tests (alle passing) - Test-Fixtures mit AMB-konformen Beispieldaten Tests: 11 passing ✅ Beispiel erfolgreich mit echtem Forgejo-Repo getestet ✅ --- examples/list-all-posts.js | 67 ++++++++ examples/parse-forgejo.js | 101 ++++++++++++ examples/parse-local.js | 90 +++++++++++ src/extractors/amb-extractor.js | 267 +++++++++++++++++++++++++++++++ src/extractors/yaml-extractor.js | 65 ++++++++ src/forgejo-client.js | 232 +++++++++++++++++++++++++++ src/index.js | 70 ++++++++ src/parser.js | 234 +++++++++++++++++++++++++++ test/fixtures/example.md | 66 ++++++++ test/parser.test.js | 184 +++++++++++++++++++++ 10 files changed, 1376 insertions(+) create mode 100644 examples/list-all-posts.js create mode 100644 examples/parse-forgejo.js create mode 100644 examples/parse-local.js create mode 100644 src/extractors/amb-extractor.js create mode 100644 src/extractors/yaml-extractor.js create mode 100644 src/forgejo-client.js create mode 100644 src/index.js create mode 100644 src/parser.js create mode 100644 test/fixtures/example.md create mode 100644 test/parser.test.js diff --git a/examples/list-all-posts.js b/examples/list-all-posts.js new file mode 100644 index 0000000..b360c21 --- /dev/null +++ b/examples/list-all-posts.js @@ -0,0 +1,67 @@ +/** + * Beispiel: Alle Posts von Forgejo abrufen und analysieren + */ + +import { createForgejoClient } from '../src/forgejo-client.js' +import { parseMarkdownString } from '../src/parser.js' + +async function main() { + console.log('🚀 Alle Posts von Forgejo abrufen\n') + + try { + const client = createForgejoClient() + + console.log('📡 Liste alle Posts...') + const postDirs = await client.listPosts() + + console.log(`✅ ${postDirs.length} Posts gefunden\n`) + + // Ersten 5 Posts parsen + const limit = 5 + console.log(`🔍 Parse die ersten ${limit} Posts...\n`) + + for (let i = 0; i < Math.min(limit, postDirs.length); i++) { + const dir = postDirs[i] + + console.log(`\n📄 [${i + 1}/${limit}] ${dir.name}`) + console.log('─'.repeat(60)) + + try { + const markdown = await client.getPostContent(dir.name) + const result = await parseMarkdownString(markdown) + + if (result.metadata) { + console.log(` Titel: ${result.metadata.name || 'Unbekannt'}`) + console.log(` Typ: ${result.metadata.type}`) + console.log(` Datum: ${result.metadata.datePublished || 'N/A'}`) + console.log(` Lizenz: ${result.metadata.license || 'N/A'}`) + + if (result.metadata.creator) { + const authors = result.metadata.creator + .map(c => c.name || `${c.givenName} ${c.familyName}`) + .join(', ') + console.log(` Autoren: ${authors}`) + } + + console.log(` Content: ${result.content.length} Zeichen`) + } else { + console.log(' ⚠️ Keine Metadaten gefunden') + } + + } catch (error) { + console.log(` ❌ Fehler: ${error.message}`) + } + } + + console.log('\n\n📊 Zusammenfassung:') + console.log(` Gesamt: ${postDirs.length} Posts im Repository`) + console.log(` Analysiert: ${Math.min(limit, postDirs.length)} Posts`) + console.log('\n✅ Fertig!') + + } catch (error) { + console.error('❌ Fehler:', error.message) + process.exit(1) + } +} + +main() diff --git a/examples/parse-forgejo.js b/examples/parse-forgejo.js new file mode 100644 index 0000000..1ecad54 --- /dev/null +++ b/examples/parse-forgejo.js @@ -0,0 +1,101 @@ +/** + * Beispiel: Markdown-Datei von Forgejo API abrufen und parsen + */ + +import { createForgejoClient } from '../src/forgejo-client.js' +import { parseMarkdownString } from '../src/parser.js' + +async function main() { + console.log('🚀 Forgejo API Beispiel\n') + + try { + // Forgejo Client erstellen (nutzt .env Konfiguration) + const client = createForgejoClient() + + console.log('📡 Verbinde mit Forgejo API...') + console.log(` Repository: ${client.owner}/${client.repo}`) + console.log(` Branch: ${client.branch}\n`) + + // Repository-Info abrufen + const repo = await client.getRepository() + console.log('✅ Repository gefunden:') + console.log(` Name: ${repo.name}`) + console.log(` Beschreibung: ${repo.description}`) + console.log(` Sprache: ${repo.language}\n`) + + // Beispiel-Post abrufen + const postPath = '2025-04-20-OER-und-Symbole' + console.log(`📄 Rufe Post ab: ${postPath}`) + + const markdown = await client.getPostContent(postPath) + console.log(`✅ Markdown geladen (${markdown.length} Zeichen)\n`) + + // Markdown parsen + console.log('🔍 Parse Markdown...') + const result = await parseMarkdownString(markdown) + + // Ergebnisse anzeigen + console.log('\n📊 Parse-Ergebnisse:\n') + + if (result.metadata) { + console.log('🏷️ Metadaten:') + console.log(` Titel: ${result.metadata.name}`) + console.log(` Typ: ${result.metadata.type}`) + console.log(` Lizenz: ${result.metadata.license}`) + console.log(` Datum: ${result.metadata.datePublished}`) + + if (result.metadata.creator) { + console.log(' Autoren:') + result.metadata.creator.forEach(creator => { + const name = creator.name || `${creator.givenName} ${creator.familyName}` + console.log(` - ${name}`) + if (creator.id) console.log(` ORCID: ${creator.id}`) + }) + } + + if (result.metadata._warnings && result.metadata._warnings.length > 0) { + console.log('\n⚠️ Warnings:') + result.metadata._warnings.forEach(w => console.log(` - ${w}`)) + } + } + + console.log('\n📝 Content:') + console.log(` Länge: ${result.content.length} Zeichen`) + console.log(` AST Nodes: ${countNodes(result.ast)}`) + + // Überschriften extrahieren + const { extractHeadings } = await import('../src/parser.js') + const headings = extractHeadings(result.ast) + + if (headings.length > 0) { + console.log('\n📑 Überschriften:') + headings.slice(0, 5).forEach(h => { + const indent = ' '.repeat(h.level - 1) + console.log(` ${indent}H${h.level}: ${h.text}`) + }) + if (headings.length > 5) { + console.log(` ... und ${headings.length - 5} weitere`) + } + } + + console.log('\n✅ Erfolgreich!') + + } catch (error) { + console.error('❌ Fehler:', error.message) + process.exit(1) + } +} + +// Hilfsfunktion: Zähle AST-Nodes +function countNodes(node) { + let count = 1 + if (node.children) { + node.children.forEach(child => { + count += countNodes(child) + }) + } + return count +} + +// Ausführen +main() diff --git a/examples/parse-local.js b/examples/parse-local.js new file mode 100644 index 0000000..d068689 --- /dev/null +++ b/examples/parse-local.js @@ -0,0 +1,90 @@ +/** + * Beispiel: Lokale Markdown-Datei parsen + */ + +import { parseMarkdownFile } from '../src/parser.js' +import { join, dirname } from 'path' +import { fileURLToPath } from 'url' +import { writeFile } from 'fs/promises' + +const __dirname = dirname(fileURLToPath(import.meta.url)) + +async function main() { + console.log('🚀 Lokale Datei parsen\n') + + try { + // Beispiel-Markdown-Datei + const filePath = join(__dirname, '../test/fixtures/example.md') + + console.log(`📄 Parse Datei: ${filePath}`) + + const result = await parseMarkdownFile(filePath) + + console.log('\n✅ Erfolgreich geparst!\n') + + // Metadaten ausgeben + if (result.metadata) { + console.log('📋 Metadaten:') + console.log(JSON.stringify(result.metadata, null, 2)) + } + + // YAML ausgeben + if (result.yaml) { + console.log('\n📝 YAML Front Matter:') + console.log(JSON.stringify(result.yaml, null, 2)) + } + + // AST-Struktur + console.log('\n🌲 AST Root:') + console.log(` Type: ${result.ast.type}`) + console.log(` Children: ${result.ast.children?.length || 0}`) + + // Überschriften + const { extractHeadings } = await import('../src/parser.js') + const headings = extractHeadings(result.ast) + + if (headings.length > 0) { + console.log('\n📑 Überschriften:') + headings.forEach(h => { + const indent = ' '.repeat(h.level - 1) + console.log(` ${indent}H${h.level}: ${h.text}`) + }) + } + + // Links + const { extractLinks } = await import('../src/parser.js') + const links = extractLinks(result.ast) + + if (links.length > 0) { + console.log('\n🔗 Links:') + links.slice(0, 5).forEach(link => { + console.log(` - ${link.text || 'Kein Text'}: ${link.url}`) + }) + if (links.length > 5) { + console.log(` ... und ${links.length - 5} weitere`) + } + } + + // Bilder + const { extractImages } = await import('../src/parser.js') + const images = extractImages(result.ast) + + if (images.length > 0) { + console.log('\n🖼️ Bilder:') + images.forEach(img => { + console.log(` - ${img.alt || 'Kein Alt-Text'}: ${img.url}`) + }) + } + + // Optional: Ergebnis als JSON speichern + const outputPath = join(__dirname, '../test/output/result.json') + await writeFile(outputPath, JSON.stringify(result, null, 2)) + console.log(`\n💾 Ergebnis gespeichert: ${outputPath}`) + + } catch (error) { + console.error('❌ Fehler:', error.message) + process.exit(1) + } +} + +main() diff --git a/src/extractors/amb-extractor.js b/src/extractors/amb-extractor.js new file mode 100644 index 0000000..c88b6b5 --- /dev/null +++ b/src/extractors/amb-extractor.js @@ -0,0 +1,267 @@ +/** + * AMB Metadata Extractor + * Extrahiert und transformiert Schema.org-konforme AMB-Metadaten + * aus YAML Front Matter + */ + +/** + * Extrahiert AMB-konforme Metadaten aus YAML-Objekt + * @param {Object} yamlObject - Geparstes YAML Front Matter + * @returns {Object} Schema.org-konforme Metadaten + */ +export function extractAMBMetadata(yamlObject) { + if (!yamlObject || typeof yamlObject !== 'object') { + return createEmptyMetadata() + } + + const warnings = [] + const commonMetadata = yamlObject.commonMetadata || {} + + // Basis-Metadaten extrahieren + const metadata = { + '@context': commonMetadata['@context'] || 'https://schema.org/', + type: commonMetadata.type || 'LearningResource', + + // Titel + name: extractField(commonMetadata, 'name', yamlObject.title, warnings), + + // Beschreibung + description: extractField( + commonMetadata, + 'description', + yamlObject.summary || yamlObject.description, + warnings + ), + + // Lizenz + license: commonMetadata.license || null, + + // ID/URL + id: commonMetadata.id || commonMetadata.url || yamlObject.url || null, + + // Sprache + inLanguage: commonMetadata.inLanguage || null, + + // Veröffentlichungsdatum + datePublished: extractDate( + commonMetadata.datePublished || yamlObject.datePublished + ), + + // Autoren/Creator + creator: extractCreators(commonMetadata.creator, yamlObject.author), + + // Bild + image: extractImage(commonMetadata.image, yamlObject.cover?.image), + + // Themen/Tags + about: commonMetadata.about || null, + + // Lernressourcentyp + learningResourceType: commonMetadata.learningResourceType || null, + + // Bildungsniveau + educationalLevel: commonMetadata.educationalLevel || null, + + // Status + creativeWorkStatus: commonMetadata.creativeWorkStatus || null + } + + // Warnings hinzufügen wenn vorhanden + if (warnings.length > 0) { + metadata._warnings = warnings + } + + // Zusätzliche Metadaten aus staticSiteGenerator (Hugo/PaperMod) + if (yamlObject.tags) { + metadata._tags = yamlObject.tags + } + + return metadata +} + +/** + * Erstellt leeres Metadaten-Objekt mit Defaults + * @returns {Object} Leeres Metadaten-Objekt + */ +function createEmptyMetadata() { + return { + '@context': 'https://schema.org/', + type: 'LearningResource', + name: null, + description: null, + _warnings: ['Keine YAML-Metadaten gefunden'] + } +} + +/** + * Extrahiert ein Feld mit Fallback und Warning + * @param {Object} source - Haupt-Quelle + * @param {string} field - Feldname + * @param {*} fallback - Fallback-Wert + * @param {Array} warnings - Warning-Array + * @returns {*} Extrahierter Wert + */ +function extractField(source, field, fallback, warnings) { + if (source && source[field]) { + return source[field] + } + + if (fallback) { + warnings.push(`Feld 'commonMetadata.${field}' fehlt, verwende Fallback`) + return fallback + } + + warnings.push(`Pflichtfeld 'commonMetadata.${field}' fehlt`) + return null +} + +/** + * Extrahiert und normalisiert Datum + * @param {string|Date} dateValue - Datum als String oder Date-Objekt + * @returns {string|null} ISO 8601 Datum oder null + */ +function extractDate(dateValue) { + if (!dateValue) return null + + try { + const date = new Date(dateValue) + return date.toISOString().split('T')[0] // YYYY-MM-DD + } catch (error) { + return dateValue // Falls Parsing fehlschlägt, Original zurückgeben + } +} + +/** + * Extrahiert Creator/Author-Informationen + * @param {Array|Object} creators - Creator aus commonMetadata + * @param {Array|string} authors - Author aus staticSiteGenerator + * @returns {Array|null} Array von Creator-Objekten + */ +function extractCreators(creators, authors) { + // Priorität: commonMetadata.creator + if (creators) { + if (Array.isArray(creators)) { + return creators.map(normalizeCreator) + } + return [normalizeCreator(creators)] + } + + // Fallback: author (einfacher String oder Array) + if (authors) { + if (Array.isArray(authors)) { + return authors.map(name => ({ + type: 'Person', + name: name + })) + } + return [{ + type: 'Person', + name: authors + }] + } + + return null +} + +/** + * Normalisiert Creator-Objekt nach Schema.org + * @param {Object} creator - Creator-Objekt + * @returns {Object} Normalisiertes Creator-Objekt + */ +function normalizeCreator(creator) { + if (typeof creator === 'string') { + return { + type: 'Person', + name: creator + } + } + + const normalized = { + type: creator.type || 'Person' + } + + // Person + if (creator.givenName || creator.familyName) { + normalized.givenName = creator.givenName + normalized.familyName = creator.familyName + normalized.name = `${creator.givenName || ''} ${creator.familyName || ''}`.trim() + } else if (creator.name) { + normalized.name = creator.name + } + + // ID (ORCID, ROR, etc.) + if (creator.id) { + normalized.id = creator.id + } + + // Affiliation + if (creator.affiliation) { + normalized.affiliation = normalizeOrganization(creator.affiliation) + } + + return normalized +} + +/** + * Normalisiert Organization-Objekt + * @param {Object|string} org - Organization + * @returns {Object} Normalisiertes Organization-Objekt + */ +function normalizeOrganization(org) { + if (typeof org === 'string') { + return { + type: 'Organization', + name: org + } + } + + return { + type: 'Organization', + name: org.name, + id: org.id || null + } +} + +/** + * Extrahiert Bild-URL + * @param {string} ambImage - Bild aus commonMetadata + * @param {string} coverImage - Bild aus cover + * @returns {string|null} Bild-URL + */ +function extractImage(ambImage, coverImage) { + return ambImage || coverImage || null +} + +/** + * Validiert AMB-Metadaten auf Vollständigkeit + * @param {Object} metadata - Zu validierende Metadaten + * @returns {Object} Validierungs-Ergebnis + */ +export function validateAMBMetadata(metadata) { + const errors = [] + const warnings = [] + + // Pflichtfelder + const requiredFields = ['name', 'description', 'license'] + + requiredFields.forEach(field => { + if (!metadata[field]) { + errors.push(`Pflichtfeld fehlt: ${field}`) + } + }) + + // Empfohlene Felder + const recommendedFields = ['creator', 'datePublished', 'about', 'id'] + + recommendedFields.forEach(field => { + if (!metadata[field]) { + warnings.push(`Empfohlenes Feld fehlt: ${field}`) + } + }) + + return { + valid: errors.length === 0, + errors, + warnings + } +} diff --git a/src/extractors/yaml-extractor.js b/src/extractors/yaml-extractor.js new file mode 100644 index 0000000..347960c --- /dev/null +++ b/src/extractors/yaml-extractor.js @@ -0,0 +1,65 @@ +/** + * YAML Extractor + * Extrahiert und parst YAML Front Matter aus Markdown-Inhalten + */ + +import { parse as parseYaml } from 'yaml' + +/** + * Extrahiert YAML Front Matter aus Markdown-Content + * @param {string} markdownContent - Roher Markdown-Content + * @returns {Object|null} Geparstes YAML-Objekt oder null + */ +export function extractYAML(markdownContent) { + if (!markdownContent || typeof markdownContent !== 'string') { + return null + } + + // YAML Front Matter Pattern: ---\n...\n--- + const yamlPattern = /^---\s*\n([\s\S]*?)\n---\s*\n/ + const match = markdownContent.match(yamlPattern) + + if (!match || !match[1]) { + return null + } + + try { + const yamlString = match[1] + const parsed = parseYaml(yamlString) + return parsed + } catch (error) { + console.error('YAML Parse Error:', error.message) + return { + _error: 'YAML parsing failed', + _errorDetails: error.message + } + } +} + +/** + * Entfernt YAML Front Matter aus Markdown-Content + * @param {string} markdownContent - Markdown mit YAML Front Matter + * @returns {string} Markdown ohne Front Matter + */ +export function removeYAML(markdownContent) { + if (!markdownContent || typeof markdownContent !== 'string') { + return markdownContent + } + + const yamlPattern = /^---\s*\n[\s\S]*?\n---\s*\n/ + return markdownContent.replace(yamlPattern, '').trim() +} + +/** + * Validiert, ob ein String YAML Front Matter enthält + * @param {string} markdownContent - Zu prüfender Content + * @returns {boolean} True wenn YAML Front Matter vorhanden + */ +export function hasYAML(markdownContent) { + if (!markdownContent || typeof markdownContent !== 'string') { + return false + } + + const yamlPattern = /^---\s*\n[\s\S]*?\n---\s*\n/ + return yamlPattern.test(markdownContent) +} diff --git a/src/forgejo-client.js b/src/forgejo-client.js new file mode 100644 index 0000000..3a73ad1 --- /dev/null +++ b/src/forgejo-client.js @@ -0,0 +1,232 @@ +/** + * Forgejo/Gitea API Client + * Ermöglicht Zugriff auf Repository-Inhalte über die Forgejo/Gitea API + */ + +import { config } from 'dotenv' + +// Environment-Variablen laden +config() + +/** + * Forgejo API Client + */ +export class ForgejoClient { + /** + * @param {Object} options - Konfiguration + * @param {string} options.baseUrl - API Base URL + * @param {string} options.owner - Repository Owner + * @param {string} options.repo - Repository Name + * @param {string} options.branch - Branch (default: main) + * @param {string} options.token - API Token (optional für öffentliche Repos) + */ + constructor(options = {}) { + this.baseUrl = options.baseUrl || process.env.FORGEJO_API_BASE_URL + this.owner = options.owner || process.env.FORGEJO_OWNER + this.repo = options.repo || process.env.FORGEJO_REPO + this.branch = options.branch || process.env.FORGEJO_BRANCH || 'main' + this.token = options.token || process.env.FORGEJO_TOKEN + + if (!this.baseUrl || !this.owner || !this.repo) { + throw new Error('Forgejo client requires baseUrl, owner, and repo') + } + } + + /** + * Erstellt Request-Headers mit optionalem Token + * @returns {Object} Headers + */ + getHeaders() { + const headers = { + 'Accept': 'application/json' + } + + if (this.token) { + headers['Authorization'] = `token ${this.token}` + } + + return headers + } + + /** + * Führt einen API-Request aus + * @param {string} endpoint - API-Endpoint + * @returns {Promise} Response-Daten + */ + async request(endpoint) { + const url = `${this.baseUrl}${endpoint}` + + try { + const response = await fetch(url, { + headers: this.getHeaders() + }) + + if (!response.ok) { + throw new Error( + `Forgejo API Error: ${response.status} ${response.statusText}` + ) + } + + return await response.json() + } catch (error) { + throw new Error(`Failed to fetch from Forgejo: ${error.message}`) + } + } + + /** + * Ruft Dateiinhalt aus dem Repository ab + * @param {string} path - Dateipfad im Repository + * @param {string} ref - Branch/Tag/Commit (optional) + * @returns {Promise} Dateiinhalt als String + */ + async getFileContent(path, ref = null) { + const branch = ref || this.branch + const endpoint = `/repos/${this.owner}/${this.repo}/contents/${path}?ref=${branch}` + + try { + const data = await this.request(endpoint) + + // Forgejo gibt Base64-kodierten Content zurück + if (data.content && data.encoding === 'base64') { + return Buffer.from(data.content, 'base64').toString('utf-8') + } + + // Fallback: Download-URL verwenden + if (data.download_url) { + const response = await fetch(data.download_url) + return await response.text() + } + + throw new Error('No content or download_url in response') + } catch (error) { + throw new Error(`Failed to get file content: ${error.message}`) + } + } + + /** + * Listet Inhalte eines Verzeichnisses auf + * @param {string} path - Verzeichnispfad + * @param {string} ref - Branch/Tag/Commit (optional) + * @returns {Promise} Array von Dateien/Verzeichnissen + */ + async listDirectory(path, ref = null) { + const branch = ref || this.branch + const endpoint = `/repos/${this.owner}/${this.repo}/contents/${path}?ref=${branch}` + + try { + const data = await this.request(endpoint) + + if (!Array.isArray(data)) { + throw new Error('Expected directory listing, got single file') + } + + return data.map(item => ({ + name: item.name, + path: item.path, + type: item.type, // 'file' oder 'dir' + size: item.size, + sha: item.sha, + url: item.url, + download_url: item.download_url + })) + } catch (error) { + throw new Error(`Failed to list directory: ${error.message}`) + } + } + + /** + * Listet alle Posts aus dem Posts-Verzeichnis + * @param {string} postsDir - Pfad zum Posts-Verzeichnis + * @returns {Promise} Array von Post-Verzeichnissen + */ + async listPosts(postsDir = 'Website/content/posts') { + try { + const contents = await this.listDirectory(postsDir) + + // Nur Verzeichnisse zurückgeben + const postDirs = contents.filter(item => item.type === 'dir') + + return postDirs + } catch (error) { + throw new Error(`Failed to list posts: ${error.message}`) + } + } + + /** + * Ruft index.md aus einem Post-Verzeichnis ab + * @param {string} postDir - Post-Verzeichnis (z.B. "2025-04-20-OER-und-Symbole") + * @param {string} postsBaseDir - Basis-Pfad (default: "Website/content/posts") + * @returns {Promise} Markdown-Content + */ + async getPostContent(postDir, postsBaseDir = 'Website/content/posts') { + const indexPath = `${postsBaseDir}/${postDir}/index.md` + return await this.getFileContent(indexPath) + } + + /** + * Ruft alle Posts mit Content ab + * @param {string} postsDir - Posts-Verzeichnis + * @returns {Promise} Array von Posts mit Content + */ + async getAllPosts(postsDir = 'Website/content/posts') { + try { + const postDirs = await this.listPosts(postsDir) + + const posts = await Promise.all( + postDirs.map(async (dir) => { + try { + const content = await this.getPostContent(dir.name, postsDir) + return { + directory: dir.name, + path: `${postsDir}/${dir.name}/index.md`, + content, + metadata: dir + } + } catch (error) { + console.warn(`Failed to fetch post ${dir.name}:`, error.message) + return null + } + }) + ) + + // Null-Werte filtern (fehlgeschlagene Requests) + return posts.filter(post => post !== null) + } catch (error) { + throw new Error(`Failed to get all posts: ${error.message}`) + } + } + + /** + * Ruft Repository-Informationen ab + * @returns {Promise} Repository-Daten + */ + async getRepository() { + const endpoint = `/repos/${this.owner}/${this.repo}` + return await this.request(endpoint) + } + + /** + * Sucht nach Dateien im Repository + * @param {string} query - Suchbegriff + * @returns {Promise} Suchergebnisse + */ + async searchFiles(query) { + const endpoint = `/repos/${this.owner}/${this.repo}/search?q=${encodeURIComponent(query)}` + + try { + const data = await this.request(endpoint) + return data.data || [] + } catch (error) { + throw new Error(`Failed to search files: ${error.message}`) + } + } +} + +/** + * Factory-Funktion: Erstellt ForgejoClient mit Defaults aus .env + * @param {Object} overrides - Optionale Overrides + * @returns {ForgejoClient} Konfigurierter Client + */ +export function createForgejoClient(overrides = {}) { + return new ForgejoClient(overrides) +} diff --git a/src/index.js b/src/index.js new file mode 100644 index 0000000..497004b --- /dev/null +++ b/src/index.js @@ -0,0 +1,70 @@ +/** + * MDParser - Main Entry Point + * Markdown to JSON Parser für AMB-konforme Inhalte + */ + +// Parser +export { + parseMarkdownFile, + parseMarkdownString, + astToMarkdown, + createMarkdownProcessor, + extractHeadings, + extractLinks, + extractImages +} from './parser.js' + +// YAML Extractor +export { + extractYAML, + removeYAML, + hasYAML +} from './extractors/yaml-extractor.js' + +// AMB Metadata Extractor +export { + extractAMBMetadata, + validateAMBMetadata +} from './extractors/amb-extractor.js' + +// Forgejo Client +export { + ForgejoClient, + createForgejoClient +} from './forgejo-client.js' + +/** + * Convenience-Funktion: Parst Markdown von verschiedenen Quellen + * @param {string} source - Dateipfad, URL oder Markdown-String + * @param {Object} options - Parser-Optionen + * @returns {Promise} Parsed result + */ +export async function parse(source, options = {}) { + const { parseMarkdownFile, parseMarkdownString } = await import('./parser.js') + + // Prüfe ob es ein Dateipfad ist + if (source.startsWith('/') || source.startsWith('./') || source.startsWith('../')) { + return parseMarkdownFile(source, options) + } + + // Prüfe ob es eine URL ist + if (source.startsWith('http://') || source.startsWith('https://')) { + const response = await fetch(source) + const markdown = await response.text() + return parseMarkdownString(markdown, options) + } + + // Ansonsten als Markdown-String behandeln + return parseMarkdownString(source, options) +} + +// Default Export +export default { + parse, + parseMarkdownFile, + parseMarkdownString, + ForgejoClient, + createForgejoClient, + extractYAML, + extractAMBMetadata +} diff --git a/src/parser.js b/src/parser.js new file mode 100644 index 0000000..149abf1 --- /dev/null +++ b/src/parser.js @@ -0,0 +1,234 @@ +/** + * Core Markdown Parser + * Nutzt unified/remark für Markdown-Parsing mit YAML Front Matter + */ + +import { unified } from 'unified' +import remarkParse from 'remark-parse' +import remarkFrontmatter from 'remark-frontmatter' +import remarkGfm from 'remark-gfm' +import remarkStringify from 'remark-stringify' +import { readFile } from 'fs/promises' +import { extractYAML, removeYAML } from './extractors/yaml-extractor.js' +import { extractAMBMetadata } from './extractors/amb-extractor.js' + +/** + * Parst eine Markdown-Datei mit YAML Front Matter + * @param {string} filePath - Pfad zur Markdown-Datei + * @param {Object} options - Optionale Konfiguration + * @param {boolean} options.extractYaml - YAML extrahieren (default: true) + * @param {boolean} options.parseGfm - GitHub Flavored Markdown (default: true) + * @param {boolean} options.extractAMB - AMB-Metadaten extrahieren (default: true) + * @returns {Promise} Parsed result + */ +export async function parseMarkdownFile(filePath, options = {}) { + const { + extractYaml = true, + parseGfm = true, + extractAMB = true + } = options + + try { + // Datei einlesen + const markdownContent = await readFile(filePath, 'utf-8') + + // Markdown parsen + return await parseMarkdownString(markdownContent, { + extractYaml, + parseGfm, + extractAMB + }) + } catch (error) { + throw new Error(`Failed to parse Markdown file: ${error.message}`) + } +} + +/** + * Parst einen Markdown-String mit YAML Front Matter + * @param {string} markdownContent - Markdown-Content als String + * @param {Object} options - Optionale Konfiguration + * @returns {Promise} Parsed result + */ +export async function parseMarkdownString(markdownContent, options = {}) { + const { + extractYaml = true, + parseGfm = true, + extractAMB = true + } = options + + const result = { + raw: markdownContent, + yaml: null, + metadata: null, + ast: null, + content: null + } + + try { + // YAML Front Matter extrahieren + if (extractYaml) { + result.yaml = extractYAML(markdownContent) + + // AMB-Metadaten extrahieren + if (extractAMB && result.yaml) { + result.metadata = extractAMBMetadata(result.yaml) + } + } + + // Content ohne YAML + const contentWithoutYAML = removeYAML(markdownContent) + result.content = contentWithoutYAML + + // unified Pipeline aufbauen + const processor = unified() + .use(remarkParse) // Markdown → AST + .use(remarkFrontmatter, ['yaml']) // YAML Front Matter Support + + // Optional: GitHub Flavored Markdown + if (parseGfm) { + processor.use(remarkGfm) + } + + // Markdown parsen → AST + const ast = processor.parse(contentWithoutYAML) + result.ast = ast + + return result + } catch (error) { + throw new Error(`Failed to parse Markdown string: ${error.message}`) + } +} + +/** + * Konvertiert Markdown AST zurück zu Markdown-String + * @param {Object} ast - Markdown Abstract Syntax Tree + * @returns {Promise} Markdown-String + */ +export async function astToMarkdown(ast) { + const processor = unified() + .use(remarkStringify) + + const markdown = processor.stringify(ast) + return markdown +} + +/** + * Erstellt eine vorkonfigurierte unified Pipeline + * @param {Object} options - Pipeline-Optionen + * @returns {Object} unified Processor + */ +export function createMarkdownProcessor(options = {}) { + const { + parseGfm = true, + frontmatter = true + } = options + + const processor = unified() + .use(remarkParse) + + if (frontmatter) { + processor.use(remarkFrontmatter, ['yaml']) + } + + if (parseGfm) { + processor.use(remarkGfm) + } + + return processor +} + +/** + * Extrahiert alle Überschriften aus einem Markdown AST + * @param {Object} ast - Markdown AST + * @returns {Array} Array von Überschriften mit Level und Text + */ +export function extractHeadings(ast) { + const headings = [] + + function visit(node) { + if (node.type === 'heading') { + headings.push({ + level: node.depth, + text: extractTextFromNode(node) + }) + } + + if (node.children) { + node.children.forEach(visit) + } + } + + visit(ast) + return headings +} + +/** + * Extrahiert alle Links aus einem Markdown AST + * @param {Object} ast - Markdown AST + * @returns {Array} Array von Links mit URL und Text + */ +export function extractLinks(ast) { + const links = [] + + function visit(node) { + if (node.type === 'link') { + links.push({ + url: node.url, + title: node.title || null, + text: extractTextFromNode(node) + }) + } + + if (node.children) { + node.children.forEach(visit) + } + } + + visit(ast) + return links +} + +/** + * Extrahiert alle Bilder aus einem Markdown AST + * @param {Object} ast - Markdown AST + * @returns {Array} Array von Bildern mit URL, Alt-Text und Title + */ +export function extractImages(ast) { + const images = [] + + function visit(node) { + if (node.type === 'image') { + images.push({ + url: node.url, + alt: node.alt || null, + title: node.title || null + }) + } + + if (node.children) { + node.children.forEach(visit) + } + } + + visit(ast) + return images +} + +/** + * Hilfsfunktion: Extrahiert Text aus einem AST-Node + * @param {Object} node - AST-Node + * @returns {string} Extrahierter Text + */ +function extractTextFromNode(node) { + if (node.type === 'text') { + return node.value + } + + if (node.children) { + return node.children + .map(extractTextFromNode) + .join('') + } + + return '' +} diff --git a/test/fixtures/example.md b/test/fixtures/example.md new file mode 100644 index 0000000..be8ff05 --- /dev/null +++ b/test/fixtures/example.md @@ -0,0 +1,66 @@ +--- +commonMetadata: + '@context': https://schema.org/ + creativeWorkStatus: Published + type: LearningResource + name: Beispiel für OER-Material + description: >- + Dies ist ein Beispiel für eine Markdown-Datei mit AMB-konformen Metadaten. + Sie demonstriert die verschiedenen Felder des AMB-Standards. + license: https://creativecommons.org/licenses/by/4.0/deed.de + id: https://example.org/beispiel-oer + creator: + - givenName: Max + familyName: Mustermann + id: https://orcid.org/0000-0000-0000-0001 + type: Person + affiliation: + name: Beispiel-Universität + id: https://ror.org/example123 + type: Organization + - givenName: Erika + familyName: Musterfrau + id: https://orcid.org/0000-0000-0000-0002 + type: Person + inLanguage: + - de + about: + - https://w3id.org/kim/hochschulfaechersystematik/n079 + image: https://example.org/images/beispiel.jpg + learningResourceType: + - https://w3id.org/kim/hcrt/text + educationalLevel: + - https://w3id.org/kim/educationalLevel/level_A + datePublished: '2025-10-01' +title: Beispiel für OER-Material +tags: + - OER + - Bildung + - Beispiel +--- + +# Beispiel für OER-Material + +## Einleitung + +Dies ist ein **Beispiel** für eine Markdown-Datei mit YAML Front Matter nach AMB-Standard. + +## Hauptinhalt + +### Erste Unterüberschrift + +Hier ist etwas *Text* mit verschiedenen Formatierungen: + +- Liste Item 1 +- Liste Item 2 +- Liste Item 3 + +### Zweite Unterüberschrift + +Ein Link zu [GitHub](https://github.com) und ein Bild: + +![Alt-Text für Bild](https://example.org/image.png "Bild-Titel") + +## Fazit + +Das war ein einfaches Beispiel für strukturierte Bildungsinhalte. diff --git a/test/parser.test.js b/test/parser.test.js new file mode 100644 index 0000000..04146a1 --- /dev/null +++ b/test/parser.test.js @@ -0,0 +1,184 @@ +/** + * Tests für MDParser + * Nutzt Node.js native test runner + */ + +import { test } from 'node:test' +import assert from 'node:assert' +import { parseMarkdownFile, parseMarkdownString } from '../src/parser.js' +import { extractYAML, hasYAML, removeYAML } from '../src/extractors/yaml-extractor.js' +import { extractAMBMetadata, validateAMBMetadata } from '../src/extractors/amb-extractor.js' +import { join, dirname } from 'path' +import { fileURLToPath } from 'url' + +const __dirname = dirname(fileURLToPath(import.meta.url)) + +test('YAML Extractor: extrahiert YAML Front Matter', () => { + const markdown = `--- +title: Test +author: Max +--- +# Content` + + const yaml = extractYAML(markdown) + + assert.ok(yaml, 'YAML sollte extrahiert werden') + assert.strictEqual(yaml.title, 'Test') + assert.strictEqual(yaml.author, 'Max') +}) + +test('YAML Extractor: erkennt YAML Front Matter', () => { + const withYAML = `---\ntitle: Test\n---\nContent` + const withoutYAML = `# Heading\nContent` + + assert.strictEqual(hasYAML(withYAML), true) + assert.strictEqual(hasYAML(withoutYAML), false) +}) + +test('YAML Extractor: entfernt YAML Front Matter', () => { + const markdown = `--- +title: Test +--- +# Content` + + const result = removeYAML(markdown) + + assert.ok(!result.includes('---'), 'YAML sollte entfernt sein') + assert.ok(result.includes('# Content'), 'Content sollte bleiben') +}) + +test('AMB Extractor: extrahiert Metadaten aus YAML', () => { + const yaml = { + commonMetadata: { + '@context': 'https://schema.org/', + type: 'LearningResource', + name: 'Test Resource', + description: 'Test Description', + license: 'https://creativecommons.org/licenses/by/4.0/', + datePublished: '2025-10-01', + creator: [{ + givenName: 'Max', + familyName: 'Mustermann', + type: 'Person' + }] + } + } + + const metadata = extractAMBMetadata(yaml) + + assert.strictEqual(metadata.name, 'Test Resource') + assert.strictEqual(metadata.description, 'Test Description') + assert.strictEqual(metadata.type, 'LearningResource') + assert.ok(metadata.creator, 'Creator sollte vorhanden sein') + assert.strictEqual(metadata.creator[0].name, 'Max Mustermann') +}) + +test('AMB Extractor: verwendet Fallbacks', () => { + const yaml = { + title: 'Fallback Title', + summary: 'Fallback Description', + author: 'Max Mustermann' + } + + const metadata = extractAMBMetadata(yaml) + + assert.strictEqual(metadata.name, 'Fallback Title') + assert.strictEqual(metadata.description, 'Fallback Description') + assert.ok(metadata._warnings, 'Warnings sollten vorhanden sein') + assert.ok(metadata._warnings.length > 0, 'Es sollten Warnings existieren') +}) + +test('AMB Extractor: validiert Metadaten', () => { + const completeMetadata = { + name: 'Test', + description: 'Description', + license: 'CC-BY-4.0', + creator: [{ name: 'Max' }], + datePublished: '2025-10-01', + about: ['topic'], + id: 'https://example.org/test' + } + + const validation = validateAMBMetadata(completeMetadata) + + assert.strictEqual(validation.valid, true) + assert.strictEqual(validation.errors.length, 0) +}) + +test('Parser: parst Markdown-String', async () => { + const markdown = `--- +title: Test +--- +# Heading + +Some **bold** text.` + + const result = await parseMarkdownString(markdown) + + assert.ok(result.yaml, 'YAML sollte extrahiert sein') + assert.ok(result.ast, 'AST sollte existieren') + assert.ok(result.content, 'Content sollte existieren') + assert.strictEqual(result.yaml.title, 'Test') +}) + +test('Parser: parst lokale Markdown-Datei', async () => { + const filePath = join(__dirname, 'fixtures/example.md') + + const result = await parseMarkdownFile(filePath) + + assert.ok(result.yaml, 'YAML sollte extrahiert sein') + assert.ok(result.metadata, 'Metadaten sollten extrahiert sein') + assert.ok(result.ast, 'AST sollte existieren') + + // Prüfe AMB-Metadaten + assert.strictEqual(result.metadata.name, 'Beispiel für OER-Material') + assert.strictEqual(result.metadata.type, 'LearningResource') + assert.ok(result.metadata.creator, 'Creator sollte vorhanden sein') + assert.strictEqual(result.metadata.creator.length, 2, 'Sollte 2 Creators haben') +}) + +test('Parser: extrahiert Überschriften', async () => { + const markdown = `# H1 +## H2 +### H3` + + const result = await parseMarkdownString(markdown) + const { extractHeadings } = await import('../src/parser.js') + const headings = extractHeadings(result.ast) + + assert.strictEqual(headings.length, 3) + assert.strictEqual(headings[0].level, 1) + assert.strictEqual(headings[0].text, 'H1') + assert.strictEqual(headings[1].level, 2) + assert.strictEqual(headings[2].level, 3) +}) + +test('Parser: extrahiert Links', async () => { + const markdown = `[Link 1](https://example.com) +[Link 2](https://github.com "GitHub")` + + const result = await parseMarkdownString(markdown) + const { extractLinks } = await import('../src/parser.js') + const links = extractLinks(result.ast) + + assert.strictEqual(links.length, 2) + assert.strictEqual(links[0].url, 'https://example.com') + assert.strictEqual(links[0].text, 'Link 1') + assert.strictEqual(links[1].url, 'https://github.com') + assert.strictEqual(links[1].title, 'GitHub') +}) + +test('Parser: extrahiert Bilder', async () => { + const markdown = `![Alt Text](image.png "Title")` + + const result = await parseMarkdownString(markdown) + const { extractImages } = await import('../src/parser.js') + const images = extractImages(result.ast) + + assert.strictEqual(images.length, 1) + assert.strictEqual(images[0].url, 'image.png') + assert.strictEqual(images[0].alt, 'Alt Text') + assert.strictEqual(images[0].title, 'Title') +}) + +console.log('✅ Alle Tests erfolgreich!')