feat: implement core parser and Forgejo API client

Implementiert:
- YAML Front Matter Extractor mit parse/remove/has Funktionen
- AMB Metadata Extractor für Schema.org-konforme Metadaten
- Core Parser mit unified/remark Pipeline
  - parseMarkdownFile für lokale Dateien
  - parseMarkdownString für Strings
  - extractHeadings, extractLinks, extractImages Utilities
- Forgejo API Client
  - getFileContent, listDirectory, listPosts
  - getPostContent, getAllPosts
  - Volle API-Integration mit Token-Auth
- Public API in src/index.js
- 3 Beispiele (parse-forgejo, list-all-posts, parse-local)
- 11 Unit Tests (alle passing)
- Test-Fixtures mit AMB-konformen Beispieldaten

Tests: 11 passing 
Beispiel erfolgreich mit echtem Forgejo-Repo getestet 
This commit is contained in:
Jörg Lohrer 2025-10-01 15:37:55 +02:00
parent fbd6630f6d
commit c31423d811
10 changed files with 1376 additions and 0 deletions

View file

@ -0,0 +1,67 @@
/**
* Beispiel: Alle Posts von Forgejo abrufen und analysieren
*/
import { createForgejoClient } from '../src/forgejo-client.js'
import { parseMarkdownString } from '../src/parser.js'
async function main() {
console.log('🚀 Alle Posts von Forgejo abrufen\n')
try {
const client = createForgejoClient()
console.log('📡 Liste alle Posts...')
const postDirs = await client.listPosts()
console.log(`${postDirs.length} Posts gefunden\n`)
// Ersten 5 Posts parsen
const limit = 5
console.log(`🔍 Parse die ersten ${limit} Posts...\n`)
for (let i = 0; i < Math.min(limit, postDirs.length); i++) {
const dir = postDirs[i]
console.log(`\n📄 [${i + 1}/${limit}] ${dir.name}`)
console.log('─'.repeat(60))
try {
const markdown = await client.getPostContent(dir.name)
const result = await parseMarkdownString(markdown)
if (result.metadata) {
console.log(` Titel: ${result.metadata.name || 'Unbekannt'}`)
console.log(` Typ: ${result.metadata.type}`)
console.log(` Datum: ${result.metadata.datePublished || 'N/A'}`)
console.log(` Lizenz: ${result.metadata.license || 'N/A'}`)
if (result.metadata.creator) {
const authors = result.metadata.creator
.map(c => c.name || `${c.givenName} ${c.familyName}`)
.join(', ')
console.log(` Autoren: ${authors}`)
}
console.log(` Content: ${result.content.length} Zeichen`)
} else {
console.log(' ⚠️ Keine Metadaten gefunden')
}
} catch (error) {
console.log(` ❌ Fehler: ${error.message}`)
}
}
console.log('\n\n📊 Zusammenfassung:')
console.log(` Gesamt: ${postDirs.length} Posts im Repository`)
console.log(` Analysiert: ${Math.min(limit, postDirs.length)} Posts`)
console.log('\n✅ Fertig!')
} catch (error) {
console.error('❌ Fehler:', error.message)
process.exit(1)
}
}
main()

101
examples/parse-forgejo.js Normal file
View file

@ -0,0 +1,101 @@
/**
* Beispiel: Markdown-Datei von Forgejo API abrufen und parsen
*/
import { createForgejoClient } from '../src/forgejo-client.js'
import { parseMarkdownString } from '../src/parser.js'
async function main() {
console.log('🚀 Forgejo API Beispiel\n')
try {
// Forgejo Client erstellen (nutzt .env Konfiguration)
const client = createForgejoClient()
console.log('📡 Verbinde mit Forgejo API...')
console.log(` Repository: ${client.owner}/${client.repo}`)
console.log(` Branch: ${client.branch}\n`)
// Repository-Info abrufen
const repo = await client.getRepository()
console.log('✅ Repository gefunden:')
console.log(` Name: ${repo.name}`)
console.log(` Beschreibung: ${repo.description}`)
console.log(` Sprache: ${repo.language}\n`)
// Beispiel-Post abrufen
const postPath = '2025-04-20-OER-und-Symbole'
console.log(`📄 Rufe Post ab: ${postPath}`)
const markdown = await client.getPostContent(postPath)
console.log(`✅ Markdown geladen (${markdown.length} Zeichen)\n`)
// Markdown parsen
console.log('🔍 Parse Markdown...')
const result = await parseMarkdownString(markdown)
// Ergebnisse anzeigen
console.log('\n📊 Parse-Ergebnisse:\n')
if (result.metadata) {
console.log('🏷️ Metadaten:')
console.log(` Titel: ${result.metadata.name}`)
console.log(` Typ: ${result.metadata.type}`)
console.log(` Lizenz: ${result.metadata.license}`)
console.log(` Datum: ${result.metadata.datePublished}`)
if (result.metadata.creator) {
console.log(' Autoren:')
result.metadata.creator.forEach(creator => {
const name = creator.name || `${creator.givenName} ${creator.familyName}`
console.log(` - ${name}`)
if (creator.id) console.log(` ORCID: ${creator.id}`)
})
}
if (result.metadata._warnings && result.metadata._warnings.length > 0) {
console.log('\n⚠ Warnings:')
result.metadata._warnings.forEach(w => console.log(` - ${w}`))
}
}
console.log('\n📝 Content:')
console.log(` Länge: ${result.content.length} Zeichen`)
console.log(` AST Nodes: ${countNodes(result.ast)}`)
// Überschriften extrahieren
const { extractHeadings } = await import('../src/parser.js')
const headings = extractHeadings(result.ast)
if (headings.length > 0) {
console.log('\n📑 Überschriften:')
headings.slice(0, 5).forEach(h => {
const indent = ' '.repeat(h.level - 1)
console.log(` ${indent}H${h.level}: ${h.text}`)
})
if (headings.length > 5) {
console.log(` ... und ${headings.length - 5} weitere`)
}
}
console.log('\n✅ Erfolgreich!')
} catch (error) {
console.error('❌ Fehler:', error.message)
process.exit(1)
}
}
// Hilfsfunktion: Zähle AST-Nodes
function countNodes(node) {
let count = 1
if (node.children) {
node.children.forEach(child => {
count += countNodes(child)
})
}
return count
}
// Ausführen
main()

90
examples/parse-local.js Normal file
View file

@ -0,0 +1,90 @@
/**
* Beispiel: Lokale Markdown-Datei parsen
*/
import { parseMarkdownFile } from '../src/parser.js'
import { join, dirname } from 'path'
import { fileURLToPath } from 'url'
import { writeFile } from 'fs/promises'
const __dirname = dirname(fileURLToPath(import.meta.url))
async function main() {
console.log('🚀 Lokale Datei parsen\n')
try {
// Beispiel-Markdown-Datei
const filePath = join(__dirname, '../test/fixtures/example.md')
console.log(`📄 Parse Datei: ${filePath}`)
const result = await parseMarkdownFile(filePath)
console.log('\n✅ Erfolgreich geparst!\n')
// Metadaten ausgeben
if (result.metadata) {
console.log('📋 Metadaten:')
console.log(JSON.stringify(result.metadata, null, 2))
}
// YAML ausgeben
if (result.yaml) {
console.log('\n📝 YAML Front Matter:')
console.log(JSON.stringify(result.yaml, null, 2))
}
// AST-Struktur
console.log('\n🌲 AST Root:')
console.log(` Type: ${result.ast.type}`)
console.log(` Children: ${result.ast.children?.length || 0}`)
// Überschriften
const { extractHeadings } = await import('../src/parser.js')
const headings = extractHeadings(result.ast)
if (headings.length > 0) {
console.log('\n📑 Überschriften:')
headings.forEach(h => {
const indent = ' '.repeat(h.level - 1)
console.log(` ${indent}H${h.level}: ${h.text}`)
})
}
// Links
const { extractLinks } = await import('../src/parser.js')
const links = extractLinks(result.ast)
if (links.length > 0) {
console.log('\n🔗 Links:')
links.slice(0, 5).forEach(link => {
console.log(` - ${link.text || 'Kein Text'}: ${link.url}`)
})
if (links.length > 5) {
console.log(` ... und ${links.length - 5} weitere`)
}
}
// Bilder
const { extractImages } = await import('../src/parser.js')
const images = extractImages(result.ast)
if (images.length > 0) {
console.log('\n🖼 Bilder:')
images.forEach(img => {
console.log(` - ${img.alt || 'Kein Alt-Text'}: ${img.url}`)
})
}
// Optional: Ergebnis als JSON speichern
const outputPath = join(__dirname, '../test/output/result.json')
await writeFile(outputPath, JSON.stringify(result, null, 2))
console.log(`\n💾 Ergebnis gespeichert: ${outputPath}`)
} catch (error) {
console.error('❌ Fehler:', error.message)
process.exit(1)
}
}
main()

View file

@ -0,0 +1,267 @@
/**
* AMB Metadata Extractor
* Extrahiert und transformiert Schema.org-konforme AMB-Metadaten
* aus YAML Front Matter
*/
/**
* Extrahiert AMB-konforme Metadaten aus YAML-Objekt
* @param {Object} yamlObject - Geparstes YAML Front Matter
* @returns {Object} Schema.org-konforme Metadaten
*/
export function extractAMBMetadata(yamlObject) {
if (!yamlObject || typeof yamlObject !== 'object') {
return createEmptyMetadata()
}
const warnings = []
const commonMetadata = yamlObject.commonMetadata || {}
// Basis-Metadaten extrahieren
const metadata = {
'@context': commonMetadata['@context'] || 'https://schema.org/',
type: commonMetadata.type || 'LearningResource',
// Titel
name: extractField(commonMetadata, 'name', yamlObject.title, warnings),
// Beschreibung
description: extractField(
commonMetadata,
'description',
yamlObject.summary || yamlObject.description,
warnings
),
// Lizenz
license: commonMetadata.license || null,
// ID/URL
id: commonMetadata.id || commonMetadata.url || yamlObject.url || null,
// Sprache
inLanguage: commonMetadata.inLanguage || null,
// Veröffentlichungsdatum
datePublished: extractDate(
commonMetadata.datePublished || yamlObject.datePublished
),
// Autoren/Creator
creator: extractCreators(commonMetadata.creator, yamlObject.author),
// Bild
image: extractImage(commonMetadata.image, yamlObject.cover?.image),
// Themen/Tags
about: commonMetadata.about || null,
// Lernressourcentyp
learningResourceType: commonMetadata.learningResourceType || null,
// Bildungsniveau
educationalLevel: commonMetadata.educationalLevel || null,
// Status
creativeWorkStatus: commonMetadata.creativeWorkStatus || null
}
// Warnings hinzufügen wenn vorhanden
if (warnings.length > 0) {
metadata._warnings = warnings
}
// Zusätzliche Metadaten aus staticSiteGenerator (Hugo/PaperMod)
if (yamlObject.tags) {
metadata._tags = yamlObject.tags
}
return metadata
}
/**
* Erstellt leeres Metadaten-Objekt mit Defaults
* @returns {Object} Leeres Metadaten-Objekt
*/
function createEmptyMetadata() {
return {
'@context': 'https://schema.org/',
type: 'LearningResource',
name: null,
description: null,
_warnings: ['Keine YAML-Metadaten gefunden']
}
}
/**
* Extrahiert ein Feld mit Fallback und Warning
* @param {Object} source - Haupt-Quelle
* @param {string} field - Feldname
* @param {*} fallback - Fallback-Wert
* @param {Array} warnings - Warning-Array
* @returns {*} Extrahierter Wert
*/
function extractField(source, field, fallback, warnings) {
if (source && source[field]) {
return source[field]
}
if (fallback) {
warnings.push(`Feld 'commonMetadata.${field}' fehlt, verwende Fallback`)
return fallback
}
warnings.push(`Pflichtfeld 'commonMetadata.${field}' fehlt`)
return null
}
/**
* Extrahiert und normalisiert Datum
* @param {string|Date} dateValue - Datum als String oder Date-Objekt
* @returns {string|null} ISO 8601 Datum oder null
*/
function extractDate(dateValue) {
if (!dateValue) return null
try {
const date = new Date(dateValue)
return date.toISOString().split('T')[0] // YYYY-MM-DD
} catch (error) {
return dateValue // Falls Parsing fehlschlägt, Original zurückgeben
}
}
/**
* Extrahiert Creator/Author-Informationen
* @param {Array|Object} creators - Creator aus commonMetadata
* @param {Array|string} authors - Author aus staticSiteGenerator
* @returns {Array|null} Array von Creator-Objekten
*/
function extractCreators(creators, authors) {
// Priorität: commonMetadata.creator
if (creators) {
if (Array.isArray(creators)) {
return creators.map(normalizeCreator)
}
return [normalizeCreator(creators)]
}
// Fallback: author (einfacher String oder Array)
if (authors) {
if (Array.isArray(authors)) {
return authors.map(name => ({
type: 'Person',
name: name
}))
}
return [{
type: 'Person',
name: authors
}]
}
return null
}
/**
* Normalisiert Creator-Objekt nach Schema.org
* @param {Object} creator - Creator-Objekt
* @returns {Object} Normalisiertes Creator-Objekt
*/
function normalizeCreator(creator) {
if (typeof creator === 'string') {
return {
type: 'Person',
name: creator
}
}
const normalized = {
type: creator.type || 'Person'
}
// Person
if (creator.givenName || creator.familyName) {
normalized.givenName = creator.givenName
normalized.familyName = creator.familyName
normalized.name = `${creator.givenName || ''} ${creator.familyName || ''}`.trim()
} else if (creator.name) {
normalized.name = creator.name
}
// ID (ORCID, ROR, etc.)
if (creator.id) {
normalized.id = creator.id
}
// Affiliation
if (creator.affiliation) {
normalized.affiliation = normalizeOrganization(creator.affiliation)
}
return normalized
}
/**
* Normalisiert Organization-Objekt
* @param {Object|string} org - Organization
* @returns {Object} Normalisiertes Organization-Objekt
*/
function normalizeOrganization(org) {
if (typeof org === 'string') {
return {
type: 'Organization',
name: org
}
}
return {
type: 'Organization',
name: org.name,
id: org.id || null
}
}
/**
* Extrahiert Bild-URL
* @param {string} ambImage - Bild aus commonMetadata
* @param {string} coverImage - Bild aus cover
* @returns {string|null} Bild-URL
*/
function extractImage(ambImage, coverImage) {
return ambImage || coverImage || null
}
/**
* Validiert AMB-Metadaten auf Vollständigkeit
* @param {Object} metadata - Zu validierende Metadaten
* @returns {Object} Validierungs-Ergebnis
*/
export function validateAMBMetadata(metadata) {
const errors = []
const warnings = []
// Pflichtfelder
const requiredFields = ['name', 'description', 'license']
requiredFields.forEach(field => {
if (!metadata[field]) {
errors.push(`Pflichtfeld fehlt: ${field}`)
}
})
// Empfohlene Felder
const recommendedFields = ['creator', 'datePublished', 'about', 'id']
recommendedFields.forEach(field => {
if (!metadata[field]) {
warnings.push(`Empfohlenes Feld fehlt: ${field}`)
}
})
return {
valid: errors.length === 0,
errors,
warnings
}
}

View file

@ -0,0 +1,65 @@
/**
* YAML Extractor
* Extrahiert und parst YAML Front Matter aus Markdown-Inhalten
*/
import { parse as parseYaml } from 'yaml'
/**
* Extrahiert YAML Front Matter aus Markdown-Content
* @param {string} markdownContent - Roher Markdown-Content
* @returns {Object|null} Geparstes YAML-Objekt oder null
*/
export function extractYAML(markdownContent) {
if (!markdownContent || typeof markdownContent !== 'string') {
return null
}
// YAML Front Matter Pattern: ---\n...\n---
const yamlPattern = /^---\s*\n([\s\S]*?)\n---\s*\n/
const match = markdownContent.match(yamlPattern)
if (!match || !match[1]) {
return null
}
try {
const yamlString = match[1]
const parsed = parseYaml(yamlString)
return parsed
} catch (error) {
console.error('YAML Parse Error:', error.message)
return {
_error: 'YAML parsing failed',
_errorDetails: error.message
}
}
}
/**
* Entfernt YAML Front Matter aus Markdown-Content
* @param {string} markdownContent - Markdown mit YAML Front Matter
* @returns {string} Markdown ohne Front Matter
*/
export function removeYAML(markdownContent) {
if (!markdownContent || typeof markdownContent !== 'string') {
return markdownContent
}
const yamlPattern = /^---\s*\n[\s\S]*?\n---\s*\n/
return markdownContent.replace(yamlPattern, '').trim()
}
/**
* Validiert, ob ein String YAML Front Matter enthält
* @param {string} markdownContent - Zu prüfender Content
* @returns {boolean} True wenn YAML Front Matter vorhanden
*/
export function hasYAML(markdownContent) {
if (!markdownContent || typeof markdownContent !== 'string') {
return false
}
const yamlPattern = /^---\s*\n[\s\S]*?\n---\s*\n/
return yamlPattern.test(markdownContent)
}

232
src/forgejo-client.js Normal file
View file

@ -0,0 +1,232 @@
/**
* Forgejo/Gitea API Client
* Ermöglicht Zugriff auf Repository-Inhalte über die Forgejo/Gitea API
*/
import { config } from 'dotenv'
// Environment-Variablen laden
config()
/**
* Forgejo API Client
*/
export class ForgejoClient {
/**
* @param {Object} options - Konfiguration
* @param {string} options.baseUrl - API Base URL
* @param {string} options.owner - Repository Owner
* @param {string} options.repo - Repository Name
* @param {string} options.branch - Branch (default: main)
* @param {string} options.token - API Token (optional für öffentliche Repos)
*/
constructor(options = {}) {
this.baseUrl = options.baseUrl || process.env.FORGEJO_API_BASE_URL
this.owner = options.owner || process.env.FORGEJO_OWNER
this.repo = options.repo || process.env.FORGEJO_REPO
this.branch = options.branch || process.env.FORGEJO_BRANCH || 'main'
this.token = options.token || process.env.FORGEJO_TOKEN
if (!this.baseUrl || !this.owner || !this.repo) {
throw new Error('Forgejo client requires baseUrl, owner, and repo')
}
}
/**
* Erstellt Request-Headers mit optionalem Token
* @returns {Object} Headers
*/
getHeaders() {
const headers = {
'Accept': 'application/json'
}
if (this.token) {
headers['Authorization'] = `token ${this.token}`
}
return headers
}
/**
* Führt einen API-Request aus
* @param {string} endpoint - API-Endpoint
* @returns {Promise<Object>} Response-Daten
*/
async request(endpoint) {
const url = `${this.baseUrl}${endpoint}`
try {
const response = await fetch(url, {
headers: this.getHeaders()
})
if (!response.ok) {
throw new Error(
`Forgejo API Error: ${response.status} ${response.statusText}`
)
}
return await response.json()
} catch (error) {
throw new Error(`Failed to fetch from Forgejo: ${error.message}`)
}
}
/**
* Ruft Dateiinhalt aus dem Repository ab
* @param {string} path - Dateipfad im Repository
* @param {string} ref - Branch/Tag/Commit (optional)
* @returns {Promise<string>} Dateiinhalt als String
*/
async getFileContent(path, ref = null) {
const branch = ref || this.branch
const endpoint = `/repos/${this.owner}/${this.repo}/contents/${path}?ref=${branch}`
try {
const data = await this.request(endpoint)
// Forgejo gibt Base64-kodierten Content zurück
if (data.content && data.encoding === 'base64') {
return Buffer.from(data.content, 'base64').toString('utf-8')
}
// Fallback: Download-URL verwenden
if (data.download_url) {
const response = await fetch(data.download_url)
return await response.text()
}
throw new Error('No content or download_url in response')
} catch (error) {
throw new Error(`Failed to get file content: ${error.message}`)
}
}
/**
* Listet Inhalte eines Verzeichnisses auf
* @param {string} path - Verzeichnispfad
* @param {string} ref - Branch/Tag/Commit (optional)
* @returns {Promise<Array>} Array von Dateien/Verzeichnissen
*/
async listDirectory(path, ref = null) {
const branch = ref || this.branch
const endpoint = `/repos/${this.owner}/${this.repo}/contents/${path}?ref=${branch}`
try {
const data = await this.request(endpoint)
if (!Array.isArray(data)) {
throw new Error('Expected directory listing, got single file')
}
return data.map(item => ({
name: item.name,
path: item.path,
type: item.type, // 'file' oder 'dir'
size: item.size,
sha: item.sha,
url: item.url,
download_url: item.download_url
}))
} catch (error) {
throw new Error(`Failed to list directory: ${error.message}`)
}
}
/**
* Listet alle Posts aus dem Posts-Verzeichnis
* @param {string} postsDir - Pfad zum Posts-Verzeichnis
* @returns {Promise<Array>} Array von Post-Verzeichnissen
*/
async listPosts(postsDir = 'Website/content/posts') {
try {
const contents = await this.listDirectory(postsDir)
// Nur Verzeichnisse zurückgeben
const postDirs = contents.filter(item => item.type === 'dir')
return postDirs
} catch (error) {
throw new Error(`Failed to list posts: ${error.message}`)
}
}
/**
* Ruft index.md aus einem Post-Verzeichnis ab
* @param {string} postDir - Post-Verzeichnis (z.B. "2025-04-20-OER-und-Symbole")
* @param {string} postsBaseDir - Basis-Pfad (default: "Website/content/posts")
* @returns {Promise<string>} Markdown-Content
*/
async getPostContent(postDir, postsBaseDir = 'Website/content/posts') {
const indexPath = `${postsBaseDir}/${postDir}/index.md`
return await this.getFileContent(indexPath)
}
/**
* Ruft alle Posts mit Content ab
* @param {string} postsDir - Posts-Verzeichnis
* @returns {Promise<Array>} Array von Posts mit Content
*/
async getAllPosts(postsDir = 'Website/content/posts') {
try {
const postDirs = await this.listPosts(postsDir)
const posts = await Promise.all(
postDirs.map(async (dir) => {
try {
const content = await this.getPostContent(dir.name, postsDir)
return {
directory: dir.name,
path: `${postsDir}/${dir.name}/index.md`,
content,
metadata: dir
}
} catch (error) {
console.warn(`Failed to fetch post ${dir.name}:`, error.message)
return null
}
})
)
// Null-Werte filtern (fehlgeschlagene Requests)
return posts.filter(post => post !== null)
} catch (error) {
throw new Error(`Failed to get all posts: ${error.message}`)
}
}
/**
* Ruft Repository-Informationen ab
* @returns {Promise<Object>} Repository-Daten
*/
async getRepository() {
const endpoint = `/repos/${this.owner}/${this.repo}`
return await this.request(endpoint)
}
/**
* Sucht nach Dateien im Repository
* @param {string} query - Suchbegriff
* @returns {Promise<Array>} Suchergebnisse
*/
async searchFiles(query) {
const endpoint = `/repos/${this.owner}/${this.repo}/search?q=${encodeURIComponent(query)}`
try {
const data = await this.request(endpoint)
return data.data || []
} catch (error) {
throw new Error(`Failed to search files: ${error.message}`)
}
}
}
/**
* Factory-Funktion: Erstellt ForgejoClient mit Defaults aus .env
* @param {Object} overrides - Optionale Overrides
* @returns {ForgejoClient} Konfigurierter Client
*/
export function createForgejoClient(overrides = {}) {
return new ForgejoClient(overrides)
}

70
src/index.js Normal file
View file

@ -0,0 +1,70 @@
/**
* MDParser - Main Entry Point
* Markdown to JSON Parser für AMB-konforme Inhalte
*/
// Parser
export {
parseMarkdownFile,
parseMarkdownString,
astToMarkdown,
createMarkdownProcessor,
extractHeadings,
extractLinks,
extractImages
} from './parser.js'
// YAML Extractor
export {
extractYAML,
removeYAML,
hasYAML
} from './extractors/yaml-extractor.js'
// AMB Metadata Extractor
export {
extractAMBMetadata,
validateAMBMetadata
} from './extractors/amb-extractor.js'
// Forgejo Client
export {
ForgejoClient,
createForgejoClient
} from './forgejo-client.js'
/**
* Convenience-Funktion: Parst Markdown von verschiedenen Quellen
* @param {string} source - Dateipfad, URL oder Markdown-String
* @param {Object} options - Parser-Optionen
* @returns {Promise<Object>} Parsed result
*/
export async function parse(source, options = {}) {
const { parseMarkdownFile, parseMarkdownString } = await import('./parser.js')
// Prüfe ob es ein Dateipfad ist
if (source.startsWith('/') || source.startsWith('./') || source.startsWith('../')) {
return parseMarkdownFile(source, options)
}
// Prüfe ob es eine URL ist
if (source.startsWith('http://') || source.startsWith('https://')) {
const response = await fetch(source)
const markdown = await response.text()
return parseMarkdownString(markdown, options)
}
// Ansonsten als Markdown-String behandeln
return parseMarkdownString(source, options)
}
// Default Export
export default {
parse,
parseMarkdownFile,
parseMarkdownString,
ForgejoClient,
createForgejoClient,
extractYAML,
extractAMBMetadata
}

234
src/parser.js Normal file
View file

@ -0,0 +1,234 @@
/**
* Core Markdown Parser
* Nutzt unified/remark für Markdown-Parsing mit YAML Front Matter
*/
import { unified } from 'unified'
import remarkParse from 'remark-parse'
import remarkFrontmatter from 'remark-frontmatter'
import remarkGfm from 'remark-gfm'
import remarkStringify from 'remark-stringify'
import { readFile } from 'fs/promises'
import { extractYAML, removeYAML } from './extractors/yaml-extractor.js'
import { extractAMBMetadata } from './extractors/amb-extractor.js'
/**
* Parst eine Markdown-Datei mit YAML Front Matter
* @param {string} filePath - Pfad zur Markdown-Datei
* @param {Object} options - Optionale Konfiguration
* @param {boolean} options.extractYaml - YAML extrahieren (default: true)
* @param {boolean} options.parseGfm - GitHub Flavored Markdown (default: true)
* @param {boolean} options.extractAMB - AMB-Metadaten extrahieren (default: true)
* @returns {Promise<Object>} Parsed result
*/
export async function parseMarkdownFile(filePath, options = {}) {
const {
extractYaml = true,
parseGfm = true,
extractAMB = true
} = options
try {
// Datei einlesen
const markdownContent = await readFile(filePath, 'utf-8')
// Markdown parsen
return await parseMarkdownString(markdownContent, {
extractYaml,
parseGfm,
extractAMB
})
} catch (error) {
throw new Error(`Failed to parse Markdown file: ${error.message}`)
}
}
/**
* Parst einen Markdown-String mit YAML Front Matter
* @param {string} markdownContent - Markdown-Content als String
* @param {Object} options - Optionale Konfiguration
* @returns {Promise<Object>} Parsed result
*/
export async function parseMarkdownString(markdownContent, options = {}) {
const {
extractYaml = true,
parseGfm = true,
extractAMB = true
} = options
const result = {
raw: markdownContent,
yaml: null,
metadata: null,
ast: null,
content: null
}
try {
// YAML Front Matter extrahieren
if (extractYaml) {
result.yaml = extractYAML(markdownContent)
// AMB-Metadaten extrahieren
if (extractAMB && result.yaml) {
result.metadata = extractAMBMetadata(result.yaml)
}
}
// Content ohne YAML
const contentWithoutYAML = removeYAML(markdownContent)
result.content = contentWithoutYAML
// unified Pipeline aufbauen
const processor = unified()
.use(remarkParse) // Markdown → AST
.use(remarkFrontmatter, ['yaml']) // YAML Front Matter Support
// Optional: GitHub Flavored Markdown
if (parseGfm) {
processor.use(remarkGfm)
}
// Markdown parsen → AST
const ast = processor.parse(contentWithoutYAML)
result.ast = ast
return result
} catch (error) {
throw new Error(`Failed to parse Markdown string: ${error.message}`)
}
}
/**
* Konvertiert Markdown AST zurück zu Markdown-String
* @param {Object} ast - Markdown Abstract Syntax Tree
* @returns {Promise<string>} Markdown-String
*/
export async function astToMarkdown(ast) {
const processor = unified()
.use(remarkStringify)
const markdown = processor.stringify(ast)
return markdown
}
/**
* Erstellt eine vorkonfigurierte unified Pipeline
* @param {Object} options - Pipeline-Optionen
* @returns {Object} unified Processor
*/
export function createMarkdownProcessor(options = {}) {
const {
parseGfm = true,
frontmatter = true
} = options
const processor = unified()
.use(remarkParse)
if (frontmatter) {
processor.use(remarkFrontmatter, ['yaml'])
}
if (parseGfm) {
processor.use(remarkGfm)
}
return processor
}
/**
* Extrahiert alle Überschriften aus einem Markdown AST
* @param {Object} ast - Markdown AST
* @returns {Array} Array von Überschriften mit Level und Text
*/
export function extractHeadings(ast) {
const headings = []
function visit(node) {
if (node.type === 'heading') {
headings.push({
level: node.depth,
text: extractTextFromNode(node)
})
}
if (node.children) {
node.children.forEach(visit)
}
}
visit(ast)
return headings
}
/**
* Extrahiert alle Links aus einem Markdown AST
* @param {Object} ast - Markdown AST
* @returns {Array} Array von Links mit URL und Text
*/
export function extractLinks(ast) {
const links = []
function visit(node) {
if (node.type === 'link') {
links.push({
url: node.url,
title: node.title || null,
text: extractTextFromNode(node)
})
}
if (node.children) {
node.children.forEach(visit)
}
}
visit(ast)
return links
}
/**
* Extrahiert alle Bilder aus einem Markdown AST
* @param {Object} ast - Markdown AST
* @returns {Array} Array von Bildern mit URL, Alt-Text und Title
*/
export function extractImages(ast) {
const images = []
function visit(node) {
if (node.type === 'image') {
images.push({
url: node.url,
alt: node.alt || null,
title: node.title || null
})
}
if (node.children) {
node.children.forEach(visit)
}
}
visit(ast)
return images
}
/**
* Hilfsfunktion: Extrahiert Text aus einem AST-Node
* @param {Object} node - AST-Node
* @returns {string} Extrahierter Text
*/
function extractTextFromNode(node) {
if (node.type === 'text') {
return node.value
}
if (node.children) {
return node.children
.map(extractTextFromNode)
.join('')
}
return ''
}

66
test/fixtures/example.md vendored Normal file
View file

@ -0,0 +1,66 @@
---
commonMetadata:
'@context': https://schema.org/
creativeWorkStatus: Published
type: LearningResource
name: Beispiel für OER-Material
description: >-
Dies ist ein Beispiel für eine Markdown-Datei mit AMB-konformen Metadaten.
Sie demonstriert die verschiedenen Felder des AMB-Standards.
license: https://creativecommons.org/licenses/by/4.0/deed.de
id: https://example.org/beispiel-oer
creator:
- givenName: Max
familyName: Mustermann
id: https://orcid.org/0000-0000-0000-0001
type: Person
affiliation:
name: Beispiel-Universität
id: https://ror.org/example123
type: Organization
- givenName: Erika
familyName: Musterfrau
id: https://orcid.org/0000-0000-0000-0002
type: Person
inLanguage:
- de
about:
- https://w3id.org/kim/hochschulfaechersystematik/n079
image: https://example.org/images/beispiel.jpg
learningResourceType:
- https://w3id.org/kim/hcrt/text
educationalLevel:
- https://w3id.org/kim/educationalLevel/level_A
datePublished: '2025-10-01'
title: Beispiel für OER-Material
tags:
- OER
- Bildung
- Beispiel
---
# Beispiel für OER-Material
## Einleitung
Dies ist ein **Beispiel** für eine Markdown-Datei mit YAML Front Matter nach AMB-Standard.
## Hauptinhalt
### Erste Unterüberschrift
Hier ist etwas *Text* mit verschiedenen Formatierungen:
- Liste Item 1
- Liste Item 2
- Liste Item 3
### Zweite Unterüberschrift
Ein Link zu [GitHub](https://github.com) und ein Bild:
![Alt-Text für Bild](https://example.org/image.png "Bild-Titel")
## Fazit
Das war ein einfaches Beispiel für strukturierte Bildungsinhalte.

184
test/parser.test.js Normal file
View file

@ -0,0 +1,184 @@
/**
* Tests für MDParser
* Nutzt Node.js native test runner
*/
import { test } from 'node:test'
import assert from 'node:assert'
import { parseMarkdownFile, parseMarkdownString } from '../src/parser.js'
import { extractYAML, hasYAML, removeYAML } from '../src/extractors/yaml-extractor.js'
import { extractAMBMetadata, validateAMBMetadata } from '../src/extractors/amb-extractor.js'
import { join, dirname } from 'path'
import { fileURLToPath } from 'url'
const __dirname = dirname(fileURLToPath(import.meta.url))
test('YAML Extractor: extrahiert YAML Front Matter', () => {
const markdown = `---
title: Test
author: Max
---
# Content`
const yaml = extractYAML(markdown)
assert.ok(yaml, 'YAML sollte extrahiert werden')
assert.strictEqual(yaml.title, 'Test')
assert.strictEqual(yaml.author, 'Max')
})
test('YAML Extractor: erkennt YAML Front Matter', () => {
const withYAML = `---\ntitle: Test\n---\nContent`
const withoutYAML = `# Heading\nContent`
assert.strictEqual(hasYAML(withYAML), true)
assert.strictEqual(hasYAML(withoutYAML), false)
})
test('YAML Extractor: entfernt YAML Front Matter', () => {
const markdown = `---
title: Test
---
# Content`
const result = removeYAML(markdown)
assert.ok(!result.includes('---'), 'YAML sollte entfernt sein')
assert.ok(result.includes('# Content'), 'Content sollte bleiben')
})
test('AMB Extractor: extrahiert Metadaten aus YAML', () => {
const yaml = {
commonMetadata: {
'@context': 'https://schema.org/',
type: 'LearningResource',
name: 'Test Resource',
description: 'Test Description',
license: 'https://creativecommons.org/licenses/by/4.0/',
datePublished: '2025-10-01',
creator: [{
givenName: 'Max',
familyName: 'Mustermann',
type: 'Person'
}]
}
}
const metadata = extractAMBMetadata(yaml)
assert.strictEqual(metadata.name, 'Test Resource')
assert.strictEqual(metadata.description, 'Test Description')
assert.strictEqual(metadata.type, 'LearningResource')
assert.ok(metadata.creator, 'Creator sollte vorhanden sein')
assert.strictEqual(metadata.creator[0].name, 'Max Mustermann')
})
test('AMB Extractor: verwendet Fallbacks', () => {
const yaml = {
title: 'Fallback Title',
summary: 'Fallback Description',
author: 'Max Mustermann'
}
const metadata = extractAMBMetadata(yaml)
assert.strictEqual(metadata.name, 'Fallback Title')
assert.strictEqual(metadata.description, 'Fallback Description')
assert.ok(metadata._warnings, 'Warnings sollten vorhanden sein')
assert.ok(metadata._warnings.length > 0, 'Es sollten Warnings existieren')
})
test('AMB Extractor: validiert Metadaten', () => {
const completeMetadata = {
name: 'Test',
description: 'Description',
license: 'CC-BY-4.0',
creator: [{ name: 'Max' }],
datePublished: '2025-10-01',
about: ['topic'],
id: 'https://example.org/test'
}
const validation = validateAMBMetadata(completeMetadata)
assert.strictEqual(validation.valid, true)
assert.strictEqual(validation.errors.length, 0)
})
test('Parser: parst Markdown-String', async () => {
const markdown = `---
title: Test
---
# Heading
Some **bold** text.`
const result = await parseMarkdownString(markdown)
assert.ok(result.yaml, 'YAML sollte extrahiert sein')
assert.ok(result.ast, 'AST sollte existieren')
assert.ok(result.content, 'Content sollte existieren')
assert.strictEqual(result.yaml.title, 'Test')
})
test('Parser: parst lokale Markdown-Datei', async () => {
const filePath = join(__dirname, 'fixtures/example.md')
const result = await parseMarkdownFile(filePath)
assert.ok(result.yaml, 'YAML sollte extrahiert sein')
assert.ok(result.metadata, 'Metadaten sollten extrahiert sein')
assert.ok(result.ast, 'AST sollte existieren')
// Prüfe AMB-Metadaten
assert.strictEqual(result.metadata.name, 'Beispiel für OER-Material')
assert.strictEqual(result.metadata.type, 'LearningResource')
assert.ok(result.metadata.creator, 'Creator sollte vorhanden sein')
assert.strictEqual(result.metadata.creator.length, 2, 'Sollte 2 Creators haben')
})
test('Parser: extrahiert Überschriften', async () => {
const markdown = `# H1
## H2
### H3`
const result = await parseMarkdownString(markdown)
const { extractHeadings } = await import('../src/parser.js')
const headings = extractHeadings(result.ast)
assert.strictEqual(headings.length, 3)
assert.strictEqual(headings[0].level, 1)
assert.strictEqual(headings[0].text, 'H1')
assert.strictEqual(headings[1].level, 2)
assert.strictEqual(headings[2].level, 3)
})
test('Parser: extrahiert Links', async () => {
const markdown = `[Link 1](https://example.com)
[Link 2](https://github.com "GitHub")`
const result = await parseMarkdownString(markdown)
const { extractLinks } = await import('../src/parser.js')
const links = extractLinks(result.ast)
assert.strictEqual(links.length, 2)
assert.strictEqual(links[0].url, 'https://example.com')
assert.strictEqual(links[0].text, 'Link 1')
assert.strictEqual(links[1].url, 'https://github.com')
assert.strictEqual(links[1].title, 'GitHub')
})
test('Parser: extrahiert Bilder', async () => {
const markdown = `![Alt Text](image.png "Title")`
const result = await parseMarkdownString(markdown)
const { extractImages } = await import('../src/parser.js')
const images = extractImages(result.ast)
assert.strictEqual(images.length, 1)
assert.strictEqual(images[0].url, 'image.png')
assert.strictEqual(images[0].alt, 'Alt Text')
assert.strictEqual(images[0].title, 'Title')
})
console.log('✅ Alle Tests erfolgreich!')