M3-Kernobjekte und Ports für Quellkandidaten und PDF-Auslese eingeführt

2026-04-01 18:11:35 +02:00
parent d425815788
commit dd282e8f7b
10 changed files with 550 additions and 2 deletions
@@ -0,0 +1,65 @@
+package de.gecheckt.pdf.umbenenner.application.port.out;
+
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+
+/**
+ * Outbound port for extracting text content and page count from a PDF document.
+ * <p>
+ * This interface abstracts PDF text extraction, allowing the application layer
+ * to remain independent of the underlying PDF library or extraction mechanism.
+ * <p>
+ * Responsibilities of the implementing adapter:
+ * <ul>
+ *   <li>Read the PDF file identified by the candidate</li>
+ *   <li>Extract all text content from the PDF</li>
+ *   <li>Count the total number of pages</li>
+ *   <li>Distinguish between content errors (PDF exists but is not extractable) and technical errors</li>
+ *   <li>Return a structured {@link PdfExtractionResult} encoding success or failure type</li>
+ *   <li>Encapsulate all PDF library details (no PDFBox types appear in domain/application)</li>
+ * </ul>
+ * <p>
+ * Architecture notes:
+ * <ul>
+ *   <li>The adapter implementation is the ONLY place where PDFBox appears in the codebase</li>
+ *   <li>Results are returned as structured types, never as exceptions thrown</li>
+ *   <li>The three result variants ({@code PdfExtractionSuccess}, {@code PdfExtractionContentError},
+ *       {@code PdfExtractionTechnicalError}) allow callers to distinguish recoverable from non-recoverable failures</li>
+ * </ul>
+ * <p>
+ * Non-goals of this port (handled in later milestones):
+ * <ul>
+ *   <li>Validation of extracted text content (e.g., minimum length, character sets)</li>
+ *   <li>Checking page count against limits</li>
+ *   <li>Any form of document fingerprinting</li>
+ *   <li>Persistence or caching of results</li>
+ * </ul>
+ *
+ * @since M3-AP-001
+ */
+public interface PdfTextExtractionPort {
+
+    /**
+     * Extracts text content and page count from a single PDF document.
+     * <p>
+     * Performs a single extraction operation and returns the result.
+     * Errors are never thrown as exceptions; they are encoded in the result type.
+     * <p>
+     * The candidate parameter serves as both:
+     * <ul>
+     *   <li>Document identification (e.g., filename for logging)</li>
+     *   <li>Adapter guidance (the adapter uses the candidate's identifier to locate the actual file)</li>
+     * </ul>
+     *
+     * @param candidate the document to extract; non-null
+     * @return a {@link PdfExtractionResult} encoding the outcome:
+     *         <ul>
+     *           <li>Success: contains extracted text and page count</li>
+     *           <li>Content error: PDF exists but is not extractable (e.g., image-only, encrypted)</li>
+     *           <li>Technical error: I/O failure, PDFBox library error, etc.</li>
+     *         </ul>
+     * @throws NullPointerException if candidate is null
+     * @see PdfExtractionResult
+     */
+    PdfExtractionResult extractTextAndPageCount(SourceDocumentCandidate candidate);
+}
@@ -0,0 +1,39 @@
+package de.gecheckt.pdf.umbenenner.application.port.out;
+
+/**
+ * Exception thrown when source documents cannot be accessed or read.
+ * <p>
+ * This exception is raised by the {@link SourceDocumentCandidatesPort} when:
+ * <ul>
+ *   <li>The source folder does not exist or is not readable</li>
+ *   <li>Permission issues prevent directory listing</li>
+ *   <li>Filesystem I/O errors occur during candidate discovery</li>
+ * </ul>
+ * <p>
+ * This is a runtime exception, allowing adapters to propagate filesystem errors
+ * without forcing try/catch blocks in the application layer.
+ *
+ * @since M3-AP-001
+ */
+public final class SourceDocumentAccessException extends RuntimeException {
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Constructs an exception with a message.
+     *
+     * @param message descriptive message about the access failure
+     */
+    public SourceDocumentAccessException(String message) {
+        super(message);
+    }
+
+    /**
+     * Constructs an exception with a message and cause.
+     *
+     * @param message descriptive message about the access failure
+     * @param cause the underlying exception that caused this failure
+     */
+    public SourceDocumentAccessException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
@@ -0,0 +1,49 @@
+package de.gecheckt.pdf.umbenenner.application.port.out;
+
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+
+import java.util.List;
+
+/**
+ * Outbound port for loading PDF document candidates from the source folder.
+ * <p>
+ * This interface abstracts filesystem access, allowing the application layer
+ * to remain independent of how candidates are discovered or retrieved.
+ * <p>
+ * Responsibilities of the implementing adapter:
+ * <ul>
+ *   <li>Scan the configured source folder</li>
+ *   <li>Identify only PDF files (by extension or content type)</li>
+ *   <li>Ignore non-PDF files and directories</li>
+ *   <li>Return candidates in deterministic order (e.g., alphabetical by filename)</li>
+ *   <li>Convert filesystem objects to {@link SourceDocumentCandidate} records</li>
+ *   <li>Encapsulate all filesystem access details (no Path or File leakage)</li>
+ * </ul>
+ * <p>
+ * Architecture note:
+ * <ul>
+ *   <li>The adapter implementation is the ONLY place where java.nio.file.Path or java.io.File appears</li>
+ *   <li>The application never sees or handles raw filesystem objects</li>
+ *   <li>This enforces the hexagonal boundary and allows easy adapter swapping (e.g., cloud storage)</li>
+ * </ul>
+ *
+ * @since M3-AP-001
+ */
+public interface SourceDocumentCandidatesPort {
+
+    /**
+     * Loads all PDF candidates from the configured source folder.
+     * <p>
+     * Returns a list of candidates representing discovered PDF files.
+     * The list is guaranteed to be in deterministic order to enable reproducible runs.
+     * <p>
+     * Non-goal: This method does NOT filter candidates by any fachlich criteria
+     * (such as file size, corruption checks, or content validation).
+     * It delivers raw candidates from the filesystem; fachlich evaluation happens later.
+     *
+     * @return a list of discovered PDF candidates in deterministic order (may be empty if no PDFs found)
+     * @throws SourceDocumentAccessException if the source folder cannot be read or accessed
+     * @see SourceDocumentCandidate
+     */
+    List<SourceDocumentCandidate> loadCandidates() throws SourceDocumentAccessException;
+}
@@ -14,10 +14,20 @@
 *       — System time access for timestamps and run context</li>
 * </ul>
 * <p>
+ * M3-AP-001 ports:
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort}
+ *       — Load PDF document candidates from the source folder</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort}
+ *       — Extract text content and page count from a single PDF</li>
+ * </ul>
+ * <p>
 * Exception types:
 * <ul>
 *   <li>{@link de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException}
- *       — Thrown when run lock cannot be acquired (another instance running)</li>
+ *       — Thrown when run lock cannot be acquired (another instance running) (M2)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException}
+ *       — Thrown when source folder cannot be read or accessed (M3)</li>
 * </ul>
 * <p>
 * Architecture Rule: Outbound ports are implementation-agnostic and contain no business logic.
@@ -0,0 +1,107 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Sealed interface representing the outcome of M3 document pre-checks.
+ * <p>
+ * This is a placeholder interface introduced in AP-001 to establish the architectural
+ * pattern for M3 pre-check results. The actual pre-check logic (fachlich validation
+ * such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005).
+ * <p>
+ * There are two allowed implementations:
+ * <ul>
+ *   <li>{@link M3PreCheckPassed}: Document passed all M3 pre-checks and is ready for KI integration</li>
+ *   <li>{@link M3PreCheckFailed}: Document failed an M3 pre-check and will not proceed further in this run</li>
+ * </ul>
+ * <p>
+ * Design principles:
+ * <ul>
+ *   <li>Sealed: enforces exhaustive handling of all cases</li>
+ *   <li>Carries both success path ({@link M3PreCheckPassed}) and failure reason ({@link M3PreCheckFailed})</li>
+ *   <li>Defined early (AP-001) to ensure architecture is established before logic arrives</li>
+ *   <li>Future-extensible for additional pre-check variants in later milestones</li>
+ * </ul>
+ *
+ * @since M3-AP-001
+ */
+public sealed interface M3ProcessingDecision
+    permits M3PreCheckPassed, M3PreCheckFailed {
+    // Marker interface; concrete implementations define structure
+}
+
+/**
+ * Represents a document that passed all M3 pre-checks.
+ * <p>
+ * This result encapsulates:
+ * <ul>
+ *   <li>The original document candidate metadata</li>
+ *   <li>The successful PDF text extraction result</li>
+ * </ul>
+ * <p>
+ * A document with this decision is ready to proceed to M4 and later milestones
+ * (fingerprinting, persistence, KI integration, filename generation, target copy).
+ *
+ * @param candidate the source document metadata
+ * @param extraction the successful text extraction result
+ * @since M3-AP-001
+ */
+record M3PreCheckPassed(
+    SourceDocumentCandidate candidate,
+    PdfExtractionSuccess extraction
+) implements M3ProcessingDecision {
+    /**
+     * Constructor with validation.
+     *
+     * @param candidate must be non-null
+     * @param extraction must be non-null
+     * @throws NullPointerException if either parameter is null
+     */
+    M3PreCheckPassed {
+        Objects.requireNonNull(candidate, "candidate must not be null");
+        Objects.requireNonNull(extraction, "extraction must not be null");
+    }
+}
+
+/**
+ * Represents a document that failed an M3 pre-check.
+ * <p>
+ * This result encapsulates:
+ * <ul>
+ *   <li>The original document candidate metadata (for correlation)</li>
+ *   <li>A description of why the pre-check failed</li>
+ * </ul>
+ * <p>
+ * Reasons include:
+ * <ul>
+ *   <li>"No usable text" – extraction yielded no meaningful content</li>
+ *   <li>"Page limit exceeded" – document exceeds the configured page limit</li>
+ *   <li>"Technical extraction error" – I/O or PDFBox failure (may be retryable later)</li>
+ * </ul>
+ * <p>
+ * A document with this decision will not proceed further in the current batch run.
+ *
+ * @param candidate the source document metadata
+ * @param failureReason a human-readable explanation of the pre-check failure
+ * @since M3-AP-001
+ */
+record M3PreCheckFailed(
+    SourceDocumentCandidate candidate,
+    String failureReason
+) implements M3ProcessingDecision {
+    /**
+     * Constructor with validation.
+     *
+     * @param candidate must be non-null
+     * @param failureReason must be non-null and non-empty
+     * @throws NullPointerException if either parameter is null
+     * @throws IllegalArgumentException if failureReason is empty
+     */
+    M3PreCheckFailed {
+        Objects.requireNonNull(candidate, "candidate must not be null");
+        Objects.requireNonNull(failureReason, "failureReason must not be null");
+        if (failureReason.isEmpty()) {
+            throw new IllegalArgumentException("failureReason must not be empty");
+        }
+    }
+}
@@ -0,0 +1,122 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Sealed interface representing the outcome of PDF text extraction.
+ * <p>
+ * This interface uses Java 17+ sealed types to enforce exhaustive case handling.
+ * There are exactly three allowed implementations:
+ * <ul>
+ *   <li>{@link PdfExtractionSuccess}: Text and page count were successfully extracted</li>
+ *   <li>{@link PdfExtractionContentError}: PDF exists but is not extractable (content problem)</li>
+ *   <li>{@link PdfExtractionTechnicalError}: Technical failure during extraction (I/O, etc.)</li>
+ * </ul>
+ * <p>
+ * Design principles:
+ * <ul>
+ *   <li>No exceptions thrown: results are encoded in the type system</li>
+ *   <li>Exhaustive: pattern matching forces handling of all cases</li>
+ *   <li>Distinct error types: allows retry logic to differentiate recoverable from non-recoverable</li>
+ *   <li>No PDFBox or filesystem types: pure domain representation</li>
+ * </ul>
+ *
+ * @since M3-AP-001
+ */
+public sealed interface PdfExtractionResult
+    permits PdfExtractionSuccess, PdfExtractionContentError, PdfExtractionTechnicalError {
+    // Marker interface; concrete implementations define structure
+}
+
+/**
+ * Represents successful PDF text extraction.
+ * <p>
+ * When this result is obtained, both text content and page count have been
+ * successfully extracted and are guaranteed to be valid.
+ *
+ * @param extractedText the full text content extracted from the PDF (non-null, may be empty string)
+ * @param pageCount the number of pages in the PDF (non-null, validated &gt;= 1)
+ * @since M3-AP-001
+ */
+record PdfExtractionSuccess(
+    String extractedText,
+    PdfPageCount pageCount
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param extractedText must be non-null (may be empty)
+     * @param pageCount must be non-null
+     * @throws NullPointerException if either parameter is null
+     */
+    PdfExtractionSuccess {
+        Objects.requireNonNull(extractedText, "extractedText must not be null");
+        Objects.requireNonNull(pageCount, "pageCount must not be null");
+    }
+}
+
+/**
+ * Represents a content-related failure during PDF text extraction.
+ * <p>
+ * This indicates that the PDF file itself is readable (no I/O error),
+ * but its content is not suitable for text extraction.
+ * <p>
+ * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked,
+ * PDF is severely corrupted in the content layer.
+ * <p>
+ * This is typically a deterministic, non-retryable condition for a given source file
+ * (unless the source file is modified and re-scanned in a later run).
+ *
+ * @param reason a human-readable explanation of why extraction failed (non-null, non-empty)
+ * @since M3-AP-001
+ */
+record PdfExtractionContentError(
+    String reason
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param reason must be non-null and non-empty
+     * @throws NullPointerException if reason is null
+     * @throws IllegalArgumentException if reason is empty
+     */
+    PdfExtractionContentError {
+        Objects.requireNonNull(reason, "reason must not be null");
+        if (reason.isEmpty()) {
+            throw new IllegalArgumentException("reason must not be empty");
+        }
+    }
+}
+
+/**
+ * Represents a technical (infrastructure) failure during PDF text extraction.
+ * <p>
+ * This indicates that something went wrong with the extraction process itself,
+ * such as file I/O errors, PDFBox library problems, or out-of-memory conditions.
+ * <p>
+ * These are typically retryable conditions in later batch runs, as they may be
+ * transient infrastructure issues.
+ *
+ * @param errorMessage a description of what went wrong (non-null, non-empty)
+ * @param cause the underlying exception, if any (may be null)
+ * @since M3-AP-001
+ */
+record PdfExtractionTechnicalError(
+    String errorMessage,
+    Throwable cause
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param errorMessage must be non-null and non-empty
+     * @param cause may be null
+     * @throws NullPointerException if errorMessage is null
+     * @throws IllegalArgumentException if errorMessage is empty
+     */
+    PdfExtractionTechnicalError {
+        Objects.requireNonNull(errorMessage, "errorMessage must not be null");
+        if (errorMessage.isEmpty()) {
+            throw new IllegalArgumentException("errorMessage must not be empty");
+        }
+    }
+}
@@ -0,0 +1,45 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+/**
+ * Typed representation of a PDF document's page count.
+ * <p>
+ * This record provides type safety and semantic clarity, distinguishing page count
+ * from other numeric values (character counts, file sizes, error codes, etc.).
+ * <p>
+ * Design principles:
+ * <ul>
+ *   <li>Validated: only positive page counts are allowed</li>
+ *   <li>Type-safe: prevents confusion with other numeric metrics</li>
+ *   <li>Self-documenting: {@code PdfPageCount} is clearer than naked {@code int}</li>
+ *   <li>Future-extensible: can add validation rules per milestone without signature changes</li>
+ * </ul>
+ *
+ * @since M3-AP-001
+ */
+public record PdfPageCount(int value) {
+    /**
+     * Constructor with validation.
+     * <p>
+     * Ensures the page count is meaningful and valid.
+     *
+     * @param value must be &gt;= 1
+     * @throws IllegalArgumentException if value &lt; 1
+     */
+    public PdfPageCount {
+        if (value < 1) {
+            throw new IllegalArgumentException("Page count must be >= 1, but got: " + value);
+        }
+    }
+
+    /**
+     * Returns whether this page count exceeds a given limit.
+     * <p>
+     * Convenience method for page limit checks in later milestones.
+     *
+     * @param limit the maximum allowed page count
+     * @return true if this count exceeds the limit
+     */
+    public boolean exceedsLimit(int limit) {
+        return this.value > limit;
+    }
+}
@@ -0,0 +1,59 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents a discovered PDF candidate from the source folder.
+ * <p>
+ * This record encapsulates the minimal, infrastructure-agnostic metadata
+ * needed to identify, correlate, and later extract a document during processing.
+ * <p>
+ * It deliberately does NOT expose filesystem paths or file handles directly.
+ * The hexagonal boundary is maintained: adapters map filesystem objects to this
+ * representation. The physical document location is carried opaquely via
+ * {@link SourceDocumentLocator}, which only adapters interpret.
+ * <p>
+ * Fields:
+ * <ul>
+ *   <li>{@code uniqueIdentifier} — human-readable name for logging and correlation (e.g. filename)</li>
+ *   <li>{@code fileSizeBytes} — enables early detection of corrupt/empty documents</li>
+ *   <li>{@code locator} — opaque reference passed through unchanged to the extraction adapter;
+ *       Domain and Application never interpret its value</li>
+ * </ul>
+ * <p>
+ * No java.io.File or java.nio.file.Path references appear in this record.
+ *
+ * @since M3-AP-001
+ */
+public record SourceDocumentCandidate(
+    String uniqueIdentifier,
+    long fileSizeBytes,
+    SourceDocumentLocator locator
+) {
+    /**
+     * Compact constructor with validation.
+     * <p>
+     * Ensures all parameters are non-null and meaningful:
+     * <ul>
+     *   <li>{@code uniqueIdentifier} must be non-null and non-empty</li>
+     *   <li>{@code fileSizeBytes} must be positive</li>
+     *   <li>{@code locator} must be non-null</li>
+     * </ul>
+     *
+     * @param uniqueIdentifier non-null, non-empty identifier for logging and correlation
+     * @param fileSizeBytes must be &gt; 0
+     * @param locator non-null opaque locator; only adapters interpret its value
+     * @throws NullPointerException if uniqueIdentifier or locator is null
+     * @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes &lt;= 0
+     */
+    public SourceDocumentCandidate {
+        Objects.requireNonNull(uniqueIdentifier, "uniqueIdentifier must not be null");
+        if (uniqueIdentifier.isEmpty()) {
+            throw new IllegalArgumentException("uniqueIdentifier must not be empty");
+        }
+        if (fileSizeBytes <= 0) {
+            throw new IllegalArgumentException("fileSizeBytes must be positive");
+        }
+        Objects.requireNonNull(locator, "locator must not be null");
+    }
+}
@@ -0,0 +1,45 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Opaque locator that allows an adapter to re-find the physical document
+ * that was originally discovered by the source-scan adapter.
+ * <p>
+ * <strong>This type is deliberately opaque to Domain and Application.</strong>
+ * Neither layer interprets or constructs the contained value. Only adapters
+ * read and write the {@code value} field:
+ * <ul>
+ *   <li>The {@code SourceDocumentCandidatesPort} adapter writes the value when
+ *       constructing a {@link SourceDocumentCandidate}.</li>
+ *   <li>The {@code PdfTextExtractionPort} adapter reads the value to locate
+ *       the physical file for extraction.</li>
+ * </ul>
+ * <p>
+ * The value itself is an adapter-internal convention (e.g. an absolute path string).
+ * Domain and Application never interpret it — they only pass it through.
+ * This preserves the hexagonal boundary: no {@code java.nio.file.Path},
+ * {@code java.io.File}, or other infrastructure types appear outside the adapter layer.
+ * <p>
+ * Coupling: Both the scan adapter and the extraction adapter live in the same
+ * {@code adapter-out} module and share the same encoding convention for the value.
+ * This is an intentional intra-adapter contract, not a cross-layer concern.
+ *
+ * @since M3-AP-001
+ */
+public record SourceDocumentLocator(String value) {
+
+    /**
+     * Compact constructor with validation.
+     *
+     * @param value non-null, non-empty opaque locator value; content is an adapter-internal convention
+     * @throws NullPointerException     if value is null
+     * @throws IllegalArgumentException if value is empty
+     */
+    public SourceDocumentLocator {
+        Objects.requireNonNull(value, "value must not be null");
+        if (value.isEmpty()) {
+            throw new IllegalArgumentException("value must not be empty");
+        }
+    }
+}
@@ -3,7 +3,14 @@
 * <p>
 * This package contains the fundamental domain entities and status models required for document processing:
 * <ul>
- *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.ProcessingStatus} — enumeration of all valid document processing states</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.ProcessingStatus} — enumeration of all valid document processing states (M2-AP-001)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.RunId} — unique identifier for a batch run (M2-AP-003)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext} — technical context for a batch run (M2-AP-003)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate} — discovered PDF from source folder (M3-AP-001)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator} — opaque locator passed from scan adapter to extraction adapter (M3-AP-001)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount} — typed page count validation (M3-AP-001)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult} — sealed result of PDF text extraction (M3-AP-001)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision} — sealed result of M3 pre-checks (M3-AP-001)</li>
 * </ul>
 * <p>
 * All classes in this package are: