M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert

2026-04-01 19:07:03 +02:00
parent a9407aaba2
commit c482b20df9
9 changed files with 568 additions and 81 deletions
@@ -0,0 +1,46 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents a document that failed an M3 pre-check.
+ * <p>
+ * This result encapsulates:
+ * <ul>
+ *   <li>The original document candidate metadata (for correlation)</li>
+ *   <li>A description of why the pre-check failed</li>
+ * </ul>
+ * <p>
+ * Reasons include:
+ * <ul>
+ *   <li>"No usable text" – extraction yielded no meaningful content</li>
+ *   <li>"Page limit exceeded" – document exceeds the configured page limit</li>
+ *   <li>"Technical extraction error" – I/O or PDFBox failure (may be retryable later)</li>
+ * </ul>
+ * <p>
+ * A document with this decision will not proceed further in the current batch run.
+ *
+ * @param candidate the source document metadata
+ * @param failureReason a human-readable explanation of the pre-check failure
+ * @since M3-AP-001
+ */
+public record M3PreCheckFailed(
+    SourceDocumentCandidate candidate,
+    String failureReason
+) implements M3ProcessingDecision {
+    /**
+     * Constructor with validation.
+     *
+     * @param candidate must be non-null
+     * @param failureReason must be non-null and non-empty
+     * @throws NullPointerException if either parameter is null
+     * @throws IllegalArgumentException if failureReason is empty
+     */
+    public M3PreCheckFailed {
+        Objects.requireNonNull(candidate, "candidate must not be null");
+        Objects.requireNonNull(failureReason, "failureReason must not be null");
+        if (failureReason.isEmpty()) {
+            throw new IllegalArgumentException("failureReason must not be empty");
+        }
+    }
+}
@@ -0,0 +1,54 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+/**
+ * Enumeration of M3 pre-check failure reasons.
+ * <p>
+ * These are the deterministic content errors that can occur during M3 pre-check evaluation.
+ * They distinguish between failures in the document content versus technical extraction failures.
+ * <p>
+ * Deterministic content errors:
+ * <ul>
+ *   <li>{@link #NO_USABLE_TEXT}: The extracted text contains no meaningful content after normalization.</li>
+ *   <li>{@link #PAGE_LIMIT_EXCEEDED}: The document exceeds the configured page limit.</li>
+ * </ul>
+ * <p>
+ * Note: Technical extraction failures (I/O errors, PDFBox failures) are not M3 pre-check reasons;
+ * they are represented as {@link PdfExtractionTechnicalError} in the extraction result.
+ *
+ * @since M3-AP-004
+ */
+public enum M3PreCheckFailureReason {
+    /**
+     * The extracted PDF text, after normalization, contains no letters or digits.
+     * <p>
+     * This is a deterministic content error: reprocessing the same file in a later run
+     * will have the same outcome unless the source file is changed.
+     * <p>
+     * In M3, retry logic: exactly 1 retry in a later batch run.
+     */
+    NO_USABLE_TEXT("No usable text in extracted PDF content"),
+
+    /**
+     * The document's page count exceeds the configured limit.
+     * <p>
+     * This is a deterministic content error: the page count will not change unless the source file is modified.
+     * <p>
+     * In M3, retry logic: exactly 1 retry in a later batch run.
+     */
+    PAGE_LIMIT_EXCEEDED("Document page count exceeds configured limit");
+
+    private final String description;
+
+    M3PreCheckFailureReason(String description) {
+        this.description = description;
+    }
+
+    /**
+     * Returns a human-readable description of this failure reason.
+     *
+     * @return the description
+     */
+    public String getDescription() {
+        return description;
+    }
+}
@@ -0,0 +1,36 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents a document that passed all M3 pre-checks.
+ * <p>
+ * This result encapsulates:
+ * <ul>
+ *   <li>The original document candidate metadata</li>
+ *   <li>The successful PDF text extraction result</li>
+ * </ul>
+ * <p>
+ * A document with this decision is ready to proceed to M4 and later milestones
+ * (fingerprinting, persistence, KI integration, filename generation, target copy).
+ *
+ * @param candidate the source document metadata
+ * @param extraction the successful text extraction result
+ * @since M3-AP-001
+ */
+public record M3PreCheckPassed(
+    SourceDocumentCandidate candidate,
+    PdfExtractionSuccess extraction
+) implements M3ProcessingDecision {
+    /**
+     * Constructor with validation.
+     *
+     * @param candidate must be non-null
+     * @param extraction must be non-null
+     * @throws NullPointerException if either parameter is null
+     */
+    public M3PreCheckPassed {
+        Objects.requireNonNull(candidate, "candidate must not be null");
+        Objects.requireNonNull(extraction, "extraction must not be null");
+    }
+}
@@ -1,13 +1,12 @@
 package de.gecheckt.pdf.umbenenner.domain.model;

-import java.util.Objects;
-
 /**
 * Sealed interface representing the outcome of M3 document pre-checks.
 * <p>
- * This is a placeholder interface introduced in AP-001 to establish the architectural
+ * This interface introduced in AP-001 establishes the architectural
 * pattern for M3 pre-check results. The actual pre-check logic (fachlich validation
- * such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005).
+ * such as "brauchbarer Text" and "Seitenlimit") is implemented in AP-004 via
+ * {@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator}.
 * <p>
 * There are two allowed implementations:
 * <ul>
@@ -29,79 +28,3 @@ public sealed interface M3ProcessingDecision
    permits M3PreCheckPassed, M3PreCheckFailed {
    // Marker interface; concrete implementations define structure
 }
-
-/**
- * Represents a document that passed all M3 pre-checks.
- * <p>
- * This result encapsulates:
- * <ul>
- *   <li>The original document candidate metadata</li>
- *   <li>The successful PDF text extraction result</li>
- * </ul>
- * <p>
- * A document with this decision is ready to proceed to M4 and later milestones
- * (fingerprinting, persistence, KI integration, filename generation, target copy).
- *
- * @param candidate the source document metadata
- * @param extraction the successful text extraction result
- * @since M3-AP-001
- */
-record M3PreCheckPassed(
-    SourceDocumentCandidate candidate,
-    PdfExtractionSuccess extraction
-) implements M3ProcessingDecision {
-    /**
-     * Constructor with validation.
-     *
-     * @param candidate must be non-null
-     * @param extraction must be non-null
-     * @throws NullPointerException if either parameter is null
-     */
-    M3PreCheckPassed {
-        Objects.requireNonNull(candidate, "candidate must not be null");
-        Objects.requireNonNull(extraction, "extraction must not be null");
-    }
-}
-
-/**
- * Represents a document that failed an M3 pre-check.
- * <p>
- * This result encapsulates:
- * <ul>
- *   <li>The original document candidate metadata (for correlation)</li>
- *   <li>A description of why the pre-check failed</li>
- * </ul>
- * <p>
- * Reasons include:
- * <ul>
- *   <li>"No usable text" – extraction yielded no meaningful content</li>
- *   <li>"Page limit exceeded" – document exceeds the configured page limit</li>
- *   <li>"Technical extraction error" – I/O or PDFBox failure (may be retryable later)</li>
- * </ul>
- * <p>
- * A document with this decision will not proceed further in the current batch run.
- *
- * @param candidate the source document metadata
- * @param failureReason a human-readable explanation of the pre-check failure
- * @since M3-AP-001
- */
-record M3PreCheckFailed(
-    SourceDocumentCandidate candidate,
-    String failureReason
-) implements M3ProcessingDecision {
-    /**
-     * Constructor with validation.
-     *
-     * @param candidate must be non-null
-     * @param failureReason must be non-null and non-empty
-     * @throws NullPointerException if either parameter is null
-     * @throws IllegalArgumentException if failureReason is empty
-     */
-    M3PreCheckFailed {
-        Objects.requireNonNull(candidate, "candidate must not be null");
-        Objects.requireNonNull(failureReason, "failureReason must not be null");
-        if (failureReason.isEmpty()) {
-            throw new IllegalArgumentException("failureReason must not be empty");
-        }
-    }
-}
@@ -13,6 +13,17 @@
 *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision} — sealed result of M3 pre-checks (M3-AP-001)</li>
 * </ul>
 * <p>
+ * Additional classes introduced in M3:
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason} — enumeration of M3 pre-check failure reasons (M3-AP-004)</li>
+ * </ul>
+ *
+ * Implementation classes:
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed} — document passed M3 pre-checks (M3-AP-001, M3-AP-004)</li>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed} — document failed M3 pre-check (M3-AP-001, M3-AP-004)</li>
+ * </ul>
+ *
 * All classes in this package are:
 * <ul>
 *   <li>Infrastructure-agnostic (no database, filesystem, network, or framework dependencies)</li>