M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert

2026-04-01 19:07:03 +02:00
parent a9407aaba2
commit c482b20df9
9 changed files with 568 additions and 81 deletions
@@ -0,0 +1,119 @@
+package de.gecheckt.pdf.umbenenner.application.service;
+
+import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+
+import java.util.Objects;
+
+/**
+ * Evaluates whether a successfully extracted PDF passes M3 pre-checks.
+ * <p>
+ * M3 Pre-checks verify that:
+ * <ul>
+ *   <li>The extracted text contains at least one meaningful character after normalization</li>
+ *   <li>The document's page count does not exceed the configured limit</li>
+ * </ul>
+ * <p>
+ * A document that passes both pre-checks is ready to proceed to M4 and later milestones.
+ * A document that fails a pre-check is classified with a specific deterministic failure reason
+ * and will not proceed further in the current batch run.
+ * <p>
+ * This service is stateless and thread-safe.
+ *
+ * @since M3-AP-004
+ */
+public class M3PreCheckEvaluator {
+
+    /**
+     * Evaluates M3 pre-checks for a successfully extracted PDF document.
+     * <p>
+     * Pre-check logic:
+     * <ol>
+     *   <li>Check if extracted text contains at least one letter or digit after normalization</li>
+     *   <li>Check if document page count does not exceed the configured limit</li>
+     * </ol>
+     * <p>
+     * Returns {@link M3PreCheckPassed} if both checks pass, or {@link M3PreCheckFailed}
+     * with a specific reason if any check fails.
+     *
+     * @param candidate the source document metadata
+     * @param extraction the successfully extracted PDF content
+     * @param configuration the startup configuration (used for maxPages limit)
+     * @return the pre-check decision: passed or failed with reason
+     * @throws NullPointerException if any parameter is null
+     */
+    public static M3ProcessingDecision evaluate(
+            SourceDocumentCandidate candidate,
+            PdfExtractionSuccess extraction,
+            StartConfiguration configuration) {
+
+        Objects.requireNonNull(candidate, "candidate must not be null");
+        Objects.requireNonNull(extraction, "extraction must not be null");
+        Objects.requireNonNull(configuration, "configuration must not be null");
+
+        // Pre-check 1: Verify document has usable text
+        if (!hasUsableText(extraction.extractedText())) {
+            return new M3PreCheckFailed(
+                    candidate,
+                    M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription()
+            );
+        }
+
+        // Pre-check 2: Verify document page count does not exceed configured limit
+        if (extraction.pageCount().exceedsLimit(configuration.maxPages())) {
+            return new M3PreCheckFailed(
+                    candidate,
+                    M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription()
+            );
+        }
+
+        // All pre-checks passed
+        return new M3PreCheckPassed(candidate, extraction);
+    }
+
+    /**
+     * Determines whether the extracted text contains at least one meaningful character.
+     * <p>
+     * Definition of "usable text" for M3:
+     * <ul>
+     *   <li>After normalization (trimming whitespace), at least one letter or digit remains</li>
+     *   <li>Pure whitespace or only special characters do not qualify as usable text</li>
+     *   <li>Letters and digits include Unicode characters (e.g., ÄÖÜß, äöüß, etc.)</li>
+     * </ul>
+     * <p>
+     * Normalization process:
+     * <ol>
+     *   <li>Trim leading and trailing whitespace</li>
+     *   <li>Scan for at least one character where {@link Character#isLetterOrDigit(char)} returns true</li>
+     *   <li>Unicode-aware character classification (not limited to ASCII)</li>
+     * </ol>
+     *
+     * @param text the extracted text from the PDF (non-null, may be empty)
+     * @return true if text contains at least one letter or digit (Unicode-aware) after normalization
+     */
+    private static boolean hasUsableText(String text) {
+        Objects.requireNonNull(text, "text must not be null");
+
+        // Trim whitespace first
+        String trimmed = text.strip();
+
+        // Check if text contains at least one letter or digit (Unicode-aware)
+        for (char c : trimmed.toCharArray()) {
+            if (Character.isLetterOrDigit(c)) {
+                return true;
+            }
+        }
+
+        // No letter or digit found
+        return false;
+    }
+
+    private M3PreCheckEvaluator() {
+        // Static utility class – no instances
+    }
+}
@@ -0,0 +1,20 @@
+/**
+ * Application-level services for business logic evaluation.
+ * <p>
+ * This package contains stateless, pure-logic services that evaluate document content
+ * and apply business rules. Services in this package:
+ * <ul>
+ *   <li>Do not manage state or resources</li>
+ *   <li>Do not depend on infrastructure (database, filesystem, network)</li>
+ *   <li>Can be tested with simple unit tests and in-memory mocks</li>
+ *   <li>Are reused by multiple use cases or adapters</li>
+ * </ul>
+ *
+ * Current services:
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator} — M3 pre-check evaluation (M3-AP-004)</li>
+ * </ul>
+ *
+ * @since M3-AP-004
+ */
+package de.gecheckt.pdf.umbenenner.application.service;