M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert

2026-04-01 19:07:03 +02:00
parent a9407aaba2
commit c482b20df9
9 changed files with 568 additions and 81 deletions
@@ -0,0 +1,276 @@
+package de.gecheckt.pdf.umbenenner.application.service;
+
+import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.net.URI;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Tests for {@link M3PreCheckEvaluator}.
+ * <p>
+ * Verifies correct M3 pre-check logic for usable text and page limit validation.
+ */
+class M3PreCheckEvaluatorTest {
+
+    @TempDir
+    Path tempDir;
+
+    @Test
+    void evaluate_passesWhenDocumentHasUsableTextAndValidPageCount() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Some meaningful text", new PdfPageCount(5));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass when text is usable and page count is valid");
+        M3PreCheckPassed passed = (M3PreCheckPassed) result;
+        assertSame(passed.candidate(), candidate, "Candidate should be preserved");
+        assertSame(passed.extraction(), extraction, "Extraction should be preserved");
+    }
+
+    @Test
+    void evaluate_failsWithNoUsableTextWhenExtractedTextIsEmpty() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail with empty text");
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
+    }
+
+    @Test
+    void evaluate_failsWithNoUsableTextWhenTextIsOnlyWhitespace() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("   \n\t  \r\n  ", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail with whitespace-only text");
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
+    }
+
+    @Test
+    void evaluate_failsWithNoUsableTextWhenTextContainsOnlySpecialCharacters() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#$%^&*()_+-=[]{}|;:',.<>?/", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail with special characters only");
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
+    }
+
+    @Test
+    void evaluate_passesWithTextContainingSingleLetter() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("a", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass with single letter");
+    }
+
+    @Test
+    void evaluate_passesWithTextContainingSingleDigit() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("5", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass with single digit");
+    }
+
+    @Test
+    void evaluate_passesWithTextMixedWithSpecialCharactersIfLettersOrDigitsPresent() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#a$%^&*", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass when letters/digits are present among special chars");
+    }
+
+    @Test
+    void evaluate_passesWithWhitespaceAroundUsableText() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("   meaningful text   ", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass when text has meaningful content despite whitespace");
+    }
+
+    @Test
+    void evaluate_failsWithPageLimitExceededWhenPageCountEqualsLimit() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(5));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(5));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass when page count equals limit (not exceeded)");
+    }
+
+    @Test
+    void evaluate_failsWithPageLimitExceededWhenPageCountExceedsLimit() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(5));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(6));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail when page count exceeds limit");
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
+    }
+
+    @Test
+    void evaluate_failsWithPageLimitExceededEvenIfTextIsValid() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(2));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Excellent meaningful text with lots of content", new PdfPageCount(100));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail with page limit exceeded even if text is good");
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
+    }
+
+    @Test
+    void evaluate_prefersPageLimitCheckOverTextCheck() throws Exception {
+        // If both checks fail, page limit check should take precedence (not tested for priority,
+        // but we verify that one failure is reported consistently)
+        StartConfiguration config = buildConfig(maxPages(1));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(10));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckFailed, "Should fail when both checks fail");
+        // The specific order of checks doesn't matter for M3; just verify one reason is returned
+        M3PreCheckFailed failed = (M3PreCheckFailed) result;
+        assertNotNull(failed.failureReason());
+        assertFalse(failed.failureReason().isEmpty());
+    }
+
+    @Test
+    void evaluate_throwsNullPointerExceptionWhenCandidateIsNull() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
+
+        assertThrows(NullPointerException.class, () ->
+            M3PreCheckEvaluator.evaluate(null, extraction, config)
+        );
+    }
+
+    @Test
+    void evaluate_throwsNullPointerExceptionWhenExtractionIsNull() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+
+        assertThrows(NullPointerException.class, () ->
+            M3PreCheckEvaluator.evaluate(candidate, null, config)
+        );
+    }
+
+    @Test
+    void evaluate_throwsNullPointerExceptionWhenConfigurationIsNull() throws Exception {
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
+
+        assertThrows(NullPointerException.class, () ->
+            M3PreCheckEvaluator.evaluate(candidate, extraction, null)
+        );
+    }
+
+    @Test
+    void evaluate_passesWithUnicodeGermanUmlauts() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Äußerst äöüß Großes", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass with German umlauts (ÄÖÜß)");
+    }
+
+    @Test
+    void evaluate_passesWithOtherUnicodeCharacters() throws Exception {
+        StartConfiguration config = buildConfig(maxPages(10));
+        SourceDocumentCandidate candidate = buildCandidate();
+        PdfExtractionSuccess extraction = new PdfExtractionSuccess("Αβγδ 中文 καλημέρα", new PdfPageCount(1));
+
+        M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
+
+        assertTrue(result instanceof M3PreCheckPassed, "Should pass with Greek, Chinese, and other Unicode letters");
+    }
+
+    // =========================================================================
+    // Helpers
+    // =========================================================================
+
+    private StartConfiguration buildConfig(int maxPages) throws Exception {
+        Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
+        Path targetDir = Files.createDirectories(tempDir.resolve("target"));
+        Path dbFile = tempDir.resolve("db.sqlite");
+        Files.createFile(dbFile);
+        Path promptFile = tempDir.resolve("prompt.txt");
+        Files.createFile(promptFile);
+
+        return new StartConfiguration(
+                sourceDir,
+                targetDir,
+                dbFile,
+                URI.create("https://api.example.com"),
+                "gpt-4",
+                30,
+                3,
+                maxPages,
+                50000,
+                promptFile,
+                tempDir.resolve("lock.lock"),
+                tempDir.resolve("logs"),
+                "INFO",
+                "test-key"
+        );
+    }
+
+    private int maxPages(int limit) {
+        return limit;
+    }
+
+    private SourceDocumentCandidate buildCandidate() throws Exception {
+        Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
+        Path pdfFile = sourceDir.resolve("test.pdf");
+        Files.createFile(pdfFile);
+        SourceDocumentLocator locator = new SourceDocumentLocator(pdfFile.toString());
+        return new SourceDocumentCandidate(pdfFile.getFileName().toString(), 0L, locator);
+    }
+}