diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 949c18b..47e5593 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -10,7 +10,9 @@ "Bash(mvn -pl pdf-umbenenner-adapter-out clean compile)", "Bash(mvn dependency:tree -pl pdf-umbenenner-adapter-out)", "Bash(mvn -pl pdf-umbenenner-domain clean compile)", - "Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)" + "Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)", + "Bash(cd /d D:/Dev/Projects/pdf-umbenenner-parent)", + "Bash(mvn -v)" ] } } diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/M3PreCheckEvaluator.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/M3PreCheckEvaluator.java new file mode 100644 index 0000000..d5224f6 --- /dev/null +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/M3PreCheckEvaluator.java @@ -0,0 +1,119 @@ +package de.gecheckt.pdf.umbenenner.application.service; + +import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration; +import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason; +import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed; +import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed; +import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess; +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; + +import java.util.Objects; + +/** + * Evaluates whether a successfully extracted PDF passes M3 pre-checks. + *
+ * M3 Pre-checks verify that: + *
+ * A document that passes both pre-checks is ready to proceed to M4 and later milestones. + * A document that fails a pre-check is classified with a specific deterministic failure reason + * and will not proceed further in the current batch run. + *
+ * This service is stateless and thread-safe. + * + * @since M3-AP-004 + */ +public class M3PreCheckEvaluator { + + /** + * Evaluates M3 pre-checks for a successfully extracted PDF document. + *
+ * Pre-check logic: + *
+ * Returns {@link M3PreCheckPassed} if both checks pass, or {@link M3PreCheckFailed} + * with a specific reason if any check fails. + * + * @param candidate the source document metadata + * @param extraction the successfully extracted PDF content + * @param configuration the startup configuration (used for maxPages limit) + * @return the pre-check decision: passed or failed with reason + * @throws NullPointerException if any parameter is null + */ + public static M3ProcessingDecision evaluate( + SourceDocumentCandidate candidate, + PdfExtractionSuccess extraction, + StartConfiguration configuration) { + + Objects.requireNonNull(candidate, "candidate must not be null"); + Objects.requireNonNull(extraction, "extraction must not be null"); + Objects.requireNonNull(configuration, "configuration must not be null"); + + // Pre-check 1: Verify document has usable text + if (!hasUsableText(extraction.extractedText())) { + return new M3PreCheckFailed( + candidate, + M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription() + ); + } + + // Pre-check 2: Verify document page count does not exceed configured limit + if (extraction.pageCount().exceedsLimit(configuration.maxPages())) { + return new M3PreCheckFailed( + candidate, + M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription() + ); + } + + // All pre-checks passed + return new M3PreCheckPassed(candidate, extraction); + } + + /** + * Determines whether the extracted text contains at least one meaningful character. + *
+ * Definition of "usable text" for M3: + *
+ * Normalization process: + *
+ * This package contains stateless, pure-logic services that evaluate document content + * and apply business rules. Services in this package: + *
+ * Verifies correct M3 pre-check logic for usable text and page limit validation. + */ +class M3PreCheckEvaluatorTest { + + @TempDir + Path tempDir; + + @Test + void evaluate_passesWhenDocumentHasUsableTextAndValidPageCount() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Some meaningful text", new PdfPageCount(5)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass when text is usable and page count is valid"); + M3PreCheckPassed passed = (M3PreCheckPassed) result; + assertSame(passed.candidate(), candidate, "Candidate should be preserved"); + assertSame(passed.extraction(), extraction, "Extraction should be preserved"); + } + + @Test + void evaluate_failsWithNoUsableTextWhenExtractedTextIsEmpty() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail with empty text"); + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason()); + } + + @Test + void evaluate_failsWithNoUsableTextWhenTextIsOnlyWhitespace() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess(" \n\t \r\n ", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail with whitespace-only text"); + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason()); + } + + @Test + void evaluate_failsWithNoUsableTextWhenTextContainsOnlySpecialCharacters() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#$%^&*()_+-=[]{}|;:',.<>?/", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail with special characters only"); + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason()); + } + + @Test + void evaluate_passesWithTextContainingSingleLetter() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("a", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass with single letter"); + } + + @Test + void evaluate_passesWithTextContainingSingleDigit() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("5", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass with single digit"); + } + + @Test + void evaluate_passesWithTextMixedWithSpecialCharactersIfLettersOrDigitsPresent() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#a$%^&*", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass when letters/digits are present among special chars"); + } + + @Test + void evaluate_passesWithWhitespaceAroundUsableText() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess(" meaningful text ", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass when text has meaningful content despite whitespace"); + } + + @Test + void evaluate_failsWithPageLimitExceededWhenPageCountEqualsLimit() throws Exception { + StartConfiguration config = buildConfig(maxPages(5)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(5)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass when page count equals limit (not exceeded)"); + } + + @Test + void evaluate_failsWithPageLimitExceededWhenPageCountExceedsLimit() throws Exception { + StartConfiguration config = buildConfig(maxPages(5)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(6)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail when page count exceeds limit"); + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason()); + } + + @Test + void evaluate_failsWithPageLimitExceededEvenIfTextIsValid() throws Exception { + StartConfiguration config = buildConfig(maxPages(2)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Excellent meaningful text with lots of content", new PdfPageCount(100)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail with page limit exceeded even if text is good"); + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason()); + } + + @Test + void evaluate_prefersPageLimitCheckOverTextCheck() throws Exception { + // If both checks fail, page limit check should take precedence (not tested for priority, + // but we verify that one failure is reported consistently) + StartConfiguration config = buildConfig(maxPages(1)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(10)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckFailed, "Should fail when both checks fail"); + // The specific order of checks doesn't matter for M3; just verify one reason is returned + M3PreCheckFailed failed = (M3PreCheckFailed) result; + assertNotNull(failed.failureReason()); + assertFalse(failed.failureReason().isEmpty()); + } + + @Test + void evaluate_throwsNullPointerExceptionWhenCandidateIsNull() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1)); + + assertThrows(NullPointerException.class, () -> + M3PreCheckEvaluator.evaluate(null, extraction, config) + ); + } + + @Test + void evaluate_throwsNullPointerExceptionWhenExtractionIsNull() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + + assertThrows(NullPointerException.class, () -> + M3PreCheckEvaluator.evaluate(candidate, null, config) + ); + } + + @Test + void evaluate_throwsNullPointerExceptionWhenConfigurationIsNull() throws Exception { + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1)); + + assertThrows(NullPointerException.class, () -> + M3PreCheckEvaluator.evaluate(candidate, extraction, null) + ); + } + + @Test + void evaluate_passesWithUnicodeGermanUmlauts() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Äußerst äöüß Großes", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass with German umlauts (ÄÖÜß)"); + } + + @Test + void evaluate_passesWithOtherUnicodeCharacters() throws Exception { + StartConfiguration config = buildConfig(maxPages(10)); + SourceDocumentCandidate candidate = buildCandidate(); + PdfExtractionSuccess extraction = new PdfExtractionSuccess("Αβγδ 中文 καλημέρα", new PdfPageCount(1)); + + M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config); + + assertTrue(result instanceof M3PreCheckPassed, "Should pass with Greek, Chinese, and other Unicode letters"); + } + + // ========================================================================= + // Helpers + // ========================================================================= + + private StartConfiguration buildConfig(int maxPages) throws Exception { + Path sourceDir = Files.createDirectories(tempDir.resolve("source")); + Path targetDir = Files.createDirectories(tempDir.resolve("target")); + Path dbFile = tempDir.resolve("db.sqlite"); + Files.createFile(dbFile); + Path promptFile = tempDir.resolve("prompt.txt"); + Files.createFile(promptFile); + + return new StartConfiguration( + sourceDir, + targetDir, + dbFile, + URI.create("https://api.example.com"), + "gpt-4", + 30, + 3, + maxPages, + 50000, + promptFile, + tempDir.resolve("lock.lock"), + tempDir.resolve("logs"), + "INFO", + "test-key" + ); + } + + private int maxPages(int limit) { + return limit; + } + + private SourceDocumentCandidate buildCandidate() throws Exception { + Path sourceDir = Files.createDirectories(tempDir.resolve("source")); + Path pdfFile = sourceDir.resolve("test.pdf"); + Files.createFile(pdfFile); + SourceDocumentLocator locator = new SourceDocumentLocator(pdfFile.toString()); + return new SourceDocumentCandidate(pdfFile.getFileName().toString(), 0L, locator); + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailed.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailed.java new file mode 100644 index 0000000..67c39ab --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailed.java @@ -0,0 +1,46 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents a document that failed an M3 pre-check. + *
+ * This result encapsulates: + *
+ * Reasons include: + *
+ * A document with this decision will not proceed further in the current batch run. + * + * @param candidate the source document metadata + * @param failureReason a human-readable explanation of the pre-check failure + * @since M3-AP-001 + */ +public record M3PreCheckFailed( + SourceDocumentCandidate candidate, + String failureReason +) implements M3ProcessingDecision { + /** + * Constructor with validation. + * + * @param candidate must be non-null + * @param failureReason must be non-null and non-empty + * @throws NullPointerException if either parameter is null + * @throws IllegalArgumentException if failureReason is empty + */ + public M3PreCheckFailed { + Objects.requireNonNull(candidate, "candidate must not be null"); + Objects.requireNonNull(failureReason, "failureReason must not be null"); + if (failureReason.isEmpty()) { + throw new IllegalArgumentException("failureReason must not be empty"); + } + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailureReason.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailureReason.java new file mode 100644 index 0000000..6ef5a33 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckFailureReason.java @@ -0,0 +1,54 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +/** + * Enumeration of M3 pre-check failure reasons. + *
+ * These are the deterministic content errors that can occur during M3 pre-check evaluation. + * They distinguish between failures in the document content versus technical extraction failures. + *
+ * Deterministic content errors: + *
+ * Note: Technical extraction failures (I/O errors, PDFBox failures) are not M3 pre-check reasons; + * they are represented as {@link PdfExtractionTechnicalError} in the extraction result. + * + * @since M3-AP-004 + */ +public enum M3PreCheckFailureReason { + /** + * The extracted PDF text, after normalization, contains no letters or digits. + *
+ * This is a deterministic content error: reprocessing the same file in a later run + * will have the same outcome unless the source file is changed. + *
+ * In M3, retry logic: exactly 1 retry in a later batch run. + */ + NO_USABLE_TEXT("No usable text in extracted PDF content"), + + /** + * The document's page count exceeds the configured limit. + *
+ * This is a deterministic content error: the page count will not change unless the source file is modified. + *
+ * In M3, retry logic: exactly 1 retry in a later batch run. + */ + PAGE_LIMIT_EXCEEDED("Document page count exceeds configured limit"); + + private final String description; + + M3PreCheckFailureReason(String description) { + this.description = description; + } + + /** + * Returns a human-readable description of this failure reason. + * + * @return the description + */ + public String getDescription() { + return description; + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckPassed.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckPassed.java new file mode 100644 index 0000000..12eee37 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3PreCheckPassed.java @@ -0,0 +1,36 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents a document that passed all M3 pre-checks. + *
+ * This result encapsulates: + *
+ * A document with this decision is ready to proceed to M4 and later milestones + * (fingerprinting, persistence, KI integration, filename generation, target copy). + * + * @param candidate the source document metadata + * @param extraction the successful text extraction result + * @since M3-AP-001 + */ +public record M3PreCheckPassed( + SourceDocumentCandidate candidate, + PdfExtractionSuccess extraction +) implements M3ProcessingDecision { + /** + * Constructor with validation. + * + * @param candidate must be non-null + * @param extraction must be non-null + * @throws NullPointerException if either parameter is null + */ + public M3PreCheckPassed { + Objects.requireNonNull(candidate, "candidate must not be null"); + Objects.requireNonNull(extraction, "extraction must not be null"); + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java index d55979b..2b77aee 100644 --- a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java @@ -1,13 +1,12 @@ package de.gecheckt.pdf.umbenenner.domain.model; -import java.util.Objects; - /** * Sealed interface representing the outcome of M3 document pre-checks. *
- * This is a placeholder interface introduced in AP-001 to establish the architectural + * This interface introduced in AP-001 establishes the architectural * pattern for M3 pre-check results. The actual pre-check logic (fachlich validation - * such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005). + * such as "brauchbarer Text" and "Seitenlimit") is implemented in AP-004 via + * {@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator}. *
* There are two allowed implementations: *
- * This result encapsulates: - *
- * A document with this decision is ready to proceed to M4 and later milestones - * (fingerprinting, persistence, KI integration, filename generation, target copy). - * - * @param candidate the source document metadata - * @param extraction the successful text extraction result - * @since M3-AP-001 - */ -record M3PreCheckPassed( - SourceDocumentCandidate candidate, - PdfExtractionSuccess extraction -) implements M3ProcessingDecision { - /** - * Constructor with validation. - * - * @param candidate must be non-null - * @param extraction must be non-null - * @throws NullPointerException if either parameter is null - */ - M3PreCheckPassed { - Objects.requireNonNull(candidate, "candidate must not be null"); - Objects.requireNonNull(extraction, "extraction must not be null"); - } -} - -/** - * Represents a document that failed an M3 pre-check. - *
- * This result encapsulates: - *
- * Reasons include: - *
- * A document with this decision will not proceed further in the current batch run. - * - * @param candidate the source document metadata - * @param failureReason a human-readable explanation of the pre-check failure - * @since M3-AP-001 - */ -record M3PreCheckFailed( - SourceDocumentCandidate candidate, - String failureReason -) implements M3ProcessingDecision { - /** - * Constructor with validation. - * - * @param candidate must be non-null - * @param failureReason must be non-null and non-empty - * @throws NullPointerException if either parameter is null - * @throws IllegalArgumentException if failureReason is empty - */ - M3PreCheckFailed { - Objects.requireNonNull(candidate, "candidate must not be null"); - Objects.requireNonNull(failureReason, "failureReason must not be null"); - if (failureReason.isEmpty()) { - throw new IllegalArgumentException("failureReason must not be empty"); - } - } -} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java index 4a996a3..f1041b1 100644 --- a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java @@ -13,6 +13,17 @@ *
+ * Additional classes introduced in M3: + *