M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert
This commit is contained in:
@@ -10,7 +10,9 @@
|
||||
"Bash(mvn -pl pdf-umbenenner-adapter-out clean compile)",
|
||||
"Bash(mvn dependency:tree -pl pdf-umbenenner-adapter-out)",
|
||||
"Bash(mvn -pl pdf-umbenenner-domain clean compile)",
|
||||
"Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)"
|
||||
"Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)",
|
||||
"Bash(cd /d D:/Dev/Projects/pdf-umbenenner-parent)",
|
||||
"Bash(mvn -v)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
package de.gecheckt.pdf.umbenenner.application.service;
|
||||
|
||||
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Evaluates whether a successfully extracted PDF passes M3 pre-checks.
|
||||
* <p>
|
||||
* M3 Pre-checks verify that:
|
||||
* <ul>
|
||||
* <li>The extracted text contains at least one meaningful character after normalization</li>
|
||||
* <li>The document's page count does not exceed the configured limit</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document that passes both pre-checks is ready to proceed to M4 and later milestones.
|
||||
* A document that fails a pre-check is classified with a specific deterministic failure reason
|
||||
* and will not proceed further in the current batch run.
|
||||
* <p>
|
||||
* This service is stateless and thread-safe.
|
||||
*
|
||||
* @since M3-AP-004
|
||||
*/
|
||||
public class M3PreCheckEvaluator {
|
||||
|
||||
/**
|
||||
* Evaluates M3 pre-checks for a successfully extracted PDF document.
|
||||
* <p>
|
||||
* Pre-check logic:
|
||||
* <ol>
|
||||
* <li>Check if extracted text contains at least one letter or digit after normalization</li>
|
||||
* <li>Check if document page count does not exceed the configured limit</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* Returns {@link M3PreCheckPassed} if both checks pass, or {@link M3PreCheckFailed}
|
||||
* with a specific reason if any check fails.
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param extraction the successfully extracted PDF content
|
||||
* @param configuration the startup configuration (used for maxPages limit)
|
||||
* @return the pre-check decision: passed or failed with reason
|
||||
* @throws NullPointerException if any parameter is null
|
||||
*/
|
||||
public static M3ProcessingDecision evaluate(
|
||||
SourceDocumentCandidate candidate,
|
||||
PdfExtractionSuccess extraction,
|
||||
StartConfiguration configuration) {
|
||||
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(extraction, "extraction must not be null");
|
||||
Objects.requireNonNull(configuration, "configuration must not be null");
|
||||
|
||||
// Pre-check 1: Verify document has usable text
|
||||
if (!hasUsableText(extraction.extractedText())) {
|
||||
return new M3PreCheckFailed(
|
||||
candidate,
|
||||
M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription()
|
||||
);
|
||||
}
|
||||
|
||||
// Pre-check 2: Verify document page count does not exceed configured limit
|
||||
if (extraction.pageCount().exceedsLimit(configuration.maxPages())) {
|
||||
return new M3PreCheckFailed(
|
||||
candidate,
|
||||
M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription()
|
||||
);
|
||||
}
|
||||
|
||||
// All pre-checks passed
|
||||
return new M3PreCheckPassed(candidate, extraction);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the extracted text contains at least one meaningful character.
|
||||
* <p>
|
||||
* Definition of "usable text" for M3:
|
||||
* <ul>
|
||||
* <li>After normalization (trimming whitespace), at least one letter or digit remains</li>
|
||||
* <li>Pure whitespace or only special characters do not qualify as usable text</li>
|
||||
* <li>Letters and digits include Unicode characters (e.g., ÄÖÜß, äöüß, etc.)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Normalization process:
|
||||
* <ol>
|
||||
* <li>Trim leading and trailing whitespace</li>
|
||||
* <li>Scan for at least one character where {@link Character#isLetterOrDigit(char)} returns true</li>
|
||||
* <li>Unicode-aware character classification (not limited to ASCII)</li>
|
||||
* </ol>
|
||||
*
|
||||
* @param text the extracted text from the PDF (non-null, may be empty)
|
||||
* @return true if text contains at least one letter or digit (Unicode-aware) after normalization
|
||||
*/
|
||||
private static boolean hasUsableText(String text) {
|
||||
Objects.requireNonNull(text, "text must not be null");
|
||||
|
||||
// Trim whitespace first
|
||||
String trimmed = text.strip();
|
||||
|
||||
// Check if text contains at least one letter or digit (Unicode-aware)
|
||||
for (char c : trimmed.toCharArray()) {
|
||||
if (Character.isLetterOrDigit(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// No letter or digit found
|
||||
return false;
|
||||
}
|
||||
|
||||
private M3PreCheckEvaluator() {
|
||||
// Static utility class – no instances
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* Application-level services for business logic evaluation.
|
||||
* <p>
|
||||
* This package contains stateless, pure-logic services that evaluate document content
|
||||
* and apply business rules. Services in this package:
|
||||
* <ul>
|
||||
* <li>Do not manage state or resources</li>
|
||||
* <li>Do not depend on infrastructure (database, filesystem, network)</li>
|
||||
* <li>Can be tested with simple unit tests and in-memory mocks</li>
|
||||
* <li>Are reused by multiple use cases or adapters</li>
|
||||
* </ul>
|
||||
*
|
||||
* Current services:
|
||||
* <ul>
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator} — M3 pre-check evaluation (M3-AP-004)</li>
|
||||
* </ul>
|
||||
*
|
||||
* @since M3-AP-004
|
||||
*/
|
||||
package de.gecheckt.pdf.umbenenner.application.service;
|
||||
@@ -0,0 +1,276 @@
|
||||
package de.gecheckt.pdf.umbenenner.application.service;
|
||||
|
||||
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
/**
|
||||
* Tests for {@link M3PreCheckEvaluator}.
|
||||
* <p>
|
||||
* Verifies correct M3 pre-check logic for usable text and page limit validation.
|
||||
*/
|
||||
class M3PreCheckEvaluatorTest {
|
||||
|
||||
@TempDir
|
||||
Path tempDir;
|
||||
|
||||
@Test
|
||||
void evaluate_passesWhenDocumentHasUsableTextAndValidPageCount() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Some meaningful text", new PdfPageCount(5));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass when text is usable and page count is valid");
|
||||
M3PreCheckPassed passed = (M3PreCheckPassed) result;
|
||||
assertSame(passed.candidate(), candidate, "Candidate should be preserved");
|
||||
assertSame(passed.extraction(), extraction, "Extraction should be preserved");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithNoUsableTextWhenExtractedTextIsEmpty() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail with empty text");
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithNoUsableTextWhenTextIsOnlyWhitespace() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess(" \n\t \r\n ", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail with whitespace-only text");
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithNoUsableTextWhenTextContainsOnlySpecialCharacters() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#$%^&*()_+-=[]{}|;:',.<>?/", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail with special characters only");
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithTextContainingSingleLetter() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("a", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass with single letter");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithTextContainingSingleDigit() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("5", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass with single digit");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithTextMixedWithSpecialCharactersIfLettersOrDigitsPresent() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#a$%^&*", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass when letters/digits are present among special chars");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithWhitespaceAroundUsableText() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess(" meaningful text ", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass when text has meaningful content despite whitespace");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithPageLimitExceededWhenPageCountEqualsLimit() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(5));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(5));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass when page count equals limit (not exceeded)");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithPageLimitExceededWhenPageCountExceedsLimit() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(5));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(6));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail when page count exceeds limit");
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_failsWithPageLimitExceededEvenIfTextIsValid() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(2));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Excellent meaningful text with lots of content", new PdfPageCount(100));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail with page limit exceeded even if text is good");
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_prefersPageLimitCheckOverTextCheck() throws Exception {
|
||||
// If both checks fail, page limit check should take precedence (not tested for priority,
|
||||
// but we verify that one failure is reported consistently)
|
||||
StartConfiguration config = buildConfig(maxPages(1));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(10));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckFailed, "Should fail when both checks fail");
|
||||
// The specific order of checks doesn't matter for M3; just verify one reason is returned
|
||||
M3PreCheckFailed failed = (M3PreCheckFailed) result;
|
||||
assertNotNull(failed.failureReason());
|
||||
assertFalse(failed.failureReason().isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_throwsNullPointerExceptionWhenCandidateIsNull() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
|
||||
|
||||
assertThrows(NullPointerException.class, () ->
|
||||
M3PreCheckEvaluator.evaluate(null, extraction, config)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_throwsNullPointerExceptionWhenExtractionIsNull() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
|
||||
assertThrows(NullPointerException.class, () ->
|
||||
M3PreCheckEvaluator.evaluate(candidate, null, config)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_throwsNullPointerExceptionWhenConfigurationIsNull() throws Exception {
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
|
||||
|
||||
assertThrows(NullPointerException.class, () ->
|
||||
M3PreCheckEvaluator.evaluate(candidate, extraction, null)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithUnicodeGermanUmlauts() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Äußerst äöüß Großes", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass with German umlauts (ÄÖÜß)");
|
||||
}
|
||||
|
||||
@Test
|
||||
void evaluate_passesWithOtherUnicodeCharacters() throws Exception {
|
||||
StartConfiguration config = buildConfig(maxPages(10));
|
||||
SourceDocumentCandidate candidate = buildCandidate();
|
||||
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Αβγδ 中文 καλημέρα", new PdfPageCount(1));
|
||||
|
||||
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
|
||||
|
||||
assertTrue(result instanceof M3PreCheckPassed, "Should pass with Greek, Chinese, and other Unicode letters");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Helpers
|
||||
// =========================================================================
|
||||
|
||||
private StartConfiguration buildConfig(int maxPages) throws Exception {
|
||||
Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
|
||||
Path targetDir = Files.createDirectories(tempDir.resolve("target"));
|
||||
Path dbFile = tempDir.resolve("db.sqlite");
|
||||
Files.createFile(dbFile);
|
||||
Path promptFile = tempDir.resolve("prompt.txt");
|
||||
Files.createFile(promptFile);
|
||||
|
||||
return new StartConfiguration(
|
||||
sourceDir,
|
||||
targetDir,
|
||||
dbFile,
|
||||
URI.create("https://api.example.com"),
|
||||
"gpt-4",
|
||||
30,
|
||||
3,
|
||||
maxPages,
|
||||
50000,
|
||||
promptFile,
|
||||
tempDir.resolve("lock.lock"),
|
||||
tempDir.resolve("logs"),
|
||||
"INFO",
|
||||
"test-key"
|
||||
);
|
||||
}
|
||||
|
||||
private int maxPages(int limit) {
|
||||
return limit;
|
||||
}
|
||||
|
||||
private SourceDocumentCandidate buildCandidate() throws Exception {
|
||||
Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
|
||||
Path pdfFile = sourceDir.resolve("test.pdf");
|
||||
Files.createFile(pdfFile);
|
||||
SourceDocumentLocator locator = new SourceDocumentLocator(pdfFile.toString());
|
||||
return new SourceDocumentCandidate(pdfFile.getFileName().toString(), 0L, locator);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package de.gecheckt.pdf.umbenenner.domain.model;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Represents a document that failed an M3 pre-check.
|
||||
* <p>
|
||||
* This result encapsulates:
|
||||
* <ul>
|
||||
* <li>The original document candidate metadata (for correlation)</li>
|
||||
* <li>A description of why the pre-check failed</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Reasons include:
|
||||
* <ul>
|
||||
* <li>"No usable text" – extraction yielded no meaningful content</li>
|
||||
* <li>"Page limit exceeded" – document exceeds the configured page limit</li>
|
||||
* <li>"Technical extraction error" – I/O or PDFBox failure (may be retryable later)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document with this decision will not proceed further in the current batch run.
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param failureReason a human-readable explanation of the pre-check failure
|
||||
* @since M3-AP-001
|
||||
*/
|
||||
public record M3PreCheckFailed(
|
||||
SourceDocumentCandidate candidate,
|
||||
String failureReason
|
||||
) implements M3ProcessingDecision {
|
||||
/**
|
||||
* Constructor with validation.
|
||||
*
|
||||
* @param candidate must be non-null
|
||||
* @param failureReason must be non-null and non-empty
|
||||
* @throws NullPointerException if either parameter is null
|
||||
* @throws IllegalArgumentException if failureReason is empty
|
||||
*/
|
||||
public M3PreCheckFailed {
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(failureReason, "failureReason must not be null");
|
||||
if (failureReason.isEmpty()) {
|
||||
throw new IllegalArgumentException("failureReason must not be empty");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
package de.gecheckt.pdf.umbenenner.domain.model;
|
||||
|
||||
/**
|
||||
* Enumeration of M3 pre-check failure reasons.
|
||||
* <p>
|
||||
* These are the deterministic content errors that can occur during M3 pre-check evaluation.
|
||||
* They distinguish between failures in the document content versus technical extraction failures.
|
||||
* <p>
|
||||
* Deterministic content errors:
|
||||
* <ul>
|
||||
* <li>{@link #NO_USABLE_TEXT}: The extracted text contains no meaningful content after normalization.</li>
|
||||
* <li>{@link #PAGE_LIMIT_EXCEEDED}: The document exceeds the configured page limit.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Note: Technical extraction failures (I/O errors, PDFBox failures) are not M3 pre-check reasons;
|
||||
* they are represented as {@link PdfExtractionTechnicalError} in the extraction result.
|
||||
*
|
||||
* @since M3-AP-004
|
||||
*/
|
||||
public enum M3PreCheckFailureReason {
|
||||
/**
|
||||
* The extracted PDF text, after normalization, contains no letters or digits.
|
||||
* <p>
|
||||
* This is a deterministic content error: reprocessing the same file in a later run
|
||||
* will have the same outcome unless the source file is changed.
|
||||
* <p>
|
||||
* In M3, retry logic: exactly 1 retry in a later batch run.
|
||||
*/
|
||||
NO_USABLE_TEXT("No usable text in extracted PDF content"),
|
||||
|
||||
/**
|
||||
* The document's page count exceeds the configured limit.
|
||||
* <p>
|
||||
* This is a deterministic content error: the page count will not change unless the source file is modified.
|
||||
* <p>
|
||||
* In M3, retry logic: exactly 1 retry in a later batch run.
|
||||
*/
|
||||
PAGE_LIMIT_EXCEEDED("Document page count exceeds configured limit");
|
||||
|
||||
private final String description;
|
||||
|
||||
M3PreCheckFailureReason(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a human-readable description of this failure reason.
|
||||
*
|
||||
* @return the description
|
||||
*/
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
package de.gecheckt.pdf.umbenenner.domain.model;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Represents a document that passed all M3 pre-checks.
|
||||
* <p>
|
||||
* This result encapsulates:
|
||||
* <ul>
|
||||
* <li>The original document candidate metadata</li>
|
||||
* <li>The successful PDF text extraction result</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document with this decision is ready to proceed to M4 and later milestones
|
||||
* (fingerprinting, persistence, KI integration, filename generation, target copy).
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param extraction the successful text extraction result
|
||||
* @since M3-AP-001
|
||||
*/
|
||||
public record M3PreCheckPassed(
|
||||
SourceDocumentCandidate candidate,
|
||||
PdfExtractionSuccess extraction
|
||||
) implements M3ProcessingDecision {
|
||||
/**
|
||||
* Constructor with validation.
|
||||
*
|
||||
* @param candidate must be non-null
|
||||
* @param extraction must be non-null
|
||||
* @throws NullPointerException if either parameter is null
|
||||
*/
|
||||
public M3PreCheckPassed {
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(extraction, "extraction must not be null");
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,12 @@
|
||||
package de.gecheckt.pdf.umbenenner.domain.model;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Sealed interface representing the outcome of M3 document pre-checks.
|
||||
* <p>
|
||||
* This is a placeholder interface introduced in AP-001 to establish the architectural
|
||||
* This interface introduced in AP-001 establishes the architectural
|
||||
* pattern for M3 pre-check results. The actual pre-check logic (fachlich validation
|
||||
* such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005).
|
||||
* such as "brauchbarer Text" and "Seitenlimit") is implemented in AP-004 via
|
||||
* {@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator}.
|
||||
* <p>
|
||||
* There are two allowed implementations:
|
||||
* <ul>
|
||||
@@ -29,79 +28,3 @@ public sealed interface M3ProcessingDecision
|
||||
permits M3PreCheckPassed, M3PreCheckFailed {
|
||||
// Marker interface; concrete implementations define structure
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a document that passed all M3 pre-checks.
|
||||
* <p>
|
||||
* This result encapsulates:
|
||||
* <ul>
|
||||
* <li>The original document candidate metadata</li>
|
||||
* <li>The successful PDF text extraction result</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document with this decision is ready to proceed to M4 and later milestones
|
||||
* (fingerprinting, persistence, KI integration, filename generation, target copy).
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param extraction the successful text extraction result
|
||||
* @since M3-AP-001
|
||||
*/
|
||||
record M3PreCheckPassed(
|
||||
SourceDocumentCandidate candidate,
|
||||
PdfExtractionSuccess extraction
|
||||
) implements M3ProcessingDecision {
|
||||
/**
|
||||
* Constructor with validation.
|
||||
*
|
||||
* @param candidate must be non-null
|
||||
* @param extraction must be non-null
|
||||
* @throws NullPointerException if either parameter is null
|
||||
*/
|
||||
M3PreCheckPassed {
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(extraction, "extraction must not be null");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a document that failed an M3 pre-check.
|
||||
* <p>
|
||||
* This result encapsulates:
|
||||
* <ul>
|
||||
* <li>The original document candidate metadata (for correlation)</li>
|
||||
* <li>A description of why the pre-check failed</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Reasons include:
|
||||
* <ul>
|
||||
* <li>"No usable text" – extraction yielded no meaningful content</li>
|
||||
* <li>"Page limit exceeded" – document exceeds the configured page limit</li>
|
||||
* <li>"Technical extraction error" – I/O or PDFBox failure (may be retryable later)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document with this decision will not proceed further in the current batch run.
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param failureReason a human-readable explanation of the pre-check failure
|
||||
* @since M3-AP-001
|
||||
*/
|
||||
record M3PreCheckFailed(
|
||||
SourceDocumentCandidate candidate,
|
||||
String failureReason
|
||||
) implements M3ProcessingDecision {
|
||||
/**
|
||||
* Constructor with validation.
|
||||
*
|
||||
* @param candidate must be non-null
|
||||
* @param failureReason must be non-null and non-empty
|
||||
* @throws NullPointerException if either parameter is null
|
||||
* @throws IllegalArgumentException if failureReason is empty
|
||||
*/
|
||||
M3PreCheckFailed {
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(failureReason, "failureReason must not be null");
|
||||
if (failureReason.isEmpty()) {
|
||||
throw new IllegalArgumentException("failureReason must not be empty");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,17 @@
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision} — sealed result of M3 pre-checks (M3-AP-001)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Additional classes introduced in M3:
|
||||
* <ul>
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason} — enumeration of M3 pre-check failure reasons (M3-AP-004)</li>
|
||||
* </ul>
|
||||
*
|
||||
* Implementation classes:
|
||||
* <ul>
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed} — document passed M3 pre-checks (M3-AP-001, M3-AP-004)</li>
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed} — document failed M3 pre-check (M3-AP-001, M3-AP-004)</li>
|
||||
* </ul>
|
||||
*
|
||||
* All classes in this package are:
|
||||
* <ul>
|
||||
* <li>Infrastructure-agnostic (no database, filesystem, network, or framework dependencies)</li>
|
||||
|
||||
Reference in New Issue
Block a user