1
0

M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert

This commit is contained in:
2026-04-01 19:07:03 +02:00
parent a9407aaba2
commit c482b20df9
9 changed files with 568 additions and 81 deletions

View File

@@ -0,0 +1,119 @@
package de.gecheckt.pdf.umbenenner.application.service;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import java.util.Objects;
/**
* Evaluates whether a successfully extracted PDF passes M3 pre-checks.
* <p>
* M3 Pre-checks verify that:
* <ul>
* <li>The extracted text contains at least one meaningful character after normalization</li>
* <li>The document's page count does not exceed the configured limit</li>
* </ul>
* <p>
* A document that passes both pre-checks is ready to proceed to M4 and later milestones.
* A document that fails a pre-check is classified with a specific deterministic failure reason
* and will not proceed further in the current batch run.
* <p>
* This service is stateless and thread-safe.
*
* @since M3-AP-004
*/
public class M3PreCheckEvaluator {
/**
* Evaluates M3 pre-checks for a successfully extracted PDF document.
* <p>
* Pre-check logic:
* <ol>
* <li>Check if extracted text contains at least one letter or digit after normalization</li>
* <li>Check if document page count does not exceed the configured limit</li>
* </ol>
* <p>
* Returns {@link M3PreCheckPassed} if both checks pass, or {@link M3PreCheckFailed}
* with a specific reason if any check fails.
*
* @param candidate the source document metadata
* @param extraction the successfully extracted PDF content
* @param configuration the startup configuration (used for maxPages limit)
* @return the pre-check decision: passed or failed with reason
* @throws NullPointerException if any parameter is null
*/
public static M3ProcessingDecision evaluate(
SourceDocumentCandidate candidate,
PdfExtractionSuccess extraction,
StartConfiguration configuration) {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(extraction, "extraction must not be null");
Objects.requireNonNull(configuration, "configuration must not be null");
// Pre-check 1: Verify document has usable text
if (!hasUsableText(extraction.extractedText())) {
return new M3PreCheckFailed(
candidate,
M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription()
);
}
// Pre-check 2: Verify document page count does not exceed configured limit
if (extraction.pageCount().exceedsLimit(configuration.maxPages())) {
return new M3PreCheckFailed(
candidate,
M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription()
);
}
// All pre-checks passed
return new M3PreCheckPassed(candidate, extraction);
}
/**
* Determines whether the extracted text contains at least one meaningful character.
* <p>
* Definition of "usable text" for M3:
* <ul>
* <li>After normalization (trimming whitespace), at least one letter or digit remains</li>
* <li>Pure whitespace or only special characters do not qualify as usable text</li>
* <li>Letters and digits include Unicode characters (e.g., ÄÖÜß, äöüß, etc.)</li>
* </ul>
* <p>
* Normalization process:
* <ol>
* <li>Trim leading and trailing whitespace</li>
* <li>Scan for at least one character where {@link Character#isLetterOrDigit(char)} returns true</li>
* <li>Unicode-aware character classification (not limited to ASCII)</li>
* </ol>
*
* @param text the extracted text from the PDF (non-null, may be empty)
* @return true if text contains at least one letter or digit (Unicode-aware) after normalization
*/
private static boolean hasUsableText(String text) {
Objects.requireNonNull(text, "text must not be null");
// Trim whitespace first
String trimmed = text.strip();
// Check if text contains at least one letter or digit (Unicode-aware)
for (char c : trimmed.toCharArray()) {
if (Character.isLetterOrDigit(c)) {
return true;
}
}
// No letter or digit found
return false;
}
private M3PreCheckEvaluator() {
// Static utility class no instances
}
}

View File

@@ -0,0 +1,20 @@
/**
* Application-level services for business logic evaluation.
* <p>
* This package contains stateless, pure-logic services that evaluate document content
* and apply business rules. Services in this package:
* <ul>
* <li>Do not manage state or resources</li>
* <li>Do not depend on infrastructure (database, filesystem, network)</li>
* <li>Can be tested with simple unit tests and in-memory mocks</li>
* <li>Are reused by multiple use cases or adapters</li>
* </ul>
*
* Current services:
* <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator} — M3 pre-check evaluation (M3-AP-004)</li>
* </ul>
*
* @since M3-AP-004
*/
package de.gecheckt.pdf.umbenenner.application.service;

View File

@@ -0,0 +1,276 @@
package de.gecheckt.pdf.umbenenner.application.service;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
/**
* Tests for {@link M3PreCheckEvaluator}.
* <p>
* Verifies correct M3 pre-check logic for usable text and page limit validation.
*/
class M3PreCheckEvaluatorTest {
@TempDir
Path tempDir;
@Test
void evaluate_passesWhenDocumentHasUsableTextAndValidPageCount() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Some meaningful text", new PdfPageCount(5));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass when text is usable and page count is valid");
M3PreCheckPassed passed = (M3PreCheckPassed) result;
assertSame(passed.candidate(), candidate, "Candidate should be preserved");
assertSame(passed.extraction(), extraction, "Extraction should be preserved");
}
@Test
void evaluate_failsWithNoUsableTextWhenExtractedTextIsEmpty() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail with empty text");
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
}
@Test
void evaluate_failsWithNoUsableTextWhenTextIsOnlyWhitespace() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess(" \n\t \r\n ", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail with whitespace-only text");
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
}
@Test
void evaluate_failsWithNoUsableTextWhenTextContainsOnlySpecialCharacters() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#$%^&*()_+-=[]{}|;:',.<>?/", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail with special characters only");
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertEquals(M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription(), failed.failureReason());
}
@Test
void evaluate_passesWithTextContainingSingleLetter() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("a", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass with single letter");
}
@Test
void evaluate_passesWithTextContainingSingleDigit() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("5", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass with single digit");
}
@Test
void evaluate_passesWithTextMixedWithSpecialCharactersIfLettersOrDigitsPresent() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("!@#a$%^&*", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass when letters/digits are present among special chars");
}
@Test
void evaluate_passesWithWhitespaceAroundUsableText() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess(" meaningful text ", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass when text has meaningful content despite whitespace");
}
@Test
void evaluate_failsWithPageLimitExceededWhenPageCountEqualsLimit() throws Exception {
StartConfiguration config = buildConfig(maxPages(5));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(5));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass when page count equals limit (not exceeded)");
}
@Test
void evaluate_failsWithPageLimitExceededWhenPageCountExceedsLimit() throws Exception {
StartConfiguration config = buildConfig(maxPages(5));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(6));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail when page count exceeds limit");
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
}
@Test
void evaluate_failsWithPageLimitExceededEvenIfTextIsValid() throws Exception {
StartConfiguration config = buildConfig(maxPages(2));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Excellent meaningful text with lots of content", new PdfPageCount(100));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail with page limit exceeded even if text is good");
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertEquals(M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription(), failed.failureReason());
}
@Test
void evaluate_prefersPageLimitCheckOverTextCheck() throws Exception {
// If both checks fail, page limit check should take precedence (not tested for priority,
// but we verify that one failure is reported consistently)
StartConfiguration config = buildConfig(maxPages(1));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("", new PdfPageCount(10));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckFailed, "Should fail when both checks fail");
// The specific order of checks doesn't matter for M3; just verify one reason is returned
M3PreCheckFailed failed = (M3PreCheckFailed) result;
assertNotNull(failed.failureReason());
assertFalse(failed.failureReason().isEmpty());
}
@Test
void evaluate_throwsNullPointerExceptionWhenCandidateIsNull() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
assertThrows(NullPointerException.class, () ->
M3PreCheckEvaluator.evaluate(null, extraction, config)
);
}
@Test
void evaluate_throwsNullPointerExceptionWhenExtractionIsNull() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
assertThrows(NullPointerException.class, () ->
M3PreCheckEvaluator.evaluate(candidate, null, config)
);
}
@Test
void evaluate_throwsNullPointerExceptionWhenConfigurationIsNull() throws Exception {
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Valid text", new PdfPageCount(1));
assertThrows(NullPointerException.class, () ->
M3PreCheckEvaluator.evaluate(candidate, extraction, null)
);
}
@Test
void evaluate_passesWithUnicodeGermanUmlauts() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Äußerst äöüß Großes", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass with German umlauts (ÄÖÜß)");
}
@Test
void evaluate_passesWithOtherUnicodeCharacters() throws Exception {
StartConfiguration config = buildConfig(maxPages(10));
SourceDocumentCandidate candidate = buildCandidate();
PdfExtractionSuccess extraction = new PdfExtractionSuccess("Αβγδ 中文 καλημέρα", new PdfPageCount(1));
M3ProcessingDecision result = M3PreCheckEvaluator.evaluate(candidate, extraction, config);
assertTrue(result instanceof M3PreCheckPassed, "Should pass with Greek, Chinese, and other Unicode letters");
}
// =========================================================================
// Helpers
// =========================================================================
private StartConfiguration buildConfig(int maxPages) throws Exception {
Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
Path targetDir = Files.createDirectories(tempDir.resolve("target"));
Path dbFile = tempDir.resolve("db.sqlite");
Files.createFile(dbFile);
Path promptFile = tempDir.resolve("prompt.txt");
Files.createFile(promptFile);
return new StartConfiguration(
sourceDir,
targetDir,
dbFile,
URI.create("https://api.example.com"),
"gpt-4",
30,
3,
maxPages,
50000,
promptFile,
tempDir.resolve("lock.lock"),
tempDir.resolve("logs"),
"INFO",
"test-key"
);
}
private int maxPages(int limit) {
return limit;
}
private SourceDocumentCandidate buildCandidate() throws Exception {
Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
Path pdfFile = sourceDir.resolve("test.pdf");
Files.createFile(pdfFile);
SourceDocumentLocator locator = new SourceDocumentLocator(pdfFile.toString());
return new SourceDocumentCandidate(pdfFile.getFileName().toString(), 0L, locator);
}
}