M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert
This commit is contained in:
@@ -0,0 +1,119 @@
|
||||
package de.gecheckt.pdf.umbenenner.application.service;
|
||||
|
||||
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Evaluates whether a successfully extracted PDF passes M3 pre-checks.
|
||||
* <p>
|
||||
* M3 Pre-checks verify that:
|
||||
* <ul>
|
||||
* <li>The extracted text contains at least one meaningful character after normalization</li>
|
||||
* <li>The document's page count does not exceed the configured limit</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A document that passes both pre-checks is ready to proceed to M4 and later milestones.
|
||||
* A document that fails a pre-check is classified with a specific deterministic failure reason
|
||||
* and will not proceed further in the current batch run.
|
||||
* <p>
|
||||
* This service is stateless and thread-safe.
|
||||
*
|
||||
* @since M3-AP-004
|
||||
*/
|
||||
public class M3PreCheckEvaluator {
|
||||
|
||||
/**
|
||||
* Evaluates M3 pre-checks for a successfully extracted PDF document.
|
||||
* <p>
|
||||
* Pre-check logic:
|
||||
* <ol>
|
||||
* <li>Check if extracted text contains at least one letter or digit after normalization</li>
|
||||
* <li>Check if document page count does not exceed the configured limit</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* Returns {@link M3PreCheckPassed} if both checks pass, or {@link M3PreCheckFailed}
|
||||
* with a specific reason if any check fails.
|
||||
*
|
||||
* @param candidate the source document metadata
|
||||
* @param extraction the successfully extracted PDF content
|
||||
* @param configuration the startup configuration (used for maxPages limit)
|
||||
* @return the pre-check decision: passed or failed with reason
|
||||
* @throws NullPointerException if any parameter is null
|
||||
*/
|
||||
public static M3ProcessingDecision evaluate(
|
||||
SourceDocumentCandidate candidate,
|
||||
PdfExtractionSuccess extraction,
|
||||
StartConfiguration configuration) {
|
||||
|
||||
Objects.requireNonNull(candidate, "candidate must not be null");
|
||||
Objects.requireNonNull(extraction, "extraction must not be null");
|
||||
Objects.requireNonNull(configuration, "configuration must not be null");
|
||||
|
||||
// Pre-check 1: Verify document has usable text
|
||||
if (!hasUsableText(extraction.extractedText())) {
|
||||
return new M3PreCheckFailed(
|
||||
candidate,
|
||||
M3PreCheckFailureReason.NO_USABLE_TEXT.getDescription()
|
||||
);
|
||||
}
|
||||
|
||||
// Pre-check 2: Verify document page count does not exceed configured limit
|
||||
if (extraction.pageCount().exceedsLimit(configuration.maxPages())) {
|
||||
return new M3PreCheckFailed(
|
||||
candidate,
|
||||
M3PreCheckFailureReason.PAGE_LIMIT_EXCEEDED.getDescription()
|
||||
);
|
||||
}
|
||||
|
||||
// All pre-checks passed
|
||||
return new M3PreCheckPassed(candidate, extraction);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the extracted text contains at least one meaningful character.
|
||||
* <p>
|
||||
* Definition of "usable text" for M3:
|
||||
* <ul>
|
||||
* <li>After normalization (trimming whitespace), at least one letter or digit remains</li>
|
||||
* <li>Pure whitespace or only special characters do not qualify as usable text</li>
|
||||
* <li>Letters and digits include Unicode characters (e.g., ÄÖÜß, äöüß, etc.)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Normalization process:
|
||||
* <ol>
|
||||
* <li>Trim leading and trailing whitespace</li>
|
||||
* <li>Scan for at least one character where {@link Character#isLetterOrDigit(char)} returns true</li>
|
||||
* <li>Unicode-aware character classification (not limited to ASCII)</li>
|
||||
* </ol>
|
||||
*
|
||||
* @param text the extracted text from the PDF (non-null, may be empty)
|
||||
* @return true if text contains at least one letter or digit (Unicode-aware) after normalization
|
||||
*/
|
||||
private static boolean hasUsableText(String text) {
|
||||
Objects.requireNonNull(text, "text must not be null");
|
||||
|
||||
// Trim whitespace first
|
||||
String trimmed = text.strip();
|
||||
|
||||
// Check if text contains at least one letter or digit (Unicode-aware)
|
||||
for (char c : trimmed.toCharArray()) {
|
||||
if (Character.isLetterOrDigit(c)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// No letter or digit found
|
||||
return false;
|
||||
}
|
||||
|
||||
private M3PreCheckEvaluator() {
|
||||
// Static utility class – no instances
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
/**
|
||||
* Application-level services for business logic evaluation.
|
||||
* <p>
|
||||
* This package contains stateless, pure-logic services that evaluate document content
|
||||
* and apply business rules. Services in this package:
|
||||
* <ul>
|
||||
* <li>Do not manage state or resources</li>
|
||||
* <li>Do not depend on infrastructure (database, filesystem, network)</li>
|
||||
* <li>Can be tested with simple unit tests and in-memory mocks</li>
|
||||
* <li>Are reused by multiple use cases or adapters</li>
|
||||
* </ul>
|
||||
*
|
||||
* Current services:
|
||||
* <ul>
|
||||
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator} — M3 pre-check evaluation (M3-AP-004)</li>
|
||||
* </ul>
|
||||
*
|
||||
* @since M3-AP-004
|
||||
*/
|
||||
package de.gecheckt.pdf.umbenenner.application.service;
|
||||
Reference in New Issue
Block a user