1
0

M3-AP-004: Vorprüfung auf Unicode-fähigen brauchbaren Text korrigiert

This commit is contained in:
2026-04-01 19:07:03 +02:00
parent a9407aaba2
commit c482b20df9
9 changed files with 568 additions and 81 deletions

View File

@@ -0,0 +1,46 @@
package de.gecheckt.pdf.umbenenner.domain.model;
import java.util.Objects;
/**
* Represents a document that failed an M3 pre-check.
* <p>
* This result encapsulates:
* <ul>
* <li>The original document candidate metadata (for correlation)</li>
* <li>A description of why the pre-check failed</li>
* </ul>
* <p>
* Reasons include:
* <ul>
* <li>"No usable text" extraction yielded no meaningful content</li>
* <li>"Page limit exceeded" document exceeds the configured page limit</li>
* <li>"Technical extraction error" I/O or PDFBox failure (may be retryable later)</li>
* </ul>
* <p>
* A document with this decision will not proceed further in the current batch run.
*
* @param candidate the source document metadata
* @param failureReason a human-readable explanation of the pre-check failure
* @since M3-AP-001
*/
public record M3PreCheckFailed(
SourceDocumentCandidate candidate,
String failureReason
) implements M3ProcessingDecision {
/**
* Constructor with validation.
*
* @param candidate must be non-null
* @param failureReason must be non-null and non-empty
* @throws NullPointerException if either parameter is null
* @throws IllegalArgumentException if failureReason is empty
*/
public M3PreCheckFailed {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(failureReason, "failureReason must not be null");
if (failureReason.isEmpty()) {
throw new IllegalArgumentException("failureReason must not be empty");
}
}
}

View File

@@ -0,0 +1,54 @@
package de.gecheckt.pdf.umbenenner.domain.model;
/**
* Enumeration of M3 pre-check failure reasons.
* <p>
* These are the deterministic content errors that can occur during M3 pre-check evaluation.
* They distinguish between failures in the document content versus technical extraction failures.
* <p>
* Deterministic content errors:
* <ul>
* <li>{@link #NO_USABLE_TEXT}: The extracted text contains no meaningful content after normalization.</li>
* <li>{@link #PAGE_LIMIT_EXCEEDED}: The document exceeds the configured page limit.</li>
* </ul>
* <p>
* Note: Technical extraction failures (I/O errors, PDFBox failures) are not M3 pre-check reasons;
* they are represented as {@link PdfExtractionTechnicalError} in the extraction result.
*
* @since M3-AP-004
*/
public enum M3PreCheckFailureReason {
/**
* The extracted PDF text, after normalization, contains no letters or digits.
* <p>
* This is a deterministic content error: reprocessing the same file in a later run
* will have the same outcome unless the source file is changed.
* <p>
* In M3, retry logic: exactly 1 retry in a later batch run.
*/
NO_USABLE_TEXT("No usable text in extracted PDF content"),
/**
* The document's page count exceeds the configured limit.
* <p>
* This is a deterministic content error: the page count will not change unless the source file is modified.
* <p>
* In M3, retry logic: exactly 1 retry in a later batch run.
*/
PAGE_LIMIT_EXCEEDED("Document page count exceeds configured limit");
private final String description;
M3PreCheckFailureReason(String description) {
this.description = description;
}
/**
* Returns a human-readable description of this failure reason.
*
* @return the description
*/
public String getDescription() {
return description;
}
}

View File

@@ -0,0 +1,36 @@
package de.gecheckt.pdf.umbenenner.domain.model;
import java.util.Objects;
/**
* Represents a document that passed all M3 pre-checks.
* <p>
* This result encapsulates:
* <ul>
* <li>The original document candidate metadata</li>
* <li>The successful PDF text extraction result</li>
* </ul>
* <p>
* A document with this decision is ready to proceed to M4 and later milestones
* (fingerprinting, persistence, KI integration, filename generation, target copy).
*
* @param candidate the source document metadata
* @param extraction the successful text extraction result
* @since M3-AP-001
*/
public record M3PreCheckPassed(
SourceDocumentCandidate candidate,
PdfExtractionSuccess extraction
) implements M3ProcessingDecision {
/**
* Constructor with validation.
*
* @param candidate must be non-null
* @param extraction must be non-null
* @throws NullPointerException if either parameter is null
*/
public M3PreCheckPassed {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(extraction, "extraction must not be null");
}
}

View File

@@ -1,13 +1,12 @@
package de.gecheckt.pdf.umbenenner.domain.model;
import java.util.Objects;
/**
* Sealed interface representing the outcome of M3 document pre-checks.
* <p>
* This is a placeholder interface introduced in AP-001 to establish the architectural
* This interface introduced in AP-001 establishes the architectural
* pattern for M3 pre-check results. The actual pre-check logic (fachlich validation
* such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005).
* such as "brauchbarer Text" and "Seitenlimit") is implemented in AP-004 via
* {@link de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator}.
* <p>
* There are two allowed implementations:
* <ul>
@@ -29,79 +28,3 @@ public sealed interface M3ProcessingDecision
permits M3PreCheckPassed, M3PreCheckFailed {
// Marker interface; concrete implementations define structure
}
/**
* Represents a document that passed all M3 pre-checks.
* <p>
* This result encapsulates:
* <ul>
* <li>The original document candidate metadata</li>
* <li>The successful PDF text extraction result</li>
* </ul>
* <p>
* A document with this decision is ready to proceed to M4 and later milestones
* (fingerprinting, persistence, KI integration, filename generation, target copy).
*
* @param candidate the source document metadata
* @param extraction the successful text extraction result
* @since M3-AP-001
*/
record M3PreCheckPassed(
SourceDocumentCandidate candidate,
PdfExtractionSuccess extraction
) implements M3ProcessingDecision {
/**
* Constructor with validation.
*
* @param candidate must be non-null
* @param extraction must be non-null
* @throws NullPointerException if either parameter is null
*/
M3PreCheckPassed {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(extraction, "extraction must not be null");
}
}
/**
* Represents a document that failed an M3 pre-check.
* <p>
* This result encapsulates:
* <ul>
* <li>The original document candidate metadata (for correlation)</li>
* <li>A description of why the pre-check failed</li>
* </ul>
* <p>
* Reasons include:
* <ul>
* <li>"No usable text" extraction yielded no meaningful content</li>
* <li>"Page limit exceeded" document exceeds the configured page limit</li>
* <li>"Technical extraction error" I/O or PDFBox failure (may be retryable later)</li>
* </ul>
* <p>
* A document with this decision will not proceed further in the current batch run.
*
* @param candidate the source document metadata
* @param failureReason a human-readable explanation of the pre-check failure
* @since M3-AP-001
*/
record M3PreCheckFailed(
SourceDocumentCandidate candidate,
String failureReason
) implements M3ProcessingDecision {
/**
* Constructor with validation.
*
* @param candidate must be non-null
* @param failureReason must be non-null and non-empty
* @throws NullPointerException if either parameter is null
* @throws IllegalArgumentException if failureReason is empty
*/
M3PreCheckFailed {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(failureReason, "failureReason must not be null");
if (failureReason.isEmpty()) {
throw new IllegalArgumentException("failureReason must not be empty");
}
}
}

View File

@@ -13,6 +13,17 @@
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision} — sealed result of M3 pre-checks (M3-AP-001)</li>
* </ul>
* <p>
* Additional classes introduced in M3:
* <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailureReason} — enumeration of M3 pre-check failure reasons (M3-AP-004)</li>
* </ul>
*
* Implementation classes:
* <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed} — document passed M3 pre-checks (M3-AP-001, M3-AP-004)</li>
* <li>{@link de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed} — document failed M3 pre-check (M3-AP-001, M3-AP-004)</li>
* </ul>
*
* All classes in this package are:
* <ul>
* <li>Infrastructure-agnostic (no database, filesystem, network, or framework dependencies)</li>