1
0

Inhaltsfehler bei PDF-Extraktion korrekt klassifiziert

This commit is contained in:
2026-04-02 15:10:47 +02:00
parent 60498ab3c8
commit 747f22438d
2 changed files with 13 additions and 10 deletions

View File

@@ -2,6 +2,7 @@ package de.gecheckt.pdf.umbenenner.application.service;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration; import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome; import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError; import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
@@ -16,7 +17,8 @@ import java.util.Objects;
* <p> * <p>
* Converts technical extraction results into processing outcomes through this pipeline: * Converts technical extraction results into processing outcomes through this pipeline:
* <ol> * <ol>
* <li>If extraction fails (content or technical): {@link TechnicalDocumentError}</li> * <li>If extraction fails (technical): {@link TechnicalDocumentError}</li>
* <li>If extraction fails (content): {@link PreCheckFailed}</li>
* <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li> * <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li>
* </ol> * </ol>
* <p> * <p>
@@ -24,7 +26,7 @@ import java.util.Objects;
* all document processing outcomes: * all document processing outcomes:
* <ul> * <ul>
* <li>Pre-check passed (document ready for further processing)</li> * <li>Pre-check passed (document ready for further processing)</li>
* <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded)</li> * <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded, content not extractable)</li>
* <li>Technical document error (I/O, access, PDF parsing, etc.)</li> * <li>Technical document error (I/O, access, PDF parsing, etc.)</li>
* </ul> * </ul>
* <p> * <p>
@@ -38,7 +40,8 @@ public class DocumentProcessingService {
* Pipeline: * Pipeline:
* <ol> * <ol>
* <li>Extract text and page count from the PDF candidate</li> * <li>Extract text and page count from the PDF candidate</li>
* <li>If extraction fails (technical or content): classify as technical document error</li> * <li>If extraction fails (technical): classify as technical document error</li>
* <li>If extraction fails (content): classify as pre-check failed (deterministic content error)</li>
* <li>If extraction succeeds: evaluate pre-checks</li> * <li>If extraction succeeds: evaluate pre-checks</li>
* </ol> * </ol>
* *
@@ -63,8 +66,8 @@ public class DocumentProcessingService {
PreCheckEvaluator.evaluate(candidate, success, configuration); PreCheckEvaluator.evaluate(candidate, success, configuration);
case PdfExtractionContentError contentError -> case PdfExtractionContentError contentError ->
// PDF content not extractable: classify as technical document error // PDF content not extractable: classify as pre-check failed (deterministic content error)
new TechnicalDocumentError(candidate, "PDF content not extractable: " + contentError.reason(), null); new PreCheckFailed(candidate, "PDF content not extractable: " + contentError.reason());
case PdfExtractionTechnicalError technicalError -> case PdfExtractionTechnicalError technicalError ->
// Technical failure during extraction: potentially retryable // Technical failure during extraction: potentially retryable

View File

@@ -119,18 +119,18 @@ class DocumentProcessingServiceTest {
@Test @Test
void testProcessDocument_WithContentError() { void testProcessDocument_WithContentError() {
// Arrange: PDF content not extractable (classified as technical document error) // Arrange: PDF content not extractable (classified as pre-check failed)
var contentError = new PdfExtractionContentError("PDF is corrupted"); var contentError = new PdfExtractionContentError("PDF is corrupted");
// Act // Act
DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument( DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument(
candidate, contentError, configuration); candidate, contentError, configuration);
// Assert: Should produce TechnicalDocumentError // Assert: Should produce PreCheckFailed
assertInstanceOf(TechnicalDocumentError.class, outcome); assertInstanceOf(PreCheckFailed.class, outcome);
TechnicalDocumentError result = (TechnicalDocumentError) outcome; PreCheckFailed result = (PreCheckFailed) outcome;
assertEquals(candidate, result.candidate()); assertEquals(candidate, result.candidate());
assertTrue(result.errorMessage().contains("PDF is corrupted")); assertTrue(result.failureReason().contains("PDF is corrupted"));
} }
@Test @Test