Inhaltsfehler bei PDF-Extraktion korrekt klassifiziert

2026-04-02 15:10:47 +02:00
parent 60498ab3c8
commit 747f22438d
2 changed files with 13 additions and 10 deletions
@@ -2,6 +2,7 @@ package de.gecheckt.pdf.umbenenner.application.service;
 import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
 import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
 import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
 import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
 import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
 import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
@@ -16,7 +17,8 @@ import java.util.Objects;
 * <p>
 * Converts technical extraction results into processing outcomes through this pipeline:
 * <ol>
- *   <li>If extraction fails (content or technical): {@link TechnicalDocumentError}</li>
+ *   <li>If extraction fails (technical): {@link TechnicalDocumentError}</li>
 *   <li>If extraction fails (content): {@link PreCheckFailed}</li>
 *   <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li>
 * </ol>
 * <p>
@@ -24,7 +26,7 @@ import java.util.Objects;
 * all document processing outcomes:
 * <ul>
 *   <li>Pre-check passed (document ready for further processing)</li>
- *   <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded)</li>
+ *   <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded, content not extractable)</li>
 *   <li>Technical document error (I/O, access, PDF parsing, etc.)</li>
 * </ul>
 * <p>
@@ -38,7 +40,8 @@ public class DocumentProcessingService {
     * Pipeline:
     * <ol>
     *   <li>Extract text and page count from the PDF candidate</li>
-     *   <li>If extraction fails (technical or content): classify as technical document error</li>
+     *   <li>If extraction fails (technical): classify as technical document error</li>
     *   <li>If extraction fails (content): classify as pre-check failed (deterministic content error)</li>
     *   <li>If extraction succeeds: evaluate pre-checks</li>
     * </ol>
     *
@@ -63,8 +66,8 @@ public class DocumentProcessingService {
                PreCheckEvaluator.evaluate(candidate, success, configuration);
            case PdfExtractionContentError contentError ->
-                // PDF content not extractable: classify as technical document error
+                // PDF content not extractable: classify as pre-check failed (deterministic content error)
-                new TechnicalDocumentError(candidate, "PDF content not extractable: " + contentError.reason(), null);
+                new PreCheckFailed(candidate, "PDF content not extractable: " + contentError.reason());
            case PdfExtractionTechnicalError technicalError ->
                // Technical failure during extraction: potentially retryable
@@ -119,18 +119,18 @@ class DocumentProcessingServiceTest {
    @Test
    void testProcessDocument_WithContentError() {
-        // Arrange: PDF content not extractable (classified as technical document error)
+        // Arrange: PDF content not extractable (classified as pre-check failed)
        var contentError = new PdfExtractionContentError("PDF is corrupted");
        // Act
        DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument(
                candidate, contentError, configuration);
-        // Assert: Should produce TechnicalDocumentError
+        // Assert: Should produce PreCheckFailed
-        assertInstanceOf(TechnicalDocumentError.class, outcome);
+        assertInstanceOf(PreCheckFailed.class, outcome);
-        TechnicalDocumentError result = (TechnicalDocumentError) outcome;
+        PreCheckFailed result = (PreCheckFailed) outcome;
        assertEquals(candidate, result.candidate());
-        assertTrue(result.errorMessage().contains("PDF is corrupted"));
+        assertTrue(result.failureReason().contains("PDF is corrupted"));
    }
    @Test