Inhaltsfehler bei PDF-Extraktion korrekt klassifiziert
This commit is contained in:
@@ -2,6 +2,7 @@ package de.gecheckt.pdf.umbenenner.application.service;
|
|||||||
|
|
||||||
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
||||||
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
|
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
|
||||||
|
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
|
||||||
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
|
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
|
||||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
|
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
|
||||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
|
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
|
||||||
@@ -16,7 +17,8 @@ import java.util.Objects;
|
|||||||
* <p>
|
* <p>
|
||||||
* Converts technical extraction results into processing outcomes through this pipeline:
|
* Converts technical extraction results into processing outcomes through this pipeline:
|
||||||
* <ol>
|
* <ol>
|
||||||
* <li>If extraction fails (content or technical): {@link TechnicalDocumentError}</li>
|
* <li>If extraction fails (technical): {@link TechnicalDocumentError}</li>
|
||||||
|
* <li>If extraction fails (content): {@link PreCheckFailed}</li>
|
||||||
* <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li>
|
* <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li>
|
||||||
* </ol>
|
* </ol>
|
||||||
* <p>
|
* <p>
|
||||||
@@ -24,7 +26,7 @@ import java.util.Objects;
|
|||||||
* all document processing outcomes:
|
* all document processing outcomes:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Pre-check passed (document ready for further processing)</li>
|
* <li>Pre-check passed (document ready for further processing)</li>
|
||||||
* <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded)</li>
|
* <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded, content not extractable)</li>
|
||||||
* <li>Technical document error (I/O, access, PDF parsing, etc.)</li>
|
* <li>Technical document error (I/O, access, PDF parsing, etc.)</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* <p>
|
* <p>
|
||||||
@@ -38,7 +40,8 @@ public class DocumentProcessingService {
|
|||||||
* Pipeline:
|
* Pipeline:
|
||||||
* <ol>
|
* <ol>
|
||||||
* <li>Extract text and page count from the PDF candidate</li>
|
* <li>Extract text and page count from the PDF candidate</li>
|
||||||
* <li>If extraction fails (technical or content): classify as technical document error</li>
|
* <li>If extraction fails (technical): classify as technical document error</li>
|
||||||
|
* <li>If extraction fails (content): classify as pre-check failed (deterministic content error)</li>
|
||||||
* <li>If extraction succeeds: evaluate pre-checks</li>
|
* <li>If extraction succeeds: evaluate pre-checks</li>
|
||||||
* </ol>
|
* </ol>
|
||||||
*
|
*
|
||||||
@@ -63,8 +66,8 @@ public class DocumentProcessingService {
|
|||||||
PreCheckEvaluator.evaluate(candidate, success, configuration);
|
PreCheckEvaluator.evaluate(candidate, success, configuration);
|
||||||
|
|
||||||
case PdfExtractionContentError contentError ->
|
case PdfExtractionContentError contentError ->
|
||||||
// PDF content not extractable: classify as technical document error
|
// PDF content not extractable: classify as pre-check failed (deterministic content error)
|
||||||
new TechnicalDocumentError(candidate, "PDF content not extractable: " + contentError.reason(), null);
|
new PreCheckFailed(candidate, "PDF content not extractable: " + contentError.reason());
|
||||||
|
|
||||||
case PdfExtractionTechnicalError technicalError ->
|
case PdfExtractionTechnicalError technicalError ->
|
||||||
// Technical failure during extraction: potentially retryable
|
// Technical failure during extraction: potentially retryable
|
||||||
|
|||||||
@@ -119,18 +119,18 @@ class DocumentProcessingServiceTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testProcessDocument_WithContentError() {
|
void testProcessDocument_WithContentError() {
|
||||||
// Arrange: PDF content not extractable (classified as technical document error)
|
// Arrange: PDF content not extractable (classified as pre-check failed)
|
||||||
var contentError = new PdfExtractionContentError("PDF is corrupted");
|
var contentError = new PdfExtractionContentError("PDF is corrupted");
|
||||||
|
|
||||||
// Act
|
// Act
|
||||||
DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument(
|
DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument(
|
||||||
candidate, contentError, configuration);
|
candidate, contentError, configuration);
|
||||||
|
|
||||||
// Assert: Should produce TechnicalDocumentError
|
// Assert: Should produce PreCheckFailed
|
||||||
assertInstanceOf(TechnicalDocumentError.class, outcome);
|
assertInstanceOf(PreCheckFailed.class, outcome);
|
||||||
TechnicalDocumentError result = (TechnicalDocumentError) outcome;
|
PreCheckFailed result = (PreCheckFailed) outcome;
|
||||||
assertEquals(candidate, result.candidate());
|
assertEquals(candidate, result.candidate());
|
||||||
assertTrue(result.errorMessage().contains("PDF is corrupted"));
|
assertTrue(result.failureReason().contains("PDF is corrupted"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
Reference in New Issue
Block a user