Inhaltsfehler bei PDF-Extraktion korrekt klassifiziert
This commit is contained in:
@@ -2,6 +2,7 @@ package de.gecheckt.pdf.umbenenner.application.service;
|
||||
|
||||
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
|
||||
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
|
||||
@@ -16,7 +17,8 @@ import java.util.Objects;
|
||||
* <p>
|
||||
* Converts technical extraction results into processing outcomes through this pipeline:
|
||||
* <ol>
|
||||
* <li>If extraction fails (content or technical): {@link TechnicalDocumentError}</li>
|
||||
* <li>If extraction fails (technical): {@link TechnicalDocumentError}</li>
|
||||
* <li>If extraction fails (content): {@link PreCheckFailed}</li>
|
||||
* <li>If extraction succeeds: Evaluate pre-checks via {@link PreCheckEvaluator}</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
@@ -24,7 +26,7 @@ import java.util.Objects;
|
||||
* all document processing outcomes:
|
||||
* <ul>
|
||||
* <li>Pre-check passed (document ready for further processing)</li>
|
||||
* <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded)</li>
|
||||
* <li>Pre-check failed (deterministic content error: no usable text, page limit exceeded, content not extractable)</li>
|
||||
* <li>Technical document error (I/O, access, PDF parsing, etc.)</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
@@ -38,7 +40,8 @@ public class DocumentProcessingService {
|
||||
* Pipeline:
|
||||
* <ol>
|
||||
* <li>Extract text and page count from the PDF candidate</li>
|
||||
* <li>If extraction fails (technical or content): classify as technical document error</li>
|
||||
* <li>If extraction fails (technical): classify as technical document error</li>
|
||||
* <li>If extraction fails (content): classify as pre-check failed (deterministic content error)</li>
|
||||
* <li>If extraction succeeds: evaluate pre-checks</li>
|
||||
* </ol>
|
||||
*
|
||||
@@ -63,8 +66,8 @@ public class DocumentProcessingService {
|
||||
PreCheckEvaluator.evaluate(candidate, success, configuration);
|
||||
|
||||
case PdfExtractionContentError contentError ->
|
||||
// PDF content not extractable: classify as technical document error
|
||||
new TechnicalDocumentError(candidate, "PDF content not extractable: " + contentError.reason(), null);
|
||||
// PDF content not extractable: classify as pre-check failed (deterministic content error)
|
||||
new PreCheckFailed(candidate, "PDF content not extractable: " + contentError.reason());
|
||||
|
||||
case PdfExtractionTechnicalError technicalError ->
|
||||
// Technical failure during extraction: potentially retryable
|
||||
|
||||
@@ -119,18 +119,18 @@ class DocumentProcessingServiceTest {
|
||||
|
||||
@Test
|
||||
void testProcessDocument_WithContentError() {
|
||||
// Arrange: PDF content not extractable (classified as technical document error)
|
||||
// Arrange: PDF content not extractable (classified as pre-check failed)
|
||||
var contentError = new PdfExtractionContentError("PDF is corrupted");
|
||||
|
||||
// Act
|
||||
DocumentProcessingOutcome outcome = DocumentProcessingService.processDocument(
|
||||
candidate, contentError, configuration);
|
||||
|
||||
// Assert: Should produce TechnicalDocumentError
|
||||
assertInstanceOf(TechnicalDocumentError.class, outcome);
|
||||
TechnicalDocumentError result = (TechnicalDocumentError) outcome;
|
||||
// Assert: Should produce PreCheckFailed
|
||||
assertInstanceOf(PreCheckFailed.class, outcome);
|
||||
PreCheckFailed result = (PreCheckFailed) outcome;
|
||||
assertEquals(candidate, result.candidate());
|
||||
assertTrue(result.errorMessage().contains("PDF is corrupted"));
|
||||
assertTrue(result.failureReason().contains("PDF is corrupted"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
Reference in New Issue
Block a user