M3-APP-03: PDFBox-Extraktion technisch sauber abgegrenzt und

Fehlersemantik korrigiert
2026-04-01 18:54:35 +02:00
parent 8f138d4cfa
commit a9407aaba2
8 changed files with 446 additions and 96 deletions
@@ -0,0 +1,164 @@
+package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction;
+
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Tests for {@link PdfTextExtractionPortAdapter}.
+ * <p>
+ * M3-AP-003: Minimal tests validating basic extraction functionality and technical error handling.
+ * In AP-003 scope: all extraction problems are treated as TechnicalError, not ContentError.
+ * No fachliche validation of text content (that is AP-004).
+ * PDFs are created programmatically using PDFBox to avoid external dependencies on test files.
+ *
+ * @since M3-AP-003
+ */
+class PdfTextExtractionPortAdapterTest {
+
+    private PdfTextExtractionPortAdapter adapter;
+
+    @TempDir
+    Path tempDir;
+
+    @BeforeEach
+    void setUp() {
+        adapter = new PdfTextExtractionPortAdapter();
+    }
+
+    @Test
+    void testNullCandidateThrowsNullPointerException() {
+        assertThrows(NullPointerException.class, () -> adapter.extractTextAndPageCount(null));
+    }
+
+    @Test
+    void testNonExistentFileReturnsTechnicalError() throws Exception {
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "nonexistent.pdf",
+            1,
+            new SourceDocumentLocator("/path/that/does/not/exist.pdf")
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionTechnicalError.class, result);
+        PdfExtractionTechnicalError error = (PdfExtractionTechnicalError) result;
+        assertTrue(error.errorMessage().contains("not found"));
+    }
+
+    @Test
+    void testSimplePdfExtractionSuccess() throws Exception {
+        // Create a simple single-page PDF
+        Path pdfFile = tempDir.resolve("simple.pdf");
+        createSimplePdf(pdfFile);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "simple.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(1, success.pageCount().value());
+        assertNotNull(success.extractedText());
+    }
+
+    @Test
+    void testMultiPagePdfExtractionSuccess() throws Exception {
+        // Create a three-page PDF
+        Path pdfFile = tempDir.resolve("multipage.pdf");
+        createMultiPagePdf(pdfFile, 3);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "multipage.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(3, success.pageCount().value());
+        assertNotNull(success.extractedText());
+    }
+
+    @Test
+    void testReadablePdfWithEmptyTextReturnsSuccess() throws Exception {
+        // Create a PDF with no text content (blank page)
+        // This is a technically readable PDF, so it should succeed
+        Path pdfFile = tempDir.resolve("blank.pdf");
+        createBlankPdf(pdfFile);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "blank.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        // AP-003: Empty text is SUCCESS, not an error
+        // Fachliche Bewertung of text content happens in AP-004
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(1, success.pageCount().value());
+        assertNotNull(success.extractedText());  // May be empty, but not null
+    }
+
+    // --- Helper methods to create test PDFs ---
+
+    /**
+     * Creates a simple single-page PDF.
+     */
+    private void createSimplePdf(Path filePath) throws Exception {
+        PDDocument document = new PDDocument();
+        PDPage page = new PDPage();
+        document.addPage(page);
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+
+    /**
+     * Creates a PDF with multiple blank pages.
+     */
+    private void createMultiPagePdf(Path filePath, int pageCount) throws Exception {
+        PDDocument document = new PDDocument();
+        for (int i = 0; i < pageCount; i++) {
+            PDPage page = new PDPage();
+            document.addPage(page);
+        }
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+
+    /**
+     * Creates a blank PDF with a single page and no text.
+     */
+    private void createBlankPdf(Path filePath) throws Exception {
+        PDDocument document = new PDDocument();
+        PDPage page = new PDPage();
+        document.addPage(page);
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+}