M3-APP-03: PDFBox-Extraktion technisch sauber abgegrenzt und

Fehlersemantik korrigiert
2026-04-01 18:54:35 +02:00
parent 8f138d4cfa
commit a9407aaba2
8 changed files with 446 additions and 96 deletions
@@ -6,7 +6,11 @@
      "Bash(mvn clean:*)",
      "Bash(mvn verify:*)",
      "Bash(mvn test:*)",
-      "Bash(find D:/Dev/Projects/pdf-umbenenner-parent -not -path */target/* -type d)"
+      "Bash(find D:/Dev/Projects/pdf-umbenenner-parent -not -path */target/* -type d)",
+      "Bash(mvn -pl pdf-umbenenner-adapter-out clean compile)",
+      "Bash(mvn dependency:tree -pl pdf-umbenenner-adapter-out)",
+      "Bash(mvn -pl pdf-umbenenner-domain clean compile)",
+      "Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)"
    ]
  }
 }
@@ -0,0 +1,141 @@
+package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction;
+
+import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Objects;
+
+/**
+ * PDFBox-based implementation of {@link PdfTextExtractionPort}.
+ * <p>
+ * AP-003 Implementation: Extracts text content and page count from a single PDF document
+ * using Apache PDFBox. All technical problems during extraction are reported as
+ * {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError}.
+ * <p>
+ * Design:
+ * <ul>
+ *   <li>Uses PDFBox 3.x for PDF processing</li>
+ *   <li>Extracts complete text from all pages (may be empty)</li>
+ *   <li>Counts total page count</li>
+ *   <li>Returns results as typed {@link PdfExtractionResult} (no exceptions thrown)</li>
+ *   <li>All extraction failures are treated as technical errors (AP-003 scope)</li>
+ *   <li>PDFBox is encapsulated and never exposed beyond this adapter</li>
+ * </ul>
+ * <p>
+ * Success criteria:
+ * <ul>
+ *   <li>PDF file is loadable by PDFBox</li>
+ *   <li>Page count is determinable (>= 1)</li>
+ *   <li>Text can be extracted (may be empty string)</li>
+ *   <li>All three values are combined into {@link PdfExtractionSuccess}</li>
+ * </ul>
+ * <p>
+ * Technical error cases (AP-003):
+ * <ul>
+ *   <li>File not found or unreadable</li>
+ *   <li>PDF cannot be loaded by PDFBox (any load error)</li>
+ *   <li>Page count cannot be determined</li>
+ *   <li>Text extraction fails or throws exception</li>
+ * </ul>
+ * <p>
+ * Non-goals (handled in later APs):
+ * <ul>
+ *   <li>Fachliche Bewertung des extrahierten Texts (AP-004)</li>
+ *   <li>Page limit checking (AP-004)</li>
+ *   <li>Text normalization or preprocessing</li>
+ * </ul>
+ *
+ * @since M3-AP-003
+ */
+public class PdfTextExtractionPortAdapter implements PdfTextExtractionPort {
+
+    /**
+     * Extracts text content and page count from a single PDF document.
+     * <p>
+     * Reads the file identified by the candidate's locator, uses PDFBox to extract
+     * text from all pages, and counts the total page count.
+     * <p>
+     * The locator is expected to contain an absolute file path as a String (adapter-internal convention).
+     * <p>
+     * In M3-AP-003, all technical problems are reported as {@link PdfExtractionTechnicalError}.
+     * Fachliche Bewertungen like "text is not usable" are deferred to AP-004.
+     *
+     * @param candidate the document to extract; must be non-null
+     * @return a {@link PdfExtractionResult} encoding the outcome:
+     *         <ul>
+     *           <li>Success: PDF loaded, text extracted (may be empty), page count determined</li>
+     *           <li>Technical error: any I/O, file access, or PDFBox loading/parsing problem</li>
+     *         </ul>
+     * @throws NullPointerException if candidate is null
+     */
+    @Override
+    public PdfExtractionResult extractTextAndPageCount(SourceDocumentCandidate candidate) {
+        Objects.requireNonNull(candidate, "candidate must not be null");
+
+        String filePath = candidate.locator().value();
+
+        try {
+            // Validate file exists and is readable
+            var path = Paths.get(filePath);
+            if (!Files.exists(path)) {
+                return new PdfExtractionTechnicalError(
+                    "PDF file not found: " + filePath,
+                    null);
+            }
+            if (!Files.isReadable(path)) {
+                return new PdfExtractionTechnicalError(
+                    "PDF file is not readable: " + filePath,
+                    null);
+            }
+
+            // Load and process PDF using PDFBox Loader (3.x API)
+            PDDocument document = Loader.loadPDF(path.toFile());
+            try {
+                int pageCount = document.getNumberOfPages();
+
+                // AP-003: Handle case of zero pages as technical error
+                // (PdfPageCount requires >= 1, so this is a constraint violation)
+                if (pageCount < 1) {
+                    return new PdfExtractionTechnicalError(
+                        "PDF has zero pages, cannot extract content",
+                        null);
+                }
+
+                // Extract text from all pages
+                // Note: extractedText may be empty string, which is valid in M3 (no fachliche validation here)
+                PDFTextStripper textStripper = new PDFTextStripper();
+                String extractedText = textStripper.getText(document);
+
+                // Success: return extracted text and page count
+                // (Empty text is not an error in AP-003; fachliche validation is AP-004)
+                PdfPageCount pageCountTyped = new PdfPageCount(pageCount);
+                return new PdfExtractionSuccess(extractedText, pageCountTyped);
+            } finally {
+                document.close();
+            }
+
+        } catch (IOException e) {
+            // All I/O and PDFBox loading/parsing errors are technical errors in AP-003
+            String errorMessage = e.getMessage() != null ? e.getMessage() : e.toString();
+            return new PdfExtractionTechnicalError(
+                "Failed to load or parse PDF: " + errorMessage,
+                e);
+        } catch (Exception e) {
+            // Catch-all for unexpected errors
+            String errorMessage = e.getMessage() != null ? e.getMessage() : e.toString();
+            return new PdfExtractionTechnicalError(
+                "Unexpected error during PDF extraction: " + errorMessage,
+                e);
+        }
+    }
+}
@@ -0,0 +1,34 @@
+/**
+ * PDFBox-based adapter for PDF text extraction.
+ * <p>
+ * <strong>M3-AP-003:</strong> This package contains the sole implementation
+ * of {@link de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort},
+ * using Apache PDFBox to extract text and page count from PDF documents.
+ * <p>
+ * <strong>Scope (AP-003):</strong>
+ * <ul>
+ *   <li>Pure technical extraction: read PDF, extract text, count pages</li>
+ *   <li>All extraction problems (file not found, PDF unreadable, PDFBox errors) → {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError}</li>
+ *   <li>No fachliche validation: empty text is SUCCESS, not error</li>
+ *   <li>PDFBox is <em>only</em> used in this package; no PDFBox types appear in Domain or Application</li>
+ *   <li>Results always typed as {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult}, never exceptions</li>
+ * </ul>
+ * <p>
+ * <strong>Restriction:</strong>
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError} is reserved for later APs</li>
+ *   <li>AP-003 adapter uses only {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess} and
+ *       {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError}</li>
+ * </ul>
+ * <p>
+ * <strong>Out of scope (handled in later APs):</strong>
+ * <ul>
+ *   <li>Text validation or quality assessment (AP-004)</li>
+ *   <li>Page limit checking (AP-004)</li>
+ *   <li>Text normalization or preprocessing</li>
+ *   <li>Fachliche Bewertung of extracted content</li>
+ * </ul>
+ *
+ * @since M3-AP-003
+ */
+package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction;
@@ -0,0 +1,164 @@
+package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction;
+
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Tests for {@link PdfTextExtractionPortAdapter}.
+ * <p>
+ * M3-AP-003: Minimal tests validating basic extraction functionality and technical error handling.
+ * In AP-003 scope: all extraction problems are treated as TechnicalError, not ContentError.
+ * No fachliche validation of text content (that is AP-004).
+ * PDFs are created programmatically using PDFBox to avoid external dependencies on test files.
+ *
+ * @since M3-AP-003
+ */
+class PdfTextExtractionPortAdapterTest {
+
+    private PdfTextExtractionPortAdapter adapter;
+
+    @TempDir
+    Path tempDir;
+
+    @BeforeEach
+    void setUp() {
+        adapter = new PdfTextExtractionPortAdapter();
+    }
+
+    @Test
+    void testNullCandidateThrowsNullPointerException() {
+        assertThrows(NullPointerException.class, () -> adapter.extractTextAndPageCount(null));
+    }
+
+    @Test
+    void testNonExistentFileReturnsTechnicalError() throws Exception {
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "nonexistent.pdf",
+            1,
+            new SourceDocumentLocator("/path/that/does/not/exist.pdf")
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionTechnicalError.class, result);
+        PdfExtractionTechnicalError error = (PdfExtractionTechnicalError) result;
+        assertTrue(error.errorMessage().contains("not found"));
+    }
+
+    @Test
+    void testSimplePdfExtractionSuccess() throws Exception {
+        // Create a simple single-page PDF
+        Path pdfFile = tempDir.resolve("simple.pdf");
+        createSimplePdf(pdfFile);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "simple.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(1, success.pageCount().value());
+        assertNotNull(success.extractedText());
+    }
+
+    @Test
+    void testMultiPagePdfExtractionSuccess() throws Exception {
+        // Create a three-page PDF
+        Path pdfFile = tempDir.resolve("multipage.pdf");
+        createMultiPagePdf(pdfFile, 3);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "multipage.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(3, success.pageCount().value());
+        assertNotNull(success.extractedText());
+    }
+
+    @Test
+    void testReadablePdfWithEmptyTextReturnsSuccess() throws Exception {
+        // Create a PDF with no text content (blank page)
+        // This is a technically readable PDF, so it should succeed
+        Path pdfFile = tempDir.resolve("blank.pdf");
+        createBlankPdf(pdfFile);
+
+        SourceDocumentCandidate candidate = new SourceDocumentCandidate(
+            "blank.pdf",
+            Files.size(pdfFile),
+            new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
+        );
+
+        PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
+
+        // AP-003: Empty text is SUCCESS, not an error
+        // Fachliche Bewertung of text content happens in AP-004
+        assertInstanceOf(PdfExtractionSuccess.class, result);
+        PdfExtractionSuccess success = (PdfExtractionSuccess) result;
+        assertEquals(1, success.pageCount().value());
+        assertNotNull(success.extractedText());  // May be empty, but not null
+    }
+
+    // --- Helper methods to create test PDFs ---
+
+    /**
+     * Creates a simple single-page PDF.
+     */
+    private void createSimplePdf(Path filePath) throws Exception {
+        PDDocument document = new PDDocument();
+        PDPage page = new PDPage();
+        document.addPage(page);
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+
+    /**
+     * Creates a PDF with multiple blank pages.
+     */
+    private void createMultiPagePdf(Path filePath, int pageCount) throws Exception {
+        PDDocument document = new PDDocument();
+        for (int i = 0; i < pageCount; i++) {
+            PDPage page = new PDPage();
+            document.addPage(page);
+        }
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+
+    /**
+     * Creates a blank PDF with a single page and no text.
+     */
+    private void createBlankPdf(Path filePath) throws Exception {
+        PDDocument document = new PDDocument();
+        PDPage page = new PDPage();
+        document.addPage(page);
+        document.save(filePath.toAbsolutePath().toString());
+        document.close();
+    }
+}
@@ -0,0 +1,36 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents a content-related failure during PDF text extraction.
+ * <p>
+ * This indicates that the PDF file itself is readable (no I/O error),
+ * but its content is not suitable for text extraction.
+ * <p>
+ * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked,
+ * PDF is severely corrupted in the content layer.
+ * <p>
+ * This is typically a deterministic, non-retryable condition for a given source file
+ * (unless the source file is modified and re-scanned in a later run).
+ *
+ * @param reason a human-readable explanation of why extraction failed (non-null, non-empty)
+ * @since M3-AP-001
+ */
+public record PdfExtractionContentError(
+    String reason
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param reason must be non-null and non-empty
+     * @throws NullPointerException if reason is null
+     * @throws IllegalArgumentException if reason is empty
+     */
+    public PdfExtractionContentError {
+        Objects.requireNonNull(reason, "reason must not be null");
+        if (reason.isEmpty()) {
+            throw new IllegalArgumentException("reason must not be empty");
+        }
+    }
+}
@@ -1,7 +1,5 @@
 package de.gecheckt.pdf.umbenenner.domain.model;

-import java.util.Objects;
-
 /**
 * Sealed interface representing the outcome of PDF text extraction.
 * <p>
@@ -27,96 +25,3 @@ public sealed interface PdfExtractionResult
    permits PdfExtractionSuccess, PdfExtractionContentError, PdfExtractionTechnicalError {
    // Marker interface; concrete implementations define structure
 }
-
-/**
- * Represents successful PDF text extraction.
- * <p>
- * When this result is obtained, both text content and page count have been
- * successfully extracted and are guaranteed to be valid.
- *
- * @param extractedText the full text content extracted from the PDF (non-null, may be empty string)
- * @param pageCount the number of pages in the PDF (non-null, validated &gt;= 1)
- * @since M3-AP-001
- */
-record PdfExtractionSuccess(
-    String extractedText,
-    PdfPageCount pageCount
-) implements PdfExtractionResult {
-    /**
-     * Constructor with validation.
-     *
-     * @param extractedText must be non-null (may be empty)
-     * @param pageCount must be non-null
-     * @throws NullPointerException if either parameter is null
-     */
-    PdfExtractionSuccess {
-        Objects.requireNonNull(extractedText, "extractedText must not be null");
-        Objects.requireNonNull(pageCount, "pageCount must not be null");
-    }
-}
-
-/**
- * Represents a content-related failure during PDF text extraction.
- * <p>
- * This indicates that the PDF file itself is readable (no I/O error),
- * but its content is not suitable for text extraction.
- * <p>
- * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked,
- * PDF is severely corrupted in the content layer.
- * <p>
- * This is typically a deterministic, non-retryable condition for a given source file
- * (unless the source file is modified and re-scanned in a later run).
- *
- * @param reason a human-readable explanation of why extraction failed (non-null, non-empty)
- * @since M3-AP-001
- */
-record PdfExtractionContentError(
-    String reason
-) implements PdfExtractionResult {
-    /**
-     * Constructor with validation.
-     *
-     * @param reason must be non-null and non-empty
-     * @throws NullPointerException if reason is null
-     * @throws IllegalArgumentException if reason is empty
-     */
-    PdfExtractionContentError {
-        Objects.requireNonNull(reason, "reason must not be null");
-        if (reason.isEmpty()) {
-            throw new IllegalArgumentException("reason must not be empty");
-        }
-    }
-}
-
-/**
- * Represents a technical (infrastructure) failure during PDF text extraction.
- * <p>
- * This indicates that something went wrong with the extraction process itself,
- * such as file I/O errors, PDFBox library problems, or out-of-memory conditions.
- * <p>
- * These are typically retryable conditions in later batch runs, as they may be
- * transient infrastructure issues.
- *
- * @param errorMessage a description of what went wrong (non-null, non-empty)
- * @param cause the underlying exception, if any (may be null)
- * @since M3-AP-001
- */
-record PdfExtractionTechnicalError(
-    String errorMessage,
-    Throwable cause
-) implements PdfExtractionResult {
-    /**
-     * Constructor with validation.
-     *
-     * @param errorMessage must be non-null and non-empty
-     * @param cause may be null
-     * @throws NullPointerException if errorMessage is null
-     * @throws IllegalArgumentException if errorMessage is empty
-     */
-    PdfExtractionTechnicalError {
-        Objects.requireNonNull(errorMessage, "errorMessage must not be null");
-        if (errorMessage.isEmpty()) {
-            throw new IllegalArgumentException("errorMessage must not be empty");
-        }
-    }
-}
@@ -0,0 +1,30 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents successful PDF text extraction.
+ * <p>
+ * When this result is obtained, both text content and page count have been
+ * successfully extracted and are guaranteed to be valid.
+ *
+ * @param extractedText the full text content extracted from the PDF (non-null, may be empty string)
+ * @param pageCount the number of pages in the PDF (non-null, validated &gt;= 1)
+ * @since M3-AP-001
+ */
+public record PdfExtractionSuccess(
+    String extractedText,
+    PdfPageCount pageCount
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param extractedText must be non-null (may be empty)
+     * @param pageCount must be non-null
+     * @throws NullPointerException if either parameter is null
+     */
+    public PdfExtractionSuccess {
+        Objects.requireNonNull(extractedText, "extractedText must not be null");
+        Objects.requireNonNull(pageCount, "pageCount must not be null");
+    }
+}
@@ -0,0 +1,36 @@
+package de.gecheckt.pdf.umbenenner.domain.model;
+
+import java.util.Objects;
+
+/**
+ * Represents a technical (infrastructure) failure during PDF text extraction.
+ * <p>
+ * This indicates that something went wrong with the extraction process itself,
+ * such as file I/O errors, PDFBox library problems, or out-of-memory conditions.
+ * <p>
+ * These are typically retryable conditions in later batch runs, as they may be
+ * transient infrastructure issues.
+ *
+ * @param errorMessage a description of what went wrong (non-null, non-empty)
+ * @param cause the underlying exception, if any (may be null)
+ * @since M3-AP-001
+ */
+public record PdfExtractionTechnicalError(
+    String errorMessage,
+    Throwable cause
+) implements PdfExtractionResult {
+    /**
+     * Constructor with validation.
+     *
+     * @param errorMessage must be non-null and non-empty
+     * @param cause may be null
+     * @throws NullPointerException if errorMessage is null
+     * @throws IllegalArgumentException if errorMessage is empty
+     */
+    public PdfExtractionTechnicalError {
+        Objects.requireNonNull(errorMessage, "errorMessage must not be null");
+        if (errorMessage.isEmpty()) {
+            throw new IllegalArgumentException("errorMessage must not be empty");
+        }
+    }
+}