From a9407aaba2ad5b66e03b5b63604022d16cfd3757 Mon Sep 17 00:00:00 2001 From: Marcus van Elst Date: Wed, 1 Apr 2026 18:54:35 +0200 Subject: [PATCH] M3-APP-03: PDFBox-Extraktion technisch sauber abgegrenzt und Fehlersemantik korrigiert --- .claude/settings.local.json | 6 +- .../PdfTextExtractionPortAdapter.java | 141 +++++++++++++++ .../outbound/pdfextraction/package-info.java | 34 ++++ .../PdfTextExtractionPortAdapterTest.java | 164 ++++++++++++++++++ .../model/PdfExtractionContentError.java | 36 ++++ .../domain/model/PdfExtractionResult.java | 95 ---------- .../domain/model/PdfExtractionSuccess.java | 30 ++++ .../model/PdfExtractionTechnicalError.java | 36 ++++ 8 files changed, 446 insertions(+), 96 deletions(-) create mode 100644 pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapter.java create mode 100644 pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/package-info.java create mode 100644 pdf-umbenenner-adapter-out/src/test/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapterTest.java create mode 100644 pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionContentError.java create mode 100644 pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionSuccess.java create mode 100644 pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionTechnicalError.java diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 377d543..949c18b 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -6,7 +6,11 @@ "Bash(mvn clean:*)", "Bash(mvn verify:*)", "Bash(mvn test:*)", - "Bash(find D:/Dev/Projects/pdf-umbenenner-parent -not -path */target/* -type d)" + "Bash(find D:/Dev/Projects/pdf-umbenenner-parent -not -path */target/* -type d)", + "Bash(mvn -pl pdf-umbenenner-adapter-out clean compile)", + "Bash(mvn dependency:tree -pl pdf-umbenenner-adapter-out)", + "Bash(mvn -pl pdf-umbenenner-domain clean compile)", + "Bash(mvn help:describe -Dplugin=org.apache.pdfbox:pdfbox -Ddetail=false)" ] } } diff --git a/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapter.java b/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapter.java new file mode 100644 index 0000000..ef92100 --- /dev/null +++ b/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapter.java @@ -0,0 +1,141 @@ +package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction; + +import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError; +import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount; +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Objects; + +/** + * PDFBox-based implementation of {@link PdfTextExtractionPort}. + *

+ * AP-003 Implementation: Extracts text content and page count from a single PDF document + * using Apache PDFBox. All technical problems during extraction are reported as + * {@link de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError}. + *

+ * Design: + *

+ *

+ * Success criteria: + *

+ *

+ * Technical error cases (AP-003): + *

+ *

+ * Non-goals (handled in later APs): + *

+ * + * @since M3-AP-003 + */ +public class PdfTextExtractionPortAdapter implements PdfTextExtractionPort { + + /** + * Extracts text content and page count from a single PDF document. + *

+ * Reads the file identified by the candidate's locator, uses PDFBox to extract + * text from all pages, and counts the total page count. + *

+ * The locator is expected to contain an absolute file path as a String (adapter-internal convention). + *

+ * In M3-AP-003, all technical problems are reported as {@link PdfExtractionTechnicalError}. + * Fachliche Bewertungen like "text is not usable" are deferred to AP-004. + * + * @param candidate the document to extract; must be non-null + * @return a {@link PdfExtractionResult} encoding the outcome: + *

+ * @throws NullPointerException if candidate is null + */ + @Override + public PdfExtractionResult extractTextAndPageCount(SourceDocumentCandidate candidate) { + Objects.requireNonNull(candidate, "candidate must not be null"); + + String filePath = candidate.locator().value(); + + try { + // Validate file exists and is readable + var path = Paths.get(filePath); + if (!Files.exists(path)) { + return new PdfExtractionTechnicalError( + "PDF file not found: " + filePath, + null); + } + if (!Files.isReadable(path)) { + return new PdfExtractionTechnicalError( + "PDF file is not readable: " + filePath, + null); + } + + // Load and process PDF using PDFBox Loader (3.x API) + PDDocument document = Loader.loadPDF(path.toFile()); + try { + int pageCount = document.getNumberOfPages(); + + // AP-003: Handle case of zero pages as technical error + // (PdfPageCount requires >= 1, so this is a constraint violation) + if (pageCount < 1) { + return new PdfExtractionTechnicalError( + "PDF has zero pages, cannot extract content", + null); + } + + // Extract text from all pages + // Note: extractedText may be empty string, which is valid in M3 (no fachliche validation here) + PDFTextStripper textStripper = new PDFTextStripper(); + String extractedText = textStripper.getText(document); + + // Success: return extracted text and page count + // (Empty text is not an error in AP-003; fachliche validation is AP-004) + PdfPageCount pageCountTyped = new PdfPageCount(pageCount); + return new PdfExtractionSuccess(extractedText, pageCountTyped); + } finally { + document.close(); + } + + } catch (IOException e) { + // All I/O and PDFBox loading/parsing errors are technical errors in AP-003 + String errorMessage = e.getMessage() != null ? e.getMessage() : e.toString(); + return new PdfExtractionTechnicalError( + "Failed to load or parse PDF: " + errorMessage, + e); + } catch (Exception e) { + // Catch-all for unexpected errors + String errorMessage = e.getMessage() != null ? e.getMessage() : e.toString(); + return new PdfExtractionTechnicalError( + "Unexpected error during PDF extraction: " + errorMessage, + e); + } + } +} diff --git a/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/package-info.java b/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/package-info.java new file mode 100644 index 0000000..1f17276 --- /dev/null +++ b/pdf-umbenenner-adapter-out/src/main/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/package-info.java @@ -0,0 +1,34 @@ +/** + * PDFBox-based adapter for PDF text extraction. + *

+ * M3-AP-003: This package contains the sole implementation + * of {@link de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort}, + * using Apache PDFBox to extract text and page count from PDF documents. + *

+ * Scope (AP-003): + *

+ *

+ * Restriction: + *

+ *

+ * Out of scope (handled in later APs): + *

+ * + * @since M3-AP-003 + */ +package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction; diff --git a/pdf-umbenenner-adapter-out/src/test/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapterTest.java b/pdf-umbenenner-adapter-out/src/test/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapterTest.java new file mode 100644 index 0000000..f1d5927 --- /dev/null +++ b/pdf-umbenenner-adapter-out/src/test/java/de/gecheckt/pdf/umbenenner/adapter/outbound/pdfextraction/PdfTextExtractionPortAdapterTest.java @@ -0,0 +1,164 @@ +package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction; + +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess; +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError; +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Tests for {@link PdfTextExtractionPortAdapter}. + *

+ * M3-AP-003: Minimal tests validating basic extraction functionality and technical error handling. + * In AP-003 scope: all extraction problems are treated as TechnicalError, not ContentError. + * No fachliche validation of text content (that is AP-004). + * PDFs are created programmatically using PDFBox to avoid external dependencies on test files. + * + * @since M3-AP-003 + */ +class PdfTextExtractionPortAdapterTest { + + private PdfTextExtractionPortAdapter adapter; + + @TempDir + Path tempDir; + + @BeforeEach + void setUp() { + adapter = new PdfTextExtractionPortAdapter(); + } + + @Test + void testNullCandidateThrowsNullPointerException() { + assertThrows(NullPointerException.class, () -> adapter.extractTextAndPageCount(null)); + } + + @Test + void testNonExistentFileReturnsTechnicalError() throws Exception { + SourceDocumentCandidate candidate = new SourceDocumentCandidate( + "nonexistent.pdf", + 1, + new SourceDocumentLocator("/path/that/does/not/exist.pdf") + ); + + PdfExtractionResult result = adapter.extractTextAndPageCount(candidate); + + assertInstanceOf(PdfExtractionTechnicalError.class, result); + PdfExtractionTechnicalError error = (PdfExtractionTechnicalError) result; + assertTrue(error.errorMessage().contains("not found")); + } + + @Test + void testSimplePdfExtractionSuccess() throws Exception { + // Create a simple single-page PDF + Path pdfFile = tempDir.resolve("simple.pdf"); + createSimplePdf(pdfFile); + + SourceDocumentCandidate candidate = new SourceDocumentCandidate( + "simple.pdf", + Files.size(pdfFile), + new SourceDocumentLocator(pdfFile.toAbsolutePath().toString()) + ); + + PdfExtractionResult result = adapter.extractTextAndPageCount(candidate); + + assertInstanceOf(PdfExtractionSuccess.class, result); + PdfExtractionSuccess success = (PdfExtractionSuccess) result; + assertEquals(1, success.pageCount().value()); + assertNotNull(success.extractedText()); + } + + @Test + void testMultiPagePdfExtractionSuccess() throws Exception { + // Create a three-page PDF + Path pdfFile = tempDir.resolve("multipage.pdf"); + createMultiPagePdf(pdfFile, 3); + + SourceDocumentCandidate candidate = new SourceDocumentCandidate( + "multipage.pdf", + Files.size(pdfFile), + new SourceDocumentLocator(pdfFile.toAbsolutePath().toString()) + ); + + PdfExtractionResult result = adapter.extractTextAndPageCount(candidate); + + assertInstanceOf(PdfExtractionSuccess.class, result); + PdfExtractionSuccess success = (PdfExtractionSuccess) result; + assertEquals(3, success.pageCount().value()); + assertNotNull(success.extractedText()); + } + + @Test + void testReadablePdfWithEmptyTextReturnsSuccess() throws Exception { + // Create a PDF with no text content (blank page) + // This is a technically readable PDF, so it should succeed + Path pdfFile = tempDir.resolve("blank.pdf"); + createBlankPdf(pdfFile); + + SourceDocumentCandidate candidate = new SourceDocumentCandidate( + "blank.pdf", + Files.size(pdfFile), + new SourceDocumentLocator(pdfFile.toAbsolutePath().toString()) + ); + + PdfExtractionResult result = adapter.extractTextAndPageCount(candidate); + + // AP-003: Empty text is SUCCESS, not an error + // Fachliche Bewertung of text content happens in AP-004 + assertInstanceOf(PdfExtractionSuccess.class, result); + PdfExtractionSuccess success = (PdfExtractionSuccess) result; + assertEquals(1, success.pageCount().value()); + assertNotNull(success.extractedText()); // May be empty, but not null + } + + // --- Helper methods to create test PDFs --- + + /** + * Creates a simple single-page PDF. + */ + private void createSimplePdf(Path filePath) throws Exception { + PDDocument document = new PDDocument(); + PDPage page = new PDPage(); + document.addPage(page); + document.save(filePath.toAbsolutePath().toString()); + document.close(); + } + + /** + * Creates a PDF with multiple blank pages. + */ + private void createMultiPagePdf(Path filePath, int pageCount) throws Exception { + PDDocument document = new PDDocument(); + for (int i = 0; i < pageCount; i++) { + PDPage page = new PDPage(); + document.addPage(page); + } + document.save(filePath.toAbsolutePath().toString()); + document.close(); + } + + /** + * Creates a blank PDF with a single page and no text. + */ + private void createBlankPdf(Path filePath) throws Exception { + PDDocument document = new PDDocument(); + PDPage page = new PDPage(); + document.addPage(page); + document.save(filePath.toAbsolutePath().toString()); + document.close(); + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionContentError.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionContentError.java new file mode 100644 index 0000000..8465767 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionContentError.java @@ -0,0 +1,36 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents a content-related failure during PDF text extraction. + *

+ * This indicates that the PDF file itself is readable (no I/O error), + * but its content is not suitable for text extraction. + *

+ * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked, + * PDF is severely corrupted in the content layer. + *

+ * This is typically a deterministic, non-retryable condition for a given source file + * (unless the source file is modified and re-scanned in a later run). + * + * @param reason a human-readable explanation of why extraction failed (non-null, non-empty) + * @since M3-AP-001 + */ +public record PdfExtractionContentError( + String reason +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param reason must be non-null and non-empty + * @throws NullPointerException if reason is null + * @throws IllegalArgumentException if reason is empty + */ + public PdfExtractionContentError { + Objects.requireNonNull(reason, "reason must not be null"); + if (reason.isEmpty()) { + throw new IllegalArgumentException("reason must not be empty"); + } + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java index c73002a..32f0c05 100644 --- a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java @@ -1,7 +1,5 @@ package de.gecheckt.pdf.umbenenner.domain.model; -import java.util.Objects; - /** * Sealed interface representing the outcome of PDF text extraction. *

@@ -27,96 +25,3 @@ public sealed interface PdfExtractionResult permits PdfExtractionSuccess, PdfExtractionContentError, PdfExtractionTechnicalError { // Marker interface; concrete implementations define structure } - -/** - * Represents successful PDF text extraction. - *

- * When this result is obtained, both text content and page count have been - * successfully extracted and are guaranteed to be valid. - * - * @param extractedText the full text content extracted from the PDF (non-null, may be empty string) - * @param pageCount the number of pages in the PDF (non-null, validated >= 1) - * @since M3-AP-001 - */ -record PdfExtractionSuccess( - String extractedText, - PdfPageCount pageCount -) implements PdfExtractionResult { - /** - * Constructor with validation. - * - * @param extractedText must be non-null (may be empty) - * @param pageCount must be non-null - * @throws NullPointerException if either parameter is null - */ - PdfExtractionSuccess { - Objects.requireNonNull(extractedText, "extractedText must not be null"); - Objects.requireNonNull(pageCount, "pageCount must not be null"); - } -} - -/** - * Represents a content-related failure during PDF text extraction. - *

- * This indicates that the PDF file itself is readable (no I/O error), - * but its content is not suitable for text extraction. - *

- * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked, - * PDF is severely corrupted in the content layer. - *

- * This is typically a deterministic, non-retryable condition for a given source file - * (unless the source file is modified and re-scanned in a later run). - * - * @param reason a human-readable explanation of why extraction failed (non-null, non-empty) - * @since M3-AP-001 - */ -record PdfExtractionContentError( - String reason -) implements PdfExtractionResult { - /** - * Constructor with validation. - * - * @param reason must be non-null and non-empty - * @throws NullPointerException if reason is null - * @throws IllegalArgumentException if reason is empty - */ - PdfExtractionContentError { - Objects.requireNonNull(reason, "reason must not be null"); - if (reason.isEmpty()) { - throw new IllegalArgumentException("reason must not be empty"); - } - } -} - -/** - * Represents a technical (infrastructure) failure during PDF text extraction. - *

- * This indicates that something went wrong with the extraction process itself, - * such as file I/O errors, PDFBox library problems, or out-of-memory conditions. - *

- * These are typically retryable conditions in later batch runs, as they may be - * transient infrastructure issues. - * - * @param errorMessage a description of what went wrong (non-null, non-empty) - * @param cause the underlying exception, if any (may be null) - * @since M3-AP-001 - */ -record PdfExtractionTechnicalError( - String errorMessage, - Throwable cause -) implements PdfExtractionResult { - /** - * Constructor with validation. - * - * @param errorMessage must be non-null and non-empty - * @param cause may be null - * @throws NullPointerException if errorMessage is null - * @throws IllegalArgumentException if errorMessage is empty - */ - PdfExtractionTechnicalError { - Objects.requireNonNull(errorMessage, "errorMessage must not be null"); - if (errorMessage.isEmpty()) { - throw new IllegalArgumentException("errorMessage must not be empty"); - } - } -} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionSuccess.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionSuccess.java new file mode 100644 index 0000000..96f982e --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionSuccess.java @@ -0,0 +1,30 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents successful PDF text extraction. + *

+ * When this result is obtained, both text content and page count have been + * successfully extracted and are guaranteed to be valid. + * + * @param extractedText the full text content extracted from the PDF (non-null, may be empty string) + * @param pageCount the number of pages in the PDF (non-null, validated >= 1) + * @since M3-AP-001 + */ +public record PdfExtractionSuccess( + String extractedText, + PdfPageCount pageCount +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param extractedText must be non-null (may be empty) + * @param pageCount must be non-null + * @throws NullPointerException if either parameter is null + */ + public PdfExtractionSuccess { + Objects.requireNonNull(extractedText, "extractedText must not be null"); + Objects.requireNonNull(pageCount, "pageCount must not be null"); + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionTechnicalError.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionTechnicalError.java new file mode 100644 index 0000000..fd706d9 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionTechnicalError.java @@ -0,0 +1,36 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents a technical (infrastructure) failure during PDF text extraction. + *

+ * This indicates that something went wrong with the extraction process itself, + * such as file I/O errors, PDFBox library problems, or out-of-memory conditions. + *

+ * These are typically retryable conditions in later batch runs, as they may be + * transient infrastructure issues. + * + * @param errorMessage a description of what went wrong (non-null, non-empty) + * @param cause the underlying exception, if any (may be null) + * @since M3-AP-001 + */ +public record PdfExtractionTechnicalError( + String errorMessage, + Throwable cause +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param errorMessage must be non-null and non-empty + * @param cause may be null + * @throws NullPointerException if errorMessage is null + * @throws IllegalArgumentException if errorMessage is empty + */ + public PdfExtractionTechnicalError { + Objects.requireNonNull(errorMessage, "errorMessage must not be null"); + if (errorMessage.isEmpty()) { + throw new IllegalArgumentException("errorMessage must not be empty"); + } + } +}