1
0

M3-APP-03: PDFBox-Extraktion technisch sauber abgegrenzt und

Fehlersemantik korrigiert
This commit is contained in:
2026-04-01 18:54:35 +02:00
parent 8f138d4cfa
commit a9407aaba2
8 changed files with 446 additions and 96 deletions

View File

@@ -0,0 +1,164 @@
package de.gecheckt.pdf.umbenenner.adapter.outbound.pdfextraction;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Tests for {@link PdfTextExtractionPortAdapter}.
* <p>
* M3-AP-003: Minimal tests validating basic extraction functionality and technical error handling.
* In AP-003 scope: all extraction problems are treated as TechnicalError, not ContentError.
* No fachliche validation of text content (that is AP-004).
* PDFs are created programmatically using PDFBox to avoid external dependencies on test files.
*
* @since M3-AP-003
*/
class PdfTextExtractionPortAdapterTest {
private PdfTextExtractionPortAdapter adapter;
@TempDir
Path tempDir;
@BeforeEach
void setUp() {
adapter = new PdfTextExtractionPortAdapter();
}
@Test
void testNullCandidateThrowsNullPointerException() {
assertThrows(NullPointerException.class, () -> adapter.extractTextAndPageCount(null));
}
@Test
void testNonExistentFileReturnsTechnicalError() throws Exception {
SourceDocumentCandidate candidate = new SourceDocumentCandidate(
"nonexistent.pdf",
1,
new SourceDocumentLocator("/path/that/does/not/exist.pdf")
);
PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
assertInstanceOf(PdfExtractionTechnicalError.class, result);
PdfExtractionTechnicalError error = (PdfExtractionTechnicalError) result;
assertTrue(error.errorMessage().contains("not found"));
}
@Test
void testSimplePdfExtractionSuccess() throws Exception {
// Create a simple single-page PDF
Path pdfFile = tempDir.resolve("simple.pdf");
createSimplePdf(pdfFile);
SourceDocumentCandidate candidate = new SourceDocumentCandidate(
"simple.pdf",
Files.size(pdfFile),
new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
);
PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
assertInstanceOf(PdfExtractionSuccess.class, result);
PdfExtractionSuccess success = (PdfExtractionSuccess) result;
assertEquals(1, success.pageCount().value());
assertNotNull(success.extractedText());
}
@Test
void testMultiPagePdfExtractionSuccess() throws Exception {
// Create a three-page PDF
Path pdfFile = tempDir.resolve("multipage.pdf");
createMultiPagePdf(pdfFile, 3);
SourceDocumentCandidate candidate = new SourceDocumentCandidate(
"multipage.pdf",
Files.size(pdfFile),
new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
);
PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
assertInstanceOf(PdfExtractionSuccess.class, result);
PdfExtractionSuccess success = (PdfExtractionSuccess) result;
assertEquals(3, success.pageCount().value());
assertNotNull(success.extractedText());
}
@Test
void testReadablePdfWithEmptyTextReturnsSuccess() throws Exception {
// Create a PDF with no text content (blank page)
// This is a technically readable PDF, so it should succeed
Path pdfFile = tempDir.resolve("blank.pdf");
createBlankPdf(pdfFile);
SourceDocumentCandidate candidate = new SourceDocumentCandidate(
"blank.pdf",
Files.size(pdfFile),
new SourceDocumentLocator(pdfFile.toAbsolutePath().toString())
);
PdfExtractionResult result = adapter.extractTextAndPageCount(candidate);
// AP-003: Empty text is SUCCESS, not an error
// Fachliche Bewertung of text content happens in AP-004
assertInstanceOf(PdfExtractionSuccess.class, result);
PdfExtractionSuccess success = (PdfExtractionSuccess) result;
assertEquals(1, success.pageCount().value());
assertNotNull(success.extractedText()); // May be empty, but not null
}
// --- Helper methods to create test PDFs ---
/**
* Creates a simple single-page PDF.
*/
private void createSimplePdf(Path filePath) throws Exception {
PDDocument document = new PDDocument();
PDPage page = new PDPage();
document.addPage(page);
document.save(filePath.toAbsolutePath().toString());
document.close();
}
/**
* Creates a PDF with multiple blank pages.
*/
private void createMultiPagePdf(Path filePath, int pageCount) throws Exception {
PDDocument document = new PDDocument();
for (int i = 0; i < pageCount; i++) {
PDPage page = new PDPage();
document.addPage(page);
}
document.save(filePath.toAbsolutePath().toString());
document.close();
}
/**
* Creates a blank PDF with a single page and no text.
*/
private void createBlankPdf(Path filePath) throws Exception {
PDDocument document = new PDDocument();
PDPage page = new PDPage();
document.addPage(page);
document.save(filePath.toAbsolutePath().toString());
document.close();
}
}