1
0

M3-Quellordneradapter korrigiert und leere PDF-Kandidaten zugelassen

This commit is contained in:
2026-04-01 18:35:28 +02:00
parent dd282e8f7b
commit 8f138d4cfa
4 changed files with 378 additions and 6 deletions

View File

@@ -0,0 +1,151 @@
package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Stream;
/**
* File-system based implementation of {@link SourceDocumentCandidatesPort}.
* <p>
* AP-002 Implementation: Scans a configured source folder and returns only PDF files
* (by extension) as {@link SourceDocumentCandidate} objects.
* <p>
* Design:
* <ul>
* <li>Reads exactly one directory level (no recursion)</li>
* <li>Filters for files with `.pdf` extension (case-insensitive)</li>
* <li>Ignores directories and non-PDF files</li>
* <li>Returns candidates in deterministic, stable order (sorted by absolute path)</li>
* <li>Each candidate's locator contains the absolute path as a String (adapter-internal convention)</li>
* <li>Technical filesystem errors are wrapped in {@link SourceDocumentAccessException}</li>
* </ul>
* <p>
* Non-goals:
* <ul>
* <li>No PDF validation (that is AP-003)</li>
* <li>No recursion into subdirectories</li>
* <li>No content evaluation (that happens in AP-004: brauchbarer Text assessment)</li>
* <li>No fachlich evaluation of candidates</li>
* </ul>
*
* @since M3-AP-002
*/
public class SourceDocumentCandidatesPortAdapter implements SourceDocumentCandidatesPort {
private static final String PDF_EXTENSION = ".pdf";
private final Path sourceFolder;
/**
* Creates a new SourceDocumentCandidatesPortAdapter for the given source folder.
*
* @param sourceFolder the directory to scan for PDF files; must be a readable directory
* @throws NullPointerException if sourceFolder is null
*/
public SourceDocumentCandidatesPortAdapter(Path sourceFolder) {
this.sourceFolder = sourceFolder;
}
/**
* Loads all PDF candidates from the source folder.
* <p>
* Scans the source folder at exactly one level (no recursion), identifies PDF files
* (by extension), and returns them as candidates in sorted order.
* <p>
* Each returned candidate carries:
* <ul>
* <li>{@code uniqueIdentifier}: the filename (e.g. "document.pdf")</li>
* <li>{@code fileSizeBytes}: the file size in bytes</li>
* <li>{@code locator}: opaque reference containing the absolute path as a String.
* This is an adapter-internal convention and is never interpreted by Domain or Application.</li>
* </ul>
*
* @return a list of discovered PDF candidates sorted by absolute path (may be empty)
* @throws SourceDocumentAccessException if the source folder cannot be read or accessed
*/
@Override
public List<SourceDocumentCandidate> loadCandidates() throws SourceDocumentAccessException {
try {
// Validate that source folder exists and is readable
if (!Files.exists(sourceFolder)) {
throw new SourceDocumentAccessException(
"Source folder does not exist: " + sourceFolder.toAbsolutePath());
}
if (!Files.isDirectory(sourceFolder)) {
throw new SourceDocumentAccessException(
"Source folder is not a directory: " + sourceFolder.toAbsolutePath());
}
if (!Files.isReadable(sourceFolder)) {
throw new SourceDocumentAccessException(
"Source folder is not readable: " + sourceFolder.toAbsolutePath());
}
// Scan folder: list exactly one level, filter for PDF files, sort deterministically
List<Path> pdfPaths;
try (Stream<Path> stream = Files.list(sourceFolder)) {
pdfPaths = stream
.filter(Files::isRegularFile) // Only files, not directories
.filter(this::isPdfFile) // Only .pdf extension
.sorted() // Deterministic order (by path)
.toList();
}
// Convert paths to candidates
List<SourceDocumentCandidate> candidates = new java.util.ArrayList<>();
for (Path path : pdfPaths) {
candidates.add(toSourceDocumentCandidate(path));
}
return candidates;
} catch (SourceDocumentAccessException e) {
throw e;
} catch (IOException e) {
throw new SourceDocumentAccessException(
"Failed to read source folder: " + sourceFolder.toAbsolutePath(), e);
} catch (Exception e) {
throw new SourceDocumentAccessException(
"Unexpected error while scanning source folder: " + sourceFolder.toAbsolutePath(), e);
}
}
/**
* Checks if a file is a PDF by extension (case-insensitive).
*
* @param path the file path to check
* @return true if the file name ends with .pdf (case-insensitive), false otherwise
*/
private boolean isPdfFile(Path path) {
String fileName = path.getFileName().toString().toLowerCase();
return fileName.endsWith(PDF_EXTENSION);
}
/**
* Converts a file path to a SourceDocumentCandidate.
* <p>
* The locator is set to the absolute path as a String. This is an adapter-internal
* convention that allows the extraction adapter to re-locate the file later.
* Domain and Application never interpret this value.
*
* @param path the file path
* @return a new SourceDocumentCandidate with metadata extracted from the path
* @throws IOException if file size or path cannot be determined
*/
private SourceDocumentCandidate toSourceDocumentCandidate(Path path) throws IOException {
long fileSizeBytes = Files.size(path);
String fileName = path.getFileName().toString();
String absolutePath = path.toAbsolutePath().toString();
return new SourceDocumentCandidate(
fileName,
fileSizeBytes,
new SourceDocumentLocator(absolutePath)
);
}
}

View File

@@ -0,0 +1,12 @@
/**
* Source document adapters for discovering and accessing PDF candidates.
* <p>
* M3-AP-002 implementations:
* <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument.SourceDocumentCandidatesPortAdapter}
* — File-system based discovery of PDF candidates from the source folder</li>
* </ul>
*
* @since M3-AP-002
*/
package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;

View File

@@ -0,0 +1,209 @@
package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
/**
* Tests for {@link SourceDocumentCandidatesPortAdapter}.
*
* @since M3-AP-002
*/
class SourceDocumentCandidatesPortAdapterTest {
@TempDir
Path tempDir;
private SourceDocumentCandidatesPortAdapter adapter;
@BeforeEach
void setUp() {
adapter = new SourceDocumentCandidatesPortAdapter(tempDir);
}
@Test
void testLoadCandidates_EmptyFolder() throws IOException {
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertNotNull(candidates);
assertTrue(candidates.isEmpty(), "Empty folder should return empty list");
}
@Test
void testLoadCandidates_OnlyPdfFiles() throws IOException {
// Create test PDF files
Path pdf1 = tempDir.resolve("document1.pdf");
Path pdf2 = tempDir.resolve("document2.pdf");
Files.write(pdf1, "pdf content".getBytes());
Files.write(pdf2, "pdf content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(2, candidates.size(), "Should return exactly 2 PDF candidates");
assertTrue(candidates.stream()
.allMatch(c -> c.uniqueIdentifier().endsWith(".pdf")),
"All candidates should be PDF files");
}
@Test
void testLoadCandidates_FiltersNonPdfFiles() throws IOException {
// Create mixed file types
Files.write(tempDir.resolve("document.pdf"), "content".getBytes());
Files.write(tempDir.resolve("image.png"), "content".getBytes());
Files.write(tempDir.resolve("text.txt"), "content".getBytes());
Files.write(tempDir.resolve("data.xlsx"), "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(1, candidates.size(), "Should return only 1 PDF candidate");
assertEquals("document.pdf", candidates.get(0).uniqueIdentifier());
}
@Test
void testLoadCandidates_IgnoresDirectories() throws IOException {
// Create files and subdirectories
Files.write(tempDir.resolve("document.pdf"), "content".getBytes());
Files.createDirectory(tempDir.resolve("subfolder"));
Files.write(tempDir.resolve("subfolder/nested.pdf"), "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(1, candidates.size(), "Should return only 1 PDF candidate (in root folder)");
assertEquals("document.pdf", candidates.get(0).uniqueIdentifier());
}
@Test
void testLoadCandidates_CaseInsensitiveExtension() throws IOException {
// Create PDFs with various case combinations
Files.write(tempDir.resolve("file1.pdf"), "content".getBytes());
Files.write(tempDir.resolve("file2.PDF"), "content".getBytes());
Files.write(tempDir.resolve("file3.Pdf"), "content".getBytes());
Files.write(tempDir.resolve("file4.pDf"), "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(4, candidates.size(), "Should recognize PDF in any case combination");
}
@Test
void testLoadCandidates_DeterministicOrder() throws IOException {
// Create PDFs in non-alphabetical order
Files.write(tempDir.resolve("zebra.pdf"), "content".getBytes());
Files.write(tempDir.resolve("apple.pdf"), "content".getBytes());
Files.write(tempDir.resolve("monkey.pdf"), "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(3, candidates.size());
// Files are sorted by absolute path, which will be consistent
List<SourceDocumentCandidate> candidates2 = adapter.loadCandidates();
assertEquals(candidates, candidates2, "Multiple calls should return same order");
}
@Test
void testLoadCandidates_FileSizeMetadata() throws IOException {
Path pdfFile = tempDir.resolve("test.pdf");
Files.write(pdfFile, "test content 12345".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(1, candidates.size());
SourceDocumentCandidate candidate = candidates.get(0);
assertEquals(18, candidate.fileSizeBytes(), "File size should match written content");
}
@Test
void testLoadCandidates_UniqueIdentifier() throws IOException {
Path pdfFile = tempDir.resolve("myfile.pdf");
Files.write(pdfFile, "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(1, candidates.size());
assertEquals("myfile.pdf", candidates.get(0).uniqueIdentifier(),
"uniqueIdentifier should be filename");
}
@Test
void testLoadCandidates_LocatorContainsAbsolutePath() throws IOException {
Path pdfFile = tempDir.resolve("test.pdf");
Files.write(pdfFile, "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(1, candidates.size());
String locatorValue = candidates.get(0).locator().value();
assertTrue(locatorValue.contains("test.pdf"), "Locator should contain filename");
assertTrue(new java.io.File(locatorValue).isAbsolute(),
"Locator value should be an absolute path");
}
@Test
void testLoadCandidates_SourceFolderNotFound() {
Path nonExistentFolder = tempDir.resolve("does-not-exist");
SourceDocumentCandidatesPortAdapter adapterForMissing =
new SourceDocumentCandidatesPortAdapter(nonExistentFolder);
SourceDocumentAccessException ex = assertThrows(
SourceDocumentAccessException.class,
adapterForMissing::loadCandidates,
"Should throw exception for non-existent source folder");
assertTrue(ex.getMessage().contains("does not exist"));
}
@Test
void testLoadCandidates_SourceFolderIsFile() throws IOException {
Path fileInsteadOfFolder = tempDir.resolve("regular-file");
Files.createFile(fileInsteadOfFolder);
SourceDocumentCandidatesPortAdapter adapterForFile =
new SourceDocumentCandidatesPortAdapter(fileInsteadOfFolder);
SourceDocumentAccessException ex = assertThrows(
SourceDocumentAccessException.class,
adapterForFile::loadCandidates,
"Should throw exception if source path is a file, not a folder");
assertTrue(ex.getMessage().contains("not a directory"));
}
@Test
void testLoadCandidates_HasLocatorForEachCandidate() throws IOException {
Files.createFile(tempDir.resolve("file1.pdf"));
Files.createFile(tempDir.resolve("file2.pdf"));
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
for (SourceDocumentCandidate candidate : candidates) {
assertNotNull(candidate.locator(), "Each candidate must have a locator");
assertNotNull(candidate.locator().value(), "Locator value must not be null");
assertFalse(candidate.locator().value().isEmpty(), "Locator value must not be empty");
}
}
@Test
void testLoadCandidates_EmptyPdfFilesAreIncluded() throws IOException {
// Create empty PDF files (M3-AP-002 requirement: PDF-Dateien im Quellordner)
Files.createFile(tempDir.resolve("empty1.pdf"));
Files.createFile(tempDir.resolve("empty2.pdf"));
// Also add a non-empty PDF for contrast
Files.write(tempDir.resolve("nonempty.pdf"), "content".getBytes());
List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
assertEquals(3, candidates.size(),
"Empty PDF files should be included as candidates; content evaluation happens in AP-004");
assertTrue(candidates.stream().allMatch(c -> c.uniqueIdentifier().endsWith(".pdf")),
"All candidates should be PDF files");
}
}

View File

@@ -16,7 +16,7 @@ import java.util.Objects;
* Fields:
* <ul>
* <li>{@code uniqueIdentifier} — human-readable name for logging and correlation (e.g. filename)</li>
* <li>{@code fileSizeBytes} — enables early detection of corrupt/empty documents</li>
* <li>{@code fileSizeBytes} — file size for metadata and tracing; may be zero for empty files (content evaluation happens later in AP-004)</li>
* <li>{@code locator} — opaque reference passed through unchanged to the extraction adapter;
* Domain and Application never interpret its value</li>
* </ul>
@@ -36,23 +36,23 @@ public record SourceDocumentCandidate(
* Ensures all parameters are non-null and meaningful:
* <ul>
* <li>{@code uniqueIdentifier} must be non-null and non-empty</li>
* <li>{@code fileSizeBytes} must be positive</li>
* <li>{@code fileSizeBytes} must be non-negative (may be zero for empty files; content evaluation is AP-004)</li>
* <li>{@code locator} must be non-null</li>
* </ul>
*
* @param uniqueIdentifier non-null, non-empty identifier for logging and correlation
* @param fileSizeBytes must be &gt; 0
* @param fileSizeBytes must be &gt;= 0 (may be 0; content evaluation happens in AP-004)
* @param locator non-null opaque locator; only adapters interpret its value
* @throws NullPointerException if uniqueIdentifier or locator is null
* @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes &lt;= 0
* @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes &lt; 0
*/
public SourceDocumentCandidate {
Objects.requireNonNull(uniqueIdentifier, "uniqueIdentifier must not be null");
if (uniqueIdentifier.isEmpty()) {
throw new IllegalArgumentException("uniqueIdentifier must not be empty");
}
if (fileSizeBytes <= 0) {
throw new IllegalArgumentException("fileSizeBytes must be positive");
if (fileSizeBytes < 0) {
throw new IllegalArgumentException("fileSizeBytes must not be negative");
}
Objects.requireNonNull(locator, "locator must not be null");
}