M3-Quellordneradapter korrigiert und leere PDF-Kandidaten zugelassen

2026-04-01 18:35:28 +02:00
parent dd282e8f7b
commit 8f138d4cfa
4 changed files with 378 additions and 6 deletions
@@ -0,0 +1,151 @@
+package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;
+
+import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
+import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.stream.Stream;
+
+/**
+ * File-system based implementation of {@link SourceDocumentCandidatesPort}.
+ * <p>
+ * AP-002 Implementation: Scans a configured source folder and returns only PDF files
+ * (by extension) as {@link SourceDocumentCandidate} objects.
+ * <p>
+ * Design:
+ * <ul>
+ *   <li>Reads exactly one directory level (no recursion)</li>
+ *   <li>Filters for files with `.pdf` extension (case-insensitive)</li>
+ *   <li>Ignores directories and non-PDF files</li>
+ *   <li>Returns candidates in deterministic, stable order (sorted by absolute path)</li>
+ *   <li>Each candidate's locator contains the absolute path as a String (adapter-internal convention)</li>
+ *   <li>Technical filesystem errors are wrapped in {@link SourceDocumentAccessException}</li>
+ * </ul>
+ * <p>
+ * Non-goals:
+ * <ul>
+ *   <li>No PDF validation (that is AP-003)</li>
+ *   <li>No recursion into subdirectories</li>
+ *   <li>No content evaluation (that happens in AP-004: brauchbarer Text assessment)</li>
+ *   <li>No fachlich evaluation of candidates</li>
+ * </ul>
+ *
+ * @since M3-AP-002
+ */
+public class SourceDocumentCandidatesPortAdapter implements SourceDocumentCandidatesPort {
+
+    private static final String PDF_EXTENSION = ".pdf";
+
+    private final Path sourceFolder;
+
+    /**
+     * Creates a new SourceDocumentCandidatesPortAdapter for the given source folder.
+     *
+     * @param sourceFolder the directory to scan for PDF files; must be a readable directory
+     * @throws NullPointerException if sourceFolder is null
+     */
+    public SourceDocumentCandidatesPortAdapter(Path sourceFolder) {
+        this.sourceFolder = sourceFolder;
+    }
+
+    /**
+     * Loads all PDF candidates from the source folder.
+     * <p>
+     * Scans the source folder at exactly one level (no recursion), identifies PDF files
+     * (by extension), and returns them as candidates in sorted order.
+     * <p>
+     * Each returned candidate carries:
+     * <ul>
+     *   <li>{@code uniqueIdentifier}: the filename (e.g. "document.pdf")</li>
+     *   <li>{@code fileSizeBytes}: the file size in bytes</li>
+     *   <li>{@code locator}: opaque reference containing the absolute path as a String.
+     *       This is an adapter-internal convention and is never interpreted by Domain or Application.</li>
+     * </ul>
+     *
+     * @return a list of discovered PDF candidates sorted by absolute path (may be empty)
+     * @throws SourceDocumentAccessException if the source folder cannot be read or accessed
+     */
+    @Override
+    public List<SourceDocumentCandidate> loadCandidates() throws SourceDocumentAccessException {
+        try {
+            // Validate that source folder exists and is readable
+            if (!Files.exists(sourceFolder)) {
+                throw new SourceDocumentAccessException(
+                    "Source folder does not exist: " + sourceFolder.toAbsolutePath());
+            }
+            if (!Files.isDirectory(sourceFolder)) {
+                throw new SourceDocumentAccessException(
+                    "Source folder is not a directory: " + sourceFolder.toAbsolutePath());
+            }
+            if (!Files.isReadable(sourceFolder)) {
+                throw new SourceDocumentAccessException(
+                    "Source folder is not readable: " + sourceFolder.toAbsolutePath());
+            }
+
+            // Scan folder: list exactly one level, filter for PDF files, sort deterministically
+            List<Path> pdfPaths;
+            try (Stream<Path> stream = Files.list(sourceFolder)) {
+                pdfPaths = stream
+                    .filter(Files::isRegularFile)                    // Only files, not directories
+                    .filter(this::isPdfFile)                         // Only .pdf extension
+                    .sorted()                                        // Deterministic order (by path)
+                    .toList();
+            }
+
+            // Convert paths to candidates
+            List<SourceDocumentCandidate> candidates = new java.util.ArrayList<>();
+            for (Path path : pdfPaths) {
+                candidates.add(toSourceDocumentCandidate(path));
+            }
+            return candidates;
+
+        } catch (SourceDocumentAccessException e) {
+            throw e;
+        } catch (IOException e) {
+            throw new SourceDocumentAccessException(
+                "Failed to read source folder: " + sourceFolder.toAbsolutePath(), e);
+        } catch (Exception e) {
+            throw new SourceDocumentAccessException(
+                "Unexpected error while scanning source folder: " + sourceFolder.toAbsolutePath(), e);
+        }
+    }
+
+    /**
+     * Checks if a file is a PDF by extension (case-insensitive).
+     *
+     * @param path the file path to check
+     * @return true if the file name ends with .pdf (case-insensitive), false otherwise
+     */
+    private boolean isPdfFile(Path path) {
+        String fileName = path.getFileName().toString().toLowerCase();
+        return fileName.endsWith(PDF_EXTENSION);
+    }
+
+    /**
+     * Converts a file path to a SourceDocumentCandidate.
+     * <p>
+     * The locator is set to the absolute path as a String. This is an adapter-internal
+     * convention that allows the extraction adapter to re-locate the file later.
+     * Domain and Application never interpret this value.
+     *
+     * @param path the file path
+     * @return a new SourceDocumentCandidate with metadata extracted from the path
+     * @throws IOException if file size or path cannot be determined
+     */
+    private SourceDocumentCandidate toSourceDocumentCandidate(Path path) throws IOException {
+        long fileSizeBytes = Files.size(path);
+        String fileName = path.getFileName().toString();
+        String absolutePath = path.toAbsolutePath().toString();
+
+        return new SourceDocumentCandidate(
+            fileName,
+            fileSizeBytes,
+            new SourceDocumentLocator(absolutePath)
+        );
+    }
+}
@@ -0,0 +1,12 @@
+/**
+ * Source document adapters for discovering and accessing PDF candidates.
+ * <p>
+ * M3-AP-002 implementations:
+ * <ul>
+ *   <li>{@link de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument.SourceDocumentCandidatesPortAdapter}
+ *       — File-system based discovery of PDF candidates from the source folder</li>
+ * </ul>
+ *
+ * @since M3-AP-002
+ */
+package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;
@@ -0,0 +1,209 @@
+package de.gecheckt.pdf.umbenenner.adapter.outbound.sourcedocument;
+
+import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * Tests for {@link SourceDocumentCandidatesPortAdapter}.
+ *
+ * @since M3-AP-002
+ */
+class SourceDocumentCandidatesPortAdapterTest {
+
+    @TempDir
+    Path tempDir;
+
+    private SourceDocumentCandidatesPortAdapter adapter;
+
+    @BeforeEach
+    void setUp() {
+        adapter = new SourceDocumentCandidatesPortAdapter(tempDir);
+    }
+
+    @Test
+    void testLoadCandidates_EmptyFolder() throws IOException {
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertNotNull(candidates);
+        assertTrue(candidates.isEmpty(), "Empty folder should return empty list");
+    }
+
+    @Test
+    void testLoadCandidates_OnlyPdfFiles() throws IOException {
+        // Create test PDF files
+        Path pdf1 = tempDir.resolve("document1.pdf");
+        Path pdf2 = tempDir.resolve("document2.pdf");
+        Files.write(pdf1, "pdf content".getBytes());
+        Files.write(pdf2, "pdf content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(2, candidates.size(), "Should return exactly 2 PDF candidates");
+        assertTrue(candidates.stream()
+                .allMatch(c -> c.uniqueIdentifier().endsWith(".pdf")),
+            "All candidates should be PDF files");
+    }
+
+    @Test
+    void testLoadCandidates_FiltersNonPdfFiles() throws IOException {
+        // Create mixed file types
+        Files.write(tempDir.resolve("document.pdf"), "content".getBytes());
+        Files.write(tempDir.resolve("image.png"), "content".getBytes());
+        Files.write(tempDir.resolve("text.txt"), "content".getBytes());
+        Files.write(tempDir.resolve("data.xlsx"), "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(1, candidates.size(), "Should return only 1 PDF candidate");
+        assertEquals("document.pdf", candidates.get(0).uniqueIdentifier());
+    }
+
+    @Test
+    void testLoadCandidates_IgnoresDirectories() throws IOException {
+        // Create files and subdirectories
+        Files.write(tempDir.resolve("document.pdf"), "content".getBytes());
+        Files.createDirectory(tempDir.resolve("subfolder"));
+        Files.write(tempDir.resolve("subfolder/nested.pdf"), "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(1, candidates.size(), "Should return only 1 PDF candidate (in root folder)");
+        assertEquals("document.pdf", candidates.get(0).uniqueIdentifier());
+    }
+
+    @Test
+    void testLoadCandidates_CaseInsensitiveExtension() throws IOException {
+        // Create PDFs with various case combinations
+        Files.write(tempDir.resolve("file1.pdf"), "content".getBytes());
+        Files.write(tempDir.resolve("file2.PDF"), "content".getBytes());
+        Files.write(tempDir.resolve("file3.Pdf"), "content".getBytes());
+        Files.write(tempDir.resolve("file4.pDf"), "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(4, candidates.size(), "Should recognize PDF in any case combination");
+    }
+
+    @Test
+    void testLoadCandidates_DeterministicOrder() throws IOException {
+        // Create PDFs in non-alphabetical order
+        Files.write(tempDir.resolve("zebra.pdf"), "content".getBytes());
+        Files.write(tempDir.resolve("apple.pdf"), "content".getBytes());
+        Files.write(tempDir.resolve("monkey.pdf"), "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(3, candidates.size());
+        // Files are sorted by absolute path, which will be consistent
+        List<SourceDocumentCandidate> candidates2 = adapter.loadCandidates();
+        assertEquals(candidates, candidates2, "Multiple calls should return same order");
+    }
+
+    @Test
+    void testLoadCandidates_FileSizeMetadata() throws IOException {
+        Path pdfFile = tempDir.resolve("test.pdf");
+        Files.write(pdfFile, "test content 12345".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(1, candidates.size());
+        SourceDocumentCandidate candidate = candidates.get(0);
+        assertEquals(18, candidate.fileSizeBytes(), "File size should match written content");
+    }
+
+    @Test
+    void testLoadCandidates_UniqueIdentifier() throws IOException {
+        Path pdfFile = tempDir.resolve("myfile.pdf");
+        Files.write(pdfFile, "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(1, candidates.size());
+        assertEquals("myfile.pdf", candidates.get(0).uniqueIdentifier(),
+            "uniqueIdentifier should be filename");
+    }
+
+    @Test
+    void testLoadCandidates_LocatorContainsAbsolutePath() throws IOException {
+        Path pdfFile = tempDir.resolve("test.pdf");
+        Files.write(pdfFile, "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(1, candidates.size());
+        String locatorValue = candidates.get(0).locator().value();
+        assertTrue(locatorValue.contains("test.pdf"), "Locator should contain filename");
+        assertTrue(new java.io.File(locatorValue).isAbsolute(),
+            "Locator value should be an absolute path");
+    }
+
+    @Test
+    void testLoadCandidates_SourceFolderNotFound() {
+        Path nonExistentFolder = tempDir.resolve("does-not-exist");
+        SourceDocumentCandidatesPortAdapter adapterForMissing =
+            new SourceDocumentCandidatesPortAdapter(nonExistentFolder);
+
+        SourceDocumentAccessException ex = assertThrows(
+            SourceDocumentAccessException.class,
+            adapterForMissing::loadCandidates,
+            "Should throw exception for non-existent source folder");
+
+        assertTrue(ex.getMessage().contains("does not exist"));
+    }
+
+    @Test
+    void testLoadCandidates_SourceFolderIsFile() throws IOException {
+        Path fileInsteadOfFolder = tempDir.resolve("regular-file");
+        Files.createFile(fileInsteadOfFolder);
+
+        SourceDocumentCandidatesPortAdapter adapterForFile =
+            new SourceDocumentCandidatesPortAdapter(fileInsteadOfFolder);
+
+        SourceDocumentAccessException ex = assertThrows(
+            SourceDocumentAccessException.class,
+            adapterForFile::loadCandidates,
+            "Should throw exception if source path is a file, not a folder");
+
+        assertTrue(ex.getMessage().contains("not a directory"));
+    }
+
+    @Test
+    void testLoadCandidates_HasLocatorForEachCandidate() throws IOException {
+        Files.createFile(tempDir.resolve("file1.pdf"));
+        Files.createFile(tempDir.resolve("file2.pdf"));
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        for (SourceDocumentCandidate candidate : candidates) {
+            assertNotNull(candidate.locator(), "Each candidate must have a locator");
+            assertNotNull(candidate.locator().value(), "Locator value must not be null");
+            assertFalse(candidate.locator().value().isEmpty(), "Locator value must not be empty");
+        }
+    }
+
+    @Test
+    void testLoadCandidates_EmptyPdfFilesAreIncluded() throws IOException {
+        // Create empty PDF files (M3-AP-002 requirement: PDF-Dateien im Quellordner)
+        Files.createFile(tempDir.resolve("empty1.pdf"));
+        Files.createFile(tempDir.resolve("empty2.pdf"));
+        // Also add a non-empty PDF for contrast
+        Files.write(tempDir.resolve("nonempty.pdf"), "content".getBytes());
+
+        List<SourceDocumentCandidate> candidates = adapter.loadCandidates();
+
+        assertEquals(3, candidates.size(),
+            "Empty PDF files should be included as candidates; content evaluation happens in AP-004");
+        assertTrue(candidates.stream().allMatch(c -> c.uniqueIdentifier().endsWith(".pdf")),
+            "All candidates should be PDF files");
+    }
+}
@@ -16,7 +16,7 @@ import java.util.Objects;
 * Fields:
 * <ul>
 *   <li>{@code uniqueIdentifier} — human-readable name for logging and correlation (e.g. filename)</li>
- *   <li>{@code fileSizeBytes} — enables early detection of corrupt/empty documents</li>
+ *   <li>{@code fileSizeBytes} — file size for metadata and tracing; may be zero for empty files (content evaluation happens later in AP-004)</li>
 *   <li>{@code locator} — opaque reference passed through unchanged to the extraction adapter;
 *       Domain and Application never interpret its value</li>
 * </ul>
@@ -36,23 +36,23 @@ public record SourceDocumentCandidate(
     * Ensures all parameters are non-null and meaningful:
     * <ul>
     *   <li>{@code uniqueIdentifier} must be non-null and non-empty</li>
-     *   <li>{@code fileSizeBytes} must be positive</li>
+     *   <li>{@code fileSizeBytes} must be non-negative (may be zero for empty files; content evaluation is AP-004)</li>
     *   <li>{@code locator} must be non-null</li>
     * </ul>
     *
     * @param uniqueIdentifier non-null, non-empty identifier for logging and correlation
-     * @param fileSizeBytes must be &gt; 0
+     * @param fileSizeBytes must be &gt;= 0 (may be 0; content evaluation happens in AP-004)
     * @param locator non-null opaque locator; only adapters interpret its value
     * @throws NullPointerException if uniqueIdentifier or locator is null
-     * @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes &lt;= 0
+     * @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes &lt; 0
     */
    public SourceDocumentCandidate {
        Objects.requireNonNull(uniqueIdentifier, "uniqueIdentifier must not be null");
        if (uniqueIdentifier.isEmpty()) {
            throw new IllegalArgumentException("uniqueIdentifier must not be empty");
        }
-        if (fileSizeBytes <= 0) {
-            throw new IllegalArgumentException("fileSizeBytes must be positive");
+        if (fileSizeBytes < 0) {
+            throw new IllegalArgumentException("fileSizeBytes must not be negative");
        }
        Objects.requireNonNull(locator, "locator must not be null");
    }