diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/PdfTextExtractionPort.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/PdfTextExtractionPort.java new file mode 100644 index 0000000..ad04445 --- /dev/null +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/PdfTextExtractionPort.java @@ -0,0 +1,65 @@ +package de.gecheckt.pdf.umbenenner.application.port.out; + +import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; + +/** + * Outbound port for extracting text content and page count from a PDF document. + *

+ * This interface abstracts PDF text extraction, allowing the application layer + * to remain independent of the underlying PDF library or extraction mechanism. + *

+ * Responsibilities of the implementing adapter: + *

+ *

+ * Architecture notes: + *

+ *

+ * Non-goals of this port (handled in later milestones): + *

+ * + * @since M3-AP-001 + */ +public interface PdfTextExtractionPort { + + /** + * Extracts text content and page count from a single PDF document. + *

+ * Performs a single extraction operation and returns the result. + * Errors are never thrown as exceptions; they are encoded in the result type. + *

+ * The candidate parameter serves as both: + *

+ * + * @param candidate the document to extract; non-null + * @return a {@link PdfExtractionResult} encoding the outcome: + * + * @throws NullPointerException if candidate is null + * @see PdfExtractionResult + */ + PdfExtractionResult extractTextAndPageCount(SourceDocumentCandidate candidate); +} diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentAccessException.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentAccessException.java new file mode 100644 index 0000000..ab3eaab --- /dev/null +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentAccessException.java @@ -0,0 +1,39 @@ +package de.gecheckt.pdf.umbenenner.application.port.out; + +/** + * Exception thrown when source documents cannot be accessed or read. + *

+ * This exception is raised by the {@link SourceDocumentCandidatesPort} when: + *

+ *

+ * This is a runtime exception, allowing adapters to propagate filesystem errors + * without forcing try/catch blocks in the application layer. + * + * @since M3-AP-001 + */ +public final class SourceDocumentAccessException extends RuntimeException { + private static final long serialVersionUID = 1L; + + /** + * Constructs an exception with a message. + * + * @param message descriptive message about the access failure + */ + public SourceDocumentAccessException(String message) { + super(message); + } + + /** + * Constructs an exception with a message and cause. + * + * @param message descriptive message about the access failure + * @param cause the underlying exception that caused this failure + */ + public SourceDocumentAccessException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentCandidatesPort.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentCandidatesPort.java new file mode 100644 index 0000000..39a172b --- /dev/null +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/SourceDocumentCandidatesPort.java @@ -0,0 +1,49 @@ +package de.gecheckt.pdf.umbenenner.application.port.out; + +import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; + +import java.util.List; + +/** + * Outbound port for loading PDF document candidates from the source folder. + *

+ * This interface abstracts filesystem access, allowing the application layer + * to remain independent of how candidates are discovered or retrieved. + *

+ * Responsibilities of the implementing adapter: + *

+ *

+ * Architecture note: + *

+ * + * @since M3-AP-001 + */ +public interface SourceDocumentCandidatesPort { + + /** + * Loads all PDF candidates from the configured source folder. + *

+ * Returns a list of candidates representing discovered PDF files. + * The list is guaranteed to be in deterministic order to enable reproducible runs. + *

+ * Non-goal: This method does NOT filter candidates by any fachlich criteria + * (such as file size, corruption checks, or content validation). + * It delivers raw candidates from the filesystem; fachlich evaluation happens later. + * + * @return a list of discovered PDF candidates in deterministic order (may be empty if no PDFs found) + * @throws SourceDocumentAccessException if the source folder cannot be read or accessed + * @see SourceDocumentCandidate + */ + List loadCandidates() throws SourceDocumentAccessException; +} diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/package-info.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/package-info.java index 08254df..8a34c0e 100644 --- a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/package-info.java +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/port/out/package-info.java @@ -14,10 +14,20 @@ * — System time access for timestamps and run context * *

+ * M3-AP-001 ports: + *

+ *

* Exception types: *

*

* Architecture Rule: Outbound ports are implementation-agnostic and contain no business logic. diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java new file mode 100644 index 0000000..d55979b --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/M3ProcessingDecision.java @@ -0,0 +1,107 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Sealed interface representing the outcome of M3 document pre-checks. + *

+ * This is a placeholder interface introduced in AP-001 to establish the architectural + * pattern for M3 pre-check results. The actual pre-check logic (fachlich validation + * such as "brauchbarer Text" and "Seitenlimit") is implemented in later APs (AP-004, AP-005). + *

+ * There are two allowed implementations: + *

+ *

+ * Design principles: + *

+ * + * @since M3-AP-001 + */ +public sealed interface M3ProcessingDecision + permits M3PreCheckPassed, M3PreCheckFailed { + // Marker interface; concrete implementations define structure +} + +/** + * Represents a document that passed all M3 pre-checks. + *

+ * This result encapsulates: + *

+ *

+ * A document with this decision is ready to proceed to M4 and later milestones + * (fingerprinting, persistence, KI integration, filename generation, target copy). + * + * @param candidate the source document metadata + * @param extraction the successful text extraction result + * @since M3-AP-001 + */ +record M3PreCheckPassed( + SourceDocumentCandidate candidate, + PdfExtractionSuccess extraction +) implements M3ProcessingDecision { + /** + * Constructor with validation. + * + * @param candidate must be non-null + * @param extraction must be non-null + * @throws NullPointerException if either parameter is null + */ + M3PreCheckPassed { + Objects.requireNonNull(candidate, "candidate must not be null"); + Objects.requireNonNull(extraction, "extraction must not be null"); + } +} + +/** + * Represents a document that failed an M3 pre-check. + *

+ * This result encapsulates: + *

+ *

+ * Reasons include: + *

+ *

+ * A document with this decision will not proceed further in the current batch run. + * + * @param candidate the source document metadata + * @param failureReason a human-readable explanation of the pre-check failure + * @since M3-AP-001 + */ +record M3PreCheckFailed( + SourceDocumentCandidate candidate, + String failureReason +) implements M3ProcessingDecision { + /** + * Constructor with validation. + * + * @param candidate must be non-null + * @param failureReason must be non-null and non-empty + * @throws NullPointerException if either parameter is null + * @throws IllegalArgumentException if failureReason is empty + */ + M3PreCheckFailed { + Objects.requireNonNull(candidate, "candidate must not be null"); + Objects.requireNonNull(failureReason, "failureReason must not be null"); + if (failureReason.isEmpty()) { + throw new IllegalArgumentException("failureReason must not be empty"); + } + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java new file mode 100644 index 0000000..c73002a --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfExtractionResult.java @@ -0,0 +1,122 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Sealed interface representing the outcome of PDF text extraction. + *

+ * This interface uses Java 17+ sealed types to enforce exhaustive case handling. + * There are exactly three allowed implementations: + *

+ *

+ * Design principles: + *

+ * + * @since M3-AP-001 + */ +public sealed interface PdfExtractionResult + permits PdfExtractionSuccess, PdfExtractionContentError, PdfExtractionTechnicalError { + // Marker interface; concrete implementations define structure +} + +/** + * Represents successful PDF text extraction. + *

+ * When this result is obtained, both text content and page count have been + * successfully extracted and are guaranteed to be valid. + * + * @param extractedText the full text content extracted from the PDF (non-null, may be empty string) + * @param pageCount the number of pages in the PDF (non-null, validated >= 1) + * @since M3-AP-001 + */ +record PdfExtractionSuccess( + String extractedText, + PdfPageCount pageCount +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param extractedText must be non-null (may be empty) + * @param pageCount must be non-null + * @throws NullPointerException if either parameter is null + */ + PdfExtractionSuccess { + Objects.requireNonNull(extractedText, "extractedText must not be null"); + Objects.requireNonNull(pageCount, "pageCount must not be null"); + } +} + +/** + * Represents a content-related failure during PDF text extraction. + *

+ * This indicates that the PDF file itself is readable (no I/O error), + * but its content is not suitable for text extraction. + *

+ * Examples: PDF is image-only (not OCR'd), PDF is encrypted and cannot be unlocked, + * PDF is severely corrupted in the content layer. + *

+ * This is typically a deterministic, non-retryable condition for a given source file + * (unless the source file is modified and re-scanned in a later run). + * + * @param reason a human-readable explanation of why extraction failed (non-null, non-empty) + * @since M3-AP-001 + */ +record PdfExtractionContentError( + String reason +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param reason must be non-null and non-empty + * @throws NullPointerException if reason is null + * @throws IllegalArgumentException if reason is empty + */ + PdfExtractionContentError { + Objects.requireNonNull(reason, "reason must not be null"); + if (reason.isEmpty()) { + throw new IllegalArgumentException("reason must not be empty"); + } + } +} + +/** + * Represents a technical (infrastructure) failure during PDF text extraction. + *

+ * This indicates that something went wrong with the extraction process itself, + * such as file I/O errors, PDFBox library problems, or out-of-memory conditions. + *

+ * These are typically retryable conditions in later batch runs, as they may be + * transient infrastructure issues. + * + * @param errorMessage a description of what went wrong (non-null, non-empty) + * @param cause the underlying exception, if any (may be null) + * @since M3-AP-001 + */ +record PdfExtractionTechnicalError( + String errorMessage, + Throwable cause +) implements PdfExtractionResult { + /** + * Constructor with validation. + * + * @param errorMessage must be non-null and non-empty + * @param cause may be null + * @throws NullPointerException if errorMessage is null + * @throws IllegalArgumentException if errorMessage is empty + */ + PdfExtractionTechnicalError { + Objects.requireNonNull(errorMessage, "errorMessage must not be null"); + if (errorMessage.isEmpty()) { + throw new IllegalArgumentException("errorMessage must not be empty"); + } + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfPageCount.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfPageCount.java new file mode 100644 index 0000000..617e2e8 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PdfPageCount.java @@ -0,0 +1,45 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +/** + * Typed representation of a PDF document's page count. + *

+ * This record provides type safety and semantic clarity, distinguishing page count + * from other numeric values (character counts, file sizes, error codes, etc.). + *

+ * Design principles: + *

+ * + * @since M3-AP-001 + */ +public record PdfPageCount(int value) { + /** + * Constructor with validation. + *

+ * Ensures the page count is meaningful and valid. + * + * @param value must be >= 1 + * @throws IllegalArgumentException if value < 1 + */ + public PdfPageCount { + if (value < 1) { + throw new IllegalArgumentException("Page count must be >= 1, but got: " + value); + } + } + + /** + * Returns whether this page count exceeds a given limit. + *

+ * Convenience method for page limit checks in later milestones. + * + * @param limit the maximum allowed page count + * @return true if this count exceeds the limit + */ + public boolean exceedsLimit(int limit) { + return this.value > limit; + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentCandidate.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentCandidate.java new file mode 100644 index 0000000..89c5464 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentCandidate.java @@ -0,0 +1,59 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Represents a discovered PDF candidate from the source folder. + *

+ * This record encapsulates the minimal, infrastructure-agnostic metadata + * needed to identify, correlate, and later extract a document during processing. + *

+ * It deliberately does NOT expose filesystem paths or file handles directly. + * The hexagonal boundary is maintained: adapters map filesystem objects to this + * representation. The physical document location is carried opaquely via + * {@link SourceDocumentLocator}, which only adapters interpret. + *

+ * Fields: + *

+ *

+ * No java.io.File or java.nio.file.Path references appear in this record. + * + * @since M3-AP-001 + */ +public record SourceDocumentCandidate( + String uniqueIdentifier, + long fileSizeBytes, + SourceDocumentLocator locator +) { + /** + * Compact constructor with validation. + *

+ * Ensures all parameters are non-null and meaningful: + *

+ * + * @param uniqueIdentifier non-null, non-empty identifier for logging and correlation + * @param fileSizeBytes must be > 0 + * @param locator non-null opaque locator; only adapters interpret its value + * @throws NullPointerException if uniqueIdentifier or locator is null + * @throws IllegalArgumentException if uniqueIdentifier is empty or fileSizeBytes <= 0 + */ + public SourceDocumentCandidate { + Objects.requireNonNull(uniqueIdentifier, "uniqueIdentifier must not be null"); + if (uniqueIdentifier.isEmpty()) { + throw new IllegalArgumentException("uniqueIdentifier must not be empty"); + } + if (fileSizeBytes <= 0) { + throw new IllegalArgumentException("fileSizeBytes must be positive"); + } + Objects.requireNonNull(locator, "locator must not be null"); + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentLocator.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentLocator.java new file mode 100644 index 0000000..fd5e1b9 --- /dev/null +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/SourceDocumentLocator.java @@ -0,0 +1,45 @@ +package de.gecheckt.pdf.umbenenner.domain.model; + +import java.util.Objects; + +/** + * Opaque locator that allows an adapter to re-find the physical document + * that was originally discovered by the source-scan adapter. + *

+ * This type is deliberately opaque to Domain and Application. + * Neither layer interprets or constructs the contained value. Only adapters + * read and write the {@code value} field: + *

+ *

+ * The value itself is an adapter-internal convention (e.g. an absolute path string). + * Domain and Application never interpret it — they only pass it through. + * This preserves the hexagonal boundary: no {@code java.nio.file.Path}, + * {@code java.io.File}, or other infrastructure types appear outside the adapter layer. + *

+ * Coupling: Both the scan adapter and the extraction adapter live in the same + * {@code adapter-out} module and share the same encoding convention for the value. + * This is an intentional intra-adapter contract, not a cross-layer concern. + * + * @since M3-AP-001 + */ +public record SourceDocumentLocator(String value) { + + /** + * Compact constructor with validation. + * + * @param value non-null, non-empty opaque locator value; content is an adapter-internal convention + * @throws NullPointerException if value is null + * @throws IllegalArgumentException if value is empty + */ + public SourceDocumentLocator { + Objects.requireNonNull(value, "value must not be null"); + if (value.isEmpty()) { + throw new IllegalArgumentException("value must not be empty"); + } + } +} diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java index 2746a19..4a996a3 100644 --- a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/package-info.java @@ -3,7 +3,14 @@ *

* This package contains the fundamental domain entities and status models required for document processing: *

*

* All classes in this package are: