M3-AP-005: Batchlauf im Use-Case integriert und sauber von Bootstrap

entkoppelt
2026-04-01 20:34:15 +02:00
parent c482b20df9
commit d60d050948
3 changed files with 413 additions and 80 deletions
@@ -3,41 +3,56 @@ package de.gecheckt.pdf.umbenenner.application.usecase;
 import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
 import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome;
 import de.gecheckt.pdf.umbenenner.application.port.in.RunBatchProcessingUseCase;
+import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort;
 import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort;
 import de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException;
+import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
+import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort;
+import de.gecheckt.pdf.umbenenner.application.service.M3PreCheckEvaluator;
 import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckFailed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3PreCheckPassed;
+import de.gecheckt.pdf.umbenenner.domain.model.M3ProcessingDecision;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
+import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
+import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;

 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;

+import java.util.List;
+
 /**
- * M2 implementation of {@link RunBatchProcessingUseCase}.
+ * M3 batch processing implementation of {@link RunBatchProcessingUseCase}.
 * <p>
- * This use case orchestrates the batch processing workflow with start protection
- * and controlled execution lifecycle, but without actual document processing.
- * <p>
- * Responsibilities:
- * <ul>
+ * Orchestrates the complete M3 batch processing workflow:
+ * <ol>
 *   <li>Acquire exclusive run lock to prevent concurrent instances</li>
- *   <li>Initialize batch execution with the provided run context</li>
- *   <li>Release lock only if it was successfully acquired</li>
- *   <li>Return structured outcome for Bootstrap exit code mapping</li>
+ *   <li>Scan source folder for PDF candidates</li>
+ *   <li>For each candidate: extract text and page count, run M3 pre-checks</li>
+ *   <li>Log per-document M3 decision; end each document controlled without KI or target copy</li>
+ *   <li>Release lock and return structured outcome for Bootstrap exit code mapping</li>
+ * </ol>
+ * <p>
+ * M3 processing boundary:
+ * <ul>
+ *   <li>Documents that pass M3 pre-checks end controlled and are ready for M4+ (KI, persistence, copy)</li>
+ *   <li>Documents with deterministic content errors (no usable text, page limit exceeded) end controlled</li>
+ *   <li>Documents with technical extraction errors end controlled; they do not abort the overall run</li>
+ *   <li>If the source folder itself is inaccessible, the run fails with {@link BatchRunOutcome#FAILURE}</li>
 * </ul>
 * <p>
- * M2 Non-Goals (not implemented):
+ * M3 Non-Goals (not implemented):
 * <ul>
- *   <li>No source folder scanning</li>
- *   <li>No PDF filtering or text extraction</li>
- *   <li>No fingerprinting</li>
- *   <li>No SQLite persistence</li>
- *   <li>No AI integration</li>
- *   <li>No filename generation</li>
- *   <li>No target file copying</li>
- *   <li>No business-level retry logic</li>
- *   <li>No single-document processing</li>
+ *   <li>No fingerprinting or SQLite persistence</li>
+ *   <li>No KI/AI integration or prompt loading</li>
+ *   <li>No filename generation or target file copy</li>
+ *   <li>No cross-run retry logic</li>
 * </ul>
 *
- * @since M2-AP-004
+ * @since M2-AP-004 (extended in M3-AP-005)
 */
 public class M2BatchRunProcessingUseCase implements RunBatchProcessingUseCase {

@@ -45,25 +60,35 @@ public class M2BatchRunProcessingUseCase implements RunBatchProcessingUseCase {

    private final StartConfiguration configuration;
    private final RunLockPort runLockPort;
+    private final SourceDocumentCandidatesPort sourceDocumentCandidatesPort;
+    private final PdfTextExtractionPort pdfTextExtractionPort;

    /**
-     * Creates the M2 batch use case with the already-loaded startup configuration and run lock port.
+     * Creates the batch use case with the already-loaded startup configuration and all required ports.
     * <p>
     * The configuration is loaded and validated by Bootstrap before use case creation;
     * the use case receives the result directly and does not re-read it.
     *
     * @param configuration the validated startup configuration
     * @param runLockPort for exclusive run locking
+     * @param sourceDocumentCandidatesPort for loading PDF candidates from the source folder
+     * @param pdfTextExtractionPort for extracting text and page count from a single PDF
     * @throws NullPointerException if any parameter is null
     */
-    public M2BatchRunProcessingUseCase(StartConfiguration configuration, RunLockPort runLockPort) {
+    public M2BatchRunProcessingUseCase(
+            StartConfiguration configuration,
+            RunLockPort runLockPort,
+            SourceDocumentCandidatesPort sourceDocumentCandidatesPort,
+            PdfTextExtractionPort pdfTextExtractionPort) {
        this.configuration = configuration;
        this.runLockPort = runLockPort;
+        this.sourceDocumentCandidatesPort = sourceDocumentCandidatesPort;
+        this.pdfTextExtractionPort = pdfTextExtractionPort;
    }

    @Override
    public BatchRunOutcome execute(BatchRunContext context) {
-        LOG.info("M2 batch processing initiated with RunId: {}", context.runId());
+        LOG.info("Batch processing initiated. RunId: {}", context.runId());
        boolean lockAcquired = false;

        try {
@@ -77,18 +102,28 @@ public class M2BatchRunProcessingUseCase implements RunBatchProcessingUseCase {
                return BatchRunOutcome.LOCK_UNAVAILABLE;
            }

-            // Step 2: M2 Batch execution frame (no document processing)
            LOG.debug("Configuration in use: source={}, target={}", configuration.sourceFolder(), configuration.targetFolder());
-            LOG.info("Batch execution frame initialized - RunId: {}, Start: {}", context.runId(), context.startInstant());
+            LOG.info("Batch run started. RunId: {}, Start: {}", context.runId(), context.startInstant());

-            // M2 Non-goal: No source folder scanning, PDF processing, persistence, or filename generation
-            // This is a controlled no-op batch cycle that validates the entire orchestration path.
+            // Step 2: Load PDF candidates from source folder
+            List<SourceDocumentCandidate> candidates;
+            try {
+                candidates = sourceDocumentCandidatesPort.loadCandidates();
+            } catch (SourceDocumentAccessException e) {
+                LOG.error("Cannot access source folder: {}", e.getMessage(), e);
+                return BatchRunOutcome.FAILURE;
+            }
+            LOG.info("Found {} PDF candidate(s) in source folder.", candidates.size());

-            LOG.info("Batch execution frame completed successfully");
+            // Step 3: Process each candidate through the M3 pipeline
+            for (SourceDocumentCandidate candidate : candidates) {
+                processCandidate(candidate);
+            }
+
+            LOG.info("Batch run completed. Processed {} candidate(s). RunId: {}", candidates.size(), context.runId());
            return BatchRunOutcome.SUCCESS;

        } catch (Exception e) {
-            // Unexpected error during batch orchestration
            LOG.error("Unexpected error during batch processing", e);
            return BatchRunOutcome.FAILURE;
        } finally {
@@ -98,11 +133,53 @@ public class M2BatchRunProcessingUseCase implements RunBatchProcessingUseCase {
            if (lockAcquired) {
                try {
                    runLockPort.release();
-                    LOG.debug("Run lock released");
+                    LOG.debug("Run lock released.");
                } catch (Exception e) {
-                    LOG.warn("Warning: Failed to release run lock", e);
+                    LOG.warn("Warning: Failed to release run lock.", e);
                }
            }
        }
    }
+
+    /**
+     * Processes a single PDF candidate through the M3 pipeline.
+     * <p>
+     * M3 processing steps per document:
+     * <ol>
+     *   <li>Extract text and page count from the PDF via {@link PdfTextExtractionPort}</li>
+     *   <li>On successful extraction: run M3 pre-checks via {@link M3PreCheckEvaluator}</li>
+     *   <li>Log the per-document M3 decision and end controlled</li>
+     * </ol>
+     * <p>
+     * Per-document errors (extraction failure, pre-check failure) do not abort the overall
+     * batch run. Each candidate ends controlled regardless of its outcome.
+     * <p>
+     * M3 processing boundary: no KI call, no persistence, no filename generation,
+     * no target file copy is initiated here, even for candidates that pass all pre-checks.
+     *
+     * @param candidate the candidate to process
+     */
+    private void processCandidate(SourceDocumentCandidate candidate) {
+        PdfExtractionResult extractionResult = pdfTextExtractionPort.extractTextAndPageCount(candidate);
+
+        switch (extractionResult) {
+            case PdfExtractionSuccess success -> {
+                M3ProcessingDecision decision = M3PreCheckEvaluator.evaluate(candidate, success, configuration);
+                switch (decision) {
+                    case M3PreCheckPassed passed ->
+                        LOG.info("M3 pre-checks passed for '{}'. Candidate ready for further processing (M4+).",
+                                candidate.uniqueIdentifier());
+                    case M3PreCheckFailed failed ->
+                        LOG.info("M3 pre-check failed for '{}': {}",
+                                candidate.uniqueIdentifier(), failed.failureReason());
+                }
+            }
+            case PdfExtractionContentError contentError ->
+                LOG.info("PDF content not extractable for '{}': {}",
+                        candidate.uniqueIdentifier(), contentError.reason());
+            case PdfExtractionTechnicalError technicalError ->
+                LOG.warn("Technical error extracting PDF '{}': {}",
+                        candidate.uniqueIdentifier(), technicalError.errorMessage());
+        }
+    }
 }