1
0

M4 AP-006 Idempotenz- und Persistenzlogik integrieren

This commit is contained in:
2026-04-02 23:36:22 +02:00
parent 8ee4041feb
commit 00c4cf1e5c
7 changed files with 1598 additions and 174 deletions

View File

@@ -0,0 +1,558 @@
package de.gecheckt.pdf.umbenenner.application.service;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentKnownProcessable;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentPersistenceException;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecord;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordLookupResult;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentTerminalFinalFailure;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentTerminalSuccess;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentUnknown;
import de.gecheckt.pdf.umbenenner.application.port.out.FailureCounters;
import de.gecheckt.pdf.umbenenner.application.port.out.PersistenceLookupTechnicalFailure;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttempt;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttemptRepository;
import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentFingerprint;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
import de.gecheckt.pdf.umbenenner.domain.model.ProcessingStatus;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.time.Instant;
import java.util.Objects;
/**
* Application-level service that implements the M4 per-document processing logic.
* <p>
* This service is the single authoritative place for the M4 decision rules:
* idempotency checks, status/counter mapping, and consistent two-level persistence.
* It is intentionally tightly scoped to AP-006 and contains no M5+ logic.
*
* <h2>M4 processing order per candidate</h2>
* <ol>
* <li>Load the document master record by fingerprint.</li>
* <li>If the overall status is {@link ProcessingStatus#SUCCESS} → create and persist
* a skip attempt with {@link ProcessingStatus#SKIPPED_ALREADY_PROCESSED}.</li>
* <li>If the overall status is {@link ProcessingStatus#FAILED_FINAL} → create and persist
* a skip attempt with {@link ProcessingStatus#SKIPPED_FINAL_FAILURE}.</li>
* <li>Otherwise execute the M3 flow (already done by the caller) and map the result
* into M4 status, counters and retryable flag.</li>
* <li>Persist exactly one historised processing attempt for the identified document.</li>
* <li>Persist the updated document master record.</li>
* </ol>
*
* <h2>M4 minimal rules</h2>
* <ul>
* <li>Already successful documents are skipped in later runs.</li>
* <li>Already finally failed documents are skipped in later runs.</li>
* <li>First historised deterministic content failure from M3 →
* {@link ProcessingStatus#FAILED_RETRYABLE}, content error counter becomes 1,
* {@code retryable=true}.</li>
* <li>Second historised deterministic content failure in a later run →
* {@link ProcessingStatus#FAILED_FINAL}, content error counter becomes 2,
* {@code retryable=false}.</li>
* <li>Document-related technical failures after successful fingerprinting remain
* {@link ProcessingStatus#FAILED_RETRYABLE}, increment transient error counter,
* {@code retryable=true}.</li>
* <li>Skip events do not change error counters.</li>
* </ul>
*
* <h2>Persistence consistency</h2>
* <p>
* For every identified document, both the processing attempt and the master record are
* written in sequence. If either write fails, the failure is logged and the batch run
* continues with the next candidate. No partial state is intentionally left; if the
* attempt write succeeds but the master record write fails, the inconsistency is bounded
* to that one document and is logged clearly. True transactionality across two separate
* repository calls is not available without a larger architectural change; this is
* documented as a known limitation of the M4 scope.
*
* <h2>Pre-fingerprint failures</h2>
* <p>
* Failures that occur before a successful fingerprint is available are <em>not</em>
* historised in SQLite. They are handled by the caller and logged as non-identifiable
* run events.
*
* @since M4-AP-006
*/
public class M4DocumentProcessor {
private static final Logger LOG = LogManager.getLogger(M4DocumentProcessor.class);
private final DocumentRecordRepository documentRecordRepository;
private final ProcessingAttemptRepository processingAttemptRepository;
/**
* Creates the M4 document processor with the required persistence ports.
*
* @param documentRecordRepository port for reading and writing the document master record;
* must not be null
* @param processingAttemptRepository port for writing and reading the attempt history;
* must not be null
* @throws NullPointerException if any parameter is null
*/
public M4DocumentProcessor(
DocumentRecordRepository documentRecordRepository,
ProcessingAttemptRepository processingAttemptRepository) {
this.documentRecordRepository =
Objects.requireNonNull(documentRecordRepository, "documentRecordRepository must not be null");
this.processingAttemptRepository =
Objects.requireNonNull(processingAttemptRepository, "processingAttemptRepository must not be null");
}
/**
* Applies the full M4 processing logic for one identified document candidate.
* <p>
* The caller must have already computed a valid {@link DocumentFingerprint} for the
* candidate. The M3 outcome (from the PDF extraction and pre-check pipeline) is
* provided as {@code m3Outcome} and is used only when the document is not in a
* terminal state.
* <p>
* This method never throws. All persistence failures are caught, logged, and
* treated as controlled per-document failures so the batch run can continue.
*
* @param candidate the source document candidate being processed; must not be null
* @param fingerprint the successfully computed fingerprint for this candidate;
* must not be null
* @param m3Outcome the result of the M3 pipeline (PDF extraction + pre-checks);
* must not be null
* @param context the current batch run context (for run ID and timing);
* must not be null
* @param attemptStart the instant at which processing of this candidate began;
* must not be null
*/
public void process(
SourceDocumentCandidate candidate,
DocumentFingerprint fingerprint,
DocumentProcessingOutcome m3Outcome,
BatchRunContext context,
Instant attemptStart) {
Objects.requireNonNull(candidate, "candidate must not be null");
Objects.requireNonNull(fingerprint, "fingerprint must not be null");
Objects.requireNonNull(m3Outcome, "m3Outcome must not be null");
Objects.requireNonNull(context, "context must not be null");
Objects.requireNonNull(attemptStart, "attemptStart must not be null");
// Step 1: Load the document master record
DocumentRecordLookupResult lookupResult =
documentRecordRepository.findByFingerprint(fingerprint);
// Step 2: Handle persistence lookup failure cannot safely proceed
if (lookupResult instanceof PersistenceLookupTechnicalFailure failure) {
LOG.error("Cannot process '{}': master record lookup failed: {}",
candidate.uniqueIdentifier(), failure.errorMessage());
return;
}
// Step 3: Determine the action based on the lookup result
switch (lookupResult) {
case DocumentTerminalSuccess terminalSuccess -> {
// Document already successfully processed → skip
LOG.info("Skipping '{}': already successfully processed (fingerprint: {}).",
candidate.uniqueIdentifier(), fingerprint.sha256Hex());
persistSkipAttempt(
candidate, fingerprint, terminalSuccess.record(),
ProcessingStatus.SKIPPED_ALREADY_PROCESSED,
context, attemptStart);
}
case DocumentTerminalFinalFailure terminalFailure -> {
// Document finally failed → skip
LOG.info("Skipping '{}': already finally failed (fingerprint: {}).",
candidate.uniqueIdentifier(), fingerprint.sha256Hex());
persistSkipAttempt(
candidate, fingerprint, terminalFailure.record(),
ProcessingStatus.SKIPPED_FINAL_FAILURE,
context, attemptStart);
}
case DocumentUnknown ignored -> {
// New document process and create a new master record
processAndPersistNewDocument(candidate, fingerprint, m3Outcome, context, attemptStart);
}
case DocumentKnownProcessable knownProcessable -> {
// Known but not terminal process and update the existing master record
processAndPersistKnownDocument(
candidate, fingerprint, m3Outcome, knownProcessable.record(),
context, attemptStart);
}
default ->
// Exhaustive sealed hierarchy; this branch is unreachable
LOG.error("Unexpected lookup result type for '{}': {}",
candidate.uniqueIdentifier(), lookupResult.getClass().getSimpleName());
}
}
// -------------------------------------------------------------------------
// Skip path
// -------------------------------------------------------------------------
/**
* Persists a skip attempt and updates the master record's {@code updatedAt} timestamp.
* <p>
* Skip events do not change any failure counter. The master record's overall status
* remains unchanged (terminal).
*
* @param candidate the candidate being skipped
* @param fingerprint the document fingerprint
* @param existingRecord the current master record (already terminal)
* @param skipStatus the skip status to record ({@link ProcessingStatus#SKIPPED_ALREADY_PROCESSED}
* or {@link ProcessingStatus#SKIPPED_FINAL_FAILURE})
* @param context the current batch run context
* @param attemptStart the start instant of this processing attempt
*/
private void persistSkipAttempt(
SourceDocumentCandidate candidate,
DocumentFingerprint fingerprint,
DocumentRecord existingRecord,
ProcessingStatus skipStatus,
BatchRunContext context,
Instant attemptStart) {
Instant now = Instant.now();
try {
int attemptNumber = processingAttemptRepository.loadNextAttemptNumber(fingerprint);
ProcessingAttempt skipAttempt = new ProcessingAttempt(
fingerprint,
context.runId(),
attemptNumber,
attemptStart,
now,
skipStatus,
null, // no failure class for skip
null, // no failure message for skip
false // not retryable
);
// Write attempt first, then update master record
processingAttemptRepository.save(skipAttempt);
// Update master record: only updatedAt changes; status and counters stay the same
DocumentRecord updatedRecord = new DocumentRecord(
existingRecord.fingerprint(),
new SourceDocumentLocator(candidate.locator().value()),
candidate.uniqueIdentifier(),
existingRecord.overallStatus(), // terminal status unchanged
existingRecord.failureCounters(), // counters unchanged for skip
existingRecord.lastFailureInstant(),
existingRecord.lastSuccessInstant(),
existingRecord.createdAt(),
now // updatedAt = now
);
documentRecordRepository.update(updatedRecord);
LOG.debug("Skip attempt #{} persisted for '{}' with status {}.",
attemptNumber, candidate.uniqueIdentifier(), skipStatus);
} catch (DocumentPersistenceException e) {
LOG.error("Failed to persist skip attempt for '{}': {}",
candidate.uniqueIdentifier(), e.getMessage(), e);
}
}
// -------------------------------------------------------------------------
// New document path
// -------------------------------------------------------------------------
/**
* Processes a newly discovered document (no existing master record) and persists
* both the attempt and the new master record.
*
* @param candidate the candidate being processed
* @param fingerprint the document fingerprint
* @param m3Outcome the M3 pipeline result
* @param context the current batch run context
* @param attemptStart the start instant of this processing attempt
*/
private void processAndPersistNewDocument(
SourceDocumentCandidate candidate,
DocumentFingerprint fingerprint,
DocumentProcessingOutcome m3Outcome,
BatchRunContext context,
Instant attemptStart) {
Instant now = Instant.now();
// Map M3 outcome to M4 status/counters for a brand-new document
M4Outcome m4 = mapM3OutcomeForNewDocument(m3Outcome);
try {
// Attempt number is always 1 for a new document
int attemptNumber = processingAttemptRepository.loadNextAttemptNumber(fingerprint);
ProcessingAttempt attempt = buildAttempt(
fingerprint, context, attemptNumber, attemptStart, now, m4);
// Create the new master record
DocumentRecord newRecord = new DocumentRecord(
fingerprint,
new SourceDocumentLocator(candidate.locator().value()),
candidate.uniqueIdentifier(),
m4.overallStatus(),
m4.counters(),
m4.overallStatus() == ProcessingStatus.SUCCESS ? null : now, // lastFailureInstant
m4.overallStatus() == ProcessingStatus.SUCCESS ? now : null, // lastSuccessInstant
now, // createdAt
now // updatedAt
);
// Persist attempt first, then master record
processingAttemptRepository.save(attempt);
documentRecordRepository.create(newRecord);
LOG.info("New document '{}' processed: status={}, contentErrors={}, transientErrors={}.",
candidate.uniqueIdentifier(),
m4.overallStatus(),
m4.counters().contentErrorCount(),
m4.counters().transientErrorCount());
} catch (DocumentPersistenceException e) {
LOG.error("Failed to persist processing result for new document '{}': {}",
candidate.uniqueIdentifier(), e.getMessage(), e);
}
}
// -------------------------------------------------------------------------
// Known processable document path
// -------------------------------------------------------------------------
/**
* Processes a known but non-terminal document and updates both the attempt history
* and the master record.
*
* @param candidate the candidate being processed
* @param fingerprint the document fingerprint
* @param m3Outcome the M3 pipeline result
* @param existingRecord the current master record (not terminal)
* @param context the current batch run context
* @param attemptStart the start instant of this processing attempt
*/
private void processAndPersistKnownDocument(
SourceDocumentCandidate candidate,
DocumentFingerprint fingerprint,
DocumentProcessingOutcome m3Outcome,
DocumentRecord existingRecord,
BatchRunContext context,
Instant attemptStart) {
Instant now = Instant.now();
// Map M3 outcome to M4 status/counters, taking existing counters into account
M4Outcome m4 = mapM3OutcomeForKnownDocument(m3Outcome, existingRecord.failureCounters());
try {
int attemptNumber = processingAttemptRepository.loadNextAttemptNumber(fingerprint);
ProcessingAttempt attempt = buildAttempt(
fingerprint, context, attemptNumber, attemptStart, now, m4);
// Update the master record with new status, counters and timestamps
DocumentRecord updatedRecord = new DocumentRecord(
existingRecord.fingerprint(),
new SourceDocumentLocator(candidate.locator().value()),
candidate.uniqueIdentifier(),
m4.overallStatus(),
m4.counters(),
m4.overallStatus() == ProcessingStatus.SUCCESS
? existingRecord.lastFailureInstant() : now,
m4.overallStatus() == ProcessingStatus.SUCCESS
? now : existingRecord.lastSuccessInstant(),
existingRecord.createdAt(),
now // updatedAt
);
// Persist attempt first, then master record
processingAttemptRepository.save(attempt);
documentRecordRepository.update(updatedRecord);
LOG.info("Known document '{}' processed: status={}, contentErrors={}, transientErrors={}.",
candidate.uniqueIdentifier(),
m4.overallStatus(),
m4.counters().contentErrorCount(),
m4.counters().transientErrorCount());
} catch (DocumentPersistenceException e) {
LOG.error("Failed to persist processing result for known document '{}': {}",
candidate.uniqueIdentifier(), e.getMessage(), e);
}
}
// -------------------------------------------------------------------------
// M3 → M4 outcome mapping
// -------------------------------------------------------------------------
/**
* Maps an M3 outcome to M4 status, counters, and retryable flag for a brand-new
* document (no prior history, counters start at zero).
*
* @param m3Outcome the M3 pipeline result
* @return the M4 outcome with status, counters and retryable flag
*/
private M4Outcome mapM3OutcomeForNewDocument(DocumentProcessingOutcome m3Outcome) {
return mapM3OutcomeForKnownDocument(m3Outcome, FailureCounters.zero());
}
/**
* Maps an M3 outcome to M4 status, counters, and retryable flag, taking the
* existing failure counters into account.
* <p>
* <strong>M4 minimal rules applied here:</strong>
* <ul>
* <li>M3 success → {@link ProcessingStatus#SUCCESS}, counters unchanged,
* {@code retryable=false}.</li>
* <li>M3 deterministic content error (first occurrence, contentErrorCount == 0) →
* {@link ProcessingStatus#FAILED_RETRYABLE}, contentErrorCount +1,
* {@code retryable=true}.</li>
* <li>M3 deterministic content error (second occurrence, contentErrorCount >= 1) →
* {@link ProcessingStatus#FAILED_FINAL}, contentErrorCount +1,
* {@code retryable=false}.</li>
* <li>M3 technical error → {@link ProcessingStatus#FAILED_RETRYABLE},
* transientErrorCount +1, {@code retryable=true}.</li>
* </ul>
*
* @param m3Outcome the M3 pipeline result
* @param existingCounters the current failure counters from the master record
* @return the M4 outcome with updated status, counters and retryable flag
*/
private M4Outcome mapM3OutcomeForKnownDocument(
DocumentProcessingOutcome m3Outcome,
FailureCounters existingCounters) {
return switch (m3Outcome) {
case de.gecheckt.pdf.umbenenner.domain.model.PreCheckPassed ignored -> {
// M3 success: document passed all pre-checks
// In M4 scope (no KI, no target copy), PreCheckPassed is the terminal success
yield new M4Outcome(
ProcessingStatus.SUCCESS,
existingCounters, // counters unchanged on success
false // not retryable
);
}
case PreCheckFailed contentError -> {
// Deterministic content error: apply the 1-retry rule
FailureCounters updatedCounters = existingCounters.withIncrementedContentErrorCount();
boolean isFirstOccurrence = existingCounters.contentErrorCount() == 0;
if (isFirstOccurrence) {
// First content error → FAILED_RETRYABLE
yield new M4Outcome(
ProcessingStatus.FAILED_RETRYABLE,
updatedCounters,
true
);
} else {
// Second (or later) content error → FAILED_FINAL
yield new M4Outcome(
ProcessingStatus.FAILED_FINAL,
updatedCounters,
false
);
}
}
case TechnicalDocumentError technicalError -> {
// Technical error after fingerprinting: always FAILED_RETRYABLE, increment transient counter
yield new M4Outcome(
ProcessingStatus.FAILED_RETRYABLE,
existingCounters.withIncrementedTransientErrorCount(),
true
);
}
};
}
// -------------------------------------------------------------------------
// Helper: build ProcessingAttempt
// -------------------------------------------------------------------------
/**
* Constructs a {@link ProcessingAttempt} from the given parameters and M4 outcome.
*
* @param fingerprint the document fingerprint
* @param context the current batch run context
* @param attemptNumber the monotonic attempt number
* @param startedAt the start instant of this attempt
* @param endedAt the end instant of this attempt
* @param m4 the M4 outcome (status, counters, retryable)
* @return the constructed processing attempt
*/
private ProcessingAttempt buildAttempt(
DocumentFingerprint fingerprint,
BatchRunContext context,
int attemptNumber,
Instant startedAt,
Instant endedAt,
M4Outcome m4) {
String failureClass = null;
String failureMessage = null;
if (m4.overallStatus() == ProcessingStatus.FAILED_RETRYABLE
|| m4.overallStatus() == ProcessingStatus.FAILED_FINAL) {
failureClass = m4.overallStatus().name();
failureMessage = buildFailureMessage(m4);
}
return new ProcessingAttempt(
fingerprint,
context.runId(),
attemptNumber,
startedAt,
endedAt,
m4.overallStatus(),
failureClass,
failureMessage,
m4.retryable()
);
}
/**
* Builds a human-readable failure message from the M4 outcome.
*
* @param m4 the M4 outcome
* @return a non-null failure message string
*/
private String buildFailureMessage(M4Outcome m4) {
return switch (m4.overallStatus()) {
case FAILED_RETRYABLE -> "Processing failed (retryable). "
+ "ContentErrors=" + m4.counters().contentErrorCount()
+ ", TransientErrors=" + m4.counters().transientErrorCount();
case FAILED_FINAL -> "Processing failed finally (not retryable). "
+ "ContentErrors=" + m4.counters().contentErrorCount()
+ ", TransientErrors=" + m4.counters().transientErrorCount();
default -> m4.overallStatus().name();
};
}
// -------------------------------------------------------------------------
// Internal value type: M4 outcome
// -------------------------------------------------------------------------
/**
* Internal value type carrying the M4 status, updated counters, and retryable flag
* after mapping from an M3 outcome.
* <p>
* Tightly scoped to {@link M4DocumentProcessor}; not exposed outside this class.
*
* @param overallStatus the M4 overall status to persist
* @param counters the updated failure counters to persist
* @param retryable whether the failure is retryable in a later run
*/
private record M4Outcome(
ProcessingStatus overallStatus,
FailureCounters counters,
boolean retryable) {
}
}

View File

@@ -1,28 +1,65 @@
/** /**
* Application-level services for business logic evaluation. * Application-level services for business logic evaluation and M4 orchestration.
* <p> * <p>
* This package contains stateless, pure-logic services that evaluate document content * This package contains stateless, pure-logic services that evaluate document content,
* and apply business rules. Services in this package: * apply business rules, and orchestrate the M4 per-document processing flow.
* Services in this package:
* <ul> * <ul>
* <li>Do not manage state or resources</li> * <li>Do not manage state or resources</li>
* <li>Do not depend on infrastructure (database, filesystem, network)</li> * <li>Do not depend on infrastructure (database, filesystem, network) directly;
* they interact with infrastructure exclusively through outbound ports</li>
* <li>Can be tested with simple unit tests and in-memory mocks</li> * <li>Can be tested with simple unit tests and in-memory mocks</li>
* <li>Are reused by multiple use cases or adapters</li>
* </ul> * </ul>
* *
* Current services: * <h2>Current services</h2>
* <ul> * <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.PreCheckEvaluator} — Pre-check evaluation</li> * <li>{@link de.gecheckt.pdf.umbenenner.application.service.PreCheckEvaluator}
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.DocumentProcessingService} — Complete document processing pipeline orchestration</li> * — Pre-check evaluation (M3)</li>
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.DocumentProcessingService}
* — Complete M3 document processing pipeline orchestration</li>
* <li>{@link de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor}
* — M4 per-document idempotency, status/counter mapping and consistent
* two-level persistence (AP-006)</li>
* </ul> * </ul>
* *
* Document Processing Pipeline: * <h2>M4 processing flow ({@code M4DocumentProcessor})</h2>
* The {@link de.gecheckt.pdf.umbenenner.application.service.DocumentProcessingService} coordinates * <p>
* the complete processing workflow: * The {@link de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor}
* implements the verbindliche M4 processing order per candidate:
* <ol> * <ol>
* <li>Convert technical PDF extraction results to processing outcomes</li> * <li>Load the document master record by fingerprint.</li>
* <li>Route successful extractions through pre-check validation</li> * <li>If overall status is {@code SUCCESS} → persist a skip attempt with
* <li>Classify extraction and pre-check failures with appropriate error types</li> * {@code SKIPPED_ALREADY_PROCESSED}; do not change counters.</li>
* <li>If overall status is {@code FAILED_FINAL} → persist a skip attempt with
* {@code SKIPPED_FINAL_FAILURE}; do not change counters.</li>
* <li>Otherwise map the M3 outcome into M4 status, counters and retryable flag
* using the M4 minimal rules.</li>
* <li>Persist exactly one historised processing attempt.</li>
* <li>Persist the updated document master record.</li>
* </ol> * </ol>
*
* <h2>M4 minimal rules (status and counter semantics)</h2>
* <ul>
* <li>First deterministic content error → {@code FAILED_RETRYABLE},
* content error counter +1, {@code retryable=true}.</li>
* <li>Second deterministic content error → {@code FAILED_FINAL},
* content error counter +1 (cumulative = 2), {@code retryable=false}.</li>
* <li>Technical error after fingerprinting → {@code FAILED_RETRYABLE},
* transient error counter +1, {@code retryable=true}.</li>
* <li>Skip events do not change any failure counter.</li>
* </ul>
*
* <h2>Persistence consistency</h2>
* <p>
* For every identified document, the processing attempt and the master record are
* written in sequence. If either write fails, the failure is caught and logged;
* the batch run continues with the next candidate. True transactionality across
* two separate repository calls is not available in the M4 scope; this is a known
* and documented limitation.
*
* <h2>Pre-fingerprint failures</h2>
* <p>
* Failures that occur before a successful fingerprint is available are not handled
* by this package. They are handled by the use case and are not historised in SQLite.
*/ */
package de.gecheckt.pdf.umbenenner.application.service; package de.gecheckt.pdf.umbenenner.application.service;

View File

@@ -3,54 +3,80 @@ package de.gecheckt.pdf.umbenenner.application.usecase;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration; import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome; import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome;
import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunProcessingUseCase; import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunProcessingUseCase;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintPort;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintResult;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintSuccess;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintTechnicalError;
import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort; import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort;
import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort; import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort;
import de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException; import de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException; import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort; import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort;
import de.gecheckt.pdf.umbenenner.application.service.DocumentProcessingService; import de.gecheckt.pdf.umbenenner.application.service.DocumentProcessingService;
import de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor;
import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext; import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed; import de.gecheckt.pdf.umbenenner.domain.model.DocumentFingerprint;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckPassed; import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate; import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import java.time.Instant;
import java.util.List; import java.util.List;
import java.util.Objects;
/** /**
* Batch processing implementation of {@link BatchRunProcessingUseCase}. * Batch processing implementation of {@link BatchRunProcessingUseCase}.
* <p> * <p>
* Orchestrates the complete batch processing workflow: * Orchestrates the complete M4 batch processing workflow per candidate:
* <ol> * <ol>
* <li>Acquire exclusive run lock to prevent concurrent instances</li> * <li>Acquire exclusive run lock to prevent concurrent instances.</li>
* <li>Scan source folder for PDF candidates</li> * <li>Scan source folder for PDF candidates.</li>
* <li>For each candidate: extract text and page count, run pre-checks</li> * <li>For each candidate, execute the M4 processing order:
* <li>Log per-document decision; end each document controlled without KI or target copy</li> * <ol type="a">
* <li>Release lock and return structured outcome for Bootstrap exit code mapping</li> * <li>Compute fingerprint.</li>
* <li>Load document master record.</li>
* <li>If already {@code SUCCESS} → persist skip attempt with
* {@code SKIPPED_ALREADY_PROCESSED}.</li>
* <li>If already {@code FAILED_FINAL} → persist skip attempt with
* {@code SKIPPED_FINAL_FAILURE}.</li>
* <li>Otherwise execute the M3 pipeline (extraction + pre-checks).</li>
* <li>Map M3 result into M4 status, counters and retryable flag.</li>
* <li>Persist exactly one historised processing attempt.</li>
* <li>Persist the updated document master record.</li>
* </ol> * </ol>
* </li>
* <li>Release lock and return structured outcome for Bootstrap exit code mapping.</li>
* </ol>
*
* <h2>Idempotency</h2>
* <p> * <p>
* Processing boundary: * Documents are identified exclusively by their SHA-256 content fingerprint. A document
* <ul> * whose overall status is {@code SUCCESS} or {@code FAILED_FINAL} is skipped in every
* <li>Documents that pass pre-checks end controlled and are ready for further processing (KI, persistence, copy)</li> * subsequent run; only a skip attempt is historised.
* <li>Documents with deterministic content errors (no usable text, page limit exceeded) end controlled</li> *
* <li>Documents with technical extraction errors end controlled; they do not abort the overall run</li> * <h2>Pre-fingerprint failures</h2>
* <li>If the source folder itself is inaccessible, the run fails with {@link BatchRunOutcome#FAILURE}</li>
* </ul>
* <p> * <p>
* Non-Goals (not implemented): * If the fingerprint computation fails (e.g. the file is no longer readable), the
* candidate is logged as a non-identifiable run event and is <em>not</em> historised
* in SQLite. The batch run continues with the next candidate.
*
* <h2>Persistence consistency</h2>
* <p>
* For every identified document, the processing attempt and the master record are
* written in sequence by {@link M4DocumentProcessor}. Persistence failures for a single
* document are caught and logged; the batch run continues with the remaining candidates.
*
* <h2>Non-Goals (not implemented in M4)</h2>
* <ul> * <ul>
* <li>No fingerprinting or SQLite persistence</li> * <li>No KI/AI integration or prompt loading.</li>
* <li>No KI/AI integration or prompt loading</li> * <li>No filename generation or target file copy.</li>
* <li>No filename generation or target file copy</li> * <li>No M5+ retry rules for KI or target copy failures.</li>
* <li>No cross-run retry logic</li>
* </ul> * </ul>
*
* @since M3-AP-004 (extended in M4-AP-006)
*/ */
public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCase { public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCase {
@@ -60,28 +86,44 @@ public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCa
private final RunLockPort runLockPort; private final RunLockPort runLockPort;
private final SourceDocumentCandidatesPort sourceDocumentCandidatesPort; private final SourceDocumentCandidatesPort sourceDocumentCandidatesPort;
private final PdfTextExtractionPort pdfTextExtractionPort; private final PdfTextExtractionPort pdfTextExtractionPort;
private final FingerprintPort fingerprintPort;
private final M4DocumentProcessor m4DocumentProcessor;
/** /**
* Creates the batch use case with the already-loaded startup configuration and all required ports. * Creates the batch use case with the already-loaded startup configuration and all
* required ports for the M4 flow.
* <p> * <p>
* The configuration is loaded and validated by Bootstrap before use case creation; * The configuration is loaded and validated by Bootstrap before use case creation;
* the use case receives the result directly and does not re-read it. * the use case receives the result directly and does not re-read the properties file.
* *
* @param configuration the validated startup configuration * @param configuration the validated startup configuration; must not be null
* @param runLockPort for exclusive run locking * @param runLockPort for exclusive run locking; must not be null
* @param sourceDocumentCandidatesPort for loading PDF candidates from the source folder * @param sourceDocumentCandidatesPort for loading PDF candidates from the source folder;
* @param pdfTextExtractionPort for extracting text and page count from a single PDF * must not be null
* @param pdfTextExtractionPort for extracting text and page count from a single PDF;
* must not be null
* @param fingerprintPort for computing the SHA-256 fingerprint of a candidate;
* must not be null
* @param m4DocumentProcessor for applying M4 decision logic and persisting results;
* must not be null
* @throws NullPointerException if any parameter is null * @throws NullPointerException if any parameter is null
*/ */
public DefaultBatchRunProcessingUseCase( public DefaultBatchRunProcessingUseCase(
StartConfiguration configuration, StartConfiguration configuration,
RunLockPort runLockPort, RunLockPort runLockPort,
SourceDocumentCandidatesPort sourceDocumentCandidatesPort, SourceDocumentCandidatesPort sourceDocumentCandidatesPort,
PdfTextExtractionPort pdfTextExtractionPort) { PdfTextExtractionPort pdfTextExtractionPort,
this.configuration = configuration; FingerprintPort fingerprintPort,
this.runLockPort = runLockPort; M4DocumentProcessor m4DocumentProcessor) {
this.sourceDocumentCandidatesPort = sourceDocumentCandidatesPort; this.configuration = Objects.requireNonNull(configuration, "configuration must not be null");
this.pdfTextExtractionPort = pdfTextExtractionPort; this.runLockPort = Objects.requireNonNull(runLockPort, "runLockPort must not be null");
this.sourceDocumentCandidatesPort = Objects.requireNonNull(
sourceDocumentCandidatesPort, "sourceDocumentCandidatesPort must not be null");
this.pdfTextExtractionPort = Objects.requireNonNull(
pdfTextExtractionPort, "pdfTextExtractionPort must not be null");
this.fingerprintPort = Objects.requireNonNull(fingerprintPort, "fingerprintPort must not be null");
this.m4DocumentProcessor = Objects.requireNonNull(
m4DocumentProcessor, "m4DocumentProcessor must not be null");
} }
@Override @Override
@@ -96,12 +138,15 @@ public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCa
lockAcquired = true; lockAcquired = true;
LOG.debug("Run lock acquired successfully."); LOG.debug("Run lock acquired successfully.");
} catch (RunLockUnavailableException e) { } catch (RunLockUnavailableException e) {
LOG.warn("Run lock not available another instance is already running. This instance terminates immediately."); LOG.warn("Run lock not available another instance is already running. "
+ "This instance terminates immediately.");
return BatchRunOutcome.LOCK_UNAVAILABLE; return BatchRunOutcome.LOCK_UNAVAILABLE;
} }
LOG.debug("Configuration in use: source={}, target={}", configuration.sourceFolder(), configuration.targetFolder()); LOG.debug("Configuration in use: source={}, target={}",
LOG.info("Batch run started. RunId: {}, Start: {}", context.runId(), context.startInstant()); configuration.sourceFolder(), configuration.targetFolder());
LOG.info("Batch run started. RunId: {}, Start: {}",
context.runId(), context.startInstant());
// Step 2: Load PDF candidates from source folder // Step 2: Load PDF candidates from source folder
List<SourceDocumentCandidate> candidates; List<SourceDocumentCandidate> candidates;
@@ -113,12 +158,13 @@ public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCa
} }
LOG.info("Found {} PDF candidate(s) in source folder.", candidates.size()); LOG.info("Found {} PDF candidate(s) in source folder.", candidates.size());
// Step 3: Process each candidate through the pipeline // Step 3: Process each candidate through the M4 pipeline
for (SourceDocumentCandidate candidate : candidates) { for (SourceDocumentCandidate candidate : candidates) {
processCandidate(candidate); processCandidate(candidate, context);
} }
LOG.info("Batch run completed. Processed {} candidate(s). RunId: {}", candidates.size(), context.runId()); LOG.info("Batch run completed. Processed {} candidate(s). RunId: {}",
candidates.size(), context.runId());
return BatchRunOutcome.SUCCESS; return BatchRunOutcome.SUCCESS;
} catch (Exception e) { } catch (Exception e) {
@@ -126,8 +172,8 @@ public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCa
return BatchRunOutcome.FAILURE; return BatchRunOutcome.FAILURE;
} finally { } finally {
// Release the run lock only if it was successfully acquired. // Release the run lock only if it was successfully acquired.
// If acquire() threw RunLockUnavailableException, the lock belongs to another instance // If acquire() threw RunLockUnavailableException, the lock belongs to another
// and must not be deleted by this instance. // instance and must not be deleted by this instance.
if (lockAcquired) { if (lockAcquired) {
try { try {
runLockPort.release(); runLockPort.release();
@@ -140,56 +186,105 @@ public class DefaultBatchRunProcessingUseCase implements BatchRunProcessingUseCa
} }
/** /**
* Processes a single PDF candidate through the complete pipeline. * Processes a single PDF candidate through the complete M4 pipeline.
* <p> * <p>
* Processing steps per document: * M4 processing order:
* <ol> * <ol>
* <li>Log candidate recognition</li> * <li>Record the attempt start instant.</li>
* <li>Extract text and page count from the PDF via {@link PdfTextExtractionPort}</li> * <li>Compute the SHA-256 fingerprint of the candidate file content.</li>
* <li>Process extraction result through pre-checks via {@link DocumentProcessingService}</li> * <li>If fingerprint computation fails: log as non-identifiable run event and
* <li>Log extraction outcome and final decision</li> * return — no SQLite record is created.</li>
* <li>Execute the M3 pipeline (PDF extraction + pre-checks).</li>
* <li>Delegate to {@link M4DocumentProcessor} for idempotency check, status/counter
* mapping, and consistent two-level persistence.</li>
* </ol> * </ol>
* <p> * <p>
* Per-document errors (extraction failure, technical error, pre-check failure) do not abort the overall * Per-document errors do not abort the overall batch run. Each candidate ends
* batch run. Each candidate ends controlled regardless of its outcome. * controlled regardless of its outcome.
* <p>
* Processing boundary: no KI call, no persistence, no filename generation,
* no target file copy is initiated here, even for candidates that pass all pre-checks.
* *
* @param candidate the candidate to process * @param candidate the candidate to process
* @param context the current batch run context
*/ */
private void processCandidate(SourceDocumentCandidate candidate) { private void processCandidate(SourceDocumentCandidate candidate, BatchRunContext context) {
LOG.debug("Processing candidate: {}", candidate.uniqueIdentifier()); LOG.debug("Processing candidate: {}", candidate.uniqueIdentifier());
PdfExtractionResult extractionResult = pdfTextExtractionPort.extractTextAndPageCount(candidate); // Record the attempt start instant before any work begins
Instant attemptStart = Instant.now();
// Step M4-1: Compute fingerprint
FingerprintResult fingerprintResult = fingerprintPort.computeFingerprint(candidate);
switch (fingerprintResult) {
case FingerprintTechnicalError fingerprintError -> {
// Pre-fingerprint failure: not historised in SQLite
LOG.warn("Fingerprint computation failed for '{}': {} — candidate skipped (not historised).",
candidate.uniqueIdentifier(), fingerprintError.errorMessage());
return;
}
case FingerprintSuccess fingerprintSuccess -> {
DocumentFingerprint fingerprint = fingerprintSuccess.fingerprint();
LOG.debug("Fingerprint computed for '{}': {}",
candidate.uniqueIdentifier(), fingerprint.sha256Hex());
// Step M4-2..M4-8: Execute M3 pipeline and delegate M4 logic to the processor
// The M3 pipeline runs only if the document is not in a terminal state;
// M4DocumentProcessor handles the terminal check internally.
// We run M3 eagerly here and pass the result; M4DocumentProcessor will
// ignore it for terminal documents.
DocumentProcessingOutcome m3Outcome = runM3Pipeline(candidate);
// Delegate idempotency check, status mapping, and persistence to M4DocumentProcessor
m4DocumentProcessor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
}
}
}
/**
* Runs the M3 pipeline (PDF text extraction + pre-checks) for the given candidate.
* <p>
* This method is called after a successful fingerprint computation. The result is
* passed to {@link M4DocumentProcessor}, which applies it only when the document is
* not in a terminal state.
*
* @param candidate the candidate to run through the M3 pipeline
* @return the M3 pipeline outcome (pre-check passed, pre-check failed, or technical error)
*/
private DocumentProcessingOutcome runM3Pipeline(SourceDocumentCandidate candidate) {
PdfExtractionResult extractionResult =
pdfTextExtractionPort.extractTextAndPageCount(candidate);
// Log extraction outcome // Log extraction outcome
switch (extractionResult) { switch (extractionResult) {
case PdfExtractionSuccess success -> case de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess success ->
LOG.debug("PDF extraction successful for '{}'. Pages: {}, Text length: {} chars.", LOG.debug("PDF extraction successful for '{}'. Pages: {}, Text length: {} chars.",
candidate.uniqueIdentifier(), success.pageCount().value(), success.extractedText().length()); candidate.uniqueIdentifier(),
case PdfExtractionContentError contentError -> success.pageCount().value(),
success.extractedText().length());
case de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError contentError ->
LOG.debug("PDF content extraction failed for '{}' (content problem): {}", LOG.debug("PDF content extraction failed for '{}' (content problem): {}",
candidate.uniqueIdentifier(), contentError.reason()); candidate.uniqueIdentifier(), contentError.reason());
case PdfExtractionTechnicalError technicalError -> case de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionTechnicalError technicalError ->
LOG.debug("PDF extraction technical error for '{}': {}", LOG.debug("PDF extraction technical error for '{}': {}",
candidate.uniqueIdentifier(), technicalError.errorMessage()); candidate.uniqueIdentifier(), technicalError.errorMessage());
} }
// Process through complete pipeline DocumentProcessingOutcome outcome =
var outcome = DocumentProcessingService.processDocument(candidate, extractionResult, configuration); DocumentProcessingService.processDocument(candidate, extractionResult, configuration);
// Log processing outcome // Log M3 outcome
switch (outcome) { switch (outcome) {
case PreCheckPassed passed -> case de.gecheckt.pdf.umbenenner.domain.model.PreCheckPassed passed ->
LOG.info("Pre-checks PASSED for '{}'. Candidate ready for further processing.", LOG.info("Pre-checks PASSED for '{}'. Candidate ready for M4 persistence.",
candidate.uniqueIdentifier()); candidate.uniqueIdentifier());
case PreCheckFailed failed -> case de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed failed ->
LOG.info("Pre-checks FAILED for '{}': {} (Deterministic content error may retry in later run).", LOG.info("Pre-checks FAILED for '{}': {} (Deterministic content error).",
candidate.uniqueIdentifier(), failed.failureReasonDescription()); candidate.uniqueIdentifier(), failed.failureReasonDescription());
case TechnicalDocumentError technicalError -> case de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError technicalError ->
LOG.warn("Processing FAILED for '{}': {} (Technical error may retry in later run).", LOG.warn("Processing FAILED for '{}': {} (Technical error retryable).",
candidate.uniqueIdentifier(), technicalError.errorMessage()); candidate.uniqueIdentifier(), technicalError.errorMessage());
} }
return outcome;
} }
} }

View File

@@ -4,9 +4,25 @@
* Implementations: * Implementations:
* <ul> * <ul>
* <li>{@link de.gecheckt.pdf.umbenenner.application.usecase.DefaultBatchRunProcessingUseCase} * <li>{@link de.gecheckt.pdf.umbenenner.application.usecase.DefaultBatchRunProcessingUseCase}
* — Production implementation with run lock and controlled batch cycle</li> * — Production implementation with run lock, M4 fingerprint-based idempotency,
* and consistent two-level persistence (extended in M4-AP-006)</li>
* </ul> * </ul>
* <p> * <p>
* <h2>M4 processing order (AP-006)</h2>
* <p>
* For each candidate, {@link de.gecheckt.pdf.umbenenner.application.usecase.DefaultBatchRunProcessingUseCase}
* enforces this order:
* <ol>
* <li>Compute SHA-256 fingerprint of the candidate file content.</li>
* <li>If fingerprint computation fails: log as non-identifiable run event;
* do NOT write any SQLite record; continue with next candidate.</li>
* <li>Run the M3 pipeline (PDF extraction + pre-checks).</li>
* <li>Delegate to {@link de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor}
* for idempotency check, status/counter mapping, and consistent persistence.</li>
* </ol>
* <p>
* All implementations are infrastructure-agnostic and interact only through ports. * All implementations are infrastructure-agnostic and interact only through ports.
*
* @since M2 (extended in M4-AP-006)
*/ */
package de.gecheckt.pdf.umbenenner.application.usecase; package de.gecheckt.pdf.umbenenner.application.usecase;

View File

@@ -0,0 +1,425 @@
package de.gecheckt.pdf.umbenenner.application.service;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentKnownProcessable;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentPersistenceException;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecord;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordLookupResult;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentTerminalFinalFailure;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentTerminalSuccess;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentUnknown;
import de.gecheckt.pdf.umbenenner.application.port.out.FailureCounters;
import de.gecheckt.pdf.umbenenner.application.port.out.PersistenceLookupTechnicalFailure;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttempt;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttemptRepository;
import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentFingerprint;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
import de.gecheckt.pdf.umbenenner.domain.model.PdfPageCount;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailureReason;
import de.gecheckt.pdf.umbenenner.domain.model.PreCheckPassed;
import de.gecheckt.pdf.umbenenner.domain.model.ProcessingStatus;
import de.gecheckt.pdf.umbenenner.domain.model.RunId;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate;
import de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentLocator;
import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
/**
* Unit tests for {@link M4DocumentProcessor}.
* <p>
* Covers:
* <ul>
* <li>M4 minimal rules: status, counter and retryable flag mapping</li>
* <li>Skip logic for SUCCESS and FAILED_FINAL documents</li>
* <li>New document path (DocumentUnknown)</li>
* <li>Known processable document path (DocumentKnownProcessable)</li>
* <li>Persistence lookup failure: no attempt written</li>
* <li>Persistence write failure: controlled failure, no crash</li>
* <li>Skip events do not change error counters</li>
* </ul>
*/
class M4DocumentProcessorTest {
private static final String FINGERPRINT_HEX =
"a".repeat(64); // 64 lowercase hex chars
private CapturingDocumentRecordRepository recordRepo;
private CapturingProcessingAttemptRepository attemptRepo;
private M4DocumentProcessor processor;
private SourceDocumentCandidate candidate;
private DocumentFingerprint fingerprint;
private BatchRunContext context;
private Instant attemptStart;
@BeforeEach
void setUp() {
recordRepo = new CapturingDocumentRecordRepository();
attemptRepo = new CapturingProcessingAttemptRepository();
processor = new M4DocumentProcessor(recordRepo, attemptRepo);
candidate = new SourceDocumentCandidate(
"test.pdf", 1024L, new SourceDocumentLocator("/tmp/test.pdf"));
fingerprint = new DocumentFingerprint(FINGERPRINT_HEX);
context = new BatchRunContext(new RunId("run-001"), Instant.now());
attemptStart = Instant.now();
}
// -------------------------------------------------------------------------
// New document (DocumentUnknown) path
// -------------------------------------------------------------------------
@Test
void process_newDocument_preCheckPassed_persistsSuccessStatus() {
recordRepo.setLookupResult(new DocumentUnknown());
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
// One attempt written
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.SUCCESS, attempt.status());
assertFalse(attempt.retryable());
assertNull(attempt.failureClass());
assertNull(attempt.failureMessage());
// One master record created
assertEquals(1, recordRepo.createdRecords.size());
DocumentRecord record = recordRepo.createdRecords.get(0);
assertEquals(ProcessingStatus.SUCCESS, record.overallStatus());
assertEquals(0, record.failureCounters().contentErrorCount());
assertEquals(0, record.failureCounters().transientErrorCount());
assertNotNull(record.lastSuccessInstant());
assertNull(record.lastFailureInstant());
}
@Test
void process_newDocument_firstContentError_persistsFailedRetryable_contentCounterOne() {
recordRepo.setLookupResult(new DocumentUnknown());
DocumentProcessingOutcome m3Outcome = new PreCheckFailed(
candidate, PreCheckFailureReason.NO_USABLE_TEXT);
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.FAILED_RETRYABLE, attempt.status());
assertTrue(attempt.retryable());
assertEquals(1, recordRepo.createdRecords.size());
DocumentRecord record = recordRepo.createdRecords.get(0);
assertEquals(ProcessingStatus.FAILED_RETRYABLE, record.overallStatus());
assertEquals(1, record.failureCounters().contentErrorCount());
assertEquals(0, record.failureCounters().transientErrorCount());
assertNotNull(record.lastFailureInstant());
assertNull(record.lastSuccessInstant());
}
@Test
void process_newDocument_technicalError_persistsFailedRetryable_transientCounterOne() {
recordRepo.setLookupResult(new DocumentUnknown());
DocumentProcessingOutcome m3Outcome = new TechnicalDocumentError(
candidate, "I/O error", null);
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.FAILED_RETRYABLE, attempt.status());
assertTrue(attempt.retryable());
assertEquals(1, recordRepo.createdRecords.size());
DocumentRecord record = recordRepo.createdRecords.get(0);
assertEquals(ProcessingStatus.FAILED_RETRYABLE, record.overallStatus());
assertEquals(0, record.failureCounters().contentErrorCount());
assertEquals(1, record.failureCounters().transientErrorCount());
}
// -------------------------------------------------------------------------
// Known processable document path (DocumentKnownProcessable)
// -------------------------------------------------------------------------
@Test
void process_knownDocument_secondContentError_persistsFailedFinal_contentCounterTwo() {
// Existing record: first content error already recorded
DocumentRecord existingRecord = buildRecord(
ProcessingStatus.FAILED_RETRYABLE,
new FailureCounters(1, 0));
recordRepo.setLookupResult(new DocumentKnownProcessable(existingRecord));
DocumentProcessingOutcome m3Outcome = new PreCheckFailed(
candidate, PreCheckFailureReason.PAGE_LIMIT_EXCEEDED);
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.FAILED_FINAL, attempt.status());
assertFalse(attempt.retryable());
assertEquals(1, recordRepo.updatedRecords.size());
DocumentRecord record = recordRepo.updatedRecords.get(0);
assertEquals(ProcessingStatus.FAILED_FINAL, record.overallStatus());
assertEquals(2, record.failureCounters().contentErrorCount());
assertEquals(0, record.failureCounters().transientErrorCount());
}
@Test
void process_knownDocument_technicalError_incrementsTransientCounter_remainsRetryable() {
DocumentRecord existingRecord = buildRecord(
ProcessingStatus.FAILED_RETRYABLE,
new FailureCounters(0, 2));
recordRepo.setLookupResult(new DocumentKnownProcessable(existingRecord));
DocumentProcessingOutcome m3Outcome = new TechnicalDocumentError(
candidate, "Timeout", null);
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, recordRepo.updatedRecords.size());
DocumentRecord record = recordRepo.updatedRecords.get(0);
assertEquals(ProcessingStatus.FAILED_RETRYABLE, record.overallStatus());
assertEquals(0, record.failureCounters().contentErrorCount());
assertEquals(3, record.failureCounters().transientErrorCount());
assertTrue(attemptRepo.savedAttempts.get(0).retryable());
}
@Test
void process_knownDocument_preCheckPassed_persistsSuccess() {
DocumentRecord existingRecord = buildRecord(
ProcessingStatus.FAILED_RETRYABLE,
new FailureCounters(0, 1));
recordRepo.setLookupResult(new DocumentKnownProcessable(existingRecord));
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, recordRepo.updatedRecords.size());
DocumentRecord record = recordRepo.updatedRecords.get(0);
assertEquals(ProcessingStatus.SUCCESS, record.overallStatus());
// Counters unchanged on success
assertEquals(0, record.failureCounters().contentErrorCount());
assertEquals(1, record.failureCounters().transientErrorCount());
assertNotNull(record.lastSuccessInstant());
}
// -------------------------------------------------------------------------
// Skip paths
// -------------------------------------------------------------------------
@Test
void process_terminalSuccess_persistsSkipAttemptWithSkippedAlreadyProcessed() {
DocumentRecord existingRecord = buildRecord(
ProcessingStatus.SUCCESS,
FailureCounters.zero());
recordRepo.setLookupResult(new DocumentTerminalSuccess(existingRecord));
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.SKIPPED_ALREADY_PROCESSED, attempt.status());
assertFalse(attempt.retryable());
assertNull(attempt.failureClass());
assertNull(attempt.failureMessage());
// Master record updated (only updatedAt changes)
assertEquals(1, recordRepo.updatedRecords.size());
DocumentRecord record = recordRepo.updatedRecords.get(0);
// Status and counters remain unchanged
assertEquals(ProcessingStatus.SUCCESS, record.overallStatus());
assertEquals(0, record.failureCounters().contentErrorCount());
assertEquals(0, record.failureCounters().transientErrorCount());
}
@Test
void process_terminalFinalFailure_persistsSkipAttemptWithSkippedFinalFailure() {
DocumentRecord existingRecord = buildRecord(
ProcessingStatus.FAILED_FINAL,
new FailureCounters(2, 0));
recordRepo.setLookupResult(new DocumentTerminalFinalFailure(existingRecord));
DocumentProcessingOutcome m3Outcome = new PreCheckFailed(
candidate, PreCheckFailureReason.NO_USABLE_TEXT);
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0);
assertEquals(ProcessingStatus.SKIPPED_FINAL_FAILURE, attempt.status());
assertFalse(attempt.retryable());
// Master record updated (only updatedAt changes); counters unchanged
assertEquals(1, recordRepo.updatedRecords.size());
DocumentRecord record = recordRepo.updatedRecords.get(0);
assertEquals(ProcessingStatus.FAILED_FINAL, record.overallStatus());
assertEquals(2, record.failureCounters().contentErrorCount());
assertEquals(0, record.failureCounters().transientErrorCount());
}
@Test
void process_skipEvent_doesNotChangeErrorCounters() {
FailureCounters originalCounters = new FailureCounters(1, 3);
DocumentRecord existingRecord = buildRecord(ProcessingStatus.SUCCESS, originalCounters);
recordRepo.setLookupResult(new DocumentTerminalSuccess(existingRecord));
processor.process(candidate, fingerprint,
new PreCheckPassed(candidate, new PdfExtractionSuccess("t", new PdfPageCount(1))),
context, attemptStart);
DocumentRecord updated = recordRepo.updatedRecords.get(0);
assertEquals(originalCounters.contentErrorCount(), updated.failureCounters().contentErrorCount(),
"Skip must not change content error counter");
assertEquals(originalCounters.transientErrorCount(), updated.failureCounters().transientErrorCount(),
"Skip must not change transient error counter");
}
// -------------------------------------------------------------------------
// Persistence lookup failure
// -------------------------------------------------------------------------
@Test
void process_persistenceLookupFailure_noAttemptWritten_noException() {
recordRepo.setLookupResult(new PersistenceLookupTechnicalFailure("DB unavailable", null));
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
// Must not throw
assertDoesNotThrow(() ->
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart));
// No attempt written, no record created/updated
assertEquals(0, attemptRepo.savedAttempts.size(),
"No attempt must be written when lookup fails");
assertEquals(0, recordRepo.createdRecords.size());
assertEquals(0, recordRepo.updatedRecords.size());
}
// -------------------------------------------------------------------------
// Persistence write failure: controlled, no crash
// -------------------------------------------------------------------------
@Test
void process_persistenceWriteFailure_doesNotThrow_batchContinues() {
recordRepo.setLookupResult(new DocumentUnknown());
// Make the attempt save throw
attemptRepo.failOnSave = true;
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
// Must not propagate the exception
assertDoesNotThrow(() ->
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart));
}
// -------------------------------------------------------------------------
// Attempt number monotonicity
// -------------------------------------------------------------------------
@Test
void process_attemptNumberIsAssignedFromRepository() {
recordRepo.setLookupResult(new DocumentUnknown());
attemptRepo.nextAttemptNumber = 3; // Simulate 2 prior attempts
DocumentProcessingOutcome m3Outcome = new PreCheckPassed(
candidate, new PdfExtractionSuccess("text", new PdfPageCount(1)));
processor.process(candidate, fingerprint, m3Outcome, context, attemptStart);
assertEquals(1, attemptRepo.savedAttempts.size());
assertEquals(3, attemptRepo.savedAttempts.get(0).attemptNumber(),
"Attempt number must be taken from the repository");
}
// -------------------------------------------------------------------------
// Helpers
// -------------------------------------------------------------------------
private DocumentRecord buildRecord(ProcessingStatus status, FailureCounters counters) {
Instant now = Instant.now();
return new DocumentRecord(
fingerprint,
new SourceDocumentLocator("/tmp/test.pdf"),
"test.pdf",
status,
counters,
status == ProcessingStatus.SUCCESS ? null : now,
status == ProcessingStatus.SUCCESS ? now : null,
now,
now
);
}
// -------------------------------------------------------------------------
// Capturing test doubles
// -------------------------------------------------------------------------
private static class CapturingDocumentRecordRepository implements DocumentRecordRepository {
private DocumentRecordLookupResult lookupResult = new DocumentUnknown();
final List<DocumentRecord> createdRecords = new ArrayList<>();
final List<DocumentRecord> updatedRecords = new ArrayList<>();
void setLookupResult(DocumentRecordLookupResult result) {
this.lookupResult = result;
}
@Override
public DocumentRecordLookupResult findByFingerprint(DocumentFingerprint fingerprint) {
return lookupResult;
}
@Override
public void create(DocumentRecord record) {
createdRecords.add(record);
}
@Override
public void update(DocumentRecord record) {
updatedRecords.add(record);
}
}
private static class CapturingProcessingAttemptRepository implements ProcessingAttemptRepository {
final List<ProcessingAttempt> savedAttempts = new ArrayList<>();
int nextAttemptNumber = 1;
boolean failOnSave = false;
@Override
public int loadNextAttemptNumber(DocumentFingerprint fingerprint) {
return nextAttemptNumber;
}
@Override
public void save(ProcessingAttempt attempt) {
if (failOnSave) {
throw new DocumentPersistenceException("Simulated save failure");
}
savedAttempts.add(attempt);
}
@Override
public List<ProcessingAttempt> findAllByFingerprint(DocumentFingerprint fingerprint) {
return List.copyOf(savedAttempts);
}
}
}

View File

@@ -2,12 +2,24 @@ package de.gecheckt.pdf.umbenenner.application.usecase;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration; import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome; import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecord;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordLookupResult;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentUnknown;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintPort;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintResult;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintSuccess;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintTechnicalError;
import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort; import de.gecheckt.pdf.umbenenner.application.port.out.PdfTextExtractionPort;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttempt;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttemptRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort; import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort;
import de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException; import de.gecheckt.pdf.umbenenner.application.port.out.RunLockUnavailableException;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException; import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentAccessException;
import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort; import de.gecheckt.pdf.umbenenner.application.port.out.SourceDocumentCandidatesPort;
import de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor;
import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext; import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
import de.gecheckt.pdf.umbenenner.domain.model.DocumentFingerprint;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionContentError;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionResult;
import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess; import de.gecheckt.pdf.umbenenner.domain.model.PdfExtractionSuccess;
@@ -37,10 +49,11 @@ import static org.junit.jupiter.api.Assertions.*;
* <ul> * <ul>
* <li>Lock acquisition and release lifecycle</li> * <li>Lock acquisition and release lifecycle</li>
* <li>Source folder scanning and per-document processing loop</li> * <li>Source folder scanning and per-document processing loop</li>
* <li>Happy path: candidate passes pre-checks, ends controlled without KI or target copy</li> * <li>Happy path: candidate passes pre-checks, M4 persistence is invoked</li>
* <li>Deterministic content errors: no usable text, page limit exceeded</li> * <li>Deterministic content errors: no usable text, page limit exceeded</li>
* <li>Technical extraction errors: controlled per-document end, batch continues</li> * <li>Technical extraction errors: controlled per-document end, batch continues</li>
* <li>Source folder access failure: batch fails with FAILURE outcome</li> * <li>Source folder access failure: batch fails with FAILURE outcome</li>
* <li>M4 idempotency: fingerprint failure → not historised</li>
* </ul> * </ul>
*/ */
class BatchRunProcessingUseCaseTest { class BatchRunProcessingUseCaseTest {
@@ -57,8 +70,9 @@ class BatchRunProcessingUseCaseTest {
MockRunLockPort lockPort = new MockRunLockPort(); MockRunLockPort lockPort = new MockRunLockPort();
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort()); config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("test-run-1"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("test-run-1"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
@@ -73,8 +87,9 @@ class BatchRunProcessingUseCaseTest {
CountingRunLockPort lockPort = new CountingRunLockPort(true); CountingRunLockPort lockPort = new CountingRunLockPort(true);
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort()); config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("test-run-2"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("test-run-2"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
@@ -92,8 +107,9 @@ class BatchRunProcessingUseCaseTest {
CountingRunLockPort lockPort = new CountingRunLockPort(true); CountingRunLockPort lockPort = new CountingRunLockPort(true);
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort()); config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("test-run-f1"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("test-run-f1"), Instant.now());
useCase.execute(context); useCase.execute(context);
@@ -108,8 +124,9 @@ class BatchRunProcessingUseCaseTest {
ErrorAfterAcquireLockPort lockPort = new ErrorAfterAcquireLockPort(); ErrorAfterAcquireLockPort lockPort = new ErrorAfterAcquireLockPort();
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort()); config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("test-run-3"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("test-run-3"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
@@ -128,8 +145,9 @@ class BatchRunProcessingUseCaseTest {
MockRunLockPort lockPort = new MockRunLockPort(); MockRunLockPort lockPort = new MockRunLockPort();
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort()); config, lockPort, new EmptyCandidatesPort(), new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("empty"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("empty"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
@@ -138,25 +156,26 @@ class BatchRunProcessingUseCaseTest {
} }
@Test @Test
void execute_happyPath_candidatePassesPreChecks_endsControlledWithoutKiOrCopy() throws Exception { void execute_happyPath_candidatePassesPreChecks_m4PersistenceInvoked() throws Exception {
MockRunLockPort lockPort = new MockRunLockPort(); MockRunLockPort lockPort = new MockRunLockPort();
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
// Candidate with usable text within page limit
SourceDocumentCandidate candidate = makeCandidate("document.pdf"); SourceDocumentCandidate candidate = makeCandidate("document.pdf");
PdfExtractionSuccess success = new PdfExtractionSuccess("Invoice text", new PdfPageCount(1)); PdfExtractionSuccess success = new PdfExtractionSuccess("Invoice text", new PdfPageCount(1));
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate)); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(success); FixedExtractionPort extractionPort = new FixedExtractionPort(success);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("happy"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("happy"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
// Batch run succeeds; document ended controlled at boundary (no KI, no copy)
assertTrue(outcome.isSuccess(), "Happy path should yield SUCCESS"); assertTrue(outcome.isSuccess(), "Happy path should yield SUCCESS");
assertEquals(1, extractionPort.callCount(), "Extraction should be called exactly once"); assertEquals(1, extractionPort.callCount(), "Extraction should be called exactly once");
assertEquals(1, m4Processor.processCallCount(), "M4 processor should be called exactly once");
} }
@Test @Test
@@ -165,40 +184,42 @@ class BatchRunProcessingUseCaseTest {
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
SourceDocumentCandidate candidate = makeCandidate("image-only.pdf"); SourceDocumentCandidate candidate = makeCandidate("image-only.pdf");
// Extraction returns text with no letters or digits
PdfExtractionSuccess emptySuccess = new PdfExtractionSuccess(" ", new PdfPageCount(1)); PdfExtractionSuccess emptySuccess = new PdfExtractionSuccess(" ", new PdfPageCount(1));
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate)); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(emptySuccess); FixedExtractionPort extractionPort = new FixedExtractionPort(emptySuccess);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("no-text"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("no-text"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
// Document ends with pre-check failure; batch itself still succeeds
assertTrue(outcome.isSuccess(), "No-usable-text pre-check failure should not abort the batch run"); assertTrue(outcome.isSuccess(), "No-usable-text pre-check failure should not abort the batch run");
assertEquals(1, m4Processor.processCallCount(), "M4 processor should still be called for content errors");
} }
@Test @Test
void execute_pageLimitExceeded_candidateEndsControlled_batchContinues() throws Exception { void execute_pageLimitExceeded_candidateEndsControlled_batchContinues() throws Exception {
MockRunLockPort lockPort = new MockRunLockPort(); MockRunLockPort lockPort = new MockRunLockPort();
// Config has maxPages=3; document has 10 pages
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
SourceDocumentCandidate candidate = makeCandidate("big.pdf"); SourceDocumentCandidate candidate = makeCandidate("big.pdf");
PdfExtractionSuccess manyPages = new PdfExtractionSuccess("Some text", new PdfPageCount(10)); PdfExtractionSuccess manyPages = new PdfExtractionSuccess("Some text", new PdfPageCount(10));
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate)); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(manyPages); FixedExtractionPort extractionPort = new FixedExtractionPort(manyPages);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("page-limit"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("page-limit"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
// maxPages in buildConfig is 3; 10 pages exceeds limit pre-check fails, batch continues
assertTrue(outcome.isSuccess(), "Page limit exceeded should not abort the batch run"); assertTrue(outcome.isSuccess(), "Page limit exceeded should not abort the batch run");
assertEquals(1, m4Processor.processCallCount(), "M4 processor should still be called for page limit errors");
} }
@Test @Test
@@ -210,14 +231,17 @@ class BatchRunProcessingUseCaseTest {
PdfExtractionContentError contentError = new PdfExtractionContentError("PDF is encrypted"); PdfExtractionContentError contentError = new PdfExtractionContentError("PDF is encrypted");
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate)); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(contentError); FixedExtractionPort extractionPort = new FixedExtractionPort(contentError);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("content-error"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("content-error"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isSuccess(), "Extraction content error should not abort the batch run"); assertTrue(outcome.isSuccess(), "Extraction content error should not abort the batch run");
assertEquals(1, m4Processor.processCallCount(), "M4 processor should be called for content errors");
} }
@Test @Test
@@ -229,14 +253,17 @@ class BatchRunProcessingUseCaseTest {
PdfExtractionTechnicalError technicalError = new PdfExtractionTechnicalError("I/O error reading file", null); PdfExtractionTechnicalError technicalError = new PdfExtractionTechnicalError("I/O error reading file", null);
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate)); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(technicalError); FixedExtractionPort extractionPort = new FixedExtractionPort(technicalError);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("tech-error"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("tech-error"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isSuccess(), "Technical extraction error should not abort the batch run"); assertTrue(outcome.isSuccess(), "Technical extraction error should not abort the batch run");
assertEquals(1, m4Processor.processCallCount(), "M4 processor should be called for technical errors");
} }
@Test @Test
@@ -248,18 +275,71 @@ class BatchRunProcessingUseCaseTest {
throw new SourceDocumentAccessException("Source folder not readable"); throw new SourceDocumentAccessException("Source folder not readable");
}; };
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, failingPort, new NoOpExtractionPort()); config, lockPort, failingPort, new NoOpExtractionPort(),
new AlwaysSuccessFingerprintPort(), new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("access-fail"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("access-fail"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isFailure(), "Source folder access failure should yield FAILURE outcome"); assertTrue(outcome.isFailure(), "Source folder access failure should yield FAILURE outcome");
assertFalse(outcome.isSuccess(), "Source folder access failure must not be SUCCESS"); assertFalse(outcome.isSuccess(), "Source folder access failure must not be SUCCESS");
// Lock must still be released
assertTrue(lockPort.wasReleaseCalled(), "Lock should be released even when source access fails"); assertTrue(lockPort.wasReleaseCalled(), "Lock should be released even when source access fails");
} }
// -------------------------------------------------------------------------
// M4-specific: fingerprint failure → not historised
// -------------------------------------------------------------------------
@Test
void execute_fingerprintFailure_candidateNotHistorised_batchContinues() throws Exception {
MockRunLockPort lockPort = new MockRunLockPort();
StartConfiguration config = buildConfig(tempDir);
SourceDocumentCandidate candidate = makeCandidate("unreadable.pdf");
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
// Fingerprint always fails → M4 processor must NOT be called
FingerprintPort alwaysFailingFingerprintPort = c ->
new FingerprintTechnicalError("Cannot read file", null);
DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, new NoOpExtractionPort(),
alwaysFailingFingerprintPort, m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("fp-fail"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isSuccess(), "Fingerprint failure should not abort the batch run");
assertEquals(0, m4Processor.processCallCount(),
"M4 processor must NOT be called when fingerprint computation fails (pre-fingerprint failure)");
}
@Test
void execute_fingerprintFailure_extractionNotCalled() throws Exception {
MockRunLockPort lockPort = new MockRunLockPort();
StartConfiguration config = buildConfig(tempDir);
SourceDocumentCandidate candidate = makeCandidate("unreadable.pdf");
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(candidate));
FixedExtractionPort extractionPort = new FixedExtractionPort(
new PdfExtractionSuccess("text", new PdfPageCount(1)));
FingerprintPort alwaysFailingFingerprintPort = c ->
new FingerprintTechnicalError("Cannot read file", null);
DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort,
alwaysFailingFingerprintPort, new NoOpM4DocumentProcessor());
BatchRunContext context = new BatchRunContext(new RunId("fp-fail-no-extract"), Instant.now());
useCase.execute(context);
assertEquals(0, extractionPort.callCount(),
"PDF extraction must NOT be called when fingerprint computation fails");
}
/** /**
* Mixed-batch test: one document per outcome type in a single run. * Mixed-batch test: one document per outcome type in a single run.
* Proves that no individual outcome aborts the overall batch. * Proves that no individual outcome aborts the overall batch.
@@ -267,7 +347,6 @@ class BatchRunProcessingUseCaseTest {
@Test @Test
void execute_mixedBatch_allOutcomeTypes_batchOverallSucceeds() throws Exception { void execute_mixedBatch_allOutcomeTypes_batchOverallSucceeds() throws Exception {
MockRunLockPort lockPort = new MockRunLockPort(); MockRunLockPort lockPort = new MockRunLockPort();
// maxPages=3 in buildConfig; pageLimitCandidate has 10 pages → exceeds limit
StartConfiguration config = buildConfig(tempDir); StartConfiguration config = buildConfig(tempDir);
SourceDocumentCandidate goodCandidate = makeCandidate("good.pdf"); SourceDocumentCandidate goodCandidate = makeCandidate("good.pdf");
@@ -275,10 +354,11 @@ class BatchRunProcessingUseCaseTest {
SourceDocumentCandidate pageLimitCandidate = makeCandidate("toobig.pdf"); SourceDocumentCandidate pageLimitCandidate = makeCandidate("toobig.pdf");
SourceDocumentCandidate technicalErrorCandidate = makeCandidate("broken.pdf"); SourceDocumentCandidate technicalErrorCandidate = makeCandidate("broken.pdf");
SourceDocumentCandidate contentErrorCandidate = makeCandidate("encrypted.pdf"); SourceDocumentCandidate contentErrorCandidate = makeCandidate("encrypted.pdf");
SourceDocumentCandidate fpFailCandidate = makeCandidate("unreadable.pdf");
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of( FixedCandidatesPort candidatesPort = new FixedCandidatesPort(List.of(
goodCandidate, noTextCandidate, pageLimitCandidate, goodCandidate, noTextCandidate, pageLimitCandidate,
technicalErrorCandidate, contentErrorCandidate)); technicalErrorCandidate, contentErrorCandidate, fpFailCandidate));
MappedExtractionPort extractionPort = new MappedExtractionPort() MappedExtractionPort extractionPort = new MappedExtractionPort()
.with(goodCandidate, new PdfExtractionSuccess("Invoice text", new PdfPageCount(1))) .with(goodCandidate, new PdfExtractionSuccess("Invoice text", new PdfPageCount(1)))
@@ -287,16 +367,31 @@ class BatchRunProcessingUseCaseTest {
.with(technicalErrorCandidate, new PdfExtractionTechnicalError("I/O error", null)) .with(technicalErrorCandidate, new PdfExtractionTechnicalError("I/O error", null))
.with(contentErrorCandidate, new PdfExtractionContentError("PDF is encrypted")); .with(contentErrorCandidate, new PdfExtractionContentError("PDF is encrypted"));
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( // fpFailCandidate gets a fingerprint failure; others get a valid fingerprint
config, lockPort, candidatesPort, extractionPort); FingerprintPort mappedFingerprintPort = candidate -> {
if (candidate.uniqueIdentifier().equals("unreadable.pdf")) {
return new FingerprintTechnicalError("Cannot read", null);
}
return new FingerprintSuccess(makeFingerprint(candidate.uniqueIdentifier()));
};
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort,
mappedFingerprintPort, m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("mixed"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("mixed"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isSuccess(), assertTrue(outcome.isSuccess(), "Mixed batch with all outcome types must yield batch SUCCESS");
"Mixed batch with all outcome types must yield batch SUCCESS"); // 5 candidates with successful fingerprint → M4 processor called 5 times
// 1 candidate with fingerprint failure → M4 processor NOT called
assertEquals(5, m4Processor.processCallCount(),
"M4 processor must be called for each candidate with a successful fingerprint");
// Extraction called for 5 candidates (not for fpFailCandidate)
assertEquals(5, extractionPort.callCount(), assertEquals(5, extractionPort.callCount(),
"Extraction must be attempted for each of the 5 candidates"); "Extraction must be attempted for each of the 5 candidates with a valid fingerprint");
} }
@Test @Test
@@ -312,21 +407,35 @@ class BatchRunProcessingUseCaseTest {
PdfExtractionSuccess success = new PdfExtractionSuccess("Invoice content", new PdfPageCount(2)); PdfExtractionSuccess success = new PdfExtractionSuccess("Invoice content", new PdfPageCount(2));
FixedCandidatesPort candidatesPort = new FixedCandidatesPort(candidates); FixedCandidatesPort candidatesPort = new FixedCandidatesPort(candidates);
FixedExtractionPort extractionPort = new FixedExtractionPort(success); FixedExtractionPort extractionPort = new FixedExtractionPort(success);
TrackingM4DocumentProcessor m4Processor = new TrackingM4DocumentProcessor();
DefaultBatchRunProcessingUseCase useCase = new DefaultBatchRunProcessingUseCase( DefaultBatchRunProcessingUseCase useCase = buildUseCase(
config, lockPort, candidatesPort, extractionPort); config, lockPort, candidatesPort, extractionPort,
new AlwaysSuccessFingerprintPort(), m4Processor);
BatchRunContext context = new BatchRunContext(new RunId("multi"), Instant.now()); BatchRunContext context = new BatchRunContext(new RunId("multi"), Instant.now());
BatchRunOutcome outcome = useCase.execute(context); BatchRunOutcome outcome = useCase.execute(context);
assertTrue(outcome.isSuccess(), "All three candidates processed should yield SUCCESS"); assertTrue(outcome.isSuccess(), "All three candidates processed should yield SUCCESS");
assertEquals(3, extractionPort.callCount(), "Extraction should be called once per candidate"); assertEquals(3, extractionPort.callCount(), "Extraction should be called once per candidate");
assertEquals(3, m4Processor.processCallCount(), "M4 processor should be called once per candidate");
} }
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// Helpers // Helpers
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
private static DefaultBatchRunProcessingUseCase buildUseCase(
StartConfiguration config,
RunLockPort lockPort,
SourceDocumentCandidatesPort candidatesPort,
PdfTextExtractionPort extractionPort,
FingerprintPort fingerprintPort,
M4DocumentProcessor m4Processor) {
return new DefaultBatchRunProcessingUseCase(
config, lockPort, candidatesPort, extractionPort, fingerprintPort, m4Processor);
}
private static StartConfiguration buildConfig(Path tempDir) throws Exception { private static StartConfiguration buildConfig(Path tempDir) throws Exception {
Path sourceDir = Files.createDirectories(tempDir.resolve("source")); Path sourceDir = Files.createDirectories(tempDir.resolve("source"));
Path targetDir = Files.createDirectories(tempDir.resolve("target")); Path targetDir = Files.createDirectories(tempDir.resolve("target"));
@@ -357,6 +466,13 @@ class BatchRunProcessingUseCaseTest {
return new SourceDocumentCandidate(filename, 1024L, new SourceDocumentLocator("/tmp/" + filename)); return new SourceDocumentCandidate(filename, 1024L, new SourceDocumentLocator("/tmp/" + filename));
} }
/** Creates a deterministic fake fingerprint from a string (padded to 64 hex chars). */
private static DocumentFingerprint makeFingerprint(String seed) {
String hex = String.format("%064x", Math.abs(seed.hashCode()));
// Ensure exactly 64 lowercase hex chars
return new DocumentFingerprint(hex.substring(0, 64));
}
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
// Mock / Stub implementations // Mock / Stub implementations
// ------------------------------------------------------------------------- // -------------------------------------------------------------------------
@@ -480,4 +596,88 @@ class BatchRunProcessingUseCaseTest {
int callCount() { return calls; } int callCount() { return calls; }
} }
/**
* Fingerprint port that always returns a deterministic success based on the candidate's
* unique identifier.
*/
private static class AlwaysSuccessFingerprintPort implements FingerprintPort {
@Override
public FingerprintResult computeFingerprint(SourceDocumentCandidate candidate) {
String hex = String.format("%064x", Math.abs(candidate.uniqueIdentifier().hashCode()));
return new FingerprintSuccess(new DocumentFingerprint(hex.substring(0, 64)));
}
}
/**
* No-op M4DocumentProcessor that does nothing (for tests that only care about
* lock/batch lifecycle, not M4 persistence).
*/
private static class NoOpM4DocumentProcessor extends M4DocumentProcessor {
NoOpM4DocumentProcessor() {
super(new NoOpDocumentRecordRepository(), new NoOpProcessingAttemptRepository());
}
}
/**
* Tracking M4DocumentProcessor that counts how many times {@code process()} is called.
*/
private static class TrackingM4DocumentProcessor extends M4DocumentProcessor {
private int processCallCount = 0;
TrackingM4DocumentProcessor() {
super(new NoOpDocumentRecordRepository(), new NoOpProcessingAttemptRepository());
}
@Override
public void process(
de.gecheckt.pdf.umbenenner.domain.model.SourceDocumentCandidate candidate,
de.gecheckt.pdf.umbenenner.domain.model.DocumentFingerprint fingerprint,
de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome m3Outcome,
de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext context,
java.time.Instant attemptStart) {
processCallCount++;
// Delegate to super so the real logic runs (with no-op repos)
super.process(candidate, fingerprint, m3Outcome, context, attemptStart);
}
int processCallCount() { return processCallCount; }
}
/** No-op DocumentRecordRepository for use in test M4DocumentProcessor instances. */
private static class NoOpDocumentRecordRepository implements DocumentRecordRepository {
@Override
public DocumentRecordLookupResult findByFingerprint(DocumentFingerprint fingerprint) {
// Return DocumentUnknown so the M4 processor always takes the "new document" path
return new DocumentUnknown();
}
@Override
public void create(DocumentRecord record) {
// No-op
}
@Override
public void update(DocumentRecord record) {
// No-op
}
}
/** No-op ProcessingAttemptRepository for use in test M4DocumentProcessor instances. */
private static class NoOpProcessingAttemptRepository implements ProcessingAttemptRepository {
@Override
public int loadNextAttemptNumber(DocumentFingerprint fingerprint) {
return 1;
}
@Override
public void save(ProcessingAttempt attempt) {
// No-op
}
@Override
public List<ProcessingAttempt> findAllByFingerprint(DocumentFingerprint fingerprint) {
return List.of();
}
}
} }

View File

@@ -5,16 +5,26 @@ import org.apache.logging.log4j.Logger;
import de.gecheckt.pdf.umbenenner.adapter.in.cli.SchedulerBatchCommand; import de.gecheckt.pdf.umbenenner.adapter.in.cli.SchedulerBatchCommand;
import de.gecheckt.pdf.umbenenner.adapter.out.configuration.PropertiesConfigurationPortAdapter; import de.gecheckt.pdf.umbenenner.adapter.out.configuration.PropertiesConfigurationPortAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.fingerprint.Sha256FingerprintAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.lock.FilesystemRunLockPortAdapter; import de.gecheckt.pdf.umbenenner.adapter.out.lock.FilesystemRunLockPortAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.pdfextraction.PdfTextExtractionPortAdapter; import de.gecheckt.pdf.umbenenner.adapter.out.pdfextraction.PdfTextExtractionPortAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.sourcedocument.SourceDocumentCandidatesPortAdapter; import de.gecheckt.pdf.umbenenner.adapter.out.sourcedocument.SourceDocumentCandidatesPortAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.sqlite.SqliteDocumentRecordRepositoryAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.sqlite.SqliteProcessingAttemptRepositoryAdapter;
import de.gecheckt.pdf.umbenenner.adapter.out.sqlite.SqliteSchemaInitializationAdapter;
import de.gecheckt.pdf.umbenenner.application.config.InvalidStartConfigurationException; import de.gecheckt.pdf.umbenenner.application.config.InvalidStartConfigurationException;
import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration; import de.gecheckt.pdf.umbenenner.application.config.StartConfiguration;
import de.gecheckt.pdf.umbenenner.application.config.StartConfigurationValidator; import de.gecheckt.pdf.umbenenner.application.config.StartConfigurationValidator;
import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome; import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunOutcome;
import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunProcessingUseCase; import de.gecheckt.pdf.umbenenner.application.port.in.BatchRunProcessingUseCase;
import de.gecheckt.pdf.umbenenner.application.port.out.ConfigurationPort; import de.gecheckt.pdf.umbenenner.application.port.out.ConfigurationPort;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentPersistenceException;
import de.gecheckt.pdf.umbenenner.application.port.out.DocumentRecordRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.FingerprintPort;
import de.gecheckt.pdf.umbenenner.application.port.out.PersistenceSchemaInitializationPort;
import de.gecheckt.pdf.umbenenner.application.port.out.ProcessingAttemptRepository;
import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort; import de.gecheckt.pdf.umbenenner.application.port.out.RunLockPort;
import de.gecheckt.pdf.umbenenner.application.service.M4DocumentProcessor;
import de.gecheckt.pdf.umbenenner.application.usecase.DefaultBatchRunProcessingUseCase; import de.gecheckt.pdf.umbenenner.application.usecase.DefaultBatchRunProcessingUseCase;
import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext; import de.gecheckt.pdf.umbenenner.domain.model.BatchRunContext;
import de.gecheckt.pdf.umbenenner.domain.model.RunId; import de.gecheckt.pdf.umbenenner.domain.model.RunId;
@@ -29,20 +39,35 @@ import java.util.UUID;
* <p> * <p>
* Responsibilities: * Responsibilities:
* <ol> * <ol>
* <li>Load and validate the startup configuration</li> * <li>Load and validate the startup configuration.</li>
* <li>Resolve the run-lock file path (with default fallback)</li> * <li>Resolve the run-lock file path (with default fallback).</li>
* <li>Create and wire all ports and adapters</li> * <li>Initialise the SQLite schema (M4: before the batch document loop begins).</li>
* <li>Start the CLI adapter and execute the batch use case</li> * <li>Create and wire all ports and adapters, including the M4 persistence ports.</li>
* <li>Map the batch outcome to a process exit code</li> * <li>Start the CLI adapter and execute the batch use case.</li>
* <li>Map the batch outcome to a process exit code.</li>
* </ol> * </ol>
* <p> *
* Exit code semantics: * <h2>Exit code semantics</h2>
* <ul> * <ul>
* <li>{@code 0}: Batch run executed successfully; individual document failures do not * <li>{@code 0}: Batch run executed successfully; individual document failures do not
* change the exit code as long as the run itself completed without a hard infrastructure error.</li> * change the exit code as long as the run itself completed without a hard
* <li>{@code 1}: Hard start, bootstrap, or configuration failure that prevented the run * infrastructure error.</li>
* from beginning, or a critical infrastructure failure during the run.</li> * <li>{@code 1}: Hard start, bootstrap, configuration, or schema-initialisation failure
* that prevented the run from beginning, or a critical infrastructure failure
* during the run.</li>
* </ul> * </ul>
*
* <h2>M4 wiring</h2>
* <p>
* The production constructor wires the following M4 adapters:
* <ul>
* <li>{@link Sha256FingerprintAdapter} — SHA-256 content fingerprinting.</li>
* <li>{@link SqliteSchemaInitializationAdapter} — schema initialisation at startup.</li>
* <li>{@link SqliteDocumentRecordRepositoryAdapter} — document master record CRUD.</li>
* <li>{@link SqliteProcessingAttemptRepositoryAdapter} — attempt history CRUD.</li>
* </ul>
*
* @since M2 (extended in M4-AP-006)
*/ */
public class BootstrapRunner { public class BootstrapRunner {
@@ -83,7 +108,7 @@ public class BootstrapRunner {
* <p> * <p>
* Receives the already-loaded and validated {@link StartConfiguration} and run lock port. * Receives the already-loaded and validated {@link StartConfiguration} and run lock port.
* The factory is responsible for creating and wiring any additional outbound ports * The factory is responsible for creating and wiring any additional outbound ports
* required by the use case (e.g., source document port, PDF extraction port). * required by the use case (e.g., source document port, PDF extraction port, M4 ports).
*/ */
@FunctionalInterface @FunctionalInterface
public interface UseCaseFactory { public interface UseCaseFactory {
@@ -101,23 +126,41 @@ public class BootstrapRunner {
/** /**
* Creates the BootstrapRunner with default factories for production use. * Creates the BootstrapRunner with default factories for production use.
* <p> * <p>
* Wires the full processing pipeline: * Wires the full M4 processing pipeline:
* <ul> * <ul>
* <li>{@link PropertiesConfigurationPortAdapter} for configuration loading</li> * <li>{@link PropertiesConfigurationPortAdapter} for configuration loading.</li>
* <li>{@link FilesystemRunLockPortAdapter} for exclusive run locking</li> * <li>{@link FilesystemRunLockPortAdapter} for exclusive run locking.</li>
* <li>{@link SourceDocumentCandidatesPortAdapter} for PDF candidate discovery</li> * <li>{@link SourceDocumentCandidatesPortAdapter} for PDF candidate discovery.</li>
* <li>{@link PdfTextExtractionPortAdapter} for PDFBox-based text and page count extraction</li> * <li>{@link PdfTextExtractionPortAdapter} for PDFBox-based text and page count extraction.</li>
* <li>{@link Sha256FingerprintAdapter} for SHA-256 content fingerprinting.</li>
* <li>{@link SqliteDocumentRecordRepositoryAdapter} for document master record CRUD.</li>
* <li>{@link SqliteProcessingAttemptRepositoryAdapter} for attempt history CRUD.</li>
* </ul> * </ul>
* <p>
* Schema initialisation is performed in {@link #run()} before the use case is created,
* using {@link SqliteSchemaInitializationAdapter}.
*/ */
public BootstrapRunner() { public BootstrapRunner() {
this.configPortFactory = PropertiesConfigurationPortAdapter::new; this.configPortFactory = PropertiesConfigurationPortAdapter::new;
this.runLockPortFactory = FilesystemRunLockPortAdapter::new; this.runLockPortFactory = FilesystemRunLockPortAdapter::new;
this.validatorFactory = StartConfigurationValidator::new; this.validatorFactory = StartConfigurationValidator::new;
this.useCaseFactory = (config, lock) -> new DefaultBatchRunProcessingUseCase( this.useCaseFactory = (config, lock) -> {
String jdbcUrl = buildJdbcUrl(config);
FingerprintPort fingerprintPort = new Sha256FingerprintAdapter();
DocumentRecordRepository documentRecordRepository =
new SqliteDocumentRecordRepositoryAdapter(jdbcUrl);
ProcessingAttemptRepository processingAttemptRepository =
new SqliteProcessingAttemptRepositoryAdapter(jdbcUrl);
M4DocumentProcessor m4Processor =
new M4DocumentProcessor(documentRecordRepository, processingAttemptRepository);
return new DefaultBatchRunProcessingUseCase(
config, config,
lock, lock,
new SourceDocumentCandidatesPortAdapter(config.sourceFolder()), new SourceDocumentCandidatesPortAdapter(config.sourceFolder()),
new PdfTextExtractionPortAdapter()); new PdfTextExtractionPortAdapter(),
fingerprintPort,
m4Processor);
};
this.commandFactory = SchedulerBatchCommand::new; this.commandFactory = SchedulerBatchCommand::new;
} }
@@ -145,11 +188,17 @@ public class BootstrapRunner {
/** /**
* Runs the application startup sequence. * Runs the application startup sequence.
* <p> * <p>
* AP-003: Manually wires the object graph and invokes the CLI command. * M4 additions:
* AP-005: Wires ConfigurationPort adapter and passes it to the use case. * <ul>
* AP-006: Validates configuration before allowing processing to start. * <li>Derives the SQLite JDBC URL from the configured {@code sqlite.file} path.</li>
* <li>Initialises the M4 SQLite schema via
* {@link PersistenceSchemaInitializationPort#initializeSchema()} before the
* batch document loop begins. A schema initialisation failure aborts the run
* with exit code&nbsp;1.</li>
* </ul>
* *
* @return exit code: 0 for success, 1 for invalid configuration or unexpected failure * @return exit code: 0 for success, 1 for invalid configuration, schema failure,
* or unexpected bootstrap failure
*/ */
public int run() { public int run() {
LOG.info("Bootstrap flow started."); LOG.info("Bootstrap flow started.");
@@ -160,61 +209,105 @@ public class BootstrapRunner {
// Step 2: Load configuration // Step 2: Load configuration
var config = configPort.loadConfiguration(); var config = configPort.loadConfiguration();
// Step 3: Validate configuration (AP-006) // Step 3: Validate configuration
StartConfigurationValidator validator = validatorFactory.create(); StartConfigurationValidator validator = validatorFactory.create();
validator.validate(config); validator.validate(config);
// Step 4: Resolve lock file path apply default if not configured (AP-006) // Step 4: Resolve lock file path apply default if not configured
Path lockFilePath = config.runtimeLockFile(); Path lockFilePath = config.runtimeLockFile();
if (lockFilePath == null || lockFilePath.toString().isBlank()) { if (lockFilePath == null || lockFilePath.toString().isBlank()) {
lockFilePath = Paths.get("pdf-umbenenner.lock"); lockFilePath = Paths.get("pdf-umbenenner.lock");
LOG.info("runtime.lock.file not configured, using default lock path: {}", lockFilePath.toAbsolutePath()); LOG.info("runtime.lock.file not configured, using default lock path: {}",
lockFilePath.toAbsolutePath());
} }
RunLockPort runLockPort = runLockPortFactory.create(lockFilePath); RunLockPort runLockPort = runLockPortFactory.create(lockFilePath);
// Step 5: Create the batch run context // Step 5 (M4): Initialise the SQLite schema before the batch loop begins.
// Generate a unique run ID and initialize the run context // A failure here is a hard start error → exit code 1.
initializeSchema(config);
// Step 6: Create the batch run context
RunId runId = new RunId(UUID.randomUUID().toString()); RunId runId = new RunId(UUID.randomUUID().toString());
BatchRunContext runContext = new BatchRunContext(runId, Instant.now()); BatchRunContext runContext = new BatchRunContext(runId, Instant.now());
LOG.info("Batch run started. RunId: {}", runId); LOG.info("Batch run started. RunId: {}", runId);
// Step 6: Create the use case with the validated config and run lock (application layer). // Step 7: Create the use case with the validated config and run lock.
// Config is passed directly; the use case does not re-read the properties file. // Config is passed directly; the use case does not re-read the properties file.
// Adapters (source document port, PDF extraction port) are wired by the factory. // Adapters (source document port, PDF extraction port, M4 ports) are wired by the factory.
BatchRunProcessingUseCase useCase = useCaseFactory.create(config, runLockPort); BatchRunProcessingUseCase useCase = useCaseFactory.create(config, runLockPort);
// Step 7: Create the CLI command adapter with the use case // Step 8: Create the CLI command adapter with the use case
SchedulerBatchCommand command = commandFactory.create(useCase); SchedulerBatchCommand command = commandFactory.create(useCase);
// Step 8: Execute the command with the run context and handle the outcome // Step 9: Execute the command with the run context and handle the outcome
BatchRunOutcome outcome = command.run(runContext); BatchRunOutcome outcome = command.run(runContext);
// Mark run as completed (AP-003) // Mark run as completed
runContext.setEndInstant(Instant.now()); runContext.setEndInstant(Instant.now());
if (outcome.isSuccess()) { if (outcome.isSuccess()) {
LOG.info("Batch run completed successfully. RunId: {}", runContext.runId()); LOG.info("Batch run completed successfully. RunId: {}", runContext.runId());
return 0; return 0;
} else if (outcome.isLockUnavailable()) { } else if (outcome.isLockUnavailable()) {
LOG.warn("Batch run aborted: another instance is already running. RunId: {}", runContext.runId()); LOG.warn("Batch run aborted: another instance is already running. RunId: {}",
runContext.runId());
return 1; return 1;
} else { } else {
LOG.error("Batch run failed. RunId: {}", runContext.runId()); LOG.error("Batch run failed. RunId: {}", runContext.runId());
return 1; return 1;
} }
} catch (InvalidStartConfigurationException e) { } catch (InvalidStartConfigurationException e) {
// Controlled failure for invalid configuration - log clearly without stack trace // Controlled failure for invalid configuration log clearly without stack trace
LOG.error("Configuration validation failed: {}", e.getMessage()); LOG.error("Configuration validation failed: {}", e.getMessage());
return 1; return 1;
} catch (IllegalStateException e) { } catch (IllegalStateException e) {
// Configuration loading failed due to missing/invalid required properties // Configuration loading failed due to missing/invalid required properties
// Treat as invalid configuration for controlled failure
LOG.error("Configuration loading failed: {}", e.getMessage()); LOG.error("Configuration loading failed: {}", e.getMessage());
return 1; return 1;
} catch (DocumentPersistenceException e) {
// Schema initialisation failed hard start error
LOG.error("SQLite schema initialisation failed: {}", e.getMessage(), e);
return 1;
} catch (Exception e) { } catch (Exception e) {
LOG.error("Bootstrap failure during startup.", e); LOG.error("Bootstrap failure during startup.", e);
return 1; return 1;
} }
} }
/**
* Initialises the M4 SQLite schema using the configured SQLite file path.
* <p>
* This method is called once at startup, before the batch document loop begins.
* It uses the production {@link SqliteSchemaInitializationAdapter} directly because
* schema initialisation is a startup concern, not a per-document concern, and the
* {@link UseCaseFactory} abstraction is not the right place for it.
* <p>
* If the {@code sqlite.file} configuration is null or blank, schema initialisation
* is skipped with a warning. This allows the existing test infrastructure (which
* uses the custom {@link UseCaseFactory}) to continue working without a real SQLite
* file.
*
* @param config the validated startup configuration
* @throws DocumentPersistenceException if schema initialisation fails
*/
private void initializeSchema(StartConfiguration config) {
if (config.sqliteFile() == null) {
LOG.warn("sqlite.file not configured skipping schema initialisation.");
return;
}
String jdbcUrl = buildJdbcUrl(config);
PersistenceSchemaInitializationPort schemaPort = new SqliteSchemaInitializationAdapter(jdbcUrl);
schemaPort.initializeSchema();
LOG.info("M4 SQLite schema initialised at: {}", jdbcUrl);
}
/**
* Builds the JDBC URL for the SQLite database from the configured file path.
*
* @param config the startup configuration containing the SQLite file path
* @return the JDBC URL in the form {@code jdbc:sqlite:/path/to/file.db}
*/
static String buildJdbcUrl(StartConfiguration config) {
return "jdbc:sqlite:" + config.sqliteFile().toAbsolutePath().toString().replace('\\', '/');
}
} }