From 18f9c33bbb92965fbf677cc833608fe927b8845f Mon Sep 17 00:00:00 2001 From: Marcus van Elst Date: Mon, 4 May 2026 15:08:01 +0200 Subject: [PATCH] #78: NO_USABLE_TEXT (Foto-PDF) finalisiert sofort zu FAILED_FINAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bisher wurde NO_USABLE_TEXT (kein OCR-Text im PDF) wie alle anderen deterministischen Inhaltsfehler mit der 1-Retry-Regel behandelt und landete beim ersten Auftreten in FAILED_RETRYABLE. Da ein Bild-Scan ohne OCR-Text sich zwischen Läufen nicht verändert, ist ein Wiederholversuch sinnlos – der Status muss sofort FAILED_FINAL sein. Geändert: ProcessingOutcomeTransition erkennt NO_USABLE_TEXT als Sonderfall und liefert ohne Retry-Prüfung FAILED_FINAL. PAGE_LIMIT_EXCEEDED und CONTENT_NOT_EXTRACTABLE behalten die 1-Retry-Regel. Tests angepasst: Bestehende Tests, die FAILED_RETRYABLE für NO_USABLE_TEXT erwarteten, wurden auf das korrekte Verhalten umgestellt oder auf PAGE_LIMIT_EXCEEDED umgeschrieben. Neue Lifecycle-Tests für NO_USABLE_TEXT (sofort FAILED_FINAL → SKIPPED_FINAL_FAILURE) hinzugefügt. Co-Authored-By: Claude Sonnet 4.6 --- .../service/ProcessingOutcomeTransition.java | 20 ++++-- .../DocumentProcessingCoordinatorTest.java | 69 +++++++++++++++++-- .../ProcessingOutcomeTransitionTest.java | 24 +++++-- .../domain/model/PreCheckFailureReason.java | 5 +- 4 files changed, 100 insertions(+), 18 deletions(-) diff --git a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransition.java b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransition.java index 8897f00..223c0bd 100644 --- a/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransition.java +++ b/pdf-umbenenner-application/src/main/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransition.java @@ -6,6 +6,7 @@ import de.gecheckt.pdf.umbenenner.domain.model.AiTechnicalFailure; import de.gecheckt.pdf.umbenenner.domain.model.DocumentProcessingOutcome; import de.gecheckt.pdf.umbenenner.domain.model.NamingProposalReady; import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailed; +import de.gecheckt.pdf.umbenenner.domain.model.PreCheckFailureReason; import de.gecheckt.pdf.umbenenner.domain.model.ProcessingStatus; import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError; @@ -26,10 +27,14 @@ import de.gecheckt.pdf.umbenenner.domain.model.TechnicalDocumentError; *
  • Naming proposal ready: Status becomes * {@link ProcessingStatus#PROPOSAL_READY}, counters unchanged, * {@code retryable=false}.
  • - *
  • Pre-check content error (first occurrence): + *
  • Pre-check content error {@link PreCheckFailureReason#NO_USABLE_TEXT}: + * Status becomes {@link ProcessingStatus#FAILED_FINAL} immediately, + * content error counter incremented by 1, {@code retryable=false}. + * Image-only PDFs without OCR text will not yield usable text on retry.
  • + *
  • Pre-check content error (other reason, first occurrence): * Status becomes {@link ProcessingStatus#FAILED_RETRYABLE}, * content error counter incremented by 1, {@code retryable=true}.
  • - *
  • Pre-check content error (second or later occurrence): + *
  • Pre-check content error (other reason, second or later occurrence): * Status becomes {@link ProcessingStatus#FAILED_FINAL}, * content error counter incremented by 1, {@code retryable=false}.
  • *
  • AI functional failure (first occurrence): @@ -112,11 +117,16 @@ final class ProcessingOutcomeTransition { ); } - case PreCheckFailed ignored2 -> { - // Deterministic content error from pre-check: apply the 1-retry rule + case PreCheckFailed preCheckFailed -> { FailureCounters updatedCounters = existingCounters.withIncrementedContentErrorCount(); - boolean isFirstOccurrence = existingCounters.contentErrorCount() == 0; + if (preCheckFailed.failureReason() == PreCheckFailureReason.NO_USABLE_TEXT) { + // Image-only PDFs without OCR text will not change on retry. + yield new ProcessingOutcome(ProcessingStatus.FAILED_FINAL, updatedCounters, false); + } + + // Other deterministic content errors: apply the 1-retry rule + boolean isFirstOccurrence = existingCounters.contentErrorCount() == 0; if (isFirstOccurrence) { yield new ProcessingOutcome(ProcessingStatus.FAILED_RETRYABLE, updatedCounters, true); } else { diff --git a/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/DocumentProcessingCoordinatorTest.java b/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/DocumentProcessingCoordinatorTest.java index d9e4507..5907807 100644 --- a/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/DocumentProcessingCoordinatorTest.java +++ b/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/DocumentProcessingCoordinatorTest.java @@ -154,13 +154,36 @@ class DocumentProcessingCoordinatorTest { } @Test - void process_newDocument_firstContentError_persistsFailedRetryable_contentCounterOne() { + void process_newDocument_noUsableText_persistsFailedFinal_contentCounterOne() { + // NO_USABLE_TEXT (image-only PDF) finalises immediately — no retry. recordRepo.setLookupResult(new DocumentUnknown()); DocumentProcessingOutcome outcome = new PreCheckFailed( candidate, PreCheckFailureReason.NO_USABLE_TEXT); processor.process(candidate, fingerprint, outcome, context, attemptStart); + assertEquals(1, attemptRepo.savedAttempts.size()); + ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0); + assertEquals(ProcessingStatus.FAILED_FINAL, attempt.status()); + assertFalse(attempt.retryable()); + + assertEquals(1, recordRepo.createdRecords.size()); + DocumentRecord record = recordRepo.createdRecords.get(0); + assertEquals(ProcessingStatus.FAILED_FINAL, record.overallStatus()); + assertEquals(1, record.failureCounters().contentErrorCount()); + assertEquals(0, record.failureCounters().transientErrorCount()); + assertNotNull(record.lastFailureInstant()); + assertNull(record.lastSuccessInstant()); + } + + @Test + void process_newDocument_firstPageLimitExceeded_persistsFailedRetryable_contentCounterOne() { + recordRepo.setLookupResult(new DocumentUnknown()); + DocumentProcessingOutcome outcome = new PreCheckFailed( + candidate, PreCheckFailureReason.PAGE_LIMIT_EXCEEDED); + + processor.process(candidate, fingerprint, outcome, context, attemptStart); + assertEquals(1, attemptRepo.savedAttempts.size()); ProcessingAttempt attempt = attemptRepo.savedAttempts.get(0); assertEquals(ProcessingStatus.FAILED_RETRYABLE, attempt.status()); @@ -1191,17 +1214,18 @@ class DocumentProcessingCoordinatorTest { // ------------------------------------------------------------------------- @Test - void process_contentErrorLifecycle_firstRunRetryable_secondRunFinal_thirdRunSkipped() { - // Run 1: new document, first deterministic content error → FAILED_RETRYABLE + void process_contentErrorLifecycle_pageLimitExceeded_firstRunRetryable_secondRunFinal_thirdRunSkipped() { + // PAGE_LIMIT_EXCEEDED follows the 1-retry rule: first run → FAILED_RETRYABLE, second → FAILED_FINAL. recordRepo.setLookupResult(new DocumentUnknown()); DocumentProcessingOutcome contentError = new PreCheckFailed( - candidate, PreCheckFailureReason.NO_USABLE_TEXT); + candidate, PreCheckFailureReason.PAGE_LIMIT_EXCEEDED); + // Run 1: new document, first content error → FAILED_RETRYABLE processor.process(candidate, fingerprint, contentError, context, attemptStart); DocumentRecord afterRun1 = recordRepo.createdRecords.get(0); assertEquals(ProcessingStatus.FAILED_RETRYABLE, afterRun1.overallStatus(), - "First content error must yield FAILED_RETRYABLE"); + "First PAGE_LIMIT_EXCEEDED must yield FAILED_RETRYABLE"); assertEquals(1, afterRun1.failureCounters().contentErrorCount()); assertTrue(attemptRepo.savedAttempts.get(0).retryable(), "First content error attempt must be retryable"); @@ -1236,6 +1260,36 @@ class DocumentProcessingCoordinatorTest { "Transient error counter must remain 0 after a SKIPPED_FINAL_FAILURE event"); } + @Test + void process_contentErrorLifecycle_noUsableText_immediatelyFinal_secondRunSkipped() { + // NO_USABLE_TEXT (image-only PDF): first run is immediately FAILED_FINAL, second is skipped. + recordRepo.setLookupResult(new DocumentUnknown()); + DocumentProcessingOutcome noTextError = new PreCheckFailed( + candidate, PreCheckFailureReason.NO_USABLE_TEXT); + + // Run 1: new document → FAILED_FINAL immediately + processor.process(candidate, fingerprint, noTextError, context, attemptStart); + + DocumentRecord afterRun1 = recordRepo.createdRecords.get(0); + assertEquals(ProcessingStatus.FAILED_FINAL, afterRun1.overallStatus(), + "NO_USABLE_TEXT must yield FAILED_FINAL immediately"); + assertEquals(1, afterRun1.failureCounters().contentErrorCount()); + assertFalse(attemptRepo.savedAttempts.get(0).retryable()); + + // Run 2: terminal FAILED_FINAL → SKIPPED_FINAL_FAILURE; counters must not change + recordRepo.setLookupResult(new DocumentTerminalFinalFailure(afterRun1)); + + processor.process(candidate, fingerprint, noTextError, context, attemptStart); + + assertEquals(2, attemptRepo.savedAttempts.size()); + ProcessingAttempt skipAttempt = attemptRepo.savedAttempts.get(1); + assertEquals(ProcessingStatus.SKIPPED_FINAL_FAILURE, skipAttempt.status()); + + DocumentRecord afterRun2 = recordRepo.updatedRecords.get(0); + assertEquals(1, afterRun2.failureCounters().contentErrorCount(), + "Content error counter must remain 1 after SKIPPED_FINAL_FAILURE"); + } + @Test void process_transientErrorLifecycle_maxRetriesTransient2_firstRetryable_secondFinal() { // maxRetriesTransient=2: first transient error → FAILED_RETRYABLE, second → FAILED_FINAL @@ -1594,8 +1648,9 @@ class DocumentProcessingCoordinatorTest { @Test void process_firstContentError_retryDecisionLog_containsFingerprintAndFailedRetryable() { - // Proves that the retry decision log for a first deterministic content error contains + // Proves that the retry decision log for a first retryable content error contains // both the document fingerprint and the FAILED_RETRYABLE classification. + // Uses PAGE_LIMIT_EXCEEDED which follows the 1-retry rule. MessageCapturingProcessingLogger capturingLogger = new MessageCapturingProcessingLogger(); DocumentProcessingCoordinator coordinatorWithCapturing = new DocumentProcessingCoordinator(recordRepo, attemptRepo, unitOfWorkPort, @@ -1604,7 +1659,7 @@ class DocumentProcessingCoordinatorTest { recordRepo.setLookupResult(new DocumentUnknown()); coordinatorWithCapturing.process(candidate, fingerprint, - new PreCheckFailed(candidate, PreCheckFailureReason.NO_USABLE_TEXT), + new PreCheckFailed(candidate, PreCheckFailureReason.PAGE_LIMIT_EXCEEDED), context, attemptStart); assertTrue(capturingLogger.anyWarnContains(FINGERPRINT_HEX), diff --git a/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransitionTest.java b/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransitionTest.java index 1d59db6..66bc40f 100644 --- a/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransitionTest.java +++ b/pdf-umbenenner-application/src/test/java/de/gecheckt/pdf/umbenenner/application/service/ProcessingOutcomeTransitionTest.java @@ -103,13 +103,28 @@ class ProcessingOutcomeTransitionTest { // ------------------------------------------------------------------------- @Test - void forNewDocument_firstPreCheckFailed_returnsFailedRetryable_contentCounterOne() { + void forNewDocument_noUsableText_immediatelyFailedFinal_noRetry() { PreCheckFailed outcome = new PreCheckFailed(candidate(), PreCheckFailureReason.NO_USABLE_TEXT); ProcessingOutcomeTransition.ProcessingOutcome result = ProcessingOutcomeTransition.forNewDocument(outcome, LIMIT_1); - assertEquals(ProcessingStatus.FAILED_RETRYABLE, result.overallStatus()); + assertEquals(ProcessingStatus.FAILED_FINAL, result.overallStatus(), + "NO_USABLE_TEXT must finalise immediately without retry"); + assertFalse(result.retryable()); + assertEquals(1, result.counters().contentErrorCount()); + assertEquals(0, result.counters().transientErrorCount()); + } + + @Test + void forNewDocument_firstPageLimitExceeded_returnsFailedRetryable_contentCounterOne() { + PreCheckFailed outcome = new PreCheckFailed(candidate(), PreCheckFailureReason.PAGE_LIMIT_EXCEEDED); + + ProcessingOutcomeTransition.ProcessingOutcome result = + ProcessingOutcomeTransition.forNewDocument(outcome, LIMIT_1); + + assertEquals(ProcessingStatus.FAILED_RETRYABLE, result.overallStatus(), + "PAGE_LIMIT_EXCEEDED first occurrence must be retryable"); assertTrue(result.retryable()); assertEquals(1, result.counters().contentErrorCount()); assertEquals(0, result.counters().transientErrorCount()); @@ -149,9 +164,10 @@ class ProcessingOutcomeTransitionTest { @Test void forNewDocument_contentError_transientCounterIsIrrelevant() { - PreCheckFailed outcome = new PreCheckFailed(candidate(), PreCheckFailureReason.NO_USABLE_TEXT); + // PAGE_LIMIT_EXCEEDED is used here: it follows the 1-retry rule, and a non-zero + // transient counter must not influence the content-error decision. + PreCheckFailed outcome = new PreCheckFailed(candidate(), PreCheckFailureReason.PAGE_LIMIT_EXCEEDED); - // Counter before: 0 content errors (first occurrence), transient ignored ProcessingOutcomeTransition.ProcessingOutcome result = ProcessingOutcomeTransition.forKnownDocument( outcome, new FailureCounters(0, 5), LIMIT_1); diff --git a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PreCheckFailureReason.java b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PreCheckFailureReason.java index c4c2351..9423b55 100644 --- a/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PreCheckFailureReason.java +++ b/pdf-umbenenner-domain/src/main/java/de/gecheckt/pdf/umbenenner/domain/model/PreCheckFailureReason.java @@ -21,9 +21,10 @@ public enum PreCheckFailureReason { * The extracted PDF text, after normalization, contains no letters or digits. *

    * This is a deterministic content error: reprocessing the same file in a later run - * will have the same outcome unless the source file is changed. + * will have the same outcome unless the source file is changed (e.g. by adding OCR). *

    - * Retry logic: exactly 1 retry in a later batch run. + * Retry logic: no retry — the document is immediately finalised to + * {@link ProcessingStatus#FAILED_FINAL}. */ NO_USABLE_TEXT("No usable text in extracted PDF content"),