fix: cold-start 599 on DirSize/MD5 — restart task instead of giving up

DSM's DirSize and MD5 background service needs ~6-8 s to initialise
after a period of inactivity. During this cold-start window tasks are
registered but every status poll returns error 599 ("no such task").

Replace the ad-hoc start+_poll_task call in dir_size and get_md5 with
a new _start_and_poll_oneshot helper that:
- polls with exponential backoff (0.2 s → cap 2 s)
- on 5 consecutive 599s: restarts the task (up to 6 attempts total)
- honours a shared 60 s wall-clock budget across all restarts
- returns a clear error if all restart attempts are exhausted

Root cause confirmed by test_dirsize_md5.py: after ~6 s / 2 restarts
the service warms up and the very first poll on the new task succeeds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 14:49:32 +02:00
parent 8b2f07d9c3
commit 451ee7116f
4 changed files with 125 additions and 38 deletions
+29 -1
View File
@@ -1636,7 +1636,7 @@ async def test_dir_size_retries_on_transient_599(config: AppConfig) -> None:
@pytest.mark.asyncio
async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None:
"""dir_size gives up and returns Error: after 5 consecutive 599 responses."""
"""dir_size gives up and returns Error: after exhausting all restart attempts."""
client = MagicMock()
async def _request(api, method, version=None, params=None, **kwargs):
@@ -1653,6 +1653,34 @@ async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None
assert result.startswith("Error:")
@pytest.mark.asyncio
async def test_dir_size_cold_start_restart(config: AppConfig) -> None:
"""dir_size restarts the task after 5 consecutive 599s and succeeds on second attempt."""
client = MagicMock()
start_count = {"n": 0}
status_count = {"n": 0}
async def _request(api, method, version=None, params=None, **kwargs):
if method == "start":
start_count["n"] += 1
return {"taskid": f"task_{start_count['n']}"}
status_count["n"] += 1
# First 5 status calls → 599 (simulates cold start)
if status_count["n"] <= 5:
raise SynologyError("DSM error code 599", code=599)
# After restart: immediately done
return {"finished": True, "num_dir": 1, "num_file": 5, "total_size": 1024}
client.request = AsyncMock(side_effect=_request)
tools = _make_mcp_and_tools(config, client)
with patch("asyncio.sleep", new_callable=AsyncMock):
result = await tools["dir_size"](path="/coldstart")
assert "Total Size" in result
assert start_count["n"] == 2 # task was restarted once after cold-start 599s
# ──────────────────────────────────────────────────────────────────────────
# get_md5
# ──────────────────────────────────────────────────────────────────────────