fix: cold-start 599 on DirSize/MD5 — restart task instead of giving up

DSM's DirSize and MD5 background service needs ~6-8 s to initialise
after a period of inactivity. During this cold-start window tasks are
registered but every status poll returns error 599 ("no such task").

Replace the ad-hoc start+_poll_task call in dir_size and get_md5 with
a new _start_and_poll_oneshot helper that:
- polls with exponential backoff (0.2 s → cap 2 s)
- on 5 consecutive 599s: restarts the task (up to 6 attempts total)
- honours a shared 60 s wall-clock budget across all restarts
- returns a clear error if all restart attempts are exhausted

Root cause confirmed by test_dirsize_md5.py: after ~6 s / 2 restarts
the service warms up and the very first poll on the new task succeeds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 14:49:32 +02:00
parent 8b2f07d9c3
commit 451ee7116f
4 changed files with 125 additions and 38 deletions
+1 -1
View File
@@ -1,3 +1,3 @@
"""MCP server for Synology FileStation."""
__version__ = "0.2.8"
__version__ = "0.2.9"
@@ -59,7 +59,88 @@ def register_filestation(
client: FileStationClient for DSM API calls.
"""
# ── internal polling helper ──────────────────────────────────────────
# ── internal polling helpers ──────────────────────────────────────────
async def _start_and_poll_oneshot(
api: str,
start_params: dict[str, Any],
start_version: int,
poll_version: int,
) -> tuple[bool, dict[str, Any] | str]:
"""Start a one-shot DSM task and poll until finished, restarting on cold-start 599s.
DirSize and MD5 are "one-shot" tasks: DSM delivers ``finished=True`` exactly
once, then discards the result. Additionally, the DSM background service for
these tasks occasionally needs a few seconds to initialise after a period of
inactivity ("cold start"). During cold start the service registers task IDs
but returns error 599 on every status poll. The correct recovery is to restart
the task once the service has had time to wake up.
Args:
api: DSM API name (e.g. "SYNO.FileStation.DirSize").
start_params: Parameters forwarded to the ``start`` method.
start_version: API version for the ``start`` call.
poll_version: API version for the ``status`` call.
Returns:
``(True, status_dict)`` on success, or ``(False, "Error: …")`` on
DSM error or timeout.
"""
from mcp_synology_filestation.client import SynologyError as _SynologyError
max_restarts = 6
timeout = 60.0
total_elapsed = 0.0
for _attempt in range(max_restarts):
try:
start_data = await client.request(
api, "start", version=start_version, params=start_params
)
except _SynologyError as e:
return False, f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return False, "Error: DSM did not return a task ID."
# Poll with exponential backoff; restart on 5 consecutive 599s
delay = 0.2
consecutive_599 = 0
while True:
try:
status_data = await client.request(
api, "status", version=poll_version, params={"taskid": taskid}
)
consecutive_599 = 0
if status_data.get("finished"):
return True, status_data
# Still running — keep polling
except _SynologyError as e:
if e.code != 599:
return False, f"Error: {e}"
consecutive_599 += 1
if consecutive_599 >= 5:
# 5× 599 in a row: either cold start or missed result window.
# Restart the task so DSM can re-queue it.
break
if total_elapsed >= timeout:
return (
False,
"Error: Operation timed out after 60 seconds — check NAS manually.",
)
await asyncio.sleep(delay)
total_elapsed += delay
delay = min(delay * 2, 2.0)
return (
False,
"Error: DSM did not return results after multiple retries"
" (service may be starting up — try again in a moment).",
)
async def _poll_task(
api: str,
@@ -804,27 +885,16 @@ def register_filestation(
async def dir_size(path: str):
"""Get total size, file count and folder count for one or more directories.
path: comma-separated share-relative paths."""
from mcp_synology_filestation.client import SynologyError
paths = [p.strip() for p in path.split(",") if p.strip()]
if not paths:
return "Error: no path provided."
try:
start_data = await client.request(
"SYNO.FileStation.DirSize",
"start",
version=2,
params={"path": json.dumps(paths)},
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_task("SYNO.FileStation.DirSize", 1, taskid, initial_delay=0.0)
ok, result = await _start_and_poll_oneshot(
"SYNO.FileStation.DirSize",
start_params={"path": json.dumps(paths)},
start_version=2,
poll_version=1,
)
if not ok:
return result # type: ignore[return-value]
@@ -869,23 +939,12 @@ def register_filestation(
@mcp.tool()
async def get_md5(path: str):
"""Compute the MD5 checksum of a file on the NAS. path: share-relative file path."""
from mcp_synology_filestation.client import SynologyError
try:
start_data = await client.request(
"SYNO.FileStation.MD5",
"start",
version=2,
params={"file_path": json.dumps(path)},
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_task("SYNO.FileStation.MD5", 1, taskid, initial_delay=0.0)
ok, result = await _start_and_poll_oneshot(
"SYNO.FileStation.MD5",
start_params={"file_path": json.dumps(path)},
start_version=2,
poll_version=1,
)
if not ok:
return result # type: ignore[return-value]