fix: cold-start 599 on DirSize/MD5 — restart task instead of giving up

DSM's DirSize and MD5 background service needs ~6-8 s to initialise
after a period of inactivity. During this cold-start window tasks are
registered but every status poll returns error 599 ("no such task").

Replace the ad-hoc start+_poll_task call in dir_size and get_md5 with
a new _start_and_poll_oneshot helper that:
- polls with exponential backoff (0.2 s → cap 2 s)
- on 5 consecutive 599s: restarts the task (up to 6 attempts total)
- honours a shared 60 s wall-clock budget across all restarts
- returns a clear error if all restart attempts are exhausted

Root cause confirmed by test_dirsize_md5.py: after ~6 s / 2 restarts
the service warms up and the very first poll on the new task succeeds.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 14:49:32 +02:00
parent 8b2f07d9c3
commit 451ee7116f
4 changed files with 125 additions and 38 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
[project] [project]
name = "mcp-synology-filestation" name = "mcp-synology-filestation"
version = "0.2.8" version = "0.2.9"
description = "MCP server for Synology FileStation" description = "MCP server for Synology FileStation"
requires-python = ">=3.12" requires-python = ">=3.12"
dependencies = [ dependencies = [
+1 -1
View File
@@ -1,3 +1,3 @@
"""MCP server for Synology FileStation.""" """MCP server for Synology FileStation."""
__version__ = "0.2.8" __version__ = "0.2.9"
@@ -59,7 +59,88 @@ def register_filestation(
client: FileStationClient for DSM API calls. client: FileStationClient for DSM API calls.
""" """
# ── internal polling helper ────────────────────────────────────────── # ── internal polling helpers ──────────────────────────────────────────
async def _start_and_poll_oneshot(
api: str,
start_params: dict[str, Any],
start_version: int,
poll_version: int,
) -> tuple[bool, dict[str, Any] | str]:
"""Start a one-shot DSM task and poll until finished, restarting on cold-start 599s.
DirSize and MD5 are "one-shot" tasks: DSM delivers ``finished=True`` exactly
once, then discards the result. Additionally, the DSM background service for
these tasks occasionally needs a few seconds to initialise after a period of
inactivity ("cold start"). During cold start the service registers task IDs
but returns error 599 on every status poll. The correct recovery is to restart
the task once the service has had time to wake up.
Args:
api: DSM API name (e.g. "SYNO.FileStation.DirSize").
start_params: Parameters forwarded to the ``start`` method.
start_version: API version for the ``start`` call.
poll_version: API version for the ``status`` call.
Returns:
``(True, status_dict)`` on success, or ``(False, "Error: …")`` on
DSM error or timeout.
"""
from mcp_synology_filestation.client import SynologyError as _SynologyError
max_restarts = 6
timeout = 60.0
total_elapsed = 0.0
for _attempt in range(max_restarts):
try:
start_data = await client.request(
api, "start", version=start_version, params=start_params
)
except _SynologyError as e:
return False, f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return False, "Error: DSM did not return a task ID."
# Poll with exponential backoff; restart on 5 consecutive 599s
delay = 0.2
consecutive_599 = 0
while True:
try:
status_data = await client.request(
api, "status", version=poll_version, params={"taskid": taskid}
)
consecutive_599 = 0
if status_data.get("finished"):
return True, status_data
# Still running — keep polling
except _SynologyError as e:
if e.code != 599:
return False, f"Error: {e}"
consecutive_599 += 1
if consecutive_599 >= 5:
# 5× 599 in a row: either cold start or missed result window.
# Restart the task so DSM can re-queue it.
break
if total_elapsed >= timeout:
return (
False,
"Error: Operation timed out after 60 seconds — check NAS manually.",
)
await asyncio.sleep(delay)
total_elapsed += delay
delay = min(delay * 2, 2.0)
return (
False,
"Error: DSM did not return results after multiple retries"
" (service may be starting up — try again in a moment).",
)
async def _poll_task( async def _poll_task(
api: str, api: str,
@@ -804,27 +885,16 @@ def register_filestation(
async def dir_size(path: str): async def dir_size(path: str):
"""Get total size, file count and folder count for one or more directories. """Get total size, file count and folder count for one or more directories.
path: comma-separated share-relative paths.""" path: comma-separated share-relative paths."""
from mcp_synology_filestation.client import SynologyError
paths = [p.strip() for p in path.split(",") if p.strip()] paths = [p.strip() for p in path.split(",") if p.strip()]
if not paths: if not paths:
return "Error: no path provided." return "Error: no path provided."
try: ok, result = await _start_and_poll_oneshot(
start_data = await client.request( "SYNO.FileStation.DirSize",
"SYNO.FileStation.DirSize", start_params={"path": json.dumps(paths)},
"start", start_version=2,
version=2, poll_version=1,
params={"path": json.dumps(paths)}, )
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_task("SYNO.FileStation.DirSize", 1, taskid, initial_delay=0.0)
if not ok: if not ok:
return result # type: ignore[return-value] return result # type: ignore[return-value]
@@ -869,23 +939,12 @@ def register_filestation(
@mcp.tool() @mcp.tool()
async def get_md5(path: str): async def get_md5(path: str):
"""Compute the MD5 checksum of a file on the NAS. path: share-relative file path.""" """Compute the MD5 checksum of a file on the NAS. path: share-relative file path."""
from mcp_synology_filestation.client import SynologyError ok, result = await _start_and_poll_oneshot(
"SYNO.FileStation.MD5",
try: start_params={"file_path": json.dumps(path)},
start_data = await client.request( start_version=2,
"SYNO.FileStation.MD5", poll_version=1,
"start", )
version=2,
params={"file_path": json.dumps(path)},
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_task("SYNO.FileStation.MD5", 1, taskid, initial_delay=0.0)
if not ok: if not ok:
return result # type: ignore[return-value] return result # type: ignore[return-value]
+29 -1
View File
@@ -1636,7 +1636,7 @@ async def test_dir_size_retries_on_transient_599(config: AppConfig) -> None:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None: async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None:
"""dir_size gives up and returns Error: after 5 consecutive 599 responses.""" """dir_size gives up and returns Error: after exhausting all restart attempts."""
client = MagicMock() client = MagicMock()
async def _request(api, method, version=None, params=None, **kwargs): async def _request(api, method, version=None, params=None, **kwargs):
@@ -1653,6 +1653,34 @@ async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None
assert result.startswith("Error:") assert result.startswith("Error:")
@pytest.mark.asyncio
async def test_dir_size_cold_start_restart(config: AppConfig) -> None:
"""dir_size restarts the task after 5 consecutive 599s and succeeds on second attempt."""
client = MagicMock()
start_count = {"n": 0}
status_count = {"n": 0}
async def _request(api, method, version=None, params=None, **kwargs):
if method == "start":
start_count["n"] += 1
return {"taskid": f"task_{start_count['n']}"}
status_count["n"] += 1
# First 5 status calls → 599 (simulates cold start)
if status_count["n"] <= 5:
raise SynologyError("DSM error code 599", code=599)
# After restart: immediately done
return {"finished": True, "num_dir": 1, "num_file": 5, "total_size": 1024}
client.request = AsyncMock(side_effect=_request)
tools = _make_mcp_and_tools(config, client)
with patch("asyncio.sleep", new_callable=AsyncMock):
result = await tools["dir_size"](path="/coldstart")
assert "Total Size" in result
assert start_count["n"] == 2 # task was restarted once after cold-start 599s
# ────────────────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────────────────
# get_md5 # get_md5
# ────────────────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────────────────