fix: cold-start 599 on DirSize/MD5 — restart task instead of giving up
DSM's DirSize and MD5 background service needs ~6-8 s to initialise
after a period of inactivity. During this cold-start window tasks are
registered but every status poll returns error 599 ("no such task").
Replace the ad-hoc start+_poll_task call in dir_size and get_md5 with
a new _start_and_poll_oneshot helper that:
- polls with exponential backoff (0.2 s → cap 2 s)
- on 5 consecutive 599s: restarts the task (up to 6 attempts total)
- honours a shared 60 s wall-clock budget across all restarts
- returns a clear error if all restart attempts are exhausted
Root cause confirmed by test_dirsize_md5.py: after ~6 s / 2 restarts
the service warms up and the very first poll on the new task succeeds.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-synology-filestation"
|
name = "mcp-synology-filestation"
|
||||||
version = "0.2.8"
|
version = "0.2.9"
|
||||||
description = "MCP server for Synology FileStation"
|
description = "MCP server for Synology FileStation"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""MCP server for Synology FileStation."""
|
"""MCP server for Synology FileStation."""
|
||||||
|
|
||||||
__version__ = "0.2.8"
|
__version__ = "0.2.9"
|
||||||
|
|||||||
@@ -59,7 +59,88 @@ def register_filestation(
|
|||||||
client: FileStationClient for DSM API calls.
|
client: FileStationClient for DSM API calls.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# ── internal polling helper ───────────────────────────────────────────
|
# ── internal polling helpers ──────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _start_and_poll_oneshot(
|
||||||
|
api: str,
|
||||||
|
start_params: dict[str, Any],
|
||||||
|
start_version: int,
|
||||||
|
poll_version: int,
|
||||||
|
) -> tuple[bool, dict[str, Any] | str]:
|
||||||
|
"""Start a one-shot DSM task and poll until finished, restarting on cold-start 599s.
|
||||||
|
|
||||||
|
DirSize and MD5 are "one-shot" tasks: DSM delivers ``finished=True`` exactly
|
||||||
|
once, then discards the result. Additionally, the DSM background service for
|
||||||
|
these tasks occasionally needs a few seconds to initialise after a period of
|
||||||
|
inactivity ("cold start"). During cold start the service registers task IDs
|
||||||
|
but returns error 599 on every status poll. The correct recovery is to restart
|
||||||
|
the task once the service has had time to wake up.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api: DSM API name (e.g. "SYNO.FileStation.DirSize").
|
||||||
|
start_params: Parameters forwarded to the ``start`` method.
|
||||||
|
start_version: API version for the ``start`` call.
|
||||||
|
poll_version: API version for the ``status`` call.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``(True, status_dict)`` on success, or ``(False, "Error: …")`` on
|
||||||
|
DSM error or timeout.
|
||||||
|
"""
|
||||||
|
from mcp_synology_filestation.client import SynologyError as _SynologyError
|
||||||
|
|
||||||
|
max_restarts = 6
|
||||||
|
timeout = 60.0
|
||||||
|
total_elapsed = 0.0
|
||||||
|
|
||||||
|
for _attempt in range(max_restarts):
|
||||||
|
try:
|
||||||
|
start_data = await client.request(
|
||||||
|
api, "start", version=start_version, params=start_params
|
||||||
|
)
|
||||||
|
except _SynologyError as e:
|
||||||
|
return False, f"Error: {e}"
|
||||||
|
|
||||||
|
taskid: str = start_data.get("taskid", "")
|
||||||
|
if not taskid:
|
||||||
|
return False, "Error: DSM did not return a task ID."
|
||||||
|
|
||||||
|
# Poll with exponential backoff; restart on 5 consecutive 599s
|
||||||
|
delay = 0.2
|
||||||
|
consecutive_599 = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
status_data = await client.request(
|
||||||
|
api, "status", version=poll_version, params={"taskid": taskid}
|
||||||
|
)
|
||||||
|
consecutive_599 = 0
|
||||||
|
if status_data.get("finished"):
|
||||||
|
return True, status_data
|
||||||
|
# Still running — keep polling
|
||||||
|
except _SynologyError as e:
|
||||||
|
if e.code != 599:
|
||||||
|
return False, f"Error: {e}"
|
||||||
|
consecutive_599 += 1
|
||||||
|
if consecutive_599 >= 5:
|
||||||
|
# 5× 599 in a row: either cold start or missed result window.
|
||||||
|
# Restart the task so DSM can re-queue it.
|
||||||
|
break
|
||||||
|
|
||||||
|
if total_elapsed >= timeout:
|
||||||
|
return (
|
||||||
|
False,
|
||||||
|
"Error: Operation timed out after 60 seconds — check NAS manually.",
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
total_elapsed += delay
|
||||||
|
delay = min(delay * 2, 2.0)
|
||||||
|
|
||||||
|
return (
|
||||||
|
False,
|
||||||
|
"Error: DSM did not return results after multiple retries"
|
||||||
|
" (service may be starting up — try again in a moment).",
|
||||||
|
)
|
||||||
|
|
||||||
async def _poll_task(
|
async def _poll_task(
|
||||||
api: str,
|
api: str,
|
||||||
@@ -804,27 +885,16 @@ def register_filestation(
|
|||||||
async def dir_size(path: str):
|
async def dir_size(path: str):
|
||||||
"""Get total size, file count and folder count for one or more directories.
|
"""Get total size, file count and folder count for one or more directories.
|
||||||
path: comma-separated share-relative paths."""
|
path: comma-separated share-relative paths."""
|
||||||
from mcp_synology_filestation.client import SynologyError
|
|
||||||
|
|
||||||
paths = [p.strip() for p in path.split(",") if p.strip()]
|
paths = [p.strip() for p in path.split(",") if p.strip()]
|
||||||
if not paths:
|
if not paths:
|
||||||
return "Error: no path provided."
|
return "Error: no path provided."
|
||||||
|
|
||||||
try:
|
ok, result = await _start_and_poll_oneshot(
|
||||||
start_data = await client.request(
|
"SYNO.FileStation.DirSize",
|
||||||
"SYNO.FileStation.DirSize",
|
start_params={"path": json.dumps(paths)},
|
||||||
"start",
|
start_version=2,
|
||||||
version=2,
|
poll_version=1,
|
||||||
params={"path": json.dumps(paths)},
|
)
|
||||||
)
|
|
||||||
except SynologyError as e:
|
|
||||||
return f"Error: {e}"
|
|
||||||
|
|
||||||
taskid: str = start_data.get("taskid", "")
|
|
||||||
if not taskid:
|
|
||||||
return "Error: DSM did not return a task ID."
|
|
||||||
|
|
||||||
ok, result = await _poll_task("SYNO.FileStation.DirSize", 1, taskid, initial_delay=0.0)
|
|
||||||
if not ok:
|
if not ok:
|
||||||
return result # type: ignore[return-value]
|
return result # type: ignore[return-value]
|
||||||
|
|
||||||
@@ -869,23 +939,12 @@ def register_filestation(
|
|||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
async def get_md5(path: str):
|
async def get_md5(path: str):
|
||||||
"""Compute the MD5 checksum of a file on the NAS. path: share-relative file path."""
|
"""Compute the MD5 checksum of a file on the NAS. path: share-relative file path."""
|
||||||
from mcp_synology_filestation.client import SynologyError
|
ok, result = await _start_and_poll_oneshot(
|
||||||
|
"SYNO.FileStation.MD5",
|
||||||
try:
|
start_params={"file_path": json.dumps(path)},
|
||||||
start_data = await client.request(
|
start_version=2,
|
||||||
"SYNO.FileStation.MD5",
|
poll_version=1,
|
||||||
"start",
|
)
|
||||||
version=2,
|
|
||||||
params={"file_path": json.dumps(path)},
|
|
||||||
)
|
|
||||||
except SynologyError as e:
|
|
||||||
return f"Error: {e}"
|
|
||||||
|
|
||||||
taskid: str = start_data.get("taskid", "")
|
|
||||||
if not taskid:
|
|
||||||
return "Error: DSM did not return a task ID."
|
|
||||||
|
|
||||||
ok, result = await _poll_task("SYNO.FileStation.MD5", 1, taskid, initial_delay=0.0)
|
|
||||||
if not ok:
|
if not ok:
|
||||||
return result # type: ignore[return-value]
|
return result # type: ignore[return-value]
|
||||||
|
|
||||||
|
|||||||
@@ -1636,7 +1636,7 @@ async def test_dir_size_retries_on_transient_599(config: AppConfig) -> None:
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None:
|
async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None:
|
||||||
"""dir_size gives up and returns Error: after 5 consecutive 599 responses."""
|
"""dir_size gives up and returns Error: after exhausting all restart attempts."""
|
||||||
client = MagicMock()
|
client = MagicMock()
|
||||||
|
|
||||||
async def _request(api, method, version=None, params=None, **kwargs):
|
async def _request(api, method, version=None, params=None, **kwargs):
|
||||||
@@ -1653,6 +1653,34 @@ async def test_dir_size_fails_after_5_consecutive_599(config: AppConfig) -> None
|
|||||||
assert result.startswith("Error:")
|
assert result.startswith("Error:")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_dir_size_cold_start_restart(config: AppConfig) -> None:
|
||||||
|
"""dir_size restarts the task after 5 consecutive 599s and succeeds on second attempt."""
|
||||||
|
client = MagicMock()
|
||||||
|
start_count = {"n": 0}
|
||||||
|
status_count = {"n": 0}
|
||||||
|
|
||||||
|
async def _request(api, method, version=None, params=None, **kwargs):
|
||||||
|
if method == "start":
|
||||||
|
start_count["n"] += 1
|
||||||
|
return {"taskid": f"task_{start_count['n']}"}
|
||||||
|
status_count["n"] += 1
|
||||||
|
# First 5 status calls → 599 (simulates cold start)
|
||||||
|
if status_count["n"] <= 5:
|
||||||
|
raise SynologyError("DSM error code 599", code=599)
|
||||||
|
# After restart: immediately done
|
||||||
|
return {"finished": True, "num_dir": 1, "num_file": 5, "total_size": 1024}
|
||||||
|
|
||||||
|
client.request = AsyncMock(side_effect=_request)
|
||||||
|
tools = _make_mcp_and_tools(config, client)
|
||||||
|
|
||||||
|
with patch("asyncio.sleep", new_callable=AsyncMock):
|
||||||
|
result = await tools["dir_size"](path="/coldstart")
|
||||||
|
|
||||||
|
assert "Total Size" in result
|
||||||
|
assert start_count["n"] == 2 # task was restarted once after cold-start 599s
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────
|
||||||
# get_md5
|
# get_md5
|
||||||
# ──────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user