fix: _poll_oneshot for DirSize/MD5 with burst-retry on early 599

Add FileStationClient.start_and_poll_immediately: starts the async task and
immediately makes the first status poll within the same method, with no
intermediate awaits other than the two HTTP calls. This minimises scheduler
latency between start and first poll for one-shot tasks.

_poll_oneshot now accepts the first_status from start_and_poll_immediately:
- finished=True on first poll → return immediately
- finished=False → Phase 2 (exponential backoff, 60 s timeout)
- None (first poll was 599) → burst-retry 10× at 10 ms, then Phase 2
  (Phase 2 keeps polling through 599 until seen_alive, then fails fast)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 13:50:28 +02:00
parent 0e8ffaa6df
commit 62f8e41931
5 changed files with 128 additions and 80 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
[project]
name = "mcp-synology-filestation"
version = "0.2.6"
version = "0.2.7"
description = "MCP server for Synology FileStation"
requires-python = ">=3.12"
dependencies = [
+1 -1
View File
@@ -1,3 +1,3 @@
"""MCP server for Synology FileStation."""
__version__ = "0.2.6"
__version__ = "0.2.7"
+46
View File
@@ -307,6 +307,52 @@ class FileStationClient:
raise SynologyError(_error_message(code, api), code=code)
async def start_and_poll_immediately(
self,
api: str,
start_params: dict[str, Any],
poll_version: int,
*,
start_version: int | None = None,
) -> tuple[str, dict[str, Any] | None]:
"""Start a DSM async task and immediately make the first status poll.
Designed for one-shot tasks (DirSize, MD5) where the result window
may close quickly. Both the ``start`` and the first ``status`` request
are issued inside this single method with no intermediate awaits other
than the HTTP calls themselves, minimising scheduler latency.
Args:
api: DSM API name (e.g. "SYNO.FileStation.DirSize").
start_params: Query parameters for the ``start`` call.
poll_version: API version to use for the ``status`` call.
start_version: API version for the ``start`` call (defaults to
``maxVersion`` from the API info cache).
Returns:
``(taskid, status_data)`` where ``status_data`` is ``None`` if
the first status poll returned 599 (task not yet visible).
Raises:
SynologyError: If the ``start`` call fails, the response contains
no task ID, or the ``status`` call fails with a non-599 error.
"""
start_data = await self.request(api, "start", version=start_version, params=start_params)
taskid: str = start_data.get("taskid", "")
if not taskid:
raise SynologyError("DSM did not return a task ID.", code=0)
try:
status_data = await self.request(
api, "status", version=poll_version, params={"taskid": taskid}
)
except SynologyError as e:
if e.code == 599:
return taskid, None
raise
return taskid, status_data
async def download_bytes(self, path: str) -> tuple[str, bytes]:
"""Download a file from the NAS via SYNO.FileStation.Download.
@@ -123,58 +123,53 @@ def register_filestation(
api: str,
version: int,
taskid: str,
first_status: dict[str, Any] | None,
) -> tuple[bool, dict[str, Any] | str]:
"""Poll a one-shot DSM task (DirSize, MD5).
"""Continue polling a one-shot DSM task after the first status poll.
One-shot tasks deliver ``finished=True`` exactly once; after that,
status polls return 599. Two phases:
Called after ``client.start_and_poll_immediately`` has already made
the first status request. Handles three outcomes for ``first_status``:
Phase 1 — burst: polls immediately, then up to 10 times at 50 ms
intervals. This catches tasks that complete in under ~500 ms.
If all burst polls return 599, the result window was missed.
Phase 2 — normal: entered only after receiving ``finished=False``
(task confirmed running). Exponential backoff up to 60 s. A 599 in
this phase means the window closed before we polled — fail fast.
* ``finished=True`` — return immediately (task done on first poll).
* ``finished=False`` — task confirmed running; enter Phase 2
(exponential backoff until ``finished=True`` or 60 s timeout).
* ``None`` (first poll returned 599) — burst-retry 10× at 10 ms,
then enter Phase 2 regardless (large directories will eventually
return ``finished=False``; a 599 after the task was seen alive
means the window closed — fail fast with a retry message).
Returns:
``(True, status_dict)`` on success, or ``(False, "Error: …")``
on DSM error, missed window, or timeout.
on DSM error or timeout.
"""
from mcp_synology_filestation.client import SynologyError as _SynologyError
burst_count = 10
burst_interval = 0.05 # 50 ms between burst retries
seen_alive = False
# ── Phase 1: burst ────────────────────────────────────────────────
for attempt in range(burst_count + 1):
if attempt > 0:
await asyncio.sleep(burst_interval)
try:
status_data = await client.request(
api,
"status",
version=version,
params={"taskid": taskid},
)
except _SynologyError as e:
if e.code != 599:
return False, f"Error: {e}"
continue # 599 — task not visible yet, keep bursting
if status_data.get("finished"):
return True, status_data
break # finished=False — task confirmed running, enter Phase 2
if first_status is not None:
if first_status.get("finished"):
return True, first_status
seen_alive = True # finished=False: task is running
else:
# All burst polls returned 599 — one-shot window was missed
return (
False,
"Error: Could not read task result — the operation finished"
" before the first successful poll. Please retry.",
)
# 599 on the immediate poll: burst-retry (10×, 10 ms apart)
for _ in range(10):
await asyncio.sleep(0.01)
try:
s = await client.request(
api, "status", version=version, params={"taskid": taskid}
)
except _SynologyError as e:
if e.code == 599:
continue
return False, f"Error: {e}"
if s.get("finished"):
return True, s
seen_alive = True
break # finished=False: enter Phase 2
# ── Phase 2: exponential backoff ──────────────────────────────────
# ── Phase 2: exponential backoff until finished or 60 s timeout ──
delay = 0.2
elapsed = burst_count * burst_interval # time already spent in burst
elapsed = 0.0
timeout = 60.0
while True:
@@ -183,23 +178,23 @@ def register_filestation(
delay = min(delay * 2, 2.0)
try:
status_data = await client.request(
api,
"status",
version=version,
params={"taskid": taskid},
)
s = await client.request(api, "status", version=version, params={"taskid": taskid})
except _SynologyError as e:
if e.code == 599:
return (
False,
"Error: Could not read task result — the operation finished"
" before the result was polled. Please retry.",
)
return False, f"Error: {e}"
if status_data.get("finished"):
return True, status_data
if seen_alive:
# Task was running but the one-shot window closed before we read it
return (
False,
"Error: Could not read task result — the operation finished"
" before the result was polled. Please retry.",
)
# Not yet seen alive: large dir still initialising, keep polling
else:
return False, f"Error: {e}"
else:
seen_alive = True
if s.get("finished"):
return True, s
if elapsed >= timeout:
return (
@@ -895,20 +890,16 @@ def register_filestation(
return "Error: no path provided."
try:
start_data = await client.request(
taskid, first_status = await client.start_and_poll_immediately(
"SYNO.FileStation.DirSize",
"start",
version=2,
params={"path": json.dumps(paths)},
start_params={"path": json.dumps(paths)},
poll_version=1,
start_version=2,
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_oneshot("SYNO.FileStation.DirSize", 1, taskid)
ok, result = await _poll_oneshot("SYNO.FileStation.DirSize", 1, taskid, first_status)
if not ok:
return result # type: ignore[return-value]
@@ -956,20 +947,16 @@ def register_filestation(
from mcp_synology_filestation.client import SynologyError
try:
start_data = await client.request(
taskid, first_status = await client.start_and_poll_immediately(
"SYNO.FileStation.MD5",
"start",
version=2,
params={"file_path": json.dumps(path)},
start_params={"file_path": json.dumps(path)},
poll_version=1,
start_version=2,
)
except SynologyError as e:
return f"Error: {e}"
taskid: str = start_data.get("taskid", "")
if not taskid:
return "Error: DSM did not return a task ID."
ok, result = await _poll_oneshot("SYNO.FileStation.MD5", 1, taskid)
ok, result = await _poll_oneshot("SYNO.FileStation.MD5", 1, taskid, first_status)
if not ok:
return result # type: ignore[return-value]
+21 -6
View File
@@ -22,8 +22,24 @@ def config() -> AppConfig:
def _make_mcp_and_tools(config: AppConfig, client: MagicMock) -> dict:
"""Register FileStation tools on a mock FastMCP and collect them by name."""
from mcp_synology_filestation.client import FileStationClient
from mcp_synology_filestation.tools.filestation import register_filestation
# Bind the real start_and_poll_immediately so it delegates into the
# already-mocked client.request — no separate mock needed per test.
async def _start_and_poll_immediately(
api: str,
start_params: dict,
poll_version: int,
*,
start_version: int | None = None,
):
return await FileStationClient.start_and_poll_immediately(
client, api, start_params, poll_version, start_version=start_version
)
client.start_and_poll_immediately = _start_and_poll_immediately
registered: dict[str, object] = {}
mcp = MagicMock()
@@ -1635,12 +1651,11 @@ async def test_dir_size_retries_on_transient_599(config: AppConfig) -> None:
@pytest.mark.asyncio
async def test_dir_size_window_timeout_on_persistent_599(config: AppConfig) -> None:
"""dir_size fails fast (window_timeout=3 s) when DSM returns only 599s.
async def test_dir_size_times_out_on_persistent_599(config: AppConfig) -> None:
"""dir_size times out after 60 s when DSM returns only 599s for every poll.
Once the window_timeout elapses without ever seeing the task running
(finished=False), _poll_task returns the "result window missed" error
rather than waiting the full 60 s.
The immediate poll + burst both return 599; Phase 2 keeps polling (large
directories eventually surface) until the 60 s timeout fires.
"""
client = MagicMock()
@@ -1656,7 +1671,7 @@ async def test_dir_size_window_timeout_on_persistent_599(config: AppConfig) -> N
result = await tools["dir_size"](path="/dead")
assert result.startswith("Error:")
assert "retry" in result.lower() or "window" in result.lower() or "poll" in result.lower()
assert "timed out" in result.lower() or "60 seconds" in result
# ──────────────────────────────────────────────────────────────────────────