feat: add read_text tool — extract text from PDF/TXT/MD (v0.4.0)
Adds read_text tool with pypdf integration for PDF text extraction, plain-text decode for TXT/MD/CSV/JSON/etc, page-filter support, max_chars truncation, and 11 mock-based tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+2
-1
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "mcp-synology-filestation"
|
name = "mcp-synology-filestation"
|
||||||
version = "0.3.6"
|
version = "0.4.0"
|
||||||
description = "MCP server for Synology FileStation"
|
description = "MCP server for Synology FileStation"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
@@ -10,6 +10,7 @@ dependencies = [
|
|||||||
"keyring>=25.0.0",
|
"keyring>=25.0.0",
|
||||||
"click>=8.1.0",
|
"click>=8.1.0",
|
||||||
"rich>=13.0.0",
|
"rich>=13.0.0",
|
||||||
|
"pypdf>=4.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
"""MCP server for Synology FileStation."""
|
"""MCP server for Synology FileStation."""
|
||||||
|
|
||||||
__version__ = "0.3.6"
|
__version__ = "0.4.0"
|
||||||
|
|||||||
@@ -1319,6 +1319,109 @@ def register_filestation(
|
|||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
@mcp.tool()
|
||||||
|
async def read_text(path: str, max_chars: int = 50_000, page: int = 0):
|
||||||
|
"""Extract readable text from a PDF, TXT, or MD file (max 10 MB)."""
|
||||||
|
import io
|
||||||
|
|
||||||
|
from mcp_synology_filestation.client import SynologyError
|
||||||
|
|
||||||
|
max_read_bytes = 10 * 1024 * 1024
|
||||||
|
|
||||||
|
plain_text_exts = {
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".markdown",
|
||||||
|
".csv",
|
||||||
|
".log",
|
||||||
|
".yaml",
|
||||||
|
".yml",
|
||||||
|
".json",
|
||||||
|
".xml",
|
||||||
|
".html",
|
||||||
|
".htm",
|
||||||
|
}
|
||||||
|
|
||||||
|
path_lower = path.lower()
|
||||||
|
if path_lower.endswith(".pdf"):
|
||||||
|
file_type = "pdf"
|
||||||
|
elif any(path_lower.endswith(ext) for ext in plain_text_exts):
|
||||||
|
file_type = "text"
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
"Error: Unsupported file type. Supported: PDF, TXT, MD, CSV, JSON, YAML, XML, HTML"
|
||||||
|
" and other plain text formats."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
filename, content = await client.download_bytes(path)
|
||||||
|
except SynologyError as e:
|
||||||
|
return f"Error: {e}"
|
||||||
|
|
||||||
|
size = len(content)
|
||||||
|
if size > max_read_bytes:
|
||||||
|
return f"Error: File too large ({size / 1024 / 1024:.1f} MB). Maximum is 10 MB."
|
||||||
|
|
||||||
|
total_pages = 0
|
||||||
|
|
||||||
|
if file_type == "pdf":
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
reader = PdfReader(io.BytesIO(content))
|
||||||
|
total_pages = len(reader.pages)
|
||||||
|
|
||||||
|
if page == 0:
|
||||||
|
page_texts = [pg.extract_text() or "" for pg in reader.pages]
|
||||||
|
if not any(t.strip() for t in page_texts):
|
||||||
|
return (
|
||||||
|
"Error: No extractable text found. The PDF may be image-only"
|
||||||
|
" (scanned without OCR layer)."
|
||||||
|
)
|
||||||
|
parts: list[str] = []
|
||||||
|
for i, pg_text in enumerate(page_texts):
|
||||||
|
if i == 0:
|
||||||
|
parts.append(pg_text)
|
||||||
|
else:
|
||||||
|
parts.append(f"\n\n--- Page {i + 1} ---\n\n{pg_text}")
|
||||||
|
text = "".join(parts)
|
||||||
|
else:
|
||||||
|
if page > total_pages:
|
||||||
|
return (
|
||||||
|
f"Error: Page {page} does not exist — "
|
||||||
|
f"this PDF has {total_pages} page(s)."
|
||||||
|
)
|
||||||
|
text = reader.pages[page - 1].extract_text() or ""
|
||||||
|
if not text.strip():
|
||||||
|
return (
|
||||||
|
"Error: No extractable text found. The PDF may be image-only"
|
||||||
|
" (scanned without OCR layer)."
|
||||||
|
)
|
||||||
|
|
||||||
|
except SynologyError:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
return f"Error: Failed to parse PDF: {exc}"
|
||||||
|
else:
|
||||||
|
text = content.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
|
full_len = len(text)
|
||||||
|
|
||||||
|
if max_chars > 0 and full_len > max_chars:
|
||||||
|
text = (
|
||||||
|
text[:max_chars]
|
||||||
|
+ f"\n\n[Truncated: {full_len} total chars, showing first {max_chars}."
|
||||||
|
+ " Use max_chars parameter to adjust.]"
|
||||||
|
)
|
||||||
|
|
||||||
|
display_name = path.rsplit("/", 1)[-1]
|
||||||
|
if file_type == "pdf" and page != 0:
|
||||||
|
header = f"[{display_name} — Page {page}/{total_pages} — {full_len} chars]"
|
||||||
|
else:
|
||||||
|
header = f"[{display_name} — {full_len} chars]"
|
||||||
|
|
||||||
|
return f"{header}\n{text}"
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
async def list_snapshots(share_path: str, offset: int = 0, limit: int = 100):
|
async def list_snapshots(share_path: str, offset: int = 0, limit: int = 100):
|
||||||
"""List Btrfs snapshots for a shared folder (requires Btrfs volume)."""
|
"""List Btrfs snapshots for a shared folder (requires Btrfs volume)."""
|
||||||
|
|||||||
@@ -0,0 +1,227 @@
|
|||||||
|
"""Tests for read_text tool."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mcp_synology_filestation.client import SynologyError
|
||||||
|
from mcp_synology_filestation.config import AppConfig, ConnectionConfig
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def config() -> AppConfig:
|
||||||
|
return AppConfig(
|
||||||
|
schema_version=1,
|
||||||
|
connection=ConnectionConfig(host="nas.example.com"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_tools(config: AppConfig, client: MagicMock) -> dict:
|
||||||
|
from mcp_synology_filestation.tools.filestation import register_filestation
|
||||||
|
|
||||||
|
registered: dict[str, object] = {}
|
||||||
|
mcp = MagicMock()
|
||||||
|
|
||||||
|
def tool_decorator():
|
||||||
|
def decorator(fn):
|
||||||
|
registered[fn.__name__] = fn
|
||||||
|
return fn
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
mcp.tool = tool_decorator
|
||||||
|
register_filestation(mcp, config, client)
|
||||||
|
return registered
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_pdf_reader(pages_text: list[str]) -> MagicMock:
|
||||||
|
"""Build a mock PdfReader whose .pages list returns the given text per page."""
|
||||||
|
mock_pages = []
|
||||||
|
for text in pages_text:
|
||||||
|
page = MagicMock()
|
||||||
|
page.extract_text.return_value = text
|
||||||
|
mock_pages.append(page)
|
||||||
|
reader = MagicMock()
|
||||||
|
reader.pages = mock_pages
|
||||||
|
return reader
|
||||||
|
|
||||||
|
|
||||||
|
# ── TXT / plain text ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_txt_success(config: AppConfig) -> None:
|
||||||
|
"""TXT file content is decoded and returned with header."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("readme.txt", b"Hello, world!"))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/docs/readme.txt")
|
||||||
|
|
||||||
|
assert "readme.txt" in result
|
||||||
|
assert "Hello, world!" in result
|
||||||
|
assert "13 chars" in result # len("Hello, world!") == 13
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_md_success(config: AppConfig) -> None:
|
||||||
|
"""Markdown file is treated as plain text."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("notes.md", b"# Title\n\nBody text."))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/docs/notes.md")
|
||||||
|
|
||||||
|
assert "notes.md" in result
|
||||||
|
assert "# Title" in result
|
||||||
|
assert "Body text." in result
|
||||||
|
|
||||||
|
|
||||||
|
# ── PDF extraction ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_pdf_all_pages(config: AppConfig) -> None:
|
||||||
|
"""PDF all-pages mode joins pages with separator."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("report.pdf", b"%PDF-1.4 stub"))
|
||||||
|
|
||||||
|
mock_reader = _mock_pdf_reader(["First page text.", "Second page text."])
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
|
||||||
|
with patch("pypdf.PdfReader", return_value=mock_reader):
|
||||||
|
result = await tools["read_text"]("/docs/report.pdf")
|
||||||
|
|
||||||
|
assert "report.pdf" in result
|
||||||
|
assert "First page text." in result
|
||||||
|
assert "Second page text." in result
|
||||||
|
assert "--- Page 2 ---" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_pdf_single_page(config: AppConfig) -> None:
|
||||||
|
"""PDF single-page mode returns only the requested page."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("doc.pdf", b"%PDF-1.4 stub"))
|
||||||
|
|
||||||
|
mock_reader = _mock_pdf_reader(["Page one.", "Page two.", "Page three."])
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
|
||||||
|
with patch("pypdf.PdfReader", return_value=mock_reader):
|
||||||
|
result = await tools["read_text"]("/docs/doc.pdf", page=2)
|
||||||
|
|
||||||
|
assert "Page 2/3" in result
|
||||||
|
assert "Page two." in result
|
||||||
|
assert "Page one." not in result
|
||||||
|
assert "Page three." not in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_pdf_page_out_of_range(config: AppConfig) -> None:
|
||||||
|
"""Requesting a page beyond the PDF page count returns an error."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("doc.pdf", b"%PDF-1.4 stub"))
|
||||||
|
|
||||||
|
mock_reader = _mock_pdf_reader(["Only page."])
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
|
||||||
|
with patch("pypdf.PdfReader", return_value=mock_reader):
|
||||||
|
result = await tools["read_text"]("/docs/doc.pdf", page=5)
|
||||||
|
|
||||||
|
assert result.startswith("Error:")
|
||||||
|
assert "5" in result
|
||||||
|
assert "1" in result # total pages
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_pdf_image_only(config: AppConfig) -> None:
|
||||||
|
"""PDF with no extractable text returns image-only error."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("scan.pdf", b"%PDF-1.4 stub"))
|
||||||
|
|
||||||
|
mock_reader = _mock_pdf_reader(["", ""]) # no text on any page
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
|
||||||
|
with patch("pypdf.PdfReader", return_value=mock_reader):
|
||||||
|
result = await tools["read_text"]("/docs/scan.pdf")
|
||||||
|
|
||||||
|
assert result.startswith("Error:")
|
||||||
|
assert "image-only" in result.lower() or "No extractable text" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ── max_chars truncation ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_max_chars_truncation(config: AppConfig) -> None:
|
||||||
|
"""Text exceeding max_chars is truncated with a hint."""
|
||||||
|
long_text = "A" * 200
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("big.txt", long_text.encode()))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/data/big.txt", max_chars=50)
|
||||||
|
|
||||||
|
assert "Truncated" in result
|
||||||
|
assert "200 total chars" in result
|
||||||
|
assert "showing first 50" in result
|
||||||
|
# The returned content before the truncation note must be exactly 50 'A's
|
||||||
|
assert "A" * 50 in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_max_chars_zero_no_limit(config: AppConfig) -> None:
|
||||||
|
"""max_chars=0 disables truncation."""
|
||||||
|
long_text = "B" * 100_000
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("huge.txt", long_text.encode()))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/data/huge.txt", max_chars=0)
|
||||||
|
|
||||||
|
assert "Truncated" not in result
|
||||||
|
assert "B" * 100 in result # spot-check some content
|
||||||
|
|
||||||
|
|
||||||
|
# ── error cases ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_unsupported_type(config: AppConfig) -> None:
|
||||||
|
"""Unknown file extension returns an unsupported-type error."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("binary.exe", b"\x00\x01\x02"))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/bin/binary.exe")
|
||||||
|
|
||||||
|
assert result.startswith("Error:")
|
||||||
|
assert "Unsupported file type" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_file_too_large(config: AppConfig) -> None:
|
||||||
|
"""Files exceeding 10 MB return a size error without downloading the full content."""
|
||||||
|
oversized = b"x" * (10 * 1024 * 1024 + 1)
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(return_value=("big.txt", oversized))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/data/big.txt")
|
||||||
|
|
||||||
|
assert result.startswith("Error:")
|
||||||
|
assert "10 MB" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_read_text_dsm_error(config: AppConfig) -> None:
|
||||||
|
"""DSM errors from download are surfaced as Error: messages."""
|
||||||
|
client = MagicMock()
|
||||||
|
client.download_bytes = AsyncMock(side_effect=SynologyError(1800, "File not found"))
|
||||||
|
|
||||||
|
tools = _make_tools(config, client)
|
||||||
|
result = await tools["read_text"]("/missing/file.txt")
|
||||||
|
|
||||||
|
assert result.startswith("Error:")
|
||||||
Reference in New Issue
Block a user