feat: add read_text tool — extract text from PDF/TXT/MD (v0.4.0)
Adds read_text tool with pypdf integration for PDF text extraction, plain-text decode for TXT/MD/CSV/JSON/etc, page-filter support, max_chars truncation, and 11 mock-based tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,3 @@
|
||||
"""MCP server for Synology FileStation."""
|
||||
|
||||
__version__ = "0.3.6"
|
||||
__version__ = "0.4.0"
|
||||
|
||||
@@ -1319,6 +1319,109 @@ def register_filestation(
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@mcp.tool()
|
||||
async def read_text(path: str, max_chars: int = 50_000, page: int = 0):
|
||||
"""Extract readable text from a PDF, TXT, or MD file (max 10 MB)."""
|
||||
import io
|
||||
|
||||
from mcp_synology_filestation.client import SynologyError
|
||||
|
||||
max_read_bytes = 10 * 1024 * 1024
|
||||
|
||||
plain_text_exts = {
|
||||
".txt",
|
||||
".md",
|
||||
".markdown",
|
||||
".csv",
|
||||
".log",
|
||||
".yaml",
|
||||
".yml",
|
||||
".json",
|
||||
".xml",
|
||||
".html",
|
||||
".htm",
|
||||
}
|
||||
|
||||
path_lower = path.lower()
|
||||
if path_lower.endswith(".pdf"):
|
||||
file_type = "pdf"
|
||||
elif any(path_lower.endswith(ext) for ext in plain_text_exts):
|
||||
file_type = "text"
|
||||
else:
|
||||
return (
|
||||
"Error: Unsupported file type. Supported: PDF, TXT, MD, CSV, JSON, YAML, XML, HTML"
|
||||
" and other plain text formats."
|
||||
)
|
||||
|
||||
try:
|
||||
filename, content = await client.download_bytes(path)
|
||||
except SynologyError as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
size = len(content)
|
||||
if size > max_read_bytes:
|
||||
return f"Error: File too large ({size / 1024 / 1024:.1f} MB). Maximum is 10 MB."
|
||||
|
||||
total_pages = 0
|
||||
|
||||
if file_type == "pdf":
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
|
||||
reader = PdfReader(io.BytesIO(content))
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if page == 0:
|
||||
page_texts = [pg.extract_text() or "" for pg in reader.pages]
|
||||
if not any(t.strip() for t in page_texts):
|
||||
return (
|
||||
"Error: No extractable text found. The PDF may be image-only"
|
||||
" (scanned without OCR layer)."
|
||||
)
|
||||
parts: list[str] = []
|
||||
for i, pg_text in enumerate(page_texts):
|
||||
if i == 0:
|
||||
parts.append(pg_text)
|
||||
else:
|
||||
parts.append(f"\n\n--- Page {i + 1} ---\n\n{pg_text}")
|
||||
text = "".join(parts)
|
||||
else:
|
||||
if page > total_pages:
|
||||
return (
|
||||
f"Error: Page {page} does not exist — "
|
||||
f"this PDF has {total_pages} page(s)."
|
||||
)
|
||||
text = reader.pages[page - 1].extract_text() or ""
|
||||
if not text.strip():
|
||||
return (
|
||||
"Error: No extractable text found. The PDF may be image-only"
|
||||
" (scanned without OCR layer)."
|
||||
)
|
||||
|
||||
except SynologyError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
return f"Error: Failed to parse PDF: {exc}"
|
||||
else:
|
||||
text = content.decode("utf-8", errors="replace")
|
||||
|
||||
full_len = len(text)
|
||||
|
||||
if max_chars > 0 and full_len > max_chars:
|
||||
text = (
|
||||
text[:max_chars]
|
||||
+ f"\n\n[Truncated: {full_len} total chars, showing first {max_chars}."
|
||||
+ " Use max_chars parameter to adjust.]"
|
||||
)
|
||||
|
||||
display_name = path.rsplit("/", 1)[-1]
|
||||
if file_type == "pdf" and page != 0:
|
||||
header = f"[{display_name} — Page {page}/{total_pages} — {full_len} chars]"
|
||||
else:
|
||||
header = f"[{display_name} — {full_len} chars]"
|
||||
|
||||
return f"{header}\n{text}"
|
||||
|
||||
@mcp.tool()
|
||||
async def list_snapshots(share_path: str, offset: int = 0, limit: int = 100):
|
||||
"""List Btrfs snapshots for a shared folder (requires Btrfs volume)."""
|
||||
|
||||
Reference in New Issue
Block a user