feat: add read_text tool — extract text from PDF/TXT/MD (v0.4.0)

Adds read_text tool with pypdf integration for PDF text extraction, plain-text decode for TXT/MD/CSV/JSON/etc, page-filter support, max_chars truncation, and 11 mock-based tests. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 17:09:32 +02:00
parent 1bb75c9f36
commit 65cb5a44c7
4 changed files with 333 additions and 2 deletions
@@ -1,3 +1,3 @@
 """MCP server for Synology FileStation."""

-__version__ = "0.3.6"
+__version__ = "0.4.0"
@@ -1319,6 +1319,109 @@ def register_filestation(

        return "\n".join(lines)

+    @mcp.tool()
+    async def read_text(path: str, max_chars: int = 50_000, page: int = 0):
+        """Extract readable text from a PDF, TXT, or MD file (max 10 MB)."""
+        import io
+
+        from mcp_synology_filestation.client import SynologyError
+
+        max_read_bytes = 10 * 1024 * 1024
+
+        plain_text_exts = {
+            ".txt",
+            ".md",
+            ".markdown",
+            ".csv",
+            ".log",
+            ".yaml",
+            ".yml",
+            ".json",
+            ".xml",
+            ".html",
+            ".htm",
+        }
+
+        path_lower = path.lower()
+        if path_lower.endswith(".pdf"):
+            file_type = "pdf"
+        elif any(path_lower.endswith(ext) for ext in plain_text_exts):
+            file_type = "text"
+        else:
+            return (
+                "Error: Unsupported file type. Supported: PDF, TXT, MD, CSV, JSON, YAML, XML, HTML"
+                " and other plain text formats."
+            )
+
+        try:
+            filename, content = await client.download_bytes(path)
+        except SynologyError as e:
+            return f"Error: {e}"
+
+        size = len(content)
+        if size > max_read_bytes:
+            return f"Error: File too large ({size / 1024 / 1024:.1f} MB). Maximum is 10 MB."
+
+        total_pages = 0
+
+        if file_type == "pdf":
+            try:
+                from pypdf import PdfReader
+
+                reader = PdfReader(io.BytesIO(content))
+                total_pages = len(reader.pages)
+
+                if page == 0:
+                    page_texts = [pg.extract_text() or "" for pg in reader.pages]
+                    if not any(t.strip() for t in page_texts):
+                        return (
+                            "Error: No extractable text found. The PDF may be image-only"
+                            " (scanned without OCR layer)."
+                        )
+                    parts: list[str] = []
+                    for i, pg_text in enumerate(page_texts):
+                        if i == 0:
+                            parts.append(pg_text)
+                        else:
+                            parts.append(f"\n\n--- Page {i + 1} ---\n\n{pg_text}")
+                    text = "".join(parts)
+                else:
+                    if page > total_pages:
+                        return (
+                            f"Error: Page {page} does not exist — "
+                            f"this PDF has {total_pages} page(s)."
+                        )
+                    text = reader.pages[page - 1].extract_text() or ""
+                    if not text.strip():
+                        return (
+                            "Error: No extractable text found. The PDF may be image-only"
+                            " (scanned without OCR layer)."
+                        )
+
+            except SynologyError:
+                raise
+            except Exception as exc:
+                return f"Error: Failed to parse PDF: {exc}"
+        else:
+            text = content.decode("utf-8", errors="replace")
+
+        full_len = len(text)
+
+        if max_chars > 0 and full_len > max_chars:
+            text = (
+                text[:max_chars]
+                + f"\n\n[Truncated: {full_len} total chars, showing first {max_chars}."
+                + " Use max_chars parameter to adjust.]"
+            )
+
+        display_name = path.rsplit("/", 1)[-1]
+        if file_type == "pdf" and page != 0:
+            header = f"[{display_name} — Page {page}/{total_pages} — {full_len} chars]"
+        else:
+            header = f"[{display_name} — {full_len} chars]"
+
+        return f"{header}\n{text}"
+
    @mcp.tool()
    async def list_snapshots(share_path: str, offset: int = 0, limit: int = 100):
        """List Btrfs snapshots for a shared folder (requires Btrfs volume)."""