feat(file): 添加文件内容提取 API 支持分页读取

- FileController: 新增 fetch API 通过路径获取文本内容 - FileController: one API 支持 with_text 参数提取文本 - ManticoreFile: 实现分页提取 extractFileContentPaginated - TextExtractor: 添加 truncate 参数支持内容截取
2026-01-23 10:28:13 +00:00 · 2026-01-16 01:39:19 +00:00 · 2026-01-16 01:39:19 +00:00 · 23b06327d6
commit 23b06327d6
parent 6c22e373f7
3 changed files with 185 additions and 27 deletions
--- a/app/Http/Controllers/Api/FileController.php
+++ b/app/Http/Controllers/Api/FileController.php
@ -17,6 +17,7 @@ use App\Module\Down;
 use App\Module\Lock;
 use App\Module\Timer;
 use App\Module\Ihttp;
 use App\Module\Manticore\ManticoreFile;
 use Response;
 use Swoole\Coroutine;
 use Carbon\Carbon;
@ -68,6 +69,11 @@ class FileController extends AbstractController
     * @apiParam {String} [with_url]            是否返回文件访问URL
     * - no: 不返回（默认）
     * - yes: 返回content_url字段
     * @apiParam {String} [with_text]           是否提取文件文本内容（用于AI阅读，支持分页）
     * - no: 不提取（默认）
     * - yes: 提取文本内容，支持 docx/xlsx/pptx/pdf/txt 等格式
     * @apiParam {Number} [text_offset]         with_text=yes时有效，文本起始位置（字符数），默认0
     * @apiParam {Number} [text_limit]          with_text=yes时有效，文本获取长度（字符数），默认50000，最大200000
     *
     * @apiSuccess {Number} ret     返回状态码（1正确、0错误）
     * @apiSuccess {String} msg     返回信息（错误描述）
@ -77,6 +83,9 @@ class FileController extends AbstractController
    {
        $id = Request::input('id');
        $with_url = Request::input('with_url', 'no');
        $with_text = Request::input('with_text', 'no');
        $text_offset = intval(Request::input('text_offset', 0));
        $text_limit = intval(Request::input('text_limit', 50000));
        //
        $permission = 0;
        if (Base::isNumber($id)) {
@ -112,9 +121,57 @@ class FileController extends AbstractController
            $array['content_url'] = FileContent::getFileUrl($file->id);
        }
        // 如果请求提取文本内容
        if ($with_text === 'yes') {
            $array['text_content'] = ManticoreFile::extractFileContentPaginated($file, $text_offset, $text_limit);
        }
        return Base::retSuccess('success', $array);
    }
    /**
     * @api {get} api/file/fetch 通过路径获取文件文本内容
     *
     * @apiDescription 用于 MCP/AI 工具通过文件路径获取内容，支持分页获取大文件
     * @apiVersion 1.0.0
     * @apiGroup file
     * @apiName fetch
     *
     * @apiParam {String} path              文件路径（相对于系统根目录，如 uploads/file/...）
     * @apiParam {Number} [offset]          起始位置（字符数），默认0
     * @apiParam {Number} [limit]           获取长度（字符数），默认50000，最大200000
     *
     * @apiSuccess {Number} ret     返回状态码（1正确、0错误）
     * @apiSuccess {String} msg     返回信息（错误描述）
     * @apiSuccess {Object} data    返回数据
     * - content: 文本内容
     * - total_length: 完整内容总长度
     * - offset: 当前起始位置
     * - limit: 本次获取长度
     * - has_more: 是否还有更多内容
     */
    public function fetch()
    {
        User::auth();
        //
        $path = trim(Request::input('path'));
        $offset = intval(Request::input('offset', 0));
        $limit = intval(Request::input('limit', 50000));
        if (empty($path)) {
            return Base::retError('参数错误：path 不能为空');
        }
        // 直接传入路径，ManticoreFile 内部处理 URL 解析
        $result = ManticoreFile::extractFileContentPaginated($path, $offset, $limit);
        if (isset($result['error'])) {
            return Base::retError($result['error']);
        }
        return Base::retSuccess('success', $result);
    }
    /**
     * @api {get} api/file/search 搜索文件列表
     *
--- a/app/Module/Manticore/ManticoreFile.php
+++ b/app/Module/Manticore/ManticoreFile.php
@ -10,7 +10,6 @@ use App\Module\Base;
 use App\Module\TextExtractor;
 use App\Module\AI;
 use Illuminate\Support\Facades\Log;
 use Illuminate\Support\Facades\DB;
 /**
 * Manticore Search 文件搜索类
@ -355,7 +354,85 @@ class ManticoreFile
    }
    /**
-     * 提取文件内容
+     * 提取文件内容（支持分页）
     *
     * @param File|string $fileOrPath 文件模型 或 文件路径/URL
     * @param int $offset 起始位置（字符数），默认 0
     * @param int $limit 获取长度（字符数），默认 50000，最大 200000
     * @return array 包含 content, total_length, offset, limit, has_more, 或 error
     */
    public static function extractFileContentPaginated(File|string $fileOrPath, int $offset = 0, int $limit = 50000): array
    {
        $offset = max(0, $offset);
        $limit = min(max(1, $limit), 200000);
        // 根据参数类型获取完整内容
        if ($fileOrPath instanceof File) {
            if ($fileOrPath->type === 'folder') {
                return ['error' => '文件夹无法提取内容'];
            }
            $fullContent = self::extractFileContent($fileOrPath);
        } else {
            $fullContent = self::extractFileContentFromPath($fileOrPath);
            if (is_array($fullContent)) {
                return $fullContent; // 返回错误信息
            }
        }
        if (empty($fullContent)) {
            return ['error' => '无法提取文件内容'];
        }
        // 分页处理
        $totalLength = mb_strlen($fullContent);
        if ($offset >= $totalLength) {
            return [
                'content' => '',
                'total_length' => $totalLength,
                'offset' => $offset,
                'limit' => $limit,
                'has_more' => false,
            ];
        }
        $content = mb_substr($fullContent, $offset, $limit);
        $hasMore = ($offset + mb_strlen($content)) < $totalLength;
        return [
            'content' => $content,
            'total_length' => $totalLength,
            'offset' => $offset,
            'limit' => $limit,
            'has_more' => $hasMore,
        ];
    }
    /**
     * 通过路径/URL 提取完整内容
     * @return string|array 内容字符串，或错误数组
     */
    private static function extractFileContentFromPath(string $pathOrUrl): string|array
    {
        // 从 URL 中提取相对路径
        if (str_starts_with($pathOrUrl, 'http://') || str_starts_with($pathOrUrl, 'https://')) {
            $parsed = parse_url($pathOrUrl);
            $pathOrUrl = ltrim($parsed['path'] ?? '', '/');
        }
        if (preg_match('/^.*?(uploads\/.*)$/', $pathOrUrl, $matches)) {
            $pathOrUrl = $matches[1];
        }
        // 安全检查：只允许 uploads 目录
        if (!str_starts_with($pathOrUrl, 'uploads/')) {
            return ['error' => '不支持的文件路径'];
        }
        return self::extractFromPath($pathOrUrl);
    }
    /**
     * 提取文件内容（内部使用，返回完整内容）
     *
     * @param File $file 文件模型
     * @return string 文件内容文本
@ -364,37 +441,28 @@ class ManticoreFile
    {
        // 1. 先尝试从 FileContent 的 text 字段获取（已提取的文本内容）
        $fileContent = FileContent::where('fid', $file->id)->orderByDesc('id')->first();
-        if ($fileContent && !empty($fileContent->text)) {
+        if (!$fileContent) {
            return '';
        }
        if (!empty($fileContent->text)) {
            return $fileContent->text;
        }
        // 2. 尝试从 FileContent 的 content 字段获取
-        if ($fileContent && !empty($fileContent->content)) {
+        if (!empty($fileContent->content)) {
            $contentData = Base::json2array($fileContent->content);
            // 2.1 某些文件类型直接存储内容
-            if (!empty($contentData['content'])) {
+            if (!empty($contentData['content']) && is_string($contentData['content'])) {
-                return is_string($contentData['content']) ? $contentData['content'] : '';
+                return $contentData['content'];
            }
-            // 2.2 尝试使用 TextExtractor 提取文件内容
+            // 2.2 通过路径提取
            $filePath = $contentData['url'] ?? null;
            if ($filePath && str_starts_with($filePath, 'uploads/')) {
-                $fullPath = public_path($filePath);
+                $result = self::extractFromPath($filePath);
-                if (file_exists($fullPath)) {
+                if (is_string($result)) {
-                    // 根据文件类型设置不同的大小限制
+                    return $result;
                    $ext = strtolower(pathinfo($fullPath, PATHINFO_EXTENSION));
                    $maxFileSize = self::getMaxFileSizeByExt($ext);
                    $maxContentSize = self::MAX_CONTENT_LENGTH;
                    $result = TextExtractor::extractFile(
                        $fullPath,
                        (int) ($maxFileSize / 1024),     // 转换为 KB
                        (int) ($maxContentSize / 1024)   // 转换为 KB
                    );
                    if (Base::isSuccess($result)) {
                        return $result['data'] ?? '';
                    }
                }
            }
        }
@ -402,6 +470,33 @@ class ManticoreFile
        return '';
    }
    /**
     * 从文件路径提取内容（核心方法）
     * @return string|array 内容字符串，或错误数组
     */
    private static function extractFromPath(string $relativePath): string|array
    {
        $fullPath = public_path($relativePath);
        if (!file_exists($fullPath)) {
            return ['error' => '文件不存在'];
        }
        $ext = strtolower(pathinfo($fullPath, PATHINFO_EXTENSION));
        $maxFileSize = self::getMaxFileSizeByExt($ext);
        $result = TextExtractor::extractFile(
            $fullPath,
            (int) ($maxFileSize / 1024),
            (int) (self::MAX_CONTENT_LENGTH / 1024)
        );
        if (!Base::isSuccess($result)) {
            return ['error' => $result['msg'] ?? '无法提取文件内容'];
        }
        return $result['data'] ?? '';
    }
    /**
     * 构建用于生成向量的内容
     * 包含文件名和文件内容，确保语义搜索能匹配文件名
--- a/app/Module/TextExtractor.php
+++ b/app/Module/TextExtractor.php
@ -233,11 +233,12 @@ class TextExtractor
    /**
     * 获取文件内容
     * @param $filePath
-     * @param int $fileMaxSize      最大文件大小，单位字节，默认1024KB
+     * @param int $fileMaxSize      最大文件大小，单位KB，默认1024KB
-     * @param int $contentMaxSize   最大内容大小，单位字节，默认300KB
+     * @param int $contentMaxSize   最大内容大小，单位KB，默认300KB
     * @param bool $truncate        超过contentMaxSize时是否截取，默认true截取，false返回错误
     * @return array
     */
-    public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300): array
+    public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300, bool $truncate = true): array
    {
        if (!file_exists($filePath) || !is_file($filePath)) {
            return Base::retError("Failed to read contents of {$filePath}");
@ -248,8 +249,13 @@ class TextExtractor
        try {
            $extractor = new self($filePath);
            $content = $extractor->extractContent();
-            if (strlen($content) > $contentMaxSize * 1024) {
+            $maxBytes = $contentMaxSize * 1024;
-                return Base::retError("Content size exceeds " . Base::readableBytes($contentMaxSize * 1024) . ", unable to display content");
+            if (strlen($content) > $maxBytes) {
                if ($truncate) {
                    $content = mb_substr($content, 0, $maxBytes);
                } else {
                    return Base::retError("Content size exceeds " . Base::readableBytes($maxBytes) . ", unable to display content");
                }
            }
            return Base::retSuccess("success", $content);
        } catch (Exception $e) {