mirror of
https://github.com/kuaifan/dootask.git
synced 2026-01-23 10:28:13 +00:00
feat(file): 添加文件内容提取 API 支持分页读取
- FileController: 新增 fetch API 通过路径获取文本内容 - FileController: one API 支持 with_text 参数提取文本 - ManticoreFile: 实现分页提取 extractFileContentPaginated - TextExtractor: 添加 truncate 参数支持内容截取
This commit is contained in:
parent
6c22e373f7
commit
23b06327d6
@ -17,6 +17,7 @@ use App\Module\Down;
|
|||||||
use App\Module\Lock;
|
use App\Module\Lock;
|
||||||
use App\Module\Timer;
|
use App\Module\Timer;
|
||||||
use App\Module\Ihttp;
|
use App\Module\Ihttp;
|
||||||
|
use App\Module\Manticore\ManticoreFile;
|
||||||
use Response;
|
use Response;
|
||||||
use Swoole\Coroutine;
|
use Swoole\Coroutine;
|
||||||
use Carbon\Carbon;
|
use Carbon\Carbon;
|
||||||
@ -68,6 +69,11 @@ class FileController extends AbstractController
|
|||||||
* @apiParam {String} [with_url] 是否返回文件访问URL
|
* @apiParam {String} [with_url] 是否返回文件访问URL
|
||||||
* - no: 不返回(默认)
|
* - no: 不返回(默认)
|
||||||
* - yes: 返回content_url字段
|
* - yes: 返回content_url字段
|
||||||
|
* @apiParam {String} [with_text] 是否提取文件文本内容(用于AI阅读,支持分页)
|
||||||
|
* - no: 不提取(默认)
|
||||||
|
* - yes: 提取文本内容,支持 docx/xlsx/pptx/pdf/txt 等格式
|
||||||
|
* @apiParam {Number} [text_offset] with_text=yes时有效,文本起始位置(字符数),默认0
|
||||||
|
* @apiParam {Number} [text_limit] with_text=yes时有效,文本获取长度(字符数),默认50000,最大200000
|
||||||
*
|
*
|
||||||
* @apiSuccess {Number} ret 返回状态码(1正确、0错误)
|
* @apiSuccess {Number} ret 返回状态码(1正确、0错误)
|
||||||
* @apiSuccess {String} msg 返回信息(错误描述)
|
* @apiSuccess {String} msg 返回信息(错误描述)
|
||||||
@ -77,6 +83,9 @@ class FileController extends AbstractController
|
|||||||
{
|
{
|
||||||
$id = Request::input('id');
|
$id = Request::input('id');
|
||||||
$with_url = Request::input('with_url', 'no');
|
$with_url = Request::input('with_url', 'no');
|
||||||
|
$with_text = Request::input('with_text', 'no');
|
||||||
|
$text_offset = intval(Request::input('text_offset', 0));
|
||||||
|
$text_limit = intval(Request::input('text_limit', 50000));
|
||||||
//
|
//
|
||||||
$permission = 0;
|
$permission = 0;
|
||||||
if (Base::isNumber($id)) {
|
if (Base::isNumber($id)) {
|
||||||
@ -112,9 +121,57 @@ class FileController extends AbstractController
|
|||||||
$array['content_url'] = FileContent::getFileUrl($file->id);
|
$array['content_url'] = FileContent::getFileUrl($file->id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 如果请求提取文本内容
|
||||||
|
if ($with_text === 'yes') {
|
||||||
|
$array['text_content'] = ManticoreFile::extractFileContentPaginated($file, $text_offset, $text_limit);
|
||||||
|
}
|
||||||
|
|
||||||
return Base::retSuccess('success', $array);
|
return Base::retSuccess('success', $array);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @api {get} api/file/fetch 通过路径获取文件文本内容
|
||||||
|
*
|
||||||
|
* @apiDescription 用于 MCP/AI 工具通过文件路径获取内容,支持分页获取大文件
|
||||||
|
* @apiVersion 1.0.0
|
||||||
|
* @apiGroup file
|
||||||
|
* @apiName fetch
|
||||||
|
*
|
||||||
|
* @apiParam {String} path 文件路径(相对于系统根目录,如 uploads/file/...)
|
||||||
|
* @apiParam {Number} [offset] 起始位置(字符数),默认0
|
||||||
|
* @apiParam {Number} [limit] 获取长度(字符数),默认50000,最大200000
|
||||||
|
*
|
||||||
|
* @apiSuccess {Number} ret 返回状态码(1正确、0错误)
|
||||||
|
* @apiSuccess {String} msg 返回信息(错误描述)
|
||||||
|
* @apiSuccess {Object} data 返回数据
|
||||||
|
* - content: 文本内容
|
||||||
|
* - total_length: 完整内容总长度
|
||||||
|
* - offset: 当前起始位置
|
||||||
|
* - limit: 本次获取长度
|
||||||
|
* - has_more: 是否还有更多内容
|
||||||
|
*/
|
||||||
|
public function fetch()
|
||||||
|
{
|
||||||
|
User::auth();
|
||||||
|
//
|
||||||
|
$path = trim(Request::input('path'));
|
||||||
|
$offset = intval(Request::input('offset', 0));
|
||||||
|
$limit = intval(Request::input('limit', 50000));
|
||||||
|
|
||||||
|
if (empty($path)) {
|
||||||
|
return Base::retError('参数错误:path 不能为空');
|
||||||
|
}
|
||||||
|
|
||||||
|
// 直接传入路径,ManticoreFile 内部处理 URL 解析
|
||||||
|
$result = ManticoreFile::extractFileContentPaginated($path, $offset, $limit);
|
||||||
|
|
||||||
|
if (isset($result['error'])) {
|
||||||
|
return Base::retError($result['error']);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Base::retSuccess('success', $result);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @api {get} api/file/search 搜索文件列表
|
* @api {get} api/file/search 搜索文件列表
|
||||||
*
|
*
|
||||||
|
|||||||
@ -10,7 +10,6 @@ use App\Module\Base;
|
|||||||
use App\Module\TextExtractor;
|
use App\Module\TextExtractor;
|
||||||
use App\Module\AI;
|
use App\Module\AI;
|
||||||
use Illuminate\Support\Facades\Log;
|
use Illuminate\Support\Facades\Log;
|
||||||
use Illuminate\Support\Facades\DB;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Manticore Search 文件搜索类
|
* Manticore Search 文件搜索类
|
||||||
@ -355,7 +354,85 @@ class ManticoreFile
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 提取文件内容
|
* 提取文件内容(支持分页)
|
||||||
|
*
|
||||||
|
* @param File|string $fileOrPath 文件模型 或 文件路径/URL
|
||||||
|
* @param int $offset 起始位置(字符数),默认 0
|
||||||
|
* @param int $limit 获取长度(字符数),默认 50000,最大 200000
|
||||||
|
* @return array 包含 content, total_length, offset, limit, has_more, 或 error
|
||||||
|
*/
|
||||||
|
public static function extractFileContentPaginated(File|string $fileOrPath, int $offset = 0, int $limit = 50000): array
|
||||||
|
{
|
||||||
|
$offset = max(0, $offset);
|
||||||
|
$limit = min(max(1, $limit), 200000);
|
||||||
|
|
||||||
|
// 根据参数类型获取完整内容
|
||||||
|
if ($fileOrPath instanceof File) {
|
||||||
|
if ($fileOrPath->type === 'folder') {
|
||||||
|
return ['error' => '文件夹无法提取内容'];
|
||||||
|
}
|
||||||
|
$fullContent = self::extractFileContent($fileOrPath);
|
||||||
|
} else {
|
||||||
|
$fullContent = self::extractFileContentFromPath($fileOrPath);
|
||||||
|
if (is_array($fullContent)) {
|
||||||
|
return $fullContent; // 返回错误信息
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($fullContent)) {
|
||||||
|
return ['error' => '无法提取文件内容'];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 分页处理
|
||||||
|
$totalLength = mb_strlen($fullContent);
|
||||||
|
|
||||||
|
if ($offset >= $totalLength) {
|
||||||
|
return [
|
||||||
|
'content' => '',
|
||||||
|
'total_length' => $totalLength,
|
||||||
|
'offset' => $offset,
|
||||||
|
'limit' => $limit,
|
||||||
|
'has_more' => false,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
$content = mb_substr($fullContent, $offset, $limit);
|
||||||
|
$hasMore = ($offset + mb_strlen($content)) < $totalLength;
|
||||||
|
|
||||||
|
return [
|
||||||
|
'content' => $content,
|
||||||
|
'total_length' => $totalLength,
|
||||||
|
'offset' => $offset,
|
||||||
|
'limit' => $limit,
|
||||||
|
'has_more' => $hasMore,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 通过路径/URL 提取完整内容
|
||||||
|
* @return string|array 内容字符串,或错误数组
|
||||||
|
*/
|
||||||
|
private static function extractFileContentFromPath(string $pathOrUrl): string|array
|
||||||
|
{
|
||||||
|
// 从 URL 中提取相对路径
|
||||||
|
if (str_starts_with($pathOrUrl, 'http://') || str_starts_with($pathOrUrl, 'https://')) {
|
||||||
|
$parsed = parse_url($pathOrUrl);
|
||||||
|
$pathOrUrl = ltrim($parsed['path'] ?? '', '/');
|
||||||
|
}
|
||||||
|
if (preg_match('/^.*?(uploads\/.*)$/', $pathOrUrl, $matches)) {
|
||||||
|
$pathOrUrl = $matches[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// 安全检查:只允许 uploads 目录
|
||||||
|
if (!str_starts_with($pathOrUrl, 'uploads/')) {
|
||||||
|
return ['error' => '不支持的文件路径'];
|
||||||
|
}
|
||||||
|
|
||||||
|
return self::extractFromPath($pathOrUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 提取文件内容(内部使用,返回完整内容)
|
||||||
*
|
*
|
||||||
* @param File $file 文件模型
|
* @param File $file 文件模型
|
||||||
* @return string 文件内容文本
|
* @return string 文件内容文本
|
||||||
@ -364,37 +441,28 @@ class ManticoreFile
|
|||||||
{
|
{
|
||||||
// 1. 先尝试从 FileContent 的 text 字段获取(已提取的文本内容)
|
// 1. 先尝试从 FileContent 的 text 字段获取(已提取的文本内容)
|
||||||
$fileContent = FileContent::where('fid', $file->id)->orderByDesc('id')->first();
|
$fileContent = FileContent::where('fid', $file->id)->orderByDesc('id')->first();
|
||||||
if ($fileContent && !empty($fileContent->text)) {
|
if (!$fileContent) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
if (!empty($fileContent->text)) {
|
||||||
return $fileContent->text;
|
return $fileContent->text;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. 尝试从 FileContent 的 content 字段获取
|
// 2. 尝试从 FileContent 的 content 字段获取
|
||||||
if ($fileContent && !empty($fileContent->content)) {
|
if (!empty($fileContent->content)) {
|
||||||
$contentData = Base::json2array($fileContent->content);
|
$contentData = Base::json2array($fileContent->content);
|
||||||
|
|
||||||
// 2.1 某些文件类型直接存储内容
|
// 2.1 某些文件类型直接存储内容
|
||||||
if (!empty($contentData['content'])) {
|
if (!empty($contentData['content']) && is_string($contentData['content'])) {
|
||||||
return is_string($contentData['content']) ? $contentData['content'] : '';
|
return $contentData['content'];
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2.2 尝试使用 TextExtractor 提取文件内容
|
// 2.2 通过路径提取
|
||||||
$filePath = $contentData['url'] ?? null;
|
$filePath = $contentData['url'] ?? null;
|
||||||
if ($filePath && str_starts_with($filePath, 'uploads/')) {
|
if ($filePath && str_starts_with($filePath, 'uploads/')) {
|
||||||
$fullPath = public_path($filePath);
|
$result = self::extractFromPath($filePath);
|
||||||
if (file_exists($fullPath)) {
|
if (is_string($result)) {
|
||||||
// 根据文件类型设置不同的大小限制
|
return $result;
|
||||||
$ext = strtolower(pathinfo($fullPath, PATHINFO_EXTENSION));
|
|
||||||
$maxFileSize = self::getMaxFileSizeByExt($ext);
|
|
||||||
$maxContentSize = self::MAX_CONTENT_LENGTH;
|
|
||||||
|
|
||||||
$result = TextExtractor::extractFile(
|
|
||||||
$fullPath,
|
|
||||||
(int) ($maxFileSize / 1024), // 转换为 KB
|
|
||||||
(int) ($maxContentSize / 1024) // 转换为 KB
|
|
||||||
);
|
|
||||||
if (Base::isSuccess($result)) {
|
|
||||||
return $result['data'] ?? '';
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -402,6 +470,33 @@ class ManticoreFile
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 从文件路径提取内容(核心方法)
|
||||||
|
* @return string|array 内容字符串,或错误数组
|
||||||
|
*/
|
||||||
|
private static function extractFromPath(string $relativePath): string|array
|
||||||
|
{
|
||||||
|
$fullPath = public_path($relativePath);
|
||||||
|
if (!file_exists($fullPath)) {
|
||||||
|
return ['error' => '文件不存在'];
|
||||||
|
}
|
||||||
|
|
||||||
|
$ext = strtolower(pathinfo($fullPath, PATHINFO_EXTENSION));
|
||||||
|
$maxFileSize = self::getMaxFileSizeByExt($ext);
|
||||||
|
|
||||||
|
$result = TextExtractor::extractFile(
|
||||||
|
$fullPath,
|
||||||
|
(int) ($maxFileSize / 1024),
|
||||||
|
(int) (self::MAX_CONTENT_LENGTH / 1024)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!Base::isSuccess($result)) {
|
||||||
|
return ['error' => $result['msg'] ?? '无法提取文件内容'];
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result['data'] ?? '';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 构建用于生成向量的内容
|
* 构建用于生成向量的内容
|
||||||
* 包含文件名和文件内容,确保语义搜索能匹配文件名
|
* 包含文件名和文件内容,确保语义搜索能匹配文件名
|
||||||
|
|||||||
@ -233,11 +233,12 @@ class TextExtractor
|
|||||||
/**
|
/**
|
||||||
* 获取文件内容
|
* 获取文件内容
|
||||||
* @param $filePath
|
* @param $filePath
|
||||||
* @param int $fileMaxSize 最大文件大小,单位字节,默认1024KB
|
* @param int $fileMaxSize 最大文件大小,单位KB,默认1024KB
|
||||||
* @param int $contentMaxSize 最大内容大小,单位字节,默认300KB
|
* @param int $contentMaxSize 最大内容大小,单位KB,默认300KB
|
||||||
|
* @param bool $truncate 超过contentMaxSize时是否截取,默认true截取,false返回错误
|
||||||
* @return array
|
* @return array
|
||||||
*/
|
*/
|
||||||
public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300): array
|
public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300, bool $truncate = true): array
|
||||||
{
|
{
|
||||||
if (!file_exists($filePath) || !is_file($filePath)) {
|
if (!file_exists($filePath) || !is_file($filePath)) {
|
||||||
return Base::retError("Failed to read contents of {$filePath}");
|
return Base::retError("Failed to read contents of {$filePath}");
|
||||||
@ -248,8 +249,13 @@ class TextExtractor
|
|||||||
try {
|
try {
|
||||||
$extractor = new self($filePath);
|
$extractor = new self($filePath);
|
||||||
$content = $extractor->extractContent();
|
$content = $extractor->extractContent();
|
||||||
if (strlen($content) > $contentMaxSize * 1024) {
|
$maxBytes = $contentMaxSize * 1024;
|
||||||
return Base::retError("Content size exceeds " . Base::readableBytes($contentMaxSize * 1024) . ", unable to display content");
|
if (strlen($content) > $maxBytes) {
|
||||||
|
if ($truncate) {
|
||||||
|
$content = mb_substr($content, 0, $maxBytes);
|
||||||
|
} else {
|
||||||
|
return Base::retError("Content size exceeds " . Base::readableBytes($maxBytes) . ", unable to display content");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return Base::retSuccess("success", $content);
|
return Base::retSuccess("success", $content);
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user