perf: 优化AI解析文件

This commit is contained in:
kuaifan 2025-03-19 22:49:03 +08:00
parent 15d3ec9d81
commit 919b652a06
2 changed files with 63 additions and 60 deletions

View File

@ -3,7 +3,6 @@
namespace App\Module;
use Exception;
use Illuminate\Support\Facades\Log;
use PhpOffice\PhpWord\IOFactory;
use Smalot\PdfParser\Parser;
@ -19,25 +18,17 @@ class TextExtractor
public function extractText(string $filePath): string
{
if (!file_exists($filePath)) {
throw new Exception("文件不存在: {$filePath}");
throw new Exception("File does not exist: {$filePath}");
}
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
try {
return match($fileExtension) {
'pdf' => $this->extractFromPDF($filePath),
'docx' => $this->extractFromDOCX($filePath),
'ipynb' => $this->extractFromIPYNB($filePath),
default => $this->extractFromOtherFile($filePath),
};
} catch (Exception $e) {
Log::error('文本提取失败', [
'file' => $filePath,
'error' => $e->getMessage()
]);
throw $e;
}
return match($fileExtension) {
'pdf' => $this->extractFromPDF($filePath),
'docx' => $this->extractFromDOCX($filePath),
'ipynb' => $this->extractFromIPYNB($filePath),
default => $this->extractFromOtherFile($filePath),
};
}
/**
@ -55,11 +46,7 @@ class TextExtractor
return $pdf->getText();
} catch (Exception $e) {
Log::error('PDF解析失败', [
'file' => $filePath,
'error' => $e->getMessage()
]);
throw new Exception("PDF文本提取失败: " . $e->getMessage());
throw new Exception("PDF text extraction failed: " . $e->getMessage());
}
}
@ -99,7 +86,7 @@ class TextExtractor
$notebook = json_decode($content, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new Exception("IPYNB文件解析失败: " . json_last_error_msg());
throw new Exception("IPYNB file parsing failed: " . json_last_error_msg());
}
$extractedText = '';
@ -127,7 +114,7 @@ class TextExtractor
protected function extractFromOtherFile(string $filePath): string
{
if ($this->isBinaryFile($filePath)) {
throw new Exception("无法读取该类型文件的文本内容");
throw new Exception("Unable to read the text content of this type of file");
}
return file_get_contents($filePath);
@ -158,25 +145,21 @@ class TextExtractor
* 获取文件内容
* @param $filePath
* @param float|int $maxSize 最大文件大小单位字节默认300KB
* @return string
* @return array
*/
public static function getFileContent($filePath, float|int $maxSize = 300 * 1024)
{
if (!file_exists($filePath) || !is_file($filePath)) {
return "(Failed to read contents of {$filePath})";
return Base::retError("Failed to read contents of {$filePath}");
}
if (filesize($filePath) > $maxSize) {
return "(File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content)";
return Base::retError("File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content");
}
$te = new self();
try {
$isBinary = $te->isBinaryFile($filePath);
if ($isBinary) {
return "(Binary file, unable to display content)";
}
return $te->extractText($filePath);
$extractor = new self();
return Base::retSuccess("success", $extractor->extractText($filePath));
} catch (Exception $e) {
return "(Failed to read contents of {$filePath}: {$e->getMessage()})";
return Base::retError($e->getMessage());
}
}
}

View File

@ -17,6 +17,7 @@ use App\Module\Doo;
use App\Module\Ihttp;
use App\Module\TextExtractor;
use Carbon\Carbon;
use Exception;
use League\HTMLToMarkdown\HtmlConverter;
use DB;
@ -90,8 +91,16 @@ class BotReceiveMsgTask extends AbstractTask
}
// 提取指令
$command = $this->extractCommand($msg, $botUser->isAiBot(), $this->mention);
if (empty($command)) {
try {
$command = $this->extractCommand($msg, $botUser->isAiBot(), $this->mention);
if (empty($command)) {
return;
}
} catch (Exception $e) {
WebSocketDialogMsg::sendMsg(null, $msg->dialog_id, 'template', [
'type' => 'content',
'content' => $e->getMessage() ?: "指令解析失败。",
], $botUser->userid, false, false, true); // todo 未能在任务end事件来发送任务
return;
}
@ -473,14 +482,20 @@ class BotReceiveMsgTask extends AbstractTask
if ($replyMsg) {
switch ($replyMsg->type) {
case 'text':
$replyCommand = $this->extractCommand($replyMsg, true);
if ($replyCommand) {
$replyCommand = Base::cutStr($replyCommand, 20000);
try {
$replyCommand = $this->extractCommand($replyMsg, true);
} catch (Exception) {
$errorContent = "引用消息解析失败。";
}
break;
case 'file':
$msgData = Base::json2array($replyMsg->getRawOriginal('msg'));
$replyCommand = TextExtractor::getFileContent(public_path($msgData['path']));
$fileResult = TextExtractor::getFileContent(public_path($msgData['path']));
if (Base::isError($fileResult)) {
$errorContent = $fileResult['msg'];
} else {
$replyCommand = $fileResult['data'];
}
break;
}
}
@ -581,6 +596,7 @@ class BotReceiveMsgTask extends AbstractTask
* @param bool $isAiBot
* @param bool $mention
* @return string
* @throws Exception
*/
private function extractCommand(WebSocketDialogMsg $msg, bool $isAiBot = false, bool $mention = false)
{
@ -608,13 +624,12 @@ class BotReceiveMsgTask extends AbstractTask
if (preg_match_all("/<span class=\"mention task\" data-id=\"(\d+)\">(.*?)<\/span>/", $original, $match)) {
$taskIds = Base::newIntval($match[1]);
foreach ($taskIds as $index => $taskId) {
$taskName = addslashes($match[2][$index]) . " (ID:{$taskId})";
$taskContext = "任务状态:不存在或已删除";
$taskInfo = ProjectTask::with(['content'])->whereId($taskId)->first();
if ($taskInfo) {
$taskName = addslashes($taskInfo->name) . " (ID:{$taskId})";
$taskContext = implode("\n", $taskInfo->AIContext());
if (!$taskInfo) {
throw new Exception("任务不存在或已被删除");
}
$taskName = addslashes($taskInfo->name) . " (ID:{$taskId})";
$taskContext = implode("\n", $taskInfo->AIContext());
$contents[] = "<task_content path=\"{$taskName}\">\n{$taskContext}\n</task_content>";
$original = str_replace($match[0][$index], "'{$taskName}' (see below for task_content tag)", $original);
}
@ -628,28 +643,31 @@ class BotReceiveMsgTask extends AbstractTask
$pathContent = null;
// 文件
if (preg_match("/single\/file\/(.*?)$/", $urlPath, $fileMatch)) {
$pathTag = "file_content";
$pathName = addslashes($match[3][$index]);
$pathContent = "文件状态:不存在或已删除";
$fileInfo = FileContent::idOrCodeToContent($fileMatch[1]);
if ($fileInfo && isset($fileInfo->content['url'])) {
$urlPath = public_path($fileInfo->content['url']);
if (file_exists($urlPath)) {
$pathName .= " (ID:{$fileInfo->id})";
$pathContent = TextExtractor::getFileContent($urlPath);
}
if (!$fileInfo || !isset($fileInfo->content['url'])) {
throw new Exception("文件不存在或已被删除");
}
$urlPath = public_path($fileInfo->content['url']);
if (!file_exists($urlPath)) {
throw new Exception("文件不存在或已被删除");
}
$fileResult = TextExtractor::getFileContent($urlPath);
if (Base::isError($fileResult)) {
throw new Exception("文件读取失败:" . $fileResult['msg']);
}
$pathTag = "file_content";
$pathName = addslashes($match[3][$index]) . " (ID:{$fileInfo->id})";
$pathContent = $fileResult['data'];
}
// 报告
elseif (preg_match("/single\/report\/detail\/(.*?)$/", $urlPath, $reportMatch)) {
$pathTag = "report_content";
$pathName = addslashes($match[3][$index]);
$pathContent = "报告状态:不存在或已删除";
$reportInfo = Report::idOrCodeToContent($reportMatch[1]);
if ($reportInfo) {
$pathName .= " (ID:{$reportInfo->id})";
$pathContent = $reportInfo->content;
if (!$reportInfo) {
throw new Exception("报告不存在或已被删除");
}
$pathTag = "report_content";
$pathName = addslashes($match[3][$index]) . " (ID:{$reportInfo->id})";
$pathContent = $reportInfo->content;
}
if ($pathTag) {
$contents[] = "<{$pathTag} path=\"{$pathName}\">\n{$pathContent}\n</{$pathTag}>";
@ -662,7 +680,9 @@ class BotReceiveMsgTask extends AbstractTask
try {
$converter = new HtmlConverter();
$original = $converter->convert($original);
} catch (\Exception) { }
} catch (\Exception) {
throw new Exception("Failed to convert HTML to Markdown");
}
}
if ($contents) {
// 添加tag内容