mirror of
https://github.com/kuaifan/dootask.git
synced 2025-12-12 03:01:12 +00:00
perf: 优化AI解析文件
This commit is contained in:
parent
15d3ec9d81
commit
919b652a06
@ -3,7 +3,6 @@
|
|||||||
namespace App\Module;
|
namespace App\Module;
|
||||||
|
|
||||||
use Exception;
|
use Exception;
|
||||||
use Illuminate\Support\Facades\Log;
|
|
||||||
use PhpOffice\PhpWord\IOFactory;
|
use PhpOffice\PhpWord\IOFactory;
|
||||||
use Smalot\PdfParser\Parser;
|
use Smalot\PdfParser\Parser;
|
||||||
|
|
||||||
@ -19,25 +18,17 @@ class TextExtractor
|
|||||||
public function extractText(string $filePath): string
|
public function extractText(string $filePath): string
|
||||||
{
|
{
|
||||||
if (!file_exists($filePath)) {
|
if (!file_exists($filePath)) {
|
||||||
throw new Exception("文件不存在: {$filePath}");
|
throw new Exception("File does not exist: {$filePath}");
|
||||||
}
|
}
|
||||||
|
|
||||||
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
|
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
|
||||||
|
|
||||||
try {
|
|
||||||
return match($fileExtension) {
|
return match($fileExtension) {
|
||||||
'pdf' => $this->extractFromPDF($filePath),
|
'pdf' => $this->extractFromPDF($filePath),
|
||||||
'docx' => $this->extractFromDOCX($filePath),
|
'docx' => $this->extractFromDOCX($filePath),
|
||||||
'ipynb' => $this->extractFromIPYNB($filePath),
|
'ipynb' => $this->extractFromIPYNB($filePath),
|
||||||
default => $this->extractFromOtherFile($filePath),
|
default => $this->extractFromOtherFile($filePath),
|
||||||
};
|
};
|
||||||
} catch (Exception $e) {
|
|
||||||
Log::error('文本提取失败', [
|
|
||||||
'file' => $filePath,
|
|
||||||
'error' => $e->getMessage()
|
|
||||||
]);
|
|
||||||
throw $e;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -55,11 +46,7 @@ class TextExtractor
|
|||||||
|
|
||||||
return $pdf->getText();
|
return $pdf->getText();
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
Log::error('PDF解析失败', [
|
throw new Exception("PDF text extraction failed: " . $e->getMessage());
|
||||||
'file' => $filePath,
|
|
||||||
'error' => $e->getMessage()
|
|
||||||
]);
|
|
||||||
throw new Exception("PDF文本提取失败: " . $e->getMessage());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,7 +86,7 @@ class TextExtractor
|
|||||||
$notebook = json_decode($content, true);
|
$notebook = json_decode($content, true);
|
||||||
|
|
||||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||||
throw new Exception("IPYNB文件解析失败: " . json_last_error_msg());
|
throw new Exception("IPYNB file parsing failed: " . json_last_error_msg());
|
||||||
}
|
}
|
||||||
|
|
||||||
$extractedText = '';
|
$extractedText = '';
|
||||||
@ -127,7 +114,7 @@ class TextExtractor
|
|||||||
protected function extractFromOtherFile(string $filePath): string
|
protected function extractFromOtherFile(string $filePath): string
|
||||||
{
|
{
|
||||||
if ($this->isBinaryFile($filePath)) {
|
if ($this->isBinaryFile($filePath)) {
|
||||||
throw new Exception("无法读取该类型文件的文本内容");
|
throw new Exception("Unable to read the text content of this type of file");
|
||||||
}
|
}
|
||||||
|
|
||||||
return file_get_contents($filePath);
|
return file_get_contents($filePath);
|
||||||
@ -158,25 +145,21 @@ class TextExtractor
|
|||||||
* 获取文件内容
|
* 获取文件内容
|
||||||
* @param $filePath
|
* @param $filePath
|
||||||
* @param float|int $maxSize 最大文件大小,单位字节,默认300KB
|
* @param float|int $maxSize 最大文件大小,单位字节,默认300KB
|
||||||
* @return string
|
* @return array
|
||||||
*/
|
*/
|
||||||
public static function getFileContent($filePath, float|int $maxSize = 300 * 1024)
|
public static function getFileContent($filePath, float|int $maxSize = 300 * 1024)
|
||||||
{
|
{
|
||||||
if (!file_exists($filePath) || !is_file($filePath)) {
|
if (!file_exists($filePath) || !is_file($filePath)) {
|
||||||
return "(Failed to read contents of {$filePath})";
|
return Base::retError("Failed to read contents of {$filePath}");
|
||||||
}
|
}
|
||||||
if (filesize($filePath) > $maxSize) {
|
if (filesize($filePath) > $maxSize) {
|
||||||
return "(File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content)";
|
return Base::retError("File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content");
|
||||||
}
|
}
|
||||||
$te = new self();
|
|
||||||
try {
|
try {
|
||||||
$isBinary = $te->isBinaryFile($filePath);
|
$extractor = new self();
|
||||||
if ($isBinary) {
|
return Base::retSuccess("success", $extractor->extractText($filePath));
|
||||||
return "(Binary file, unable to display content)";
|
|
||||||
}
|
|
||||||
return $te->extractText($filePath);
|
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
return "(Failed to read contents of {$filePath}: {$e->getMessage()})";
|
return Base::retError($e->getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,6 +17,7 @@ use App\Module\Doo;
|
|||||||
use App\Module\Ihttp;
|
use App\Module\Ihttp;
|
||||||
use App\Module\TextExtractor;
|
use App\Module\TextExtractor;
|
||||||
use Carbon\Carbon;
|
use Carbon\Carbon;
|
||||||
|
use Exception;
|
||||||
use League\HTMLToMarkdown\HtmlConverter;
|
use League\HTMLToMarkdown\HtmlConverter;
|
||||||
use DB;
|
use DB;
|
||||||
|
|
||||||
@ -90,10 +91,18 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 提取指令
|
// 提取指令
|
||||||
|
try {
|
||||||
$command = $this->extractCommand($msg, $botUser->isAiBot(), $this->mention);
|
$command = $this->extractCommand($msg, $botUser->isAiBot(), $this->mention);
|
||||||
if (empty($command)) {
|
if (empty($command)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
} catch (Exception $e) {
|
||||||
|
WebSocketDialogMsg::sendMsg(null, $msg->dialog_id, 'template', [
|
||||||
|
'type' => 'content',
|
||||||
|
'content' => $e->getMessage() ?: "指令解析失败。",
|
||||||
|
], $botUser->userid, false, false, true); // todo 未能在任务end事件来发送任务
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// 查询会话
|
// 查询会话
|
||||||
$dialog = WebSocketDialog::find($msg->dialog_id);
|
$dialog = WebSocketDialog::find($msg->dialog_id);
|
||||||
@ -473,14 +482,20 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
if ($replyMsg) {
|
if ($replyMsg) {
|
||||||
switch ($replyMsg->type) {
|
switch ($replyMsg->type) {
|
||||||
case 'text':
|
case 'text':
|
||||||
|
try {
|
||||||
$replyCommand = $this->extractCommand($replyMsg, true);
|
$replyCommand = $this->extractCommand($replyMsg, true);
|
||||||
if ($replyCommand) {
|
} catch (Exception) {
|
||||||
$replyCommand = Base::cutStr($replyCommand, 20000);
|
$errorContent = "引用消息解析失败。";
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 'file':
|
case 'file':
|
||||||
$msgData = Base::json2array($replyMsg->getRawOriginal('msg'));
|
$msgData = Base::json2array($replyMsg->getRawOriginal('msg'));
|
||||||
$replyCommand = TextExtractor::getFileContent(public_path($msgData['path']));
|
$fileResult = TextExtractor::getFileContent(public_path($msgData['path']));
|
||||||
|
if (Base::isError($fileResult)) {
|
||||||
|
$errorContent = $fileResult['msg'];
|
||||||
|
} else {
|
||||||
|
$replyCommand = $fileResult['data'];
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -581,6 +596,7 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
* @param bool $isAiBot
|
* @param bool $isAiBot
|
||||||
* @param bool $mention
|
* @param bool $mention
|
||||||
* @return string
|
* @return string
|
||||||
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
private function extractCommand(WebSocketDialogMsg $msg, bool $isAiBot = false, bool $mention = false)
|
private function extractCommand(WebSocketDialogMsg $msg, bool $isAiBot = false, bool $mention = false)
|
||||||
{
|
{
|
||||||
@ -608,13 +624,12 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
if (preg_match_all("/<span class=\"mention task\" data-id=\"(\d+)\">(.*?)<\/span>/", $original, $match)) {
|
if (preg_match_all("/<span class=\"mention task\" data-id=\"(\d+)\">(.*?)<\/span>/", $original, $match)) {
|
||||||
$taskIds = Base::newIntval($match[1]);
|
$taskIds = Base::newIntval($match[1]);
|
||||||
foreach ($taskIds as $index => $taskId) {
|
foreach ($taskIds as $index => $taskId) {
|
||||||
$taskName = addslashes($match[2][$index]) . " (ID:{$taskId})";
|
|
||||||
$taskContext = "任务状态:不存在或已删除";
|
|
||||||
$taskInfo = ProjectTask::with(['content'])->whereId($taskId)->first();
|
$taskInfo = ProjectTask::with(['content'])->whereId($taskId)->first();
|
||||||
if ($taskInfo) {
|
if (!$taskInfo) {
|
||||||
|
throw new Exception("任务不存在或已被删除");
|
||||||
|
}
|
||||||
$taskName = addslashes($taskInfo->name) . " (ID:{$taskId})";
|
$taskName = addslashes($taskInfo->name) . " (ID:{$taskId})";
|
||||||
$taskContext = implode("\n", $taskInfo->AIContext());
|
$taskContext = implode("\n", $taskInfo->AIContext());
|
||||||
}
|
|
||||||
$contents[] = "<task_content path=\"{$taskName}\">\n{$taskContext}\n</task_content>";
|
$contents[] = "<task_content path=\"{$taskName}\">\n{$taskContext}\n</task_content>";
|
||||||
$original = str_replace($match[0][$index], "'{$taskName}' (see below for task_content tag)", $original);
|
$original = str_replace($match[0][$index], "'{$taskName}' (see below for task_content tag)", $original);
|
||||||
}
|
}
|
||||||
@ -628,28 +643,31 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
$pathContent = null;
|
$pathContent = null;
|
||||||
// 文件
|
// 文件
|
||||||
if (preg_match("/single\/file\/(.*?)$/", $urlPath, $fileMatch)) {
|
if (preg_match("/single\/file\/(.*?)$/", $urlPath, $fileMatch)) {
|
||||||
$pathTag = "file_content";
|
|
||||||
$pathName = addslashes($match[3][$index]);
|
|
||||||
$pathContent = "文件状态:不存在或已删除";
|
|
||||||
$fileInfo = FileContent::idOrCodeToContent($fileMatch[1]);
|
$fileInfo = FileContent::idOrCodeToContent($fileMatch[1]);
|
||||||
if ($fileInfo && isset($fileInfo->content['url'])) {
|
if (!$fileInfo || !isset($fileInfo->content['url'])) {
|
||||||
|
throw new Exception("文件不存在或已被删除");
|
||||||
|
}
|
||||||
$urlPath = public_path($fileInfo->content['url']);
|
$urlPath = public_path($fileInfo->content['url']);
|
||||||
if (file_exists($urlPath)) {
|
if (!file_exists($urlPath)) {
|
||||||
$pathName .= " (ID:{$fileInfo->id})";
|
throw new Exception("文件不存在或已被删除");
|
||||||
$pathContent = TextExtractor::getFileContent($urlPath);
|
|
||||||
}
|
}
|
||||||
|
$fileResult = TextExtractor::getFileContent($urlPath);
|
||||||
|
if (Base::isError($fileResult)) {
|
||||||
|
throw new Exception("文件读取失败:" . $fileResult['msg']);
|
||||||
}
|
}
|
||||||
|
$pathTag = "file_content";
|
||||||
|
$pathName = addslashes($match[3][$index]) . " (ID:{$fileInfo->id})";
|
||||||
|
$pathContent = $fileResult['data'];
|
||||||
}
|
}
|
||||||
// 报告
|
// 报告
|
||||||
elseif (preg_match("/single\/report\/detail\/(.*?)$/", $urlPath, $reportMatch)) {
|
elseif (preg_match("/single\/report\/detail\/(.*?)$/", $urlPath, $reportMatch)) {
|
||||||
$pathTag = "report_content";
|
|
||||||
$pathName = addslashes($match[3][$index]);
|
|
||||||
$pathContent = "报告状态:不存在或已删除";
|
|
||||||
$reportInfo = Report::idOrCodeToContent($reportMatch[1]);
|
$reportInfo = Report::idOrCodeToContent($reportMatch[1]);
|
||||||
if ($reportInfo) {
|
if (!$reportInfo) {
|
||||||
$pathName .= " (ID:{$reportInfo->id})";
|
throw new Exception("报告不存在或已被删除");
|
||||||
$pathContent = $reportInfo->content;
|
|
||||||
}
|
}
|
||||||
|
$pathTag = "report_content";
|
||||||
|
$pathName = addslashes($match[3][$index]) . " (ID:{$reportInfo->id})";
|
||||||
|
$pathContent = $reportInfo->content;
|
||||||
}
|
}
|
||||||
if ($pathTag) {
|
if ($pathTag) {
|
||||||
$contents[] = "<{$pathTag} path=\"{$pathName}\">\n{$pathContent}\n</{$pathTag}>";
|
$contents[] = "<{$pathTag} path=\"{$pathName}\">\n{$pathContent}\n</{$pathTag}>";
|
||||||
@ -662,7 +680,9 @@ class BotReceiveMsgTask extends AbstractTask
|
|||||||
try {
|
try {
|
||||||
$converter = new HtmlConverter();
|
$converter = new HtmlConverter();
|
||||||
$original = $converter->convert($original);
|
$original = $converter->convert($original);
|
||||||
} catch (\Exception) { }
|
} catch (\Exception) {
|
||||||
|
throw new Exception("Failed to convert HTML to Markdown");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if ($contents) {
|
if ($contents) {
|
||||||
// 添加tag内容
|
// 添加tag内容
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user