perf: 优化AI解析文件

2026-02-05 12:45:41 +00:00 · 2025-03-19 22:49:03 +08:00 · 2025-03-19 22:49:03 +08:00 · 919b652a06
commit 919b652a06
parent 15d3ec9d81
2 changed files with 63 additions and 60 deletions
--- a/app/Module/TextExtractor.php
+++ b/app/Module/TextExtractor.php
@ -3,7 +3,6 @@
 namespace App\Module;
 use Exception;
 use Illuminate\Support\Facades\Log;
 use PhpOffice\PhpWord\IOFactory;
 use Smalot\PdfParser\Parser;
@ -19,25 +18,17 @@ class TextExtractor
    public function extractText(string $filePath): string
    {
        if (!file_exists($filePath)) {
-            throw new Exception("文件不存在: {$filePath}");
+            throw new Exception("File does not exist: {$filePath}");
        }
        $fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
        try {
        return match($fileExtension) {
            'pdf'   => $this->extractFromPDF($filePath),
            'docx'  => $this->extractFromDOCX($filePath),
            'ipynb' => $this->extractFromIPYNB($filePath),
            default => $this->extractFromOtherFile($filePath),
        };
        } catch (Exception $e) {
            Log::error('文本提取失败', [
                'file' => $filePath,
                'error' => $e->getMessage()
            ]);
            throw $e;
        }
    }
    /**
@ -55,11 +46,7 @@ class TextExtractor
            return $pdf->getText();
        } catch (Exception $e) {
-            Log::error('PDF解析失败', [
+            throw new Exception("PDF text extraction failed: " . $e->getMessage());
                'file' => $filePath,
                'error' => $e->getMessage()
            ]);
            throw new Exception("PDF文本提取失败: " . $e->getMessage());
        }
    }
@ -99,7 +86,7 @@ class TextExtractor
        $notebook = json_decode($content, true);
        if (json_last_error() !== JSON_ERROR_NONE) {
-            throw new Exception("IPYNB文件解析失败: " . json_last_error_msg());
+            throw new Exception("IPYNB file parsing failed: " . json_last_error_msg());
        }
        $extractedText = '';
@ -127,7 +114,7 @@ class TextExtractor
    protected function extractFromOtherFile(string $filePath): string
    {
        if ($this->isBinaryFile($filePath)) {
-            throw new Exception("无法读取该类型文件的文本内容");
+            throw new Exception("Unable to read the text content of this type of file");
        }
        return file_get_contents($filePath);
@ -158,25 +145,21 @@ class TextExtractor
     * 获取文件内容
     * @param $filePath
     * @param float|int $maxSize 最大文件大小，单位字节，默认300KB
-     * @return string
+     * @return array
     */
    public static function getFileContent($filePath, float|int $maxSize = 300 * 1024)
    {
        if (!file_exists($filePath) || !is_file($filePath)) {
-            return "(Failed to read contents of {$filePath})";
+            return Base::retError("Failed to read contents of {$filePath}");
        }
        if (filesize($filePath) > $maxSize) {
-            return "(File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content)";
+            return Base::retError("File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content");
        }
        $te = new self();
        try {
-            $isBinary = $te->isBinaryFile($filePath);
+            $extractor = new self();
-            if ($isBinary) {
+            return Base::retSuccess("success", $extractor->extractText($filePath));
                return "(Binary file, unable to display content)";
            }
            return $te->extractText($filePath);
        } catch (Exception $e) {
-            return "(Failed to read contents of {$filePath}: {$e->getMessage()})";
+            return Base::retError($e->getMessage());
        }
    }
 }
--- a/app/Tasks/BotReceiveMsgTask.php
+++ b/app/Tasks/BotReceiveMsgTask.php
@ -17,6 +17,7 @@ use App\Module\Doo;
 use App\Module\Ihttp;
 use App\Module\TextExtractor;
 use Carbon\Carbon;
 use Exception;
 use League\HTMLToMarkdown\HtmlConverter;
 use DB;
@ -90,10 +91,18 @@ class BotReceiveMsgTask extends AbstractTask
        }
        // 提取指令
        try {
            $command = $this->extractCommand($msg, $botUser->isAiBot(), $this->mention);
            if (empty($command)) {
                return;
            }
        } catch (Exception $e) {
            WebSocketDialogMsg::sendMsg(null, $msg->dialog_id, 'template', [
                'type' => 'content',
                'content' => $e->getMessage() ?: "指令解析失败。",
            ], $botUser->userid, false, false, true);    // todo 未能在任务end事件来发送任务
            return;
        }
        // 查询会话
        $dialog = WebSocketDialog::find($msg->dialog_id);
@ -473,14 +482,20 @@ class BotReceiveMsgTask extends AbstractTask
                if ($replyMsg) {
                    switch ($replyMsg->type) {
                        case 'text':
                            try {
                                $replyCommand = $this->extractCommand($replyMsg, true);
-                            if ($replyCommand) {
+                            } catch (Exception) {
-                                $replyCommand = Base::cutStr($replyCommand, 20000);
+                                $errorContent = "引用消息解析失败。";
                            }
                            break;
                        case 'file':
                            $msgData = Base::json2array($replyMsg->getRawOriginal('msg'));
-                            $replyCommand = TextExtractor::getFileContent(public_path($msgData['path']));
+                            $fileResult = TextExtractor::getFileContent(public_path($msgData['path']));
                            if (Base::isError($fileResult)) {
                                $errorContent = $fileResult['msg'];
                            } else {
                                $replyCommand = $fileResult['data'];
                            }
                            break;
                    }
                }
@ -581,6 +596,7 @@ class BotReceiveMsgTask extends AbstractTask
     * @param bool $isAiBot
     * @param bool $mention
     * @return string
     * @throws Exception
     */
    private function extractCommand(WebSocketDialogMsg $msg, bool $isAiBot = false, bool $mention = false)
    {
@ -608,13 +624,12 @@ class BotReceiveMsgTask extends AbstractTask
        if (preg_match_all("/<span class=\"mention task\" data-id=\"(\d+)\">(.*?)<\/span>/", $original, $match)) {
            $taskIds = Base::newIntval($match[1]);
            foreach ($taskIds as $index => $taskId) {
                $taskName = addslashes($match[2][$index]) . " (ID:{$taskId})";
                $taskContext = "任务状态：不存在或已删除";
                $taskInfo = ProjectTask::with(['content'])->whereId($taskId)->first();
-                if ($taskInfo) {
+                if (!$taskInfo) {
                    throw new Exception("任务不存在或已被删除");
                }
                $taskName = addslashes($taskInfo->name) . " (ID:{$taskId})";
                $taskContext = implode("\n", $taskInfo->AIContext());
                }
                $contents[] = "<task_content path=\"{$taskName}\">\n{$taskContext}\n</task_content>";
                $original = str_replace($match[0][$index], "'{$taskName}' (see below for task_content tag)", $original);
            }
@ -628,28 +643,31 @@ class BotReceiveMsgTask extends AbstractTask
                $pathContent = null;
                // 文件
                if (preg_match("/single\/file\/(.*?)$/", $urlPath, $fileMatch)) {
                    $pathTag = "file_content";
                    $pathName = addslashes($match[3][$index]);
                    $pathContent = "文件状态：不存在或已删除";
                    $fileInfo = FileContent::idOrCodeToContent($fileMatch[1]);
-                    if ($fileInfo && isset($fileInfo->content['url'])) {
+                    if (!$fileInfo || !isset($fileInfo->content['url'])) {
                        throw new Exception("文件不存在或已被删除");
                    }
                    $urlPath = public_path($fileInfo->content['url']);
-                        if (file_exists($urlPath)) {
+                    if (!file_exists($urlPath)) {
-                            $pathName .= " (ID:{$fileInfo->id})";
+                        throw new Exception("文件不存在或已被删除");
                            $pathContent = TextExtractor::getFileContent($urlPath);
                    }
                    $fileResult = TextExtractor::getFileContent($urlPath);
                    if (Base::isError($fileResult)) {
                        throw new Exception("文件读取失败：" . $fileResult['msg']);
                    }
                    $pathTag = "file_content";
                    $pathName = addslashes($match[3][$index]) . " (ID:{$fileInfo->id})";
                    $pathContent = $fileResult['data'];
                }
                // 报告
                elseif (preg_match("/single\/report\/detail\/(.*?)$/", $urlPath, $reportMatch)) {
                    $pathTag = "report_content";
                    $pathName = addslashes($match[3][$index]);
                    $pathContent = "报告状态：不存在或已删除";
                    $reportInfo = Report::idOrCodeToContent($reportMatch[1]);
-                    if ($reportInfo) {
+                    if (!$reportInfo) {
-                        $pathName .= " (ID:{$reportInfo->id})";
+                        throw new Exception("报告不存在或已被删除");
                        $pathContent = $reportInfo->content;
                    }
                    $pathTag = "report_content";
                    $pathName = addslashes($match[3][$index]) . " (ID:{$reportInfo->id})";
                    $pathContent = $reportInfo->content;
                }
                if ($pathTag) {
                    $contents[] = "<{$pathTag} path=\"{$pathName}\">\n{$pathContent}\n</{$pathTag}>";
@ -662,7 +680,9 @@ class BotReceiveMsgTask extends AbstractTask
            try {
                $converter = new HtmlConverter();
                $original = $converter->convert($original);
-            } catch (\Exception) { }
+            } catch (\Exception) {
                throw new Exception("Failed to convert HTML to Markdown");
            }
        }
        if ($contents) {
            // 添加tag内容