From 4818409329b7779144d5f0f94ce895f16b588d05 Mon Sep 17 00:00:00 2001 From: kuaifan Date: Wed, 19 Mar 2025 23:33:17 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20=E4=BC=98=E5=8C=96AI=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Module/TextExtractor.php | 197 ++++++++++++++++++++------------ app/Tasks/BotReceiveMsgTask.php | 4 +- 2 files changed, 129 insertions(+), 72 deletions(-) diff --git a/app/Module/TextExtractor.php b/app/Module/TextExtractor.php index 31f225ec3..0b014a82c 100644 --- a/app/Module/TextExtractor.php +++ b/app/Module/TextExtractor.php @@ -3,8 +3,10 @@ namespace App\Module; use Exception; -use PhpOffice\PhpWord\IOFactory; -use Smalot\PdfParser\Parser; +use PhpOffice\PhpWord\IOFactory as WordIOFactory; +use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory; +use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory; + class TextExtractor { @@ -15,7 +17,7 @@ class TextExtractor * @return string * @throws Exception */ - public function extractText(string $filePath): string + public function extractContent(string $filePath): string { if (!file_exists($filePath)) { throw new Exception("File does not exist: {$filePath}"); @@ -23,49 +25,46 @@ class TextExtractor $fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION)); - return match($fileExtension) { - 'pdf' => $this->extractFromPDF($filePath), - 'docx' => $this->extractFromDOCX($filePath), - 'ipynb' => $this->extractFromIPYNB($filePath), - default => $this->extractFromOtherFile($filePath), + return match ($fileExtension) { + // Word documents + 'docx' => $this->parseWordDocument($filePath), + + // Spreadsheet files + 'xlsx', 'xls', 'csv' => $this->parseSpreadsheet($filePath), + + // Presentation files + 'ppt', 'pptx' => $this->parsePresentation($filePath), + + // PDF files (requires additional library) + 'pdf' => $this->parsePdf($filePath), + + // RTF files + 'rtf' => $this->parseRtf($filePath), + + // Default case + default => $this->parseOther($filePath), }; } /** - * 从PDF文件中提取文本 - * - * @param string $filePath - * @return string - * @throws Exception + * Parse Word documents (.doc, .docx) */ - protected function extractFromPDF(string $filePath): string + private function parseWordDocument(string $filePath): string { - try { - $parser = new Parser(); - $pdf = $parser->parseFile($filePath); - - return $pdf->getText(); - } catch (Exception $e) { - throw new Exception("PDF text extraction failed: " . $e->getMessage()); - } - } - - /** - * 从DOCX文件中提取文本 - * - * @param string $filePath - * @return string - * @throws Exception - */ - protected function extractFromDOCX(string $filePath): string - { - $phpWord = IOFactory::load($filePath); + $phpWord = WordIOFactory::load($filePath); $text = ''; + // Extract text from each section foreach ($phpWord->getSections() as $section) { foreach ($section->getElements() as $element) { if (method_exists($element, 'getText')) { $text .= $element->getText() . "\n"; + } elseif (method_exists($element, 'getElements')) { + foreach ($element->getElements() as $childElement) { + if (method_exists($childElement, 'getText')) { + $text .= $childElement->getText() . "\n"; + } + } } } } @@ -74,67 +73,125 @@ class TextExtractor } /** - * 从Jupyter Notebook文件中提取文本 - * - * @param string $filePath - * @return string - * @throws Exception + * Parse spreadsheet files (.xlsx, .xls, .csv) */ - protected function extractFromIPYNB(string $filePath): string + private function parseSpreadsheet(string $filePath): string { - $content = file_get_contents($filePath); - $notebook = json_decode($content, true); + $spreadsheet = SpreadsheetIOFactory::load($filePath); + $text = ''; - if (json_last_error() !== JSON_ERROR_NONE) { - throw new Exception("IPYNB file parsing failed: " . json_last_error_msg()); - } + // Extract text from all worksheets + foreach ($spreadsheet->getWorksheetIterator() as $worksheet) { + $text .= 'Worksheet: ' . $worksheet->getTitle() . "\n"; - $extractedText = ''; + foreach ($worksheet->getRowIterator() as $row) { + $cellIterator = $row->getCellIterator(); + $cellIterator->setIterateOnlyExistingCells(false); + $rowText = ''; - foreach ($notebook['cells'] ?? [] as $cell) { - if (in_array($cell['cell_type'] ?? '', ['markdown', 'code']) && isset($cell['source'])) { - $source = $cell['source']; - $extractedText .= is_array($source) - ? implode("\n", $source) - : $source; - $extractedText .= "\n"; + foreach ($cellIterator as $cell) { + $value = $cell->getValue(); + if (!empty($value)) { + $rowText .= $value . "\t"; + } + } + + if (!empty(trim($rowText))) { + $text .= trim($rowText) . "\n"; + } } + + $text .= "\n"; } - return $extractedText; + return $text; } /** - * 从其他类型文件中提取文本 - * - * @param string $filePath - * @return string + * Parse presentation files (.ppt, .pptx) * @throws Exception */ - protected function extractFromOtherFile(string $filePath): string + private function parsePresentation(string $filePath): string { - if ($this->isBinaryFile($filePath)) { - throw new Exception("Unable to read the text content of this type of file"); + $presentation = PresentationIOFactory::load($filePath); + $text = ''; + + // Extract text from all slides + foreach ($presentation->getAllSlides() as $slide) { + foreach ($slide->getShapeCollection() as $shape) { + if ($shape instanceof \PhpOffice\PhpPresentation\Shape\RichText) { + foreach ($shape->getParagraphs() as $paragraph) { + foreach ($paragraph->getRichTextElements() as $element) { + $text .= $element->getText(); + } + $text .= "\n"; + } + } + } + $text .= "\n"; } - return file_get_contents($filePath); + return $text; } /** - * 检查文件是否为二进制文件 - * - * @param string $filePath - * @return bool + * Parse PDF files (requires additional library like Smalot\PdfParser) + * @throws Exception */ - protected function isBinaryFile(string $filePath): bool + private function parsePdf(string $filePath): string + { + // You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser + if (!class_exists('\Smalot\PdfParser\Parser')) { + throw new \Exception("PDF Parser not available. Install with: composer require smalot/pdfparser"); + } + + $parser = new \Smalot\PdfParser\Parser(); + $pdf = $parser->parseFile($filePath); + return $pdf->getText(); + } + + /** + * Parse RTF files + */ + private function parseRtf(string $filePath): string + { + // Simple RTF to text conversion + $content = file_get_contents($filePath); + + // Remove RTF control words and groups + $content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content); + $content = preg_replace('/\\\\([^a-z]|[a-z]{33,})/i', '', $content); + $content = preg_replace('/\{\*?\\\\[^{}]*\}/', '', $content); + $content = preg_replace('/\{[\r\n]*\}/', '', $content); + + // Convert special characters + $content = preg_replace('/\\\\\'([0-9a-f]{2})/i', '', $content); + + // Remove remaining curly braces + $content = str_replace(['{', '}'], '', $content); + + return $content ?: ''; + } + + /** + * Parse Other(text) files + * @throws Exception + */ + private function parseOther(string $filePath): string { $finfo = finfo_open(FILEINFO_MIME); $mimeType = finfo_file($finfo, $filePath); finfo_close($finfo); - return !str_contains($mimeType, 'text/') + $isBinary = !str_contains($mimeType, 'text/') && !str_contains($mimeType, 'application/json') && !str_contains($mimeType, 'application/xml'); + + if ($isBinary) { + throw new Exception("Unable to read the text content of this type of file"); + } + + return file_get_contents($filePath); } /** ********************************************************************* */ @@ -147,7 +204,7 @@ class TextExtractor * @param float|int $maxSize 最大文件大小,单位字节,默认300KB * @return array */ - public static function getFileContent($filePath, float|int $maxSize = 300 * 1024) + public static function extractFile($filePath, float|int $maxSize = 300 * 1024): array { if (!file_exists($filePath) || !is_file($filePath)) { return Base::retError("Failed to read contents of {$filePath}"); @@ -157,7 +214,7 @@ class TextExtractor } try { $extractor = new self(); - return Base::retSuccess("success", $extractor->extractText($filePath)); + return Base::retSuccess("success", $extractor->extractContent($filePath)); } catch (Exception $e) { return Base::retError($e->getMessage()); } diff --git a/app/Tasks/BotReceiveMsgTask.php b/app/Tasks/BotReceiveMsgTask.php index 8b78ba68d..4edc38dd6 100644 --- a/app/Tasks/BotReceiveMsgTask.php +++ b/app/Tasks/BotReceiveMsgTask.php @@ -490,7 +490,7 @@ class BotReceiveMsgTask extends AbstractTask break; case 'file': $msgData = Base::json2array($replyMsg->getRawOriginal('msg')); - $fileResult = TextExtractor::getFileContent(public_path($msgData['path'])); + $fileResult = TextExtractor::extractFile(public_path($msgData['path'])); if (Base::isError($fileResult)) { $errorContent = $fileResult['msg']; } else { @@ -651,7 +651,7 @@ class BotReceiveMsgTask extends AbstractTask if (!file_exists($urlPath)) { throw new Exception("文件不存在或已被删除"); } - $fileResult = TextExtractor::getFileContent($urlPath); + $fileResult = TextExtractor::extractFile($urlPath); if (Base::isError($fileResult)) { throw new Exception("文件读取失败:" . $fileResult['msg']); }