diff --git a/app/Module/AiBot/TextExtractor.php b/app/Module/AiBot/TextExtractor.php new file mode 100644 index 000000000..e6039a607 --- /dev/null +++ b/app/Module/AiBot/TextExtractor.php @@ -0,0 +1,183 @@ + $this->extractFromPDF($filePath), + 'docx' => $this->extractFromDOCX($filePath), + 'ipynb' => $this->extractFromIPYNB($filePath), + default => $this->extractFromOtherFile($filePath), + }; + } catch (Exception $e) { + Log::error('文本提取失败', [ + 'file' => $filePath, + 'error' => $e->getMessage() + ]); + throw $e; + } + } + + /** + * 从PDF文件中提取文本 + * + * @param string $filePath + * @return string + * @throws Exception + */ + protected function extractFromPDF(string $filePath): string + { + try { + $parser = new Parser(); + $pdf = $parser->parseFile($filePath); + + return $pdf->getText(); + } catch (Exception $e) { + Log::error('PDF解析失败', [ + 'file' => $filePath, + 'error' => $e->getMessage() + ]); + throw new Exception("PDF文本提取失败: " . $e->getMessage()); + } + } + + /** + * 从DOCX文件中提取文本 + * + * @param string $filePath + * @return string + * @throws Exception + */ + protected function extractFromDOCX(string $filePath): string + { + $phpWord = IOFactory::load($filePath); + $text = ''; + + foreach ($phpWord->getSections() as $section) { + foreach ($section->getElements() as $element) { + if (method_exists($element, 'getText')) { + $text .= $element->getText() . "\n"; + } + } + } + + return $text; + } + + /** + * 从Jupyter Notebook文件中提取文本 + * + * @param string $filePath + * @return string + * @throws Exception + */ + protected function extractFromIPYNB(string $filePath): string + { + $content = file_get_contents($filePath); + $notebook = json_decode($content, true); + + if (json_last_error() !== JSON_ERROR_NONE) { + throw new Exception("IPYNB文件解析失败: " . json_last_error_msg()); + } + + $extractedText = ''; + + foreach ($notebook['cells'] ?? [] as $cell) { + if (in_array($cell['cell_type'] ?? '', ['markdown', 'code']) && isset($cell['source'])) { + $source = $cell['source']; + $extractedText .= is_array($source) + ? implode("\n", $source) + : $source; + $extractedText .= "\n"; + } + } + + return $extractedText; + } + + /** + * 从其他类型文件中提取文本 + * + * @param string $filePath + * @return string + * @throws Exception + */ + protected function extractFromOtherFile(string $filePath): string + { + if ($this->isBinaryFile($filePath)) { + throw new Exception("无法读取该类型文件的文本内容"); + } + + return file_get_contents($filePath); + } + + /** + * 检查文件是否为二进制文件 + * + * @param string $filePath + * @return bool + */ + protected function isBinaryFile(string $filePath): bool + { + $finfo = finfo_open(FILEINFO_MIME); + $mimeType = finfo_file($finfo, $filePath); + finfo_close($finfo); + + return !str_contains($mimeType, 'text/') + && !str_contains($mimeType, 'application/json') + && !str_contains($mimeType, 'application/xml'); + } + + /** ********************************************************************* */ + /** ********************************************************************* */ + /** ********************************************************************* */ + + public static function parsePaths($filePath) + { + // todo + // (see below for file content) + // \n${content}\n + + // (see below for site content) + // \n${result}\n + } + + public static function getFileContent($filePath) + { + if (!file_exists($filePath) || !is_file($filePath)) { + return "(Failed to read contents of {$filePath})"; + } + $te = new self(); + try { + $isBinary = $te->isBinaryFile($filePath); + if ($isBinary) { + return "(Binary file, unable to display content)"; + } + return $te->extractText($filePath); + } catch (Exception $e) { + return "(Failed to read contents of {$filePath}: {$e->getMessage()})"; + } + } +} diff --git a/composer.json b/composer.json index 280cc7dba..b49914321 100644 --- a/composer.json +++ b/composer.json @@ -11,6 +11,7 @@ "php": "^8.0", "ext-curl": "*", "ext-dom": "*", + "ext-fileinfo": "*", "ext-gd": "*", "ext-imagick": "*", "ext-json": "*", @@ -35,7 +36,9 @@ "mews/captcha": "^3.2.6", "orangehill/iseed": "^3.0.1", "overtrue/pinyin": "^4.0", + "phpoffice/phpword": "^1.3", "predis/predis": "^1.1.7", + "smalot/pdfparser": "^2.11", "symfony/mailer": "^6.0" }, "require-dev": { diff --git a/composer.lock b/composer.lock index d2a291e3b..feb4b3395 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "4bfa166220b4b329f40e62394c6336a0", + "content-hash": "ef4844086b8b7fde7050b10a9d4e436a", "packages": [ { "name": "asm89/stack-cors", @@ -3679,6 +3679,58 @@ ], "time": "2023-04-27T10:17:12+00:00" }, + { + "name": "phpoffice/math", + "version": "0.2.0", + "source": { + "type": "git", + "url": "https://github.com/PHPOffice/Math.git", + "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/PHPOffice/Math/zipball/fc2eb6d1a61b058d5dac77197059db30ee3c8329", + "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-xml": "*", + "php": "^7.1|^8.0" + }, + "require-dev": { + "phpstan/phpstan": "^0.12.88 || ^1.0.0", + "phpunit/phpunit": "^7.0 || ^9.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "PhpOffice\\Math\\": "src/Math/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Progi1984", + "homepage": "https://lefevre.dev" + } + ], + "description": "Math - Manipulate Math Formula", + "homepage": "https://phpoffice.github.io/Math/", + "keywords": [ + "MathML", + "officemathml", + "php" + ], + "support": { + "issues": "https://github.com/PHPOffice/Math/issues", + "source": "https://github.com/PHPOffice/Math/tree/0.2.0" + }, + "time": "2024-08-12T07:30:45+00:00" + }, { "name": "phpoffice/phpspreadsheet", "version": "1.29.5", @@ -3784,6 +3836,115 @@ }, "time": "2024-11-22T05:57:44+00:00" }, + { + "name": "phpoffice/phpword", + "version": "1.3.0", + "source": { + "type": "git", + "url": "https://github.com/PHPOffice/PHPWord.git", + "reference": "8392134ce4b5dba65130ba956231a1602b848b7f" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/8392134ce4b5dba65130ba956231a1602b848b7f", + "reference": "8392134ce4b5dba65130ba956231a1602b848b7f", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "ext-json": "*", + "ext-xml": "*", + "php": "^7.1|^8.0", + "phpoffice/math": "^0.2" + }, + "require-dev": { + "dompdf/dompdf": "^2.0", + "ext-gd": "*", + "ext-libxml": "*", + "ext-zip": "*", + "friendsofphp/php-cs-fixer": "^3.3", + "mpdf/mpdf": "^8.1", + "phpmd/phpmd": "^2.13", + "phpstan/phpstan-phpunit": "@stable", + "phpunit/phpunit": ">=7.0", + "symfony/process": "^4.4 || ^5.0", + "tecnickcom/tcpdf": "^6.5" + }, + "suggest": { + "dompdf/dompdf": "Allows writing PDF", + "ext-gd2": "Allows adding images", + "ext-xmlwriter": "Allows writing OOXML and ODF", + "ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template", + "ext-zip": "Allows writing OOXML and ODF" + }, + "type": "library", + "autoload": { + "psr-4": { + "PhpOffice\\PhpWord\\": "src/PhpWord" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Mark Baker" + }, + { + "name": "Gabriel Bull", + "email": "me@gabrielbull.com", + "homepage": "http://gabrielbull.com/" + }, + { + "name": "Franck Lefevre", + "homepage": "https://rootslabs.net/blog/" + }, + { + "name": "Ivan Lanin", + "homepage": "http://ivan.lanin.org" + }, + { + "name": "Roman Syroeshko", + "homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/" + }, + { + "name": "Antoine de Troostembergh" + } + ], + "description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)", + "homepage": "https://phpoffice.github.io/PHPWord/", + "keywords": [ + "ISO IEC 29500", + "OOXML", + "Office Open XML", + "OpenDocument", + "OpenXML", + "PhpOffice", + "PhpWord", + "Rich Text Format", + "WordprocessingML", + "doc", + "docx", + "html", + "odf", + "odt", + "office", + "pdf", + "php", + "reader", + "rtf", + "template", + "template processor", + "word", + "writer" + ], + "support": { + "issues": "https://github.com/PHPOffice/PHPWord/issues", + "source": "https://github.com/PHPOffice/PHPWord/tree/1.3.0" + }, + "time": "2024-08-30T18:03:42+00:00" + }, { "name": "phpoption/phpoption", "version": "1.9.3", @@ -4637,6 +4798,57 @@ ], "time": "2024-04-27T21:32:50+00:00" }, + { + "name": "smalot/pdfparser", + "version": "v2.11.0", + "source": { + "type": "git", + "url": "https://github.com/smalot/pdfparser.git", + "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/smalot/pdfparser/zipball/ac8e6678b0940e4b2ccd5caadd3fb18e68093be6", + "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "ext-zlib": "*", + "php": ">=7.1", + "symfony/polyfill-mbstring": "^1.18" + }, + "type": "library", + "autoload": { + "psr-0": { + "Smalot\\PdfParser\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "LGPL-3.0" + ], + "authors": [ + { + "name": "Sebastien MALOT", + "email": "sebastien@malot.fr" + } + ], + "description": "Pdf parser library. Can read and extract information from pdf file.", + "homepage": "https://www.pdfparser.org", + "keywords": [ + "extract", + "parse", + "parser", + "pdf", + "text" + ], + "support": { + "issues": "https://github.com/smalot/pdfparser/issues", + "source": "https://github.com/smalot/pdfparser/tree/v2.11.0" + }, + "time": "2024-08-16T06:48:03+00:00" + }, { "name": "swiftmailer/swiftmailer", "version": "v6.3.0", @@ -10705,6 +10917,8 @@ "platform": { "php": "^8.0", "ext-curl": "*", + "ext-dom": "*", + "ext-fileinfo": "*", "ext-gd": "*", "ext-imagick": "*", "ext-json": "*",