diff --git a/app/Module/AiBot/TextExtractor.php b/app/Module/AiBot/TextExtractor.php
new file mode 100644
index 000000000..e6039a607
--- /dev/null
+++ b/app/Module/AiBot/TextExtractor.php
@@ -0,0 +1,183 @@
+ $this->extractFromPDF($filePath),
+ 'docx' => $this->extractFromDOCX($filePath),
+ 'ipynb' => $this->extractFromIPYNB($filePath),
+ default => $this->extractFromOtherFile($filePath),
+ };
+ } catch (Exception $e) {
+ Log::error('文本提取失败', [
+ 'file' => $filePath,
+ 'error' => $e->getMessage()
+ ]);
+ throw $e;
+ }
+ }
+
+ /**
+ * 从PDF文件中提取文本
+ *
+ * @param string $filePath
+ * @return string
+ * @throws Exception
+ */
+ protected function extractFromPDF(string $filePath): string
+ {
+ try {
+ $parser = new Parser();
+ $pdf = $parser->parseFile($filePath);
+
+ return $pdf->getText();
+ } catch (Exception $e) {
+ Log::error('PDF解析失败', [
+ 'file' => $filePath,
+ 'error' => $e->getMessage()
+ ]);
+ throw new Exception("PDF文本提取失败: " . $e->getMessage());
+ }
+ }
+
+ /**
+ * 从DOCX文件中提取文本
+ *
+ * @param string $filePath
+ * @return string
+ * @throws Exception
+ */
+ protected function extractFromDOCX(string $filePath): string
+ {
+ $phpWord = IOFactory::load($filePath);
+ $text = '';
+
+ foreach ($phpWord->getSections() as $section) {
+ foreach ($section->getElements() as $element) {
+ if (method_exists($element, 'getText')) {
+ $text .= $element->getText() . "\n";
+ }
+ }
+ }
+
+ return $text;
+ }
+
+ /**
+ * 从Jupyter Notebook文件中提取文本
+ *
+ * @param string $filePath
+ * @return string
+ * @throws Exception
+ */
+ protected function extractFromIPYNB(string $filePath): string
+ {
+ $content = file_get_contents($filePath);
+ $notebook = json_decode($content, true);
+
+ if (json_last_error() !== JSON_ERROR_NONE) {
+ throw new Exception("IPYNB文件解析失败: " . json_last_error_msg());
+ }
+
+ $extractedText = '';
+
+ foreach ($notebook['cells'] ?? [] as $cell) {
+ if (in_array($cell['cell_type'] ?? '', ['markdown', 'code']) && isset($cell['source'])) {
+ $source = $cell['source'];
+ $extractedText .= is_array($source)
+ ? implode("\n", $source)
+ : $source;
+ $extractedText .= "\n";
+ }
+ }
+
+ return $extractedText;
+ }
+
+ /**
+ * 从其他类型文件中提取文本
+ *
+ * @param string $filePath
+ * @return string
+ * @throws Exception
+ */
+ protected function extractFromOtherFile(string $filePath): string
+ {
+ if ($this->isBinaryFile($filePath)) {
+ throw new Exception("无法读取该类型文件的文本内容");
+ }
+
+ return file_get_contents($filePath);
+ }
+
+ /**
+ * 检查文件是否为二进制文件
+ *
+ * @param string $filePath
+ * @return bool
+ */
+ protected function isBinaryFile(string $filePath): bool
+ {
+ $finfo = finfo_open(FILEINFO_MIME);
+ $mimeType = finfo_file($finfo, $filePath);
+ finfo_close($finfo);
+
+ return !str_contains($mimeType, 'text/')
+ && !str_contains($mimeType, 'application/json')
+ && !str_contains($mimeType, 'application/xml');
+ }
+
+ /** ********************************************************************* */
+ /** ********************************************************************* */
+ /** ********************************************************************* */
+
+ public static function parsePaths($filePath)
+ {
+ // todo
+ // (see below for file content)
+ // \n${content}\n
+
+ // (see below for site content)
+ // \n${result}\n
+ }
+
+ public static function getFileContent($filePath)
+ {
+ if (!file_exists($filePath) || !is_file($filePath)) {
+ return "(Failed to read contents of {$filePath})";
+ }
+ $te = new self();
+ try {
+ $isBinary = $te->isBinaryFile($filePath);
+ if ($isBinary) {
+ return "(Binary file, unable to display content)";
+ }
+ return $te->extractText($filePath);
+ } catch (Exception $e) {
+ return "(Failed to read contents of {$filePath}: {$e->getMessage()})";
+ }
+ }
+}
diff --git a/composer.json b/composer.json
index 280cc7dba..b49914321 100644
--- a/composer.json
+++ b/composer.json
@@ -11,6 +11,7 @@
"php": "^8.0",
"ext-curl": "*",
"ext-dom": "*",
+ "ext-fileinfo": "*",
"ext-gd": "*",
"ext-imagick": "*",
"ext-json": "*",
@@ -35,7 +36,9 @@
"mews/captcha": "^3.2.6",
"orangehill/iseed": "^3.0.1",
"overtrue/pinyin": "^4.0",
+ "phpoffice/phpword": "^1.3",
"predis/predis": "^1.1.7",
+ "smalot/pdfparser": "^2.11",
"symfony/mailer": "^6.0"
},
"require-dev": {
diff --git a/composer.lock b/composer.lock
index d2a291e3b..feb4b3395 100644
--- a/composer.lock
+++ b/composer.lock
@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
- "content-hash": "4bfa166220b4b329f40e62394c6336a0",
+ "content-hash": "ef4844086b8b7fde7050b10a9d4e436a",
"packages": [
{
"name": "asm89/stack-cors",
@@ -3679,6 +3679,58 @@
],
"time": "2023-04-27T10:17:12+00:00"
},
+ {
+ "name": "phpoffice/math",
+ "version": "0.2.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/PHPOffice/Math.git",
+ "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/PHPOffice/Math/zipball/fc2eb6d1a61b058d5dac77197059db30ee3c8329",
+ "reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329",
+ "shasum": ""
+ },
+ "require": {
+ "ext-dom": "*",
+ "ext-xml": "*",
+ "php": "^7.1|^8.0"
+ },
+ "require-dev": {
+ "phpstan/phpstan": "^0.12.88 || ^1.0.0",
+ "phpunit/phpunit": "^7.0 || ^9.0"
+ },
+ "type": "library",
+ "autoload": {
+ "psr-4": {
+ "PhpOffice\\Math\\": "src/Math/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "MIT"
+ ],
+ "authors": [
+ {
+ "name": "Progi1984",
+ "homepage": "https://lefevre.dev"
+ }
+ ],
+ "description": "Math - Manipulate Math Formula",
+ "homepage": "https://phpoffice.github.io/Math/",
+ "keywords": [
+ "MathML",
+ "officemathml",
+ "php"
+ ],
+ "support": {
+ "issues": "https://github.com/PHPOffice/Math/issues",
+ "source": "https://github.com/PHPOffice/Math/tree/0.2.0"
+ },
+ "time": "2024-08-12T07:30:45+00:00"
+ },
{
"name": "phpoffice/phpspreadsheet",
"version": "1.29.5",
@@ -3784,6 +3836,115 @@
},
"time": "2024-11-22T05:57:44+00:00"
},
+ {
+ "name": "phpoffice/phpword",
+ "version": "1.3.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/PHPOffice/PHPWord.git",
+ "reference": "8392134ce4b5dba65130ba956231a1602b848b7f"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/8392134ce4b5dba65130ba956231a1602b848b7f",
+ "reference": "8392134ce4b5dba65130ba956231a1602b848b7f",
+ "shasum": ""
+ },
+ "require": {
+ "ext-dom": "*",
+ "ext-json": "*",
+ "ext-xml": "*",
+ "php": "^7.1|^8.0",
+ "phpoffice/math": "^0.2"
+ },
+ "require-dev": {
+ "dompdf/dompdf": "^2.0",
+ "ext-gd": "*",
+ "ext-libxml": "*",
+ "ext-zip": "*",
+ "friendsofphp/php-cs-fixer": "^3.3",
+ "mpdf/mpdf": "^8.1",
+ "phpmd/phpmd": "^2.13",
+ "phpstan/phpstan-phpunit": "@stable",
+ "phpunit/phpunit": ">=7.0",
+ "symfony/process": "^4.4 || ^5.0",
+ "tecnickcom/tcpdf": "^6.5"
+ },
+ "suggest": {
+ "dompdf/dompdf": "Allows writing PDF",
+ "ext-gd2": "Allows adding images",
+ "ext-xmlwriter": "Allows writing OOXML and ODF",
+ "ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template",
+ "ext-zip": "Allows writing OOXML and ODF"
+ },
+ "type": "library",
+ "autoload": {
+ "psr-4": {
+ "PhpOffice\\PhpWord\\": "src/PhpWord"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "LGPL-3.0"
+ ],
+ "authors": [
+ {
+ "name": "Mark Baker"
+ },
+ {
+ "name": "Gabriel Bull",
+ "email": "me@gabrielbull.com",
+ "homepage": "http://gabrielbull.com/"
+ },
+ {
+ "name": "Franck Lefevre",
+ "homepage": "https://rootslabs.net/blog/"
+ },
+ {
+ "name": "Ivan Lanin",
+ "homepage": "http://ivan.lanin.org"
+ },
+ {
+ "name": "Roman Syroeshko",
+ "homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/"
+ },
+ {
+ "name": "Antoine de Troostembergh"
+ }
+ ],
+ "description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)",
+ "homepage": "https://phpoffice.github.io/PHPWord/",
+ "keywords": [
+ "ISO IEC 29500",
+ "OOXML",
+ "Office Open XML",
+ "OpenDocument",
+ "OpenXML",
+ "PhpOffice",
+ "PhpWord",
+ "Rich Text Format",
+ "WordprocessingML",
+ "doc",
+ "docx",
+ "html",
+ "odf",
+ "odt",
+ "office",
+ "pdf",
+ "php",
+ "reader",
+ "rtf",
+ "template",
+ "template processor",
+ "word",
+ "writer"
+ ],
+ "support": {
+ "issues": "https://github.com/PHPOffice/PHPWord/issues",
+ "source": "https://github.com/PHPOffice/PHPWord/tree/1.3.0"
+ },
+ "time": "2024-08-30T18:03:42+00:00"
+ },
{
"name": "phpoption/phpoption",
"version": "1.9.3",
@@ -4637,6 +4798,57 @@
],
"time": "2024-04-27T21:32:50+00:00"
},
+ {
+ "name": "smalot/pdfparser",
+ "version": "v2.11.0",
+ "source": {
+ "type": "git",
+ "url": "https://github.com/smalot/pdfparser.git",
+ "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6"
+ },
+ "dist": {
+ "type": "zip",
+ "url": "https://api.github.com/repos/smalot/pdfparser/zipball/ac8e6678b0940e4b2ccd5caadd3fb18e68093be6",
+ "reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6",
+ "shasum": ""
+ },
+ "require": {
+ "ext-iconv": "*",
+ "ext-zlib": "*",
+ "php": ">=7.1",
+ "symfony/polyfill-mbstring": "^1.18"
+ },
+ "type": "library",
+ "autoload": {
+ "psr-0": {
+ "Smalot\\PdfParser\\": "src/"
+ }
+ },
+ "notification-url": "https://packagist.org/downloads/",
+ "license": [
+ "LGPL-3.0"
+ ],
+ "authors": [
+ {
+ "name": "Sebastien MALOT",
+ "email": "sebastien@malot.fr"
+ }
+ ],
+ "description": "Pdf parser library. Can read and extract information from pdf file.",
+ "homepage": "https://www.pdfparser.org",
+ "keywords": [
+ "extract",
+ "parse",
+ "parser",
+ "pdf",
+ "text"
+ ],
+ "support": {
+ "issues": "https://github.com/smalot/pdfparser/issues",
+ "source": "https://github.com/smalot/pdfparser/tree/v2.11.0"
+ },
+ "time": "2024-08-16T06:48:03+00:00"
+ },
{
"name": "swiftmailer/swiftmailer",
"version": "v6.3.0",
@@ -10705,6 +10917,8 @@
"platform": {
"php": "^8.0",
"ext-curl": "*",
+ "ext-dom": "*",
+ "ext-fileinfo": "*",
"ext-gd": "*",
"ext-imagick": "*",
"ext-json": "*",