mirror of
https://github.com/kuaifan/dootask.git
synced 2025-12-12 11:19:56 +00:00
no message
This commit is contained in:
parent
368fae5f32
commit
e6167119e0
183
app/Module/AiBot/TextExtractor.php
Normal file
183
app/Module/AiBot/TextExtractor.php
Normal file
@ -0,0 +1,183 @@
|
||||
<?php
|
||||
|
||||
namespace App\Module\AiBot;
|
||||
|
||||
use Exception;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use PhpOffice\PhpWord\IOFactory;
|
||||
use Smalot\PdfParser\Parser;
|
||||
|
||||
class TextExtractor
|
||||
{
|
||||
/**
|
||||
* 从文件中提取文本
|
||||
*
|
||||
* @param string $filePath 文件路径
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
public function extractText(string $filePath): string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
throw new Exception("文件不存在: {$filePath}");
|
||||
}
|
||||
|
||||
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
|
||||
|
||||
try {
|
||||
return match($fileExtension) {
|
||||
'pdf' => $this->extractFromPDF($filePath),
|
||||
'docx' => $this->extractFromDOCX($filePath),
|
||||
'ipynb' => $this->extractFromIPYNB($filePath),
|
||||
default => $this->extractFromOtherFile($filePath),
|
||||
};
|
||||
} catch (Exception $e) {
|
||||
Log::error('文本提取失败', [
|
||||
'file' => $filePath,
|
||||
'error' => $e->getMessage()
|
||||
]);
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从PDF文件中提取文本
|
||||
*
|
||||
* @param string $filePath
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function extractFromPDF(string $filePath): string
|
||||
{
|
||||
try {
|
||||
$parser = new Parser();
|
||||
$pdf = $parser->parseFile($filePath);
|
||||
|
||||
return $pdf->getText();
|
||||
} catch (Exception $e) {
|
||||
Log::error('PDF解析失败', [
|
||||
'file' => $filePath,
|
||||
'error' => $e->getMessage()
|
||||
]);
|
||||
throw new Exception("PDF文本提取失败: " . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从DOCX文件中提取文本
|
||||
*
|
||||
* @param string $filePath
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function extractFromDOCX(string $filePath): string
|
||||
{
|
||||
$phpWord = IOFactory::load($filePath);
|
||||
$text = '';
|
||||
|
||||
foreach ($phpWord->getSections() as $section) {
|
||||
foreach ($section->getElements() as $element) {
|
||||
if (method_exists($element, 'getText')) {
|
||||
$text .= $element->getText() . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从Jupyter Notebook文件中提取文本
|
||||
*
|
||||
* @param string $filePath
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function extractFromIPYNB(string $filePath): string
|
||||
{
|
||||
$content = file_get_contents($filePath);
|
||||
$notebook = json_decode($content, true);
|
||||
|
||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||
throw new Exception("IPYNB文件解析失败: " . json_last_error_msg());
|
||||
}
|
||||
|
||||
$extractedText = '';
|
||||
|
||||
foreach ($notebook['cells'] ?? [] as $cell) {
|
||||
if (in_array($cell['cell_type'] ?? '', ['markdown', 'code']) && isset($cell['source'])) {
|
||||
$source = $cell['source'];
|
||||
$extractedText .= is_array($source)
|
||||
? implode("\n", $source)
|
||||
: $source;
|
||||
$extractedText .= "\n";
|
||||
}
|
||||
}
|
||||
|
||||
return $extractedText;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从其他类型文件中提取文本
|
||||
*
|
||||
* @param string $filePath
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function extractFromOtherFile(string $filePath): string
|
||||
{
|
||||
if ($this->isBinaryFile($filePath)) {
|
||||
throw new Exception("无法读取该类型文件的文本内容");
|
||||
}
|
||||
|
||||
return file_get_contents($filePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查文件是否为二进制文件
|
||||
*
|
||||
* @param string $filePath
|
||||
* @return bool
|
||||
*/
|
||||
protected function isBinaryFile(string $filePath): bool
|
||||
{
|
||||
$finfo = finfo_open(FILEINFO_MIME);
|
||||
$mimeType = finfo_file($finfo, $filePath);
|
||||
finfo_close($finfo);
|
||||
|
||||
return !str_contains($mimeType, 'text/')
|
||||
&& !str_contains($mimeType, 'application/json')
|
||||
&& !str_contains($mimeType, 'application/xml');
|
||||
}
|
||||
|
||||
/** ********************************************************************* */
|
||||
/** ********************************************************************* */
|
||||
/** ********************************************************************* */
|
||||
|
||||
public static function parsePaths($filePath)
|
||||
{
|
||||
// todo
|
||||
// (see below for file content)
|
||||
// <file_content path="${mentionPath}">\n${content}\n</file_content>
|
||||
|
||||
// (see below for site content)
|
||||
// <site_content url="${mention}">\n${result}\n</site_content>
|
||||
}
|
||||
|
||||
public static function getFileContent($filePath)
|
||||
{
|
||||
if (!file_exists($filePath) || !is_file($filePath)) {
|
||||
return "(Failed to read contents of {$filePath})";
|
||||
}
|
||||
$te = new self();
|
||||
try {
|
||||
$isBinary = $te->isBinaryFile($filePath);
|
||||
if ($isBinary) {
|
||||
return "(Binary file, unable to display content)";
|
||||
}
|
||||
return $te->extractText($filePath);
|
||||
} catch (Exception $e) {
|
||||
return "(Failed to read contents of {$filePath}: {$e->getMessage()})";
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -11,6 +11,7 @@
|
||||
"php": "^8.0",
|
||||
"ext-curl": "*",
|
||||
"ext-dom": "*",
|
||||
"ext-fileinfo": "*",
|
||||
"ext-gd": "*",
|
||||
"ext-imagick": "*",
|
||||
"ext-json": "*",
|
||||
@ -35,7 +36,9 @@
|
||||
"mews/captcha": "^3.2.6",
|
||||
"orangehill/iseed": "^3.0.1",
|
||||
"overtrue/pinyin": "^4.0",
|
||||
"phpoffice/phpword": "^1.3",
|
||||
"predis/predis": "^1.1.7",
|
||||
"smalot/pdfparser": "^2.11",
|
||||
"symfony/mailer": "^6.0"
|
||||
},
|
||||
"require-dev": {
|
||||
|
||||
216
composer.lock
generated
216
composer.lock
generated
@ -4,7 +4,7 @@
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "4bfa166220b4b329f40e62394c6336a0",
|
||||
"content-hash": "ef4844086b8b7fde7050b10a9d4e436a",
|
||||
"packages": [
|
||||
{
|
||||
"name": "asm89/stack-cors",
|
||||
@ -3679,6 +3679,58 @@
|
||||
],
|
||||
"time": "2023-04-27T10:17:12+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpoffice/math",
|
||||
"version": "0.2.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/PHPOffice/Math.git",
|
||||
"reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/PHPOffice/Math/zipball/fc2eb6d1a61b058d5dac77197059db30ee3c8329",
|
||||
"reference": "fc2eb6d1a61b058d5dac77197059db30ee3c8329",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-dom": "*",
|
||||
"ext-xml": "*",
|
||||
"php": "^7.1|^8.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpstan/phpstan": "^0.12.88 || ^1.0.0",
|
||||
"phpunit/phpunit": "^7.0 || ^9.0"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PhpOffice\\Math\\": "src/Math/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Progi1984",
|
||||
"homepage": "https://lefevre.dev"
|
||||
}
|
||||
],
|
||||
"description": "Math - Manipulate Math Formula",
|
||||
"homepage": "https://phpoffice.github.io/Math/",
|
||||
"keywords": [
|
||||
"MathML",
|
||||
"officemathml",
|
||||
"php"
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/PHPOffice/Math/issues",
|
||||
"source": "https://github.com/PHPOffice/Math/tree/0.2.0"
|
||||
},
|
||||
"time": "2024-08-12T07:30:45+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpoffice/phpspreadsheet",
|
||||
"version": "1.29.5",
|
||||
@ -3784,6 +3836,115 @@
|
||||
},
|
||||
"time": "2024-11-22T05:57:44+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpoffice/phpword",
|
||||
"version": "1.3.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/PHPOffice/PHPWord.git",
|
||||
"reference": "8392134ce4b5dba65130ba956231a1602b848b7f"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/8392134ce4b5dba65130ba956231a1602b848b7f",
|
||||
"reference": "8392134ce4b5dba65130ba956231a1602b848b7f",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-dom": "*",
|
||||
"ext-json": "*",
|
||||
"ext-xml": "*",
|
||||
"php": "^7.1|^8.0",
|
||||
"phpoffice/math": "^0.2"
|
||||
},
|
||||
"require-dev": {
|
||||
"dompdf/dompdf": "^2.0",
|
||||
"ext-gd": "*",
|
||||
"ext-libxml": "*",
|
||||
"ext-zip": "*",
|
||||
"friendsofphp/php-cs-fixer": "^3.3",
|
||||
"mpdf/mpdf": "^8.1",
|
||||
"phpmd/phpmd": "^2.13",
|
||||
"phpstan/phpstan-phpunit": "@stable",
|
||||
"phpunit/phpunit": ">=7.0",
|
||||
"symfony/process": "^4.4 || ^5.0",
|
||||
"tecnickcom/tcpdf": "^6.5"
|
||||
},
|
||||
"suggest": {
|
||||
"dompdf/dompdf": "Allows writing PDF",
|
||||
"ext-gd2": "Allows adding images",
|
||||
"ext-xmlwriter": "Allows writing OOXML and ODF",
|
||||
"ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template",
|
||||
"ext-zip": "Allows writing OOXML and ODF"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"PhpOffice\\PhpWord\\": "src/PhpWord"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"LGPL-3.0"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Mark Baker"
|
||||
},
|
||||
{
|
||||
"name": "Gabriel Bull",
|
||||
"email": "me@gabrielbull.com",
|
||||
"homepage": "http://gabrielbull.com/"
|
||||
},
|
||||
{
|
||||
"name": "Franck Lefevre",
|
||||
"homepage": "https://rootslabs.net/blog/"
|
||||
},
|
||||
{
|
||||
"name": "Ivan Lanin",
|
||||
"homepage": "http://ivan.lanin.org"
|
||||
},
|
||||
{
|
||||
"name": "Roman Syroeshko",
|
||||
"homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/"
|
||||
},
|
||||
{
|
||||
"name": "Antoine de Troostembergh"
|
||||
}
|
||||
],
|
||||
"description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)",
|
||||
"homepage": "https://phpoffice.github.io/PHPWord/",
|
||||
"keywords": [
|
||||
"ISO IEC 29500",
|
||||
"OOXML",
|
||||
"Office Open XML",
|
||||
"OpenDocument",
|
||||
"OpenXML",
|
||||
"PhpOffice",
|
||||
"PhpWord",
|
||||
"Rich Text Format",
|
||||
"WordprocessingML",
|
||||
"doc",
|
||||
"docx",
|
||||
"html",
|
||||
"odf",
|
||||
"odt",
|
||||
"office",
|
||||
"pdf",
|
||||
"php",
|
||||
"reader",
|
||||
"rtf",
|
||||
"template",
|
||||
"template processor",
|
||||
"word",
|
||||
"writer"
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/PHPOffice/PHPWord/issues",
|
||||
"source": "https://github.com/PHPOffice/PHPWord/tree/1.3.0"
|
||||
},
|
||||
"time": "2024-08-30T18:03:42+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpoption/phpoption",
|
||||
"version": "1.9.3",
|
||||
@ -4637,6 +4798,57 @@
|
||||
],
|
||||
"time": "2024-04-27T21:32:50+00:00"
|
||||
},
|
||||
{
|
||||
"name": "smalot/pdfparser",
|
||||
"version": "v2.11.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/smalot/pdfparser.git",
|
||||
"reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/smalot/pdfparser/zipball/ac8e6678b0940e4b2ccd5caadd3fb18e68093be6",
|
||||
"reference": "ac8e6678b0940e4b2ccd5caadd3fb18e68093be6",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-iconv": "*",
|
||||
"ext-zlib": "*",
|
||||
"php": ">=7.1",
|
||||
"symfony/polyfill-mbstring": "^1.18"
|
||||
},
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"psr-0": {
|
||||
"Smalot\\PdfParser\\": "src/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"LGPL-3.0"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Sebastien MALOT",
|
||||
"email": "sebastien@malot.fr"
|
||||
}
|
||||
],
|
||||
"description": "Pdf parser library. Can read and extract information from pdf file.",
|
||||
"homepage": "https://www.pdfparser.org",
|
||||
"keywords": [
|
||||
"extract",
|
||||
"parse",
|
||||
"parser",
|
||||
"pdf",
|
||||
"text"
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/smalot/pdfparser/issues",
|
||||
"source": "https://github.com/smalot/pdfparser/tree/v2.11.0"
|
||||
},
|
||||
"time": "2024-08-16T06:48:03+00:00"
|
||||
},
|
||||
{
|
||||
"name": "swiftmailer/swiftmailer",
|
||||
"version": "v6.3.0",
|
||||
@ -10705,6 +10917,8 @@
|
||||
"platform": {
|
||||
"php": "^8.0",
|
||||
"ext-curl": "*",
|
||||
"ext-dom": "*",
|
||||
"ext-fileinfo": "*",
|
||||
"ext-gd": "*",
|
||||
"ext-imagick": "*",
|
||||
"ext-json": "*",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user