perf: 优化AI支持文件类型

This commit is contained in:
kuaifan 2025-03-20 15:34:31 +08:00
parent 919289c5ca
commit 88b995ca9c

View File

@ -6,52 +6,83 @@ use Exception;
use PhpOffice\PhpWord\IOFactory as WordIOFactory; use PhpOffice\PhpWord\IOFactory as WordIOFactory;
use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory; use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory; use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
use Illuminate\Support\Facades\File as FileFacade;
class TextExtractor class TextExtractor
{ {
private string $filePath;
private string $fileMimeType;
private string $fileExtension;
/** /**
* 从文件中提取文本 * @param string $filePath
*
* @param string $filePath 文件路径
* @return string
* @throws Exception * @throws Exception
*/ */
public function extractContent(string $filePath): string public function __construct(string $filePath)
{ {
if (!file_exists($filePath)) { if (!file_exists($filePath)) {
throw new Exception("File does not exist: {$filePath}"); throw new Exception("File does not exist: {$filePath}");
} }
$this->filePath = $filePath;
$this->fileMimeType = FileFacade::mimeType($filePath);
$this->fileExtension = $this->detectFileType();
}
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION)); /**
* 从文件中提取文本
* @return string
* @throws Exception
*/
public function extractContent(): string
{
return match ($this->fileExtension) {
// Word文档
'docx' => $this->parseWordDocument(),
return match ($fileExtension) { // Excel文档
// Word documents 'xlsx', 'xls', 'csv' => $this->parseSpreadsheet(),
'docx' => $this->parseWordDocument($filePath),
// Spreadsheet files // PowerPoint文档
'xlsx', 'xls', 'csv' => $this->parseSpreadsheet($filePath), 'ppt', 'pptx' => $this->parsePresentation(),
// Presentation files // PDF文档
'ppt', 'pptx' => $this->parsePresentation($filePath), 'pdf' => $this->parsePdf(),
// PDF files (requires additional library) // RTF文档
'pdf' => $this->parsePdf($filePath), 'rtf' => $this->parseRtf(),
// RTF files // 其他文本文件
'rtf' => $this->parseRtf($filePath), default => $this->parseOther(),
};
}
// Default case /**
default => $this->parseOther($filePath), * 获取文件类型
* @return string
*/
private function detectFileType(): string
{
return match ($this->fileMimeType) {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
'application/vnd.ms-excel' => 'xls',
'text/csv', 'application/csv' => 'csv',
'application/vnd.ms-powerpoint' => 'ppt',
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'pptx',
'application/pdf' => 'pdf',
'application/rtf', 'text/rtf' => 'rtf',
default => strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)),
}; };
} }
/** /**
* Parse Word documents (.doc, .docx) * Parse Word documents (.doc, .docx)
* @return string
*/ */
private function parseWordDocument(string $filePath): string private function parseWordDocument(): string
{ {
$phpWord = WordIOFactory::load($filePath); $phpWord = WordIOFactory::load($this->filePath);
$text = ''; $text = '';
// Extract text from each section // Extract text from each section
@ -74,10 +105,11 @@ class TextExtractor
/** /**
* Parse spreadsheet files (.xlsx, .xls, .csv) * Parse spreadsheet files (.xlsx, .xls, .csv)
* @return string
*/ */
private function parseSpreadsheet(string $filePath): string private function parseSpreadsheet(): string
{ {
$spreadsheet = SpreadsheetIOFactory::load($filePath); $spreadsheet = SpreadsheetIOFactory::load($this->filePath);
$text = ''; $text = '';
// Extract text from all worksheets // Extract text from all worksheets
@ -109,11 +141,12 @@ class TextExtractor
/** /**
* Parse presentation files (.ppt, .pptx) * Parse presentation files (.ppt, .pptx)
* @return string
* @throws Exception * @throws Exception
*/ */
private function parsePresentation(string $filePath): string private function parsePresentation(): string
{ {
$presentation = PresentationIOFactory::load($filePath); $presentation = PresentationIOFactory::load($this->filePath);
$text = ''; $text = '';
// Extract text from all slides // Extract text from all slides
@ -136,9 +169,10 @@ class TextExtractor
/** /**
* Parse PDF files (requires additional library like Smalot\PdfParser) * Parse PDF files (requires additional library like Smalot\PdfParser)
* @return string
* @throws Exception * @throws Exception
*/ */
private function parsePdf(string $filePath): string private function parsePdf(): string
{ {
// You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser // You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser
if (!class_exists('\Smalot\PdfParser\Parser')) { if (!class_exists('\Smalot\PdfParser\Parser')) {
@ -146,17 +180,18 @@ class TextExtractor
} }
$parser = new \Smalot\PdfParser\Parser(); $parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($filePath); $pdf = $parser->parseFile($this->filePath);
return $pdf->getText(); return $pdf->getText();
} }
/** /**
* Parse RTF files * Parse RTF files
* @return string
*/ */
private function parseRtf(string $filePath): string private function parseRtf(): string
{ {
// Simple RTF to text conversion // Simple RTF to text conversion
$content = file_get_contents($filePath); $content = file_get_contents($this->filePath);
// Remove RTF control words and groups // Remove RTF control words and groups
$content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content); $content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content);
@ -175,23 +210,20 @@ class TextExtractor
/** /**
* Parse Other(text) files * Parse Other(text) files
* @return string
* @throws Exception * @throws Exception
*/ */
private function parseOther(string $filePath): string private function parseOther(): string
{ {
$finfo = finfo_open(FILEINFO_MIME); $isBinary = !str_contains($this->fileMimeType, 'text/')
$mimeType = finfo_file($finfo, $filePath); && !str_contains($this->fileMimeType, 'application/json')
finfo_close($finfo); && !str_contains($this->fileMimeType, 'application/xml');
$isBinary = !str_contains($mimeType, 'text/')
&& !str_contains($mimeType, 'application/json')
&& !str_contains($mimeType, 'application/xml');
if ($isBinary) { if ($isBinary) {
throw new Exception("Unable to read the text content of this type of file"); throw new Exception("Unable to read the text content of this type of file");
} }
return file_get_contents($filePath); return file_get_contents($this->filePath);
} }
/** ********************************************************************* */ /** ********************************************************************* */
@ -201,20 +233,25 @@ class TextExtractor
/** /**
* 获取文件内容 * 获取文件内容
* @param $filePath * @param $filePath
* @param float|int $maxSize 最大文件大小单位字节默认300KB * @param int $fileMaxSize 最大文件大小单位字节默认1024KB
* @param int $contentMaxSize 最大内容大小单位字节默认300KB
* @return array * @return array
*/ */
public static function extractFile($filePath, float|int $maxSize = 300 * 1024): array public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300): array
{ {
if (!file_exists($filePath) || !is_file($filePath)) { if (!file_exists($filePath) || !is_file($filePath)) {
return Base::retError("Failed to read contents of {$filePath}"); return Base::retError("Failed to read contents of {$filePath}");
} }
if (filesize($filePath) > $maxSize) { if (filesize($filePath) > $fileMaxSize * 1024) {
return Base::retError("File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content"); return Base::retError("File size exceeds " . Base::readableBytes($fileMaxSize * 1024) . ", unable to display content");
} }
try { try {
$extractor = new self(); $extractor = new self($filePath);
return Base::retSuccess("success", $extractor->extractContent($filePath)); $content = $extractor->extractContent();
if (strlen($content) > $contentMaxSize * 1024) {
return Base::retError("Content size exceeds " . Base::readableBytes($contentMaxSize * 1024) . ", unable to display content");
}
return Base::retSuccess("success", $content);
} catch (Exception $e) { } catch (Exception $e) {
return Base::retError($e->getMessage()); return Base::retError($e->getMessage());
} }