mirror of
https://github.com/kuaifan/dootask.git
synced 2025-12-12 19:35:50 +00:00
perf: 优化AI支持文件类型
This commit is contained in:
parent
919289c5ca
commit
88b995ca9c
@ -6,52 +6,83 @@ use Exception;
|
|||||||
use PhpOffice\PhpWord\IOFactory as WordIOFactory;
|
use PhpOffice\PhpWord\IOFactory as WordIOFactory;
|
||||||
use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
|
use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
|
||||||
use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
|
use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
|
||||||
|
use Illuminate\Support\Facades\File as FileFacade;
|
||||||
|
|
||||||
|
|
||||||
class TextExtractor
|
class TextExtractor
|
||||||
{
|
{
|
||||||
|
private string $filePath;
|
||||||
|
private string $fileMimeType;
|
||||||
|
private string $fileExtension;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 从文件中提取文本
|
* @param string $filePath
|
||||||
*
|
|
||||||
* @param string $filePath 文件路径
|
|
||||||
* @return string
|
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
public function extractContent(string $filePath): string
|
public function __construct(string $filePath)
|
||||||
{
|
{
|
||||||
if (!file_exists($filePath)) {
|
if (!file_exists($filePath)) {
|
||||||
throw new Exception("File does not exist: {$filePath}");
|
throw new Exception("File does not exist: {$filePath}");
|
||||||
}
|
}
|
||||||
|
$this->filePath = $filePath;
|
||||||
|
$this->fileMimeType = FileFacade::mimeType($filePath);
|
||||||
|
$this->fileExtension = $this->detectFileType();
|
||||||
|
}
|
||||||
|
|
||||||
$fileExtension = strtolower(pathinfo($filePath, PATHINFO_EXTENSION));
|
/**
|
||||||
|
* 从文件中提取文本
|
||||||
|
* @return string
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
public function extractContent(): string
|
||||||
|
{
|
||||||
|
return match ($this->fileExtension) {
|
||||||
|
// Word文档
|
||||||
|
'docx' => $this->parseWordDocument(),
|
||||||
|
|
||||||
return match ($fileExtension) {
|
// Excel文档
|
||||||
// Word documents
|
'xlsx', 'xls', 'csv' => $this->parseSpreadsheet(),
|
||||||
'docx' => $this->parseWordDocument($filePath),
|
|
||||||
|
|
||||||
// Spreadsheet files
|
// PowerPoint文档
|
||||||
'xlsx', 'xls', 'csv' => $this->parseSpreadsheet($filePath),
|
'ppt', 'pptx' => $this->parsePresentation(),
|
||||||
|
|
||||||
// Presentation files
|
// PDF文档
|
||||||
'ppt', 'pptx' => $this->parsePresentation($filePath),
|
'pdf' => $this->parsePdf(),
|
||||||
|
|
||||||
// PDF files (requires additional library)
|
// RTF文档
|
||||||
'pdf' => $this->parsePdf($filePath),
|
'rtf' => $this->parseRtf(),
|
||||||
|
|
||||||
// RTF files
|
// 其他文本文件
|
||||||
'rtf' => $this->parseRtf($filePath),
|
default => $this->parseOther(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Default case
|
/**
|
||||||
default => $this->parseOther($filePath),
|
* 获取文件类型
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private function detectFileType(): string
|
||||||
|
{
|
||||||
|
return match ($this->fileMimeType) {
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
|
||||||
|
'application/vnd.ms-excel' => 'xls',
|
||||||
|
'text/csv', 'application/csv' => 'csv',
|
||||||
|
'application/vnd.ms-powerpoint' => 'ppt',
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'pptx',
|
||||||
|
'application/pdf' => 'pdf',
|
||||||
|
'application/rtf', 'text/rtf' => 'rtf',
|
||||||
|
default => strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse Word documents (.doc, .docx)
|
* Parse Word documents (.doc, .docx)
|
||||||
|
* @return string
|
||||||
*/
|
*/
|
||||||
private function parseWordDocument(string $filePath): string
|
private function parseWordDocument(): string
|
||||||
{
|
{
|
||||||
$phpWord = WordIOFactory::load($filePath);
|
$phpWord = WordIOFactory::load($this->filePath);
|
||||||
$text = '';
|
$text = '';
|
||||||
|
|
||||||
// Extract text from each section
|
// Extract text from each section
|
||||||
@ -74,10 +105,11 @@ class TextExtractor
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse spreadsheet files (.xlsx, .xls, .csv)
|
* Parse spreadsheet files (.xlsx, .xls, .csv)
|
||||||
|
* @return string
|
||||||
*/
|
*/
|
||||||
private function parseSpreadsheet(string $filePath): string
|
private function parseSpreadsheet(): string
|
||||||
{
|
{
|
||||||
$spreadsheet = SpreadsheetIOFactory::load($filePath);
|
$spreadsheet = SpreadsheetIOFactory::load($this->filePath);
|
||||||
$text = '';
|
$text = '';
|
||||||
|
|
||||||
// Extract text from all worksheets
|
// Extract text from all worksheets
|
||||||
@ -109,11 +141,12 @@ class TextExtractor
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse presentation files (.ppt, .pptx)
|
* Parse presentation files (.ppt, .pptx)
|
||||||
|
* @return string
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
private function parsePresentation(string $filePath): string
|
private function parsePresentation(): string
|
||||||
{
|
{
|
||||||
$presentation = PresentationIOFactory::load($filePath);
|
$presentation = PresentationIOFactory::load($this->filePath);
|
||||||
$text = '';
|
$text = '';
|
||||||
|
|
||||||
// Extract text from all slides
|
// Extract text from all slides
|
||||||
@ -136,9 +169,10 @@ class TextExtractor
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse PDF files (requires additional library like Smalot\PdfParser)
|
* Parse PDF files (requires additional library like Smalot\PdfParser)
|
||||||
|
* @return string
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
private function parsePdf(string $filePath): string
|
private function parsePdf(): string
|
||||||
{
|
{
|
||||||
// You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser
|
// You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser
|
||||||
if (!class_exists('\Smalot\PdfParser\Parser')) {
|
if (!class_exists('\Smalot\PdfParser\Parser')) {
|
||||||
@ -146,17 +180,18 @@ class TextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
$parser = new \Smalot\PdfParser\Parser();
|
$parser = new \Smalot\PdfParser\Parser();
|
||||||
$pdf = $parser->parseFile($filePath);
|
$pdf = $parser->parseFile($this->filePath);
|
||||||
return $pdf->getText();
|
return $pdf->getText();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse RTF files
|
* Parse RTF files
|
||||||
|
* @return string
|
||||||
*/
|
*/
|
||||||
private function parseRtf(string $filePath): string
|
private function parseRtf(): string
|
||||||
{
|
{
|
||||||
// Simple RTF to text conversion
|
// Simple RTF to text conversion
|
||||||
$content = file_get_contents($filePath);
|
$content = file_get_contents($this->filePath);
|
||||||
|
|
||||||
// Remove RTF control words and groups
|
// Remove RTF control words and groups
|
||||||
$content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content);
|
$content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content);
|
||||||
@ -175,23 +210,20 @@ class TextExtractor
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse Other(text) files
|
* Parse Other(text) files
|
||||||
|
* @return string
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
private function parseOther(string $filePath): string
|
private function parseOther(): string
|
||||||
{
|
{
|
||||||
$finfo = finfo_open(FILEINFO_MIME);
|
$isBinary = !str_contains($this->fileMimeType, 'text/')
|
||||||
$mimeType = finfo_file($finfo, $filePath);
|
&& !str_contains($this->fileMimeType, 'application/json')
|
||||||
finfo_close($finfo);
|
&& !str_contains($this->fileMimeType, 'application/xml');
|
||||||
|
|
||||||
$isBinary = !str_contains($mimeType, 'text/')
|
|
||||||
&& !str_contains($mimeType, 'application/json')
|
|
||||||
&& !str_contains($mimeType, 'application/xml');
|
|
||||||
|
|
||||||
if ($isBinary) {
|
if ($isBinary) {
|
||||||
throw new Exception("Unable to read the text content of this type of file");
|
throw new Exception("Unable to read the text content of this type of file");
|
||||||
}
|
}
|
||||||
|
|
||||||
return file_get_contents($filePath);
|
return file_get_contents($this->filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** ********************************************************************* */
|
/** ********************************************************************* */
|
||||||
@ -201,20 +233,25 @@ class TextExtractor
|
|||||||
/**
|
/**
|
||||||
* 获取文件内容
|
* 获取文件内容
|
||||||
* @param $filePath
|
* @param $filePath
|
||||||
* @param float|int $maxSize 最大文件大小,单位字节,默认300KB
|
* @param int $fileMaxSize 最大文件大小,单位字节,默认1024KB
|
||||||
|
* @param int $contentMaxSize 最大内容大小,单位字节,默认300KB
|
||||||
* @return array
|
* @return array
|
||||||
*/
|
*/
|
||||||
public static function extractFile($filePath, float|int $maxSize = 300 * 1024): array
|
public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300): array
|
||||||
{
|
{
|
||||||
if (!file_exists($filePath) || !is_file($filePath)) {
|
if (!file_exists($filePath) || !is_file($filePath)) {
|
||||||
return Base::retError("Failed to read contents of {$filePath}");
|
return Base::retError("Failed to read contents of {$filePath}");
|
||||||
}
|
}
|
||||||
if (filesize($filePath) > $maxSize) {
|
if (filesize($filePath) > $fileMaxSize * 1024) {
|
||||||
return Base::retError("File size exceeds " . Base::readableBytes($maxSize) . ", unable to display content");
|
return Base::retError("File size exceeds " . Base::readableBytes($fileMaxSize * 1024) . ", unable to display content");
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
$extractor = new self();
|
$extractor = new self($filePath);
|
||||||
return Base::retSuccess("success", $extractor->extractContent($filePath));
|
$content = $extractor->extractContent();
|
||||||
|
if (strlen($content) > $contentMaxSize * 1024) {
|
||||||
|
return Base::retError("Content size exceeds " . Base::readableBytes($contentMaxSize * 1024) . ", unable to display content");
|
||||||
|
}
|
||||||
|
return Base::retSuccess("success", $content);
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
return Base::retError($e->getMessage());
|
return Base::retError($e->getMessage());
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user