dootask/app/Module/TextExtractor.php
2025-03-20 15:34:31 +08:00

260 lines
8.3 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Module;
use Exception;
use PhpOffice\PhpWord\IOFactory as WordIOFactory;
use PhpOffice\PhpSpreadsheet\IOFactory as SpreadsheetIOFactory;
use PhpOffice\PhpPresentation\IOFactory as PresentationIOFactory;
use Illuminate\Support\Facades\File as FileFacade;
class TextExtractor
{
private string $filePath;
private string $fileMimeType;
private string $fileExtension;
/**
* @param string $filePath
* @throws Exception
*/
public function __construct(string $filePath)
{
if (!file_exists($filePath)) {
throw new Exception("File does not exist: {$filePath}");
}
$this->filePath = $filePath;
$this->fileMimeType = FileFacade::mimeType($filePath);
$this->fileExtension = $this->detectFileType();
}
/**
* 从文件中提取文本
* @return string
* @throws Exception
*/
public function extractContent(): string
{
return match ($this->fileExtension) {
// Word文档
'docx' => $this->parseWordDocument(),
// Excel文档
'xlsx', 'xls', 'csv' => $this->parseSpreadsheet(),
// PowerPoint文档
'ppt', 'pptx' => $this->parsePresentation(),
// PDF文档
'pdf' => $this->parsePdf(),
// RTF文档
'rtf' => $this->parseRtf(),
// 其他文本文件
default => $this->parseOther(),
};
}
/**
* 获取文件类型
* @return string
*/
private function detectFileType(): string
{
return match ($this->fileMimeType) {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
'application/vnd.ms-excel' => 'xls',
'text/csv', 'application/csv' => 'csv',
'application/vnd.ms-powerpoint' => 'ppt',
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'pptx',
'application/pdf' => 'pdf',
'application/rtf', 'text/rtf' => 'rtf',
default => strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION)),
};
}
/**
* Parse Word documents (.doc, .docx)
* @return string
*/
private function parseWordDocument(): string
{
$phpWord = WordIOFactory::load($this->filePath);
$text = '';
// Extract text from each section
foreach ($phpWord->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (method_exists($element, 'getText')) {
$text .= $element->getText() . "\n";
} elseif (method_exists($element, 'getElements')) {
foreach ($element->getElements() as $childElement) {
if (method_exists($childElement, 'getText')) {
$text .= $childElement->getText() . "\n";
}
}
}
}
}
return $text;
}
/**
* Parse spreadsheet files (.xlsx, .xls, .csv)
* @return string
*/
private function parseSpreadsheet(): string
{
$spreadsheet = SpreadsheetIOFactory::load($this->filePath);
$text = '';
// Extract text from all worksheets
foreach ($spreadsheet->getWorksheetIterator() as $worksheet) {
$text .= 'Worksheet: ' . $worksheet->getTitle() . "\n";
foreach ($worksheet->getRowIterator() as $row) {
$cellIterator = $row->getCellIterator();
$cellIterator->setIterateOnlyExistingCells(false);
$rowText = '';
foreach ($cellIterator as $cell) {
$value = $cell->getValue();
if (!empty($value)) {
$rowText .= $value . "\t";
}
}
if (!empty(trim($rowText))) {
$text .= trim($rowText) . "\n";
}
}
$text .= "\n";
}
return $text;
}
/**
* Parse presentation files (.ppt, .pptx)
* @return string
* @throws Exception
*/
private function parsePresentation(): string
{
$presentation = PresentationIOFactory::load($this->filePath);
$text = '';
// Extract text from all slides
foreach ($presentation->getAllSlides() as $slide) {
foreach ($slide->getShapeCollection() as $shape) {
if ($shape instanceof \PhpOffice\PhpPresentation\Shape\RichText) {
foreach ($shape->getParagraphs() as $paragraph) {
foreach ($paragraph->getRichTextElements() as $element) {
$text .= $element->getText();
}
$text .= "\n";
}
}
}
$text .= "\n";
}
return $text;
}
/**
* Parse PDF files (requires additional library like Smalot\PdfParser)
* @return string
* @throws Exception
*/
private function parsePdf(): string
{
// You'll need to install the Smalot PDF Parser: composer require smalot/pdfparser
if (!class_exists('\Smalot\PdfParser\Parser')) {
throw new \Exception("PDF Parser not available. Install with: composer require smalot/pdfparser");
}
$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($this->filePath);
return $pdf->getText();
}
/**
* Parse RTF files
* @return string
*/
private function parseRtf(): string
{
// Simple RTF to text conversion
$content = file_get_contents($this->filePath);
// Remove RTF control words and groups
$content = preg_replace('/\\\\([a-z]{1,32})(-?[0-9]{1,10})?[ ]?/i', '', $content);
$content = preg_replace('/\\\\([^a-z]|[a-z]{33,})/i', '', $content);
$content = preg_replace('/\{\*?\\\\[^{}]*\}/', '', $content);
$content = preg_replace('/\{[\r\n]*\}/', '', $content);
// Convert special characters
$content = preg_replace('/\\\\\'([0-9a-f]{2})/i', '', $content);
// Remove remaining curly braces
$content = str_replace(['{', '}'], '', $content);
return $content ?: '';
}
/**
* Parse Other(text) files
* @return string
* @throws Exception
*/
private function parseOther(): string
{
$isBinary = !str_contains($this->fileMimeType, 'text/')
&& !str_contains($this->fileMimeType, 'application/json')
&& !str_contains($this->fileMimeType, 'application/xml');
if ($isBinary) {
throw new Exception("Unable to read the text content of this type of file");
}
return file_get_contents($this->filePath);
}
/** ********************************************************************* */
/** ********************************************************************* */
/** ********************************************************************* */
/**
* 获取文件内容
* @param $filePath
* @param int $fileMaxSize 最大文件大小单位字节默认1024KB
* @param int $contentMaxSize 最大内容大小单位字节默认300KB
* @return array
*/
public static function extractFile($filePath, int $fileMaxSize = 1024, int $contentMaxSize = 300): array
{
if (!file_exists($filePath) || !is_file($filePath)) {
return Base::retError("Failed to read contents of {$filePath}");
}
if (filesize($filePath) > $fileMaxSize * 1024) {
return Base::retError("File size exceeds " . Base::readableBytes($fileMaxSize * 1024) . ", unable to display content");
}
try {
$extractor = new self($filePath);
$content = $extractor->extractContent();
if (strlen($content) > $contentMaxSize * 1024) {
return Base::retError("Content size exceeds " . Base::readableBytes($contentMaxSize * 1024) . ", unable to display content");
}
return Base::retSuccess("success", $content);
} catch (Exception $e) {
return Base::retError($e->getMessage());
}
}
}