$this->extractFromPDF($filePath), 'docx' => $this->extractFromDOCX($filePath), 'ipynb' => $this->extractFromIPYNB($filePath), default => $this->extractFromOtherFile($filePath), }; } catch (Exception $e) { Log::error('文本提取失败', [ 'file' => $filePath, 'error' => $e->getMessage() ]); throw $e; } } /** * 从PDF文件中提取文本 * * @param string $filePath * @return string * @throws Exception */ protected function extractFromPDF(string $filePath): string { try { $parser = new Parser(); $pdf = $parser->parseFile($filePath); return $pdf->getText(); } catch (Exception $e) { Log::error('PDF解析失败', [ 'file' => $filePath, 'error' => $e->getMessage() ]); throw new Exception("PDF文本提取失败: " . $e->getMessage()); } } /** * 从DOCX文件中提取文本 * * @param string $filePath * @return string * @throws Exception */ protected function extractFromDOCX(string $filePath): string { $phpWord = IOFactory::load($filePath); $text = ''; foreach ($phpWord->getSections() as $section) { foreach ($section->getElements() as $element) { if (method_exists($element, 'getText')) { $text .= $element->getText() . "\n"; } } } return $text; } /** * 从Jupyter Notebook文件中提取文本 * * @param string $filePath * @return string * @throws Exception */ protected function extractFromIPYNB(string $filePath): string { $content = file_get_contents($filePath); $notebook = json_decode($content, true); if (json_last_error() !== JSON_ERROR_NONE) { throw new Exception("IPYNB文件解析失败: " . json_last_error_msg()); } $extractedText = ''; foreach ($notebook['cells'] ?? [] as $cell) { if (in_array($cell['cell_type'] ?? '', ['markdown', 'code']) && isset($cell['source'])) { $source = $cell['source']; $extractedText .= is_array($source) ? implode("\n", $source) : $source; $extractedText .= "\n"; } } return $extractedText; } /** * 从其他类型文件中提取文本 * * @param string $filePath * @return string * @throws Exception */ protected function extractFromOtherFile(string $filePath): string { if ($this->isBinaryFile($filePath)) { throw new Exception("无法读取该类型文件的文本内容"); } return file_get_contents($filePath); } /** * 检查文件是否为二进制文件 * * @param string $filePath * @return bool */ protected function isBinaryFile(string $filePath): bool { $finfo = finfo_open(FILEINFO_MIME); $mimeType = finfo_file($finfo, $filePath); finfo_close($finfo); return !str_contains($mimeType, 'text/') && !str_contains($mimeType, 'application/json') && !str_contains($mimeType, 'application/xml'); } /** ********************************************************************* */ /** ********************************************************************* */ /** ********************************************************************* */ public static function parsePaths($filePath) { // todo // (see below for file content) // \n${content}\n // (see below for site content) // \n${result}\n } public static function getFileContent($filePath) { if (!file_exists($filePath) || !is_file($filePath)) { return "(Failed to read contents of {$filePath})"; } $te = new self(); try { $isBinary = $te->isBinaryFile($filePath); if ($isBinary) { return "(Binary file, unable to display content)"; } return $te->extractText($filePath); } catch (Exception $e) { return "(Failed to read contents of {$filePath}: {$e->getMessage()})"; } } }