issues/9402 解决文档向量化，文件名中文乱码导致失败问题

2026-05-01 22:28:10 +00:00 · 2026-03-09 21:00:52 +08:00 · 2026-03-09 21:00:52 +08:00 · 8f867fc9e4
commit 8f867fc9e4
parent 254c388f65
1 changed files with 3 additions and 4 deletions
--- a/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/document/TikaDocumentParser.java
+++ b/jeecg-boot/jeecg-boot-module/jeecg-boot-module-airag/src/main/java/org/jeecg/modules/airag/llm/document/TikaDocumentParser.java
@ -30,7 +30,6 @@ import org.jeecg.common.util.AssertUtils;
 import org.xml.sax.ContentHandler;

 import java.io.*;
-import java.nio.file.Files;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
@ -73,8 +72,8 @@ public class TikaDocumentParser {
    public Document parse(File file) {
        AssertUtils.assertNotEmpty("请选择文件", file);
        try {
-            // 用于解析
-            InputStream isForParsing = Files.newInputStream(file.toPath());
+            // 用于解析(使用FileInputStream避免file.toPath()在Linux非UTF-8环境下中文文件名报错)
+            InputStream isForParsing = new FileInputStream(file);
            // 使用 Tika 自动检测 MIME 类型
            String fileName = file.getName().toLowerCase();
            //后缀
@ -102,7 +101,7 @@ public class TikaDocumentParser {
     */
    public Document parseDocExcelPdfUsingApachePoi(File file) {
        AssertUtils.assertNotEmpty("请选择文件", file);
-        try (InputStream inputStream = Files.newInputStream(file.toPath())) {
+        try (InputStream inputStream = new FileInputStream(file)) {
            ApachePoiDocumentParser parser = new ApachePoiDocumentParser();
            Document document = parser.parse(inputStream);
            if (document == null || Utils.isNullOrBlank(document.text())) {