someone-oa/analyze_docx.py
2025-12-11 15:21:16 +08:00

116 lines
3.7 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
使用标准库分析 DOCX 文件的脚本
DOCX 文件实际上是 ZIP 压缩的 XML 文件集合
"""
import zipfile
import xml.etree.ElementTree as ET
import os
import sys
import re
def get_text_from_element(element):
"""从 XML 元素中提取文本内容"""
text_parts = []
# 提取元素的文本内容
if element.text:
text_parts.append(element.text.strip())
# 递归提取子元素的文本
for child in element:
child_text = get_text_from_element(child)
if child_text:
text_parts.append(child_text)
# 提取尾部文本
if element.tail:
text_parts.append(element.tail.strip())
return ' '.join(text_parts)
def analyze_docx(file_path):
"""分析 DOCX 文件并提取内容"""
try:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
# 读取主文档内容 (word/document.xml)
document_xml = zip_ref.read('word/document.xml')
# 解析 XML
root = ET.fromstring(document_xml)
# 定义命名空间 (Office Open XML 标准命名空间)
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
print("=" * 60)
print(f"文件: {os.path.basename(file_path)}")
print("=" * 60)
print()
# 提取所有段落
paragraphs = []
for para in root.findall('.//w:p', namespaces):
para_texts = []
for text_elem in para.findall('.//w:t', namespaces):
if text_elem.text:
para_texts.append(text_elem.text)
para_text = ''.join(para_texts).strip()
if para_text:
paragraphs.append(para_text)
print(f"总段落数: {len(paragraphs)}")
print()
print("=" * 60)
print("文档内容:")
print("=" * 60)
print()
for i, para in enumerate(paragraphs, 1):
print(f"[段落 {i}]")
print(para)
print()
# 尝试提取表格
tables = root.findall('.//w:tbl', namespaces)
if tables:
print("=" * 60)
print(f"表格数量: {len(tables)}")
print("=" * 60)
print()
for table_idx, table in enumerate(tables, 1):
print(f"表格 {table_idx}:")
print("-" * 60)
rows = table.findall('.//w:tr', namespaces)
for row in rows:
cells = row.findall('.//w:tc', namespaces)
row_data = []
for cell in cells:
cell_texts = []
for text_elem in cell.findall('.//w:t', namespaces):
if text_elem.text:
cell_texts.append(text_elem.text)
cell_text = ' '.join(cell_texts).strip()
row_data.append(cell_text if cell_text else '')
print(" | ".join(row_data))
print()
return paragraphs
except Exception as e:
print(f"错误: {str(e)}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
file_path = "OA系统功能定制方案(2).docx"
if not os.path.exists(file_path):
print(f"文件不存在: {file_path}")
sys.exit(1)
analyze_docx(file_path)