116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
使用标准库分析 DOCX 文件的脚本
|
|
DOCX 文件实际上是 ZIP 压缩的 XML 文件集合
|
|
"""
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
def get_text_from_element(element):
|
|
"""从 XML 元素中提取文本内容"""
|
|
text_parts = []
|
|
|
|
# 提取元素的文本内容
|
|
if element.text:
|
|
text_parts.append(element.text.strip())
|
|
|
|
# 递归提取子元素的文本
|
|
for child in element:
|
|
child_text = get_text_from_element(child)
|
|
if child_text:
|
|
text_parts.append(child_text)
|
|
|
|
# 提取尾部文本
|
|
if element.tail:
|
|
text_parts.append(element.tail.strip())
|
|
|
|
return ' '.join(text_parts)
|
|
|
|
def analyze_docx(file_path):
|
|
"""分析 DOCX 文件并提取内容"""
|
|
try:
|
|
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
|
# 读取主文档内容 (word/document.xml)
|
|
document_xml = zip_ref.read('word/document.xml')
|
|
|
|
# 解析 XML
|
|
root = ET.fromstring(document_xml)
|
|
|
|
# 定义命名空间 (Office Open XML 标准命名空间)
|
|
namespaces = {
|
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
}
|
|
|
|
print("=" * 60)
|
|
print(f"文件: {os.path.basename(file_path)}")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# 提取所有段落
|
|
paragraphs = []
|
|
for para in root.findall('.//w:p', namespaces):
|
|
para_texts = []
|
|
for text_elem in para.findall('.//w:t', namespaces):
|
|
if text_elem.text:
|
|
para_texts.append(text_elem.text)
|
|
para_text = ''.join(para_texts).strip()
|
|
if para_text:
|
|
paragraphs.append(para_text)
|
|
|
|
print(f"总段落数: {len(paragraphs)}")
|
|
print()
|
|
print("=" * 60)
|
|
print("文档内容:")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
for i, para in enumerate(paragraphs, 1):
|
|
print(f"[段落 {i}]")
|
|
print(para)
|
|
print()
|
|
|
|
# 尝试提取表格
|
|
tables = root.findall('.//w:tbl', namespaces)
|
|
if tables:
|
|
print("=" * 60)
|
|
print(f"表格数量: {len(tables)}")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
for table_idx, table in enumerate(tables, 1):
|
|
print(f"表格 {table_idx}:")
|
|
print("-" * 60)
|
|
rows = table.findall('.//w:tr', namespaces)
|
|
for row in rows:
|
|
cells = row.findall('.//w:tc', namespaces)
|
|
row_data = []
|
|
for cell in cells:
|
|
cell_texts = []
|
|
for text_elem in cell.findall('.//w:t', namespaces):
|
|
if text_elem.text:
|
|
cell_texts.append(text_elem.text)
|
|
cell_text = ' '.join(cell_texts).strip()
|
|
row_data.append(cell_text if cell_text else '')
|
|
print(" | ".join(row_data))
|
|
print()
|
|
|
|
return paragraphs
|
|
|
|
except Exception as e:
|
|
print(f"错误: {str(e)}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
if __name__ == "__main__":
|
|
file_path = "OA系统功能定制方案(2).docx"
|
|
if not os.path.exists(file_path):
|
|
print(f"文件不存在: {file_path}")
|
|
sys.exit(1)
|
|
|
|
analyze_docx(file_path)
|