#!/usr/bin/env python # -*- coding: utf-8 -*- """ 使用标准库分析 DOCX 文件的脚本 DOCX 文件实际上是 ZIP 压缩的 XML 文件集合 """ import zipfile import xml.etree.ElementTree as ET import os import sys import re def get_text_from_element(element): """从 XML 元素中提取文本内容""" text_parts = [] # 提取元素的文本内容 if element.text: text_parts.append(element.text.strip()) # 递归提取子元素的文本 for child in element: child_text = get_text_from_element(child) if child_text: text_parts.append(child_text) # 提取尾部文本 if element.tail: text_parts.append(element.tail.strip()) return ' '.join(text_parts) def analyze_docx(file_path): """分析 DOCX 文件并提取内容""" try: with zipfile.ZipFile(file_path, 'r') as zip_ref: # 读取主文档内容 (word/document.xml) document_xml = zip_ref.read('word/document.xml') # 解析 XML root = ET.fromstring(document_xml) # 定义命名空间 (Office Open XML 标准命名空间) namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' } print("=" * 60) print(f"文件: {os.path.basename(file_path)}") print("=" * 60) print() # 提取所有段落 paragraphs = [] for para in root.findall('.//w:p', namespaces): para_texts = [] for text_elem in para.findall('.//w:t', namespaces): if text_elem.text: para_texts.append(text_elem.text) para_text = ''.join(para_texts).strip() if para_text: paragraphs.append(para_text) print(f"总段落数: {len(paragraphs)}") print() print("=" * 60) print("文档内容:") print("=" * 60) print() for i, para in enumerate(paragraphs, 1): print(f"[段落 {i}]") print(para) print() # 尝试提取表格 tables = root.findall('.//w:tbl', namespaces) if tables: print("=" * 60) print(f"表格数量: {len(tables)}") print("=" * 60) print() for table_idx, table in enumerate(tables, 1): print(f"表格 {table_idx}:") print("-" * 60) rows = table.findall('.//w:tr', namespaces) for row in rows: cells = row.findall('.//w:tc', namespaces) row_data = [] for cell in cells: cell_texts = [] for text_elem in cell.findall('.//w:t', namespaces): if text_elem.text: cell_texts.append(text_elem.text) cell_text = ' '.join(cell_texts).strip() row_data.append(cell_text if cell_text else '') print(" | ".join(row_data)) print() return paragraphs except Exception as e: print(f"错误: {str(e)}") import traceback traceback.print_exc() return None if __name__ == "__main__": file_path = "OA系统功能定制方案(2).docx" if not os.path.exists(file_path): print(f"文件不存在: {file_path}") sys.exit(1) analyze_docx(file_path)