someone-oa/docx_to_md.py
2025-12-11 15:21:16 +08:00

429 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
将 DOCX 文件转换为 Markdown 格式
"""
import zipfile
import xml.etree.ElementTree as ET
import os
import sys
import re
def get_paragraph_text(para, namespaces):
"""从段落元素中提取文本"""
texts = []
for text_elem in para.findall('.//w:t', namespaces):
if text_elem.text:
texts.append(text_elem.text)
return ''.join(texts).strip()
def has_image(para, namespaces):
"""检查段落是否包含图片"""
# 检查是否包含drawing元素
drawings = para.findall('.//w:drawing', namespaces)
if len(drawings) > 0:
return True
# 也检查pict元素旧格式
picts = para.findall('.//w:pict', namespaces)
if len(picts) > 0:
return True
return False
def get_image_id(para, namespaces):
"""从段落中提取图片关系ID"""
# 查找blip元素中的embed属性新格式
blip_elements = para.findall('.//a:blip', namespaces)
if blip_elements:
for blip in blip_elements:
embed = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if embed:
return embed
# 也查找picture元素旧格式
pict_elements = para.findall('.//w:pict', namespaces)
if pict_elements:
# 在pict元素中查找embed
for pict in pict_elements:
# 查找所有可能的embed属性
for elem in pict.iter():
for attr_name, attr_value in elem.attrib.items():
if 'embed' in attr_name.lower():
# 提取关系ID通常在rId后面
if attr_value:
return attr_value
return None
def detect_heading_level(text):
"""根据文本内容判断标题级别"""
text = text.strip()
# 一级标题:文档主标题
if text == "OA系统功能定制方案" or text == "总体需求" or text == "功能建设":
return 1
# 二级标题:主章节(如 2.1, 2.2, 2.3, 2.4, 2.5
if re.match(r'^2\.\d+', text):
# 检查是否是子章节2.1.1这种格式)
if re.match(r'^2\.\d+\.\d+', text):
# 检查是否是四级标题2.4.1.1这种格式)
if re.match(r'^2\.\d+\.\d+\.\d+', text):
return 4
# 三级标题2.1.1, 2.2.1等)
return 3
# 二级标题2.1, 2.2等)
return 2
# 处理类似 "1.1管理要求" 的格式
if re.match(r'^\d+\.\d+', text):
return 2
# 短文本且不是列表项的可能是标题
if len(text) < 30 and not text.endswith('') and not text.endswith(':'):
# 排除列表项模式
if not re.match(r'^[\d一二三四五六七八九十]+[、.)]', text):
if any(keyword in text for keyword in ['主要业务流程图', '主流程', '功能说明']):
return 3
return 0
def is_list_item(text):
"""判断是否是列表项"""
text = text.strip()
# 数字编号1) 或 (1) 或 1.
if re.match(r'^[\d]+[、.)]', text):
return True
# 中文数字编号
if re.match(r'^[一二三四五六七八九十]+[、.)]', text):
return True
# 括号数字:(1)
if re.match(r'^[(][\d一二三四五六七八九十]+[)]', text):
return True
return False
def format_list_item(text):
"""格式化列表项"""
text = text.strip()
# 带编号的列表项1) 或 (1) 转换为 - 1.
if re.match(r'^(\d+)[)]', text):
text = re.sub(r'^(\d+)[)]', r'- \1. ', text)
return text
# 括号数字:(1) 转换为 - 1.
if re.match(r'^[(](\d+)[)]', text):
text = re.sub(r'^[(](\d+)[)]', r'- \1. ', text)
return text
# 中文数字编号
if re.match(r'^[一二三四五六七八九十]+[、.)]', text):
return "- " + text
# 已经是格式化的列表项
if text.startswith('- '):
return text
# 其他情况作为列表项
return "- " + text
def format_nested_list_item(text):
"""格式化嵌套列表项(子列表)"""
text = text.strip()
# (1) 格式
if re.match(r'^[(](\d+)[)]', text):
text = re.sub(r'^[(](\d+)[)]', r' - \1. ', text)
return text
# 普通数字编号
if re.match(r'^(\d+)[)]', text):
text = re.sub(r'^(\d+)[)]', r' - \1. ', text)
return text
return " - " + text
def docx_to_markdown(file_path, output_path=None):
"""将DOCX文件转换为Markdown"""
if output_path is None:
base_name = os.path.splitext(file_path)[0]
output_path = base_name + ".md"
# 创建images目录
images_dir = "images"
if not os.path.exists(images_dir):
os.makedirs(images_dir)
try:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
# 读取文档主体
document_xml = zip_ref.read('word/document.xml')
# 读取图片关系文件
image_relations = {}
try:
rels_xml = zip_ref.read('word/_rels/document.xml.rels')
rels_root = ET.fromstring(rels_xml)
rels_namespace = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
for rel in rels_root.findall('.//r:Relationship', rels_namespace):
rel_type = rel.get('Type', '')
if 'image' in rel_type.lower():
rel_id = rel.get('Id')
target = rel.get('Target')
if rel_id and target:
# 提取完整路径或文件名
if target.startswith('media/'):
target = 'word/' + target
image_filename = os.path.basename(target)
image_relations[rel_id] = image_filename
print(f" 图片关系映射: {rel_id} -> {image_filename}")
except KeyError:
pass # 如果没有关系文件,继续处理
except Exception as e:
print(f" 警告: 读取图片关系时出错: {e}")
# 提取所有图片文件
image_files = {}
for file_info in zip_ref.namelist():
if file_info.startswith('word/media/'):
filename = os.path.basename(file_info)
if filename: # 确保文件名不为空
# 保存图片文件
image_data = zip_ref.read(file_info)
image_path = os.path.join(images_dir, filename)
# 确保目录存在
os.makedirs(os.path.dirname(image_path), exist_ok=True)
with open(image_path, 'wb') as img_file:
img_file.write(image_data)
image_files[filename] = image_path
print(f" 提取图片: {filename}")
root = ET.fromstring(document_xml)
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
}
md_lines = []
in_note = False
in_list = False
note_item_count = 0
used_images = set() # 追踪已使用的图片
image_counter = 1 # 图片计数器
# 提取所有段落
paragraphs = root.findall('.//w:p', namespaces)
i = 0
while i < len(paragraphs):
para = paragraphs[i]
# 检查段落是否包含图片
para_has_image = has_image(para, namespaces)
image_path = None
if para_has_image:
image_id = get_image_id(para, namespaces)
if image_id and image_id in image_relations:
image_filename = image_relations[image_id]
# 查找对应的文件路径
if image_filename in image_files:
image_path = image_files[image_filename].replace('\\', '/')
else:
# 如果文件名不匹配,使用完整路径
image_path = os.path.join(images_dir, image_filename).replace('\\', '/')
# 标记为已使用
if image_path and image_path not in used_images:
used_images.add(image_path)
else:
# 如果没有找到关系尝试从image_files中找到未使用的图片
for filename, path in image_files.items():
rel_path = path.replace('\\', '/')
if rel_path not in used_images:
image_path = rel_path
used_images.add(image_path)
break
text = get_paragraph_text(para, namespaces)
# 如果段落只有图片没有文本,处理图片
if para_has_image and image_path and not text:
if md_lines and md_lines[-1]:
md_lines.append("")
# 使用HTML img标签设置样式让图片可以完整显示
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
md_lines.append("")
i += 1
continue
# 如果段落既没有图片也没有文本,跳过
if not text and not para_has_image:
i += 1
continue
# 如果有图片且没有文本,先添加图片
if para_has_image and image_path and not text:
if md_lines and md_lines[-1]:
md_lines.append("")
md_lines.append(f"![]({image_path})")
md_lines.append("")
i += 1
continue
# 先检测标题(优先级最高,因为标题可能包含数字)
heading_level = detect_heading_level(text)
if heading_level > 0:
in_note = False
in_list = False
note_item_count = 0
if md_lines and md_lines[-1]:
md_lines.append("")
md_lines.append("#" * heading_level + " " + text)
md_lines.append("")
# 如果标题后紧跟图片,添加图片
if para_has_image and image_path:
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
md_lines.append("")
i += 1
continue
# 检测是否进入"注:"段落
if text.startswith('注:') or text.startswith('注:'):
in_note = True
note_item_count = 0
if md_lines and md_lines[-1]:
md_lines.append("")
md_lines.append("**" + text + "**")
md_lines.append("")
in_list = False
i += 1
continue
# 检测列表项
if is_list_item(text):
in_note = False # 列表项结束"注:"状态
note_item_count = 0
# 判断是否是嵌套列表(括号数字通常是嵌套)
if re.match(r'^[(][\d一二三四五六七八九十]+[)]', text):
formatted = format_nested_list_item(text)
else:
formatted = format_list_item(text)
md_lines.append(formatted)
in_list = True
i += 1
continue
# 如果在"注:"段落中,且当前行不是列表项,可能是第一项
if in_note and not is_list_item(text):
# 检查下一行是否是列表项
has_next_item = False
if i + 1 < len(paragraphs):
next_para = paragraphs[i + 1]
next_text = get_paragraph_text(next_para, namespaces)
if is_list_item(next_text):
has_next_item = True
if has_next_item:
# 当前行应该是第一项列表
note_item_count += 1
md_lines.append(f"- {note_item_count}. {text}")
in_list = True
else:
# 可能是普通文本
md_lines.append(text)
md_lines.append("")
in_list = False
i += 1
continue
# 检查是否是"主流程"或"功能说明"等描述
if text in ['主流程:', '主流程', '功能说明:', '功能说明']:
if md_lines and md_lines[-1]:
md_lines.append("")
md_lines.append("**" + text + "**")
md_lines.append("")
in_list = False
i += 1
continue
# 普通段落
if md_lines and md_lines[-1] and not in_list:
md_lines.append("")
md_lines.append(text)
# 如果段落中包含图片,在文本后添加图片
if para_has_image and image_path:
md_lines.append("")
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
if not in_list:
md_lines.append("")
in_list = False
in_note = False
note_item_count = 0
i += 1
# 提取表格
tables = root.findall('.//w:tbl', namespaces)
if tables:
for table in tables:
rows = table.findall('.//w:tr', namespaces)
if rows:
md_lines.append("")
md_lines.append("### 表格")
md_lines.append("")
for row_idx, row in enumerate(rows):
cells = row.findall('.//w:tc', namespaces)
row_data = []
for cell in cells:
cell_texts = []
for text_elem in cell.findall('.//w:t', namespaces):
if text_elem.text:
cell_texts.append(text_elem.text.strip())
cell_text = ' '.join(cell_texts).strip()
row_data.append(cell_text if cell_text else ' ')
if row_data:
md_lines.append("| " + " | ".join(row_data) + " |")
if row_idx == 0:
separator = "| " + " | ".join(["---"] * len(row_data)) + " |"
md_lines.append(separator)
md_lines.append("")
# 清理多余空行最多连续2个空行
cleaned_lines = []
prev_empty = False
for line in md_lines:
is_empty = not line.strip()
if is_empty and prev_empty:
continue
cleaned_lines.append(line)
prev_empty = is_empty
# 写入文件
md_content = "\n".join(cleaned_lines)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(md_content)
print(f"✓ 转换成功!")
print(f" 输入文件: {file_path}")
print(f" 输出文件: {output_path}")
return output_path
except Exception as e:
print(f"✗ 错误: {str(e)}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
input_file = "OA系统功能定制方案(2).docx"
output_file = "需求.md"
if not os.path.exists(input_file):
print(f"✗ 文件不存在: {input_file}")
sys.exit(1)
docx_to_markdown(input_file, output_file)