429 lines
16 KiB
Python
429 lines
16 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
将 DOCX 文件转换为 Markdown 格式
|
||
"""
|
||
import zipfile
|
||
import xml.etree.ElementTree as ET
|
||
import os
|
||
import sys
|
||
import re
|
||
|
||
def get_paragraph_text(para, namespaces):
|
||
"""从段落元素中提取文本"""
|
||
texts = []
|
||
for text_elem in para.findall('.//w:t', namespaces):
|
||
if text_elem.text:
|
||
texts.append(text_elem.text)
|
||
return ''.join(texts).strip()
|
||
|
||
def has_image(para, namespaces):
|
||
"""检查段落是否包含图片"""
|
||
# 检查是否包含drawing元素
|
||
drawings = para.findall('.//w:drawing', namespaces)
|
||
if len(drawings) > 0:
|
||
return True
|
||
# 也检查pict元素(旧格式)
|
||
picts = para.findall('.//w:pict', namespaces)
|
||
if len(picts) > 0:
|
||
return True
|
||
return False
|
||
|
||
def get_image_id(para, namespaces):
|
||
"""从段落中提取图片关系ID"""
|
||
# 查找blip元素中的embed属性(新格式)
|
||
blip_elements = para.findall('.//a:blip', namespaces)
|
||
if blip_elements:
|
||
for blip in blip_elements:
|
||
embed = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
||
if embed:
|
||
return embed
|
||
|
||
# 也查找picture元素(旧格式)
|
||
pict_elements = para.findall('.//w:pict', namespaces)
|
||
if pict_elements:
|
||
# 在pict元素中查找embed
|
||
for pict in pict_elements:
|
||
# 查找所有可能的embed属性
|
||
for elem in pict.iter():
|
||
for attr_name, attr_value in elem.attrib.items():
|
||
if 'embed' in attr_name.lower():
|
||
# 提取关系ID(通常在rId后面)
|
||
if attr_value:
|
||
return attr_value
|
||
|
||
return None
|
||
|
||
def detect_heading_level(text):
|
||
"""根据文本内容判断标题级别"""
|
||
text = text.strip()
|
||
|
||
# 一级标题:文档主标题
|
||
if text == "OA系统功能定制方案" or text == "总体需求" or text == "功能建设":
|
||
return 1
|
||
|
||
# 二级标题:主章节(如 2.1, 2.2, 2.3, 2.4, 2.5)
|
||
if re.match(r'^2\.\d+', text):
|
||
# 检查是否是子章节(2.1.1这种格式)
|
||
if re.match(r'^2\.\d+\.\d+', text):
|
||
# 检查是否是四级标题(2.4.1.1这种格式)
|
||
if re.match(r'^2\.\d+\.\d+\.\d+', text):
|
||
return 4
|
||
# 三级标题(2.1.1, 2.2.1等)
|
||
return 3
|
||
# 二级标题(2.1, 2.2等)
|
||
return 2
|
||
|
||
# 处理类似 "1.1管理要求" 的格式
|
||
if re.match(r'^\d+\.\d+', text):
|
||
return 2
|
||
|
||
# 短文本且不是列表项的可能是标题
|
||
if len(text) < 30 and not text.endswith(':') and not text.endswith(':'):
|
||
# 排除列表项模式
|
||
if not re.match(r'^[\d一二三四五六七八九十]+[、.))]', text):
|
||
if any(keyword in text for keyword in ['主要业务流程图', '主流程', '功能说明']):
|
||
return 3
|
||
|
||
return 0
|
||
|
||
def is_list_item(text):
|
||
"""判断是否是列表项"""
|
||
text = text.strip()
|
||
# 数字编号:1) 或 (1) 或 1.
|
||
if re.match(r'^[\d]+[、.))]', text):
|
||
return True
|
||
# 中文数字编号
|
||
if re.match(r'^[一二三四五六七八九十]+[、.))]', text):
|
||
return True
|
||
# 括号数字:(1)
|
||
if re.match(r'^[((][\d一二三四五六七八九十]+[))]', text):
|
||
return True
|
||
return False
|
||
|
||
def format_list_item(text):
|
||
"""格式化列表项"""
|
||
text = text.strip()
|
||
|
||
# 带编号的列表项:1) 或 (1) 转换为 - 1.
|
||
if re.match(r'^(\d+)[))]', text):
|
||
text = re.sub(r'^(\d+)[))]', r'- \1. ', text)
|
||
return text
|
||
|
||
# 括号数字:(1) 转换为 - 1.
|
||
if re.match(r'^[((](\d+)[))]', text):
|
||
text = re.sub(r'^[((](\d+)[))]', r'- \1. ', text)
|
||
return text
|
||
|
||
# 中文数字编号
|
||
if re.match(r'^[一二三四五六七八九十]+[、.))]', text):
|
||
return "- " + text
|
||
|
||
# 已经是格式化的列表项
|
||
if text.startswith('- '):
|
||
return text
|
||
|
||
# 其他情况作为列表项
|
||
return "- " + text
|
||
|
||
def format_nested_list_item(text):
|
||
"""格式化嵌套列表项(子列表)"""
|
||
text = text.strip()
|
||
|
||
# (1) 格式
|
||
if re.match(r'^[((](\d+)[))]', text):
|
||
text = re.sub(r'^[((](\d+)[))]', r' - \1. ', text)
|
||
return text
|
||
|
||
# 普通数字编号
|
||
if re.match(r'^(\d+)[))]', text):
|
||
text = re.sub(r'^(\d+)[))]', r' - \1. ', text)
|
||
return text
|
||
|
||
return " - " + text
|
||
|
||
def docx_to_markdown(file_path, output_path=None):
|
||
"""将DOCX文件转换为Markdown"""
|
||
if output_path is None:
|
||
base_name = os.path.splitext(file_path)[0]
|
||
output_path = base_name + ".md"
|
||
|
||
# 创建images目录
|
||
images_dir = "images"
|
||
if not os.path.exists(images_dir):
|
||
os.makedirs(images_dir)
|
||
|
||
try:
|
||
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
||
# 读取文档主体
|
||
document_xml = zip_ref.read('word/document.xml')
|
||
|
||
# 读取图片关系文件
|
||
image_relations = {}
|
||
try:
|
||
rels_xml = zip_ref.read('word/_rels/document.xml.rels')
|
||
rels_root = ET.fromstring(rels_xml)
|
||
rels_namespace = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
|
||
for rel in rels_root.findall('.//r:Relationship', rels_namespace):
|
||
rel_type = rel.get('Type', '')
|
||
if 'image' in rel_type.lower():
|
||
rel_id = rel.get('Id')
|
||
target = rel.get('Target')
|
||
if rel_id and target:
|
||
# 提取完整路径或文件名
|
||
if target.startswith('media/'):
|
||
target = 'word/' + target
|
||
image_filename = os.path.basename(target)
|
||
image_relations[rel_id] = image_filename
|
||
print(f" 图片关系映射: {rel_id} -> {image_filename}")
|
||
except KeyError:
|
||
pass # 如果没有关系文件,继续处理
|
||
except Exception as e:
|
||
print(f" 警告: 读取图片关系时出错: {e}")
|
||
|
||
# 提取所有图片文件
|
||
image_files = {}
|
||
for file_info in zip_ref.namelist():
|
||
if file_info.startswith('word/media/'):
|
||
filename = os.path.basename(file_info)
|
||
if filename: # 确保文件名不为空
|
||
# 保存图片文件
|
||
image_data = zip_ref.read(file_info)
|
||
image_path = os.path.join(images_dir, filename)
|
||
# 确保目录存在
|
||
os.makedirs(os.path.dirname(image_path), exist_ok=True)
|
||
with open(image_path, 'wb') as img_file:
|
||
img_file.write(image_data)
|
||
image_files[filename] = image_path
|
||
print(f" 提取图片: {filename}")
|
||
|
||
root = ET.fromstring(document_xml)
|
||
namespaces = {
|
||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
||
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
||
}
|
||
|
||
md_lines = []
|
||
in_note = False
|
||
in_list = False
|
||
note_item_count = 0
|
||
used_images = set() # 追踪已使用的图片
|
||
image_counter = 1 # 图片计数器
|
||
|
||
# 提取所有段落
|
||
paragraphs = root.findall('.//w:p', namespaces)
|
||
i = 0
|
||
|
||
while i < len(paragraphs):
|
||
para = paragraphs[i]
|
||
|
||
# 检查段落是否包含图片
|
||
para_has_image = has_image(para, namespaces)
|
||
image_path = None
|
||
|
||
if para_has_image:
|
||
image_id = get_image_id(para, namespaces)
|
||
if image_id and image_id in image_relations:
|
||
image_filename = image_relations[image_id]
|
||
# 查找对应的文件路径
|
||
if image_filename in image_files:
|
||
image_path = image_files[image_filename].replace('\\', '/')
|
||
else:
|
||
# 如果文件名不匹配,使用完整路径
|
||
image_path = os.path.join(images_dir, image_filename).replace('\\', '/')
|
||
|
||
# 标记为已使用
|
||
if image_path and image_path not in used_images:
|
||
used_images.add(image_path)
|
||
else:
|
||
# 如果没有找到关系,尝试从image_files中找到未使用的图片
|
||
for filename, path in image_files.items():
|
||
rel_path = path.replace('\\', '/')
|
||
if rel_path not in used_images:
|
||
image_path = rel_path
|
||
used_images.add(image_path)
|
||
break
|
||
|
||
text = get_paragraph_text(para, namespaces)
|
||
|
||
# 如果段落只有图片没有文本,处理图片
|
||
if para_has_image and image_path and not text:
|
||
if md_lines and md_lines[-1]:
|
||
md_lines.append("")
|
||
# 使用HTML img标签,设置样式让图片可以完整显示
|
||
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
|
||
md_lines.append("")
|
||
i += 1
|
||
continue
|
||
|
||
# 如果段落既没有图片也没有文本,跳过
|
||
if not text and not para_has_image:
|
||
i += 1
|
||
continue
|
||
|
||
# 如果有图片且没有文本,先添加图片
|
||
if para_has_image and image_path and not text:
|
||
if md_lines and md_lines[-1]:
|
||
md_lines.append("")
|
||
md_lines.append(f"")
|
||
md_lines.append("")
|
||
i += 1
|
||
continue
|
||
|
||
# 先检测标题(优先级最高,因为标题可能包含数字)
|
||
heading_level = detect_heading_level(text)
|
||
if heading_level > 0:
|
||
in_note = False
|
||
in_list = False
|
||
note_item_count = 0
|
||
if md_lines and md_lines[-1]:
|
||
md_lines.append("")
|
||
md_lines.append("#" * heading_level + " " + text)
|
||
md_lines.append("")
|
||
# 如果标题后紧跟图片,添加图片
|
||
if para_has_image and image_path:
|
||
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
|
||
md_lines.append("")
|
||
i += 1
|
||
continue
|
||
|
||
# 检测是否进入"注:"段落
|
||
if text.startswith('注:') or text.startswith('注:'):
|
||
in_note = True
|
||
note_item_count = 0
|
||
if md_lines and md_lines[-1]:
|
||
md_lines.append("")
|
||
md_lines.append("**" + text + "**")
|
||
md_lines.append("")
|
||
in_list = False
|
||
i += 1
|
||
continue
|
||
|
||
# 检测列表项
|
||
if is_list_item(text):
|
||
in_note = False # 列表项结束"注:"状态
|
||
note_item_count = 0
|
||
# 判断是否是嵌套列表(括号数字通常是嵌套)
|
||
if re.match(r'^[((][\d一二三四五六七八九十]+[))]', text):
|
||
formatted = format_nested_list_item(text)
|
||
else:
|
||
formatted = format_list_item(text)
|
||
md_lines.append(formatted)
|
||
in_list = True
|
||
i += 1
|
||
continue
|
||
|
||
# 如果在"注:"段落中,且当前行不是列表项,可能是第一项
|
||
if in_note and not is_list_item(text):
|
||
# 检查下一行是否是列表项
|
||
has_next_item = False
|
||
if i + 1 < len(paragraphs):
|
||
next_para = paragraphs[i + 1]
|
||
next_text = get_paragraph_text(next_para, namespaces)
|
||
if is_list_item(next_text):
|
||
has_next_item = True
|
||
|
||
if has_next_item:
|
||
# 当前行应该是第一项列表
|
||
note_item_count += 1
|
||
md_lines.append(f"- {note_item_count}. {text}")
|
||
in_list = True
|
||
else:
|
||
# 可能是普通文本
|
||
md_lines.append(text)
|
||
md_lines.append("")
|
||
in_list = False
|
||
i += 1
|
||
continue
|
||
|
||
# 检查是否是"主流程"或"功能说明"等描述
|
||
if text in ['主流程:', '主流程', '功能说明:', '功能说明']:
|
||
if md_lines and md_lines[-1]:
|
||
md_lines.append("")
|
||
md_lines.append("**" + text + "**")
|
||
md_lines.append("")
|
||
in_list = False
|
||
i += 1
|
||
continue
|
||
|
||
# 普通段落
|
||
if md_lines and md_lines[-1] and not in_list:
|
||
md_lines.append("")
|
||
md_lines.append(text)
|
||
# 如果段落中包含图片,在文本后添加图片
|
||
if para_has_image and image_path:
|
||
md_lines.append("")
|
||
md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
|
||
if not in_list:
|
||
md_lines.append("")
|
||
in_list = False
|
||
in_note = False
|
||
note_item_count = 0
|
||
i += 1
|
||
|
||
# 提取表格
|
||
tables = root.findall('.//w:tbl', namespaces)
|
||
if tables:
|
||
for table in tables:
|
||
rows = table.findall('.//w:tr', namespaces)
|
||
if rows:
|
||
md_lines.append("")
|
||
md_lines.append("### 表格")
|
||
md_lines.append("")
|
||
|
||
for row_idx, row in enumerate(rows):
|
||
cells = row.findall('.//w:tc', namespaces)
|
||
row_data = []
|
||
for cell in cells:
|
||
cell_texts = []
|
||
for text_elem in cell.findall('.//w:t', namespaces):
|
||
if text_elem.text:
|
||
cell_texts.append(text_elem.text.strip())
|
||
cell_text = ' '.join(cell_texts).strip()
|
||
row_data.append(cell_text if cell_text else ' ')
|
||
|
||
if row_data:
|
||
md_lines.append("| " + " | ".join(row_data) + " |")
|
||
if row_idx == 0:
|
||
separator = "| " + " | ".join(["---"] * len(row_data)) + " |"
|
||
md_lines.append(separator)
|
||
|
||
md_lines.append("")
|
||
|
||
# 清理多余空行(最多连续2个空行)
|
||
cleaned_lines = []
|
||
prev_empty = False
|
||
for line in md_lines:
|
||
is_empty = not line.strip()
|
||
if is_empty and prev_empty:
|
||
continue
|
||
cleaned_lines.append(line)
|
||
prev_empty = is_empty
|
||
|
||
# 写入文件
|
||
md_content = "\n".join(cleaned_lines)
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(md_content)
|
||
|
||
print(f"✓ 转换成功!")
|
||
print(f" 输入文件: {file_path}")
|
||
print(f" 输出文件: {output_path}")
|
||
return output_path
|
||
|
||
except Exception as e:
|
||
print(f"✗ 错误: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return None
|
||
|
||
if __name__ == "__main__":
|
||
input_file = "OA系统功能定制方案(2).docx"
|
||
output_file = "需求.md"
|
||
|
||
if not os.path.exists(input_file):
|
||
print(f"✗ 文件不存在: {input_file}")
|
||
sys.exit(1)
|
||
|
||
docx_to_markdown(input_file, output_file)
|