#!/usr/bin/env python # -*- coding: utf-8 -*- """ 将 DOCX 文件转换为 Markdown 格式 """ import zipfile import xml.etree.ElementTree as ET import os import sys import re def get_paragraph_text(para, namespaces): """从段落元素中提取文本""" texts = [] for text_elem in para.findall('.//w:t', namespaces): if text_elem.text: texts.append(text_elem.text) return ''.join(texts).strip() def has_image(para, namespaces): """检查段落是否包含图片""" # 检查是否包含drawing元素 drawings = para.findall('.//w:drawing', namespaces) if len(drawings) > 0: return True # 也检查pict元素(旧格式) picts = para.findall('.//w:pict', namespaces) if len(picts) > 0: return True return False def get_image_id(para, namespaces): """从段落中提取图片关系ID""" # 查找blip元素中的embed属性(新格式) blip_elements = para.findall('.//a:blip', namespaces) if blip_elements: for blip in blip_elements: embed = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') if embed: return embed # 也查找picture元素(旧格式) pict_elements = para.findall('.//w:pict', namespaces) if pict_elements: # 在pict元素中查找embed for pict in pict_elements: # 查找所有可能的embed属性 for elem in pict.iter(): for attr_name, attr_value in elem.attrib.items(): if 'embed' in attr_name.lower(): # 提取关系ID(通常在rId后面) if attr_value: return attr_value return None def detect_heading_level(text): """根据文本内容判断标题级别""" text = text.strip() # 一级标题:文档主标题 if text == "OA系统功能定制方案" or text == "总体需求" or text == "功能建设": return 1 # 二级标题:主章节(如 2.1, 2.2, 2.3, 2.4, 2.5) if re.match(r'^2\.\d+', text): # 检查是否是子章节(2.1.1这种格式) if re.match(r'^2\.\d+\.\d+', text): # 检查是否是四级标题(2.4.1.1这种格式) if re.match(r'^2\.\d+\.\d+\.\d+', text): return 4 # 三级标题(2.1.1, 2.2.1等) return 3 # 二级标题(2.1, 2.2等) return 2 # 处理类似 "1.1管理要求" 的格式 if re.match(r'^\d+\.\d+', text): return 2 # 短文本且不是列表项的可能是标题 if len(text) < 30 and not text.endswith(':') and not text.endswith(':'): # 排除列表项模式 if not re.match(r'^[\d一二三四五六七八九十]+[、.))]', text): if any(keyword in text for keyword in ['主要业务流程图', '主流程', '功能说明']): return 3 return 0 def is_list_item(text): """判断是否是列表项""" text = text.strip() # 数字编号:1) 或 (1) 或 1. if re.match(r'^[\d]+[、.))]', text): return True # 中文数字编号 if re.match(r'^[一二三四五六七八九十]+[、.))]', text): return True # 括号数字:(1) if re.match(r'^[((][\d一二三四五六七八九十]+[))]', text): return True return False def format_list_item(text): """格式化列表项""" text = text.strip() # 带编号的列表项:1) 或 (1) 转换为 - 1. if re.match(r'^(\d+)[))]', text): text = re.sub(r'^(\d+)[))]', r'- \1. ', text) return text # 括号数字:(1) 转换为 - 1. if re.match(r'^[((](\d+)[))]', text): text = re.sub(r'^[((](\d+)[))]', r'- \1. ', text) return text # 中文数字编号 if re.match(r'^[一二三四五六七八九十]+[、.))]', text): return "- " + text # 已经是格式化的列表项 if text.startswith('- '): return text # 其他情况作为列表项 return "- " + text def format_nested_list_item(text): """格式化嵌套列表项(子列表)""" text = text.strip() # (1) 格式 if re.match(r'^[((](\d+)[))]', text): text = re.sub(r'^[((](\d+)[))]', r' - \1. ', text) return text # 普通数字编号 if re.match(r'^(\d+)[))]', text): text = re.sub(r'^(\d+)[))]', r' - \1. ', text) return text return " - " + text def docx_to_markdown(file_path, output_path=None): """将DOCX文件转换为Markdown""" if output_path is None: base_name = os.path.splitext(file_path)[0] output_path = base_name + ".md" # 创建images目录 images_dir = "images" if not os.path.exists(images_dir): os.makedirs(images_dir) try: with zipfile.ZipFile(file_path, 'r') as zip_ref: # 读取文档主体 document_xml = zip_ref.read('word/document.xml') # 读取图片关系文件 image_relations = {} try: rels_xml = zip_ref.read('word/_rels/document.xml.rels') rels_root = ET.fromstring(rels_xml) rels_namespace = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'} for rel in rels_root.findall('.//r:Relationship', rels_namespace): rel_type = rel.get('Type', '') if 'image' in rel_type.lower(): rel_id = rel.get('Id') target = rel.get('Target') if rel_id and target: # 提取完整路径或文件名 if target.startswith('media/'): target = 'word/' + target image_filename = os.path.basename(target) image_relations[rel_id] = image_filename print(f" 图片关系映射: {rel_id} -> {image_filename}") except KeyError: pass # 如果没有关系文件,继续处理 except Exception as e: print(f" 警告: 读取图片关系时出错: {e}") # 提取所有图片文件 image_files = {} for file_info in zip_ref.namelist(): if file_info.startswith('word/media/'): filename = os.path.basename(file_info) if filename: # 确保文件名不为空 # 保存图片文件 image_data = zip_ref.read(file_info) image_path = os.path.join(images_dir, filename) # 确保目录存在 os.makedirs(os.path.dirname(image_path), exist_ok=True) with open(image_path, 'wb') as img_file: img_file.write(image_data) image_files[filename] = image_path print(f" 提取图片: {filename}") root = ET.fromstring(document_xml) namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' } md_lines = [] in_note = False in_list = False note_item_count = 0 used_images = set() # 追踪已使用的图片 image_counter = 1 # 图片计数器 # 提取所有段落 paragraphs = root.findall('.//w:p', namespaces) i = 0 while i < len(paragraphs): para = paragraphs[i] # 检查段落是否包含图片 para_has_image = has_image(para, namespaces) image_path = None if para_has_image: image_id = get_image_id(para, namespaces) if image_id and image_id in image_relations: image_filename = image_relations[image_id] # 查找对应的文件路径 if image_filename in image_files: image_path = image_files[image_filename].replace('\\', '/') else: # 如果文件名不匹配,使用完整路径 image_path = os.path.join(images_dir, image_filename).replace('\\', '/') # 标记为已使用 if image_path and image_path not in used_images: used_images.add(image_path) else: # 如果没有找到关系,尝试从image_files中找到未使用的图片 for filename, path in image_files.items(): rel_path = path.replace('\\', '/') if rel_path not in used_images: image_path = rel_path used_images.add(image_path) break text = get_paragraph_text(para, namespaces) # 如果段落只有图片没有文本,处理图片 if para_has_image and image_path and not text: if md_lines and md_lines[-1]: md_lines.append("") # 使用HTML img标签,设置样式让图片可以完整显示 md_lines.append(f'图片') md_lines.append("") i += 1 continue # 如果段落既没有图片也没有文本,跳过 if not text and not para_has_image: i += 1 continue # 如果有图片且没有文本,先添加图片 if para_has_image and image_path and not text: if md_lines and md_lines[-1]: md_lines.append("") md_lines.append(f"![]({image_path})") md_lines.append("") i += 1 continue # 先检测标题(优先级最高,因为标题可能包含数字) heading_level = detect_heading_level(text) if heading_level > 0: in_note = False in_list = False note_item_count = 0 if md_lines and md_lines[-1]: md_lines.append("") md_lines.append("#" * heading_level + " " + text) md_lines.append("") # 如果标题后紧跟图片,添加图片 if para_has_image and image_path: md_lines.append(f'图片') md_lines.append("") i += 1 continue # 检测是否进入"注:"段落 if text.startswith('注:') or text.startswith('注:'): in_note = True note_item_count = 0 if md_lines and md_lines[-1]: md_lines.append("") md_lines.append("**" + text + "**") md_lines.append("") in_list = False i += 1 continue # 检测列表项 if is_list_item(text): in_note = False # 列表项结束"注:"状态 note_item_count = 0 # 判断是否是嵌套列表(括号数字通常是嵌套) if re.match(r'^[((][\d一二三四五六七八九十]+[))]', text): formatted = format_nested_list_item(text) else: formatted = format_list_item(text) md_lines.append(formatted) in_list = True i += 1 continue # 如果在"注:"段落中,且当前行不是列表项,可能是第一项 if in_note and not is_list_item(text): # 检查下一行是否是列表项 has_next_item = False if i + 1 < len(paragraphs): next_para = paragraphs[i + 1] next_text = get_paragraph_text(next_para, namespaces) if is_list_item(next_text): has_next_item = True if has_next_item: # 当前行应该是第一项列表 note_item_count += 1 md_lines.append(f"- {note_item_count}. {text}") in_list = True else: # 可能是普通文本 md_lines.append(text) md_lines.append("") in_list = False i += 1 continue # 检查是否是"主流程"或"功能说明"等描述 if text in ['主流程:', '主流程', '功能说明:', '功能说明']: if md_lines and md_lines[-1]: md_lines.append("") md_lines.append("**" + text + "**") md_lines.append("") in_list = False i += 1 continue # 普通段落 if md_lines and md_lines[-1] and not in_list: md_lines.append("") md_lines.append(text) # 如果段落中包含图片,在文本后添加图片 if para_has_image and image_path: md_lines.append("") md_lines.append(f'图片') if not in_list: md_lines.append("") in_list = False in_note = False note_item_count = 0 i += 1 # 提取表格 tables = root.findall('.//w:tbl', namespaces) if tables: for table in tables: rows = table.findall('.//w:tr', namespaces) if rows: md_lines.append("") md_lines.append("### 表格") md_lines.append("") for row_idx, row in enumerate(rows): cells = row.findall('.//w:tc', namespaces) row_data = [] for cell in cells: cell_texts = [] for text_elem in cell.findall('.//w:t', namespaces): if text_elem.text: cell_texts.append(text_elem.text.strip()) cell_text = ' '.join(cell_texts).strip() row_data.append(cell_text if cell_text else ' ') if row_data: md_lines.append("| " + " | ".join(row_data) + " |") if row_idx == 0: separator = "| " + " | ".join(["---"] * len(row_data)) + " |" md_lines.append(separator) md_lines.append("") # 清理多余空行(最多连续2个空行) cleaned_lines = [] prev_empty = False for line in md_lines: is_empty = not line.strip() if is_empty and prev_empty: continue cleaned_lines.append(line) prev_empty = is_empty # 写入文件 md_content = "\n".join(cleaned_lines) with open(output_path, 'w', encoding='utf-8') as f: f.write(md_content) print(f"✓ 转换成功!") print(f" 输入文件: {file_path}") print(f" 输出文件: {output_path}") return output_path except Exception as e: print(f"✗ 错误: {str(e)}") import traceback traceback.print_exc() return None if __name__ == "__main__": input_file = "OA系统功能定制方案(2).docx" output_file = "需求.md" if not os.path.exists(input_file): print(f"✗ 文件不存在: {input_file}") sys.exit(1) docx_to_markdown(input_file, output_file)