someone-oa/docx_to_md.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
将 DOCX 文件转换为 Markdown 格式
"""
import zipfile
import xml.etree.ElementTree as ET
import os
import sys
import re

def get_paragraph_text(para, namespaces):
    """从段落元素中提取文本"""
    texts = []
    for text_elem in para.findall('.//w:t', namespaces):
        if text_elem.text:
            texts.append(text_elem.text)
    return ''.join(texts).strip()

def has_image(para, namespaces):
    """检查段落是否包含图片"""
    # 检查是否包含drawing元素
    drawings = para.findall('.//w:drawing', namespaces)
    if len(drawings) > 0:
        return True
    # 也检查pict元素（旧格式）
    picts = para.findall('.//w:pict', namespaces)
    if len(picts) > 0:
        return True
    return False

def get_image_id(para, namespaces):
    """从段落中提取图片关系ID"""
    # 查找blip元素中的embed属性（新格式）
    blip_elements = para.findall('.//a:blip', namespaces)
    if blip_elements:
        for blip in blip_elements:
            embed = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
            if embed:
                return embed

    # 也查找picture元素（旧格式）
    pict_elements = para.findall('.//w:pict', namespaces)
    if pict_elements:
        # 在pict元素中查找embed
        for pict in pict_elements:
            # 查找所有可能的embed属性
            for elem in pict.iter():
                for attr_name, attr_value in elem.attrib.items():
                    if 'embed' in attr_name.lower():
                        # 提取关系ID（通常在rId后面）
                        if attr_value:
                            return attr_value

    return None

def detect_heading_level(text):
    """根据文本内容判断标题级别"""
    text = text.strip()

    # 一级标题：文档主标题
    if text == "OA系统功能定制方案" or text == "总体需求" or text == "功能建设":
        return 1

    # 二级标题：主章节（如 2.1, 2.2, 2.3, 2.4, 2.5）
    if re.match(r'^2\.\d+', text):
        # 检查是否是子章节（2.1.1这种格式）
        if re.match(r'^2\.\d+\.\d+', text):
            # 检查是否是四级标题（2.4.1.1这种格式）
            if re.match(r'^2\.\d+\.\d+\.\d+', text):
                return 4
            # 三级标题（2.1.1, 2.2.1等）
            return 3
        # 二级标题（2.1, 2.2等）
        return 2

    # 处理类似 "1.1管理要求" 的格式
    if re.match(r'^\d+\.\d+', text):
        return 2

    # 短文本且不是列表项的可能是标题
    if len(text) < 30 and not text.endswith('：') and not text.endswith(':'):
        # 排除列表项模式
        if not re.match(r'^[\d一二三四五六七八九十]+[、.)）]', text):
            if any(keyword in text for keyword in ['主要业务流程图', '主流程', '功能说明']):
                return 3

    return 0

def is_list_item(text):
    """判断是否是列表项"""
    text = text.strip()
    # 数字编号：1) 或 (1) 或 1.
    if re.match(r'^[\d]+[、.)）]', text):
        return True
    # 中文数字编号
    if re.match(r'^[一二三四五六七八九十]+[、.)）]', text):
        return True
    # 括号数字：(1)
    if re.match(r'^[（(][\d一二三四五六七八九十]+[）)]', text):
        return True
    return False

def format_list_item(text):
    """格式化列表项"""
    text = text.strip()

    # 带编号的列表项：1) 或 (1) 转换为 - 1.
    if re.match(r'^(\d+)[）)]', text):
        text = re.sub(r'^(\d+)[）)]', r'- \1. ', text)
        return text

    # 括号数字：(1) 转换为 - 1.
    if re.match(r'^[（(](\d+)[）)]', text):
        text = re.sub(r'^[（(](\d+)[）)]', r'- \1. ', text)
        return text

    # 中文数字编号
    if re.match(r'^[一二三四五六七八九十]+[、.)）]', text):
        return "- " + text

    # 已经是格式化的列表项
    if text.startswith('- '):
        return text

    # 其他情况作为列表项
    return "- " + text

def format_nested_list_item(text):
    """格式化嵌套列表项（子列表）"""
    text = text.strip()

    # (1) 格式
    if re.match(r'^[（(](\d+)[）)]', text):
        text = re.sub(r'^[（(](\d+)[）)]', r'  - \1. ', text)
        return text

    # 普通数字编号
    if re.match(r'^(\d+)[）)]', text):
        text = re.sub(r'^(\d+)[）)]', r'  - \1. ', text)
        return text

    return "  - " + text

def docx_to_markdown(file_path, output_path=None):
    """将DOCX文件转换为Markdown"""
    if output_path is None:
        base_name = os.path.splitext(file_path)[0]
        output_path = base_name + ".md"

    # 创建images目录
    images_dir = "images"
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            # 读取文档主体
            document_xml = zip_ref.read('word/document.xml')

            # 读取图片关系文件
            image_relations = {}
            try:
                rels_xml = zip_ref.read('word/_rels/document.xml.rels')
                rels_root = ET.fromstring(rels_xml)
                rels_namespace = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
                for rel in rels_root.findall('.//r:Relationship', rels_namespace):
                    rel_type = rel.get('Type', '')
                    if 'image' in rel_type.lower():
                        rel_id = rel.get('Id')
                        target = rel.get('Target')
                        if rel_id and target:
                            # 提取完整路径或文件名
                            if target.startswith('media/'):
                                target = 'word/' + target
                            image_filename = os.path.basename(target)
                            image_relations[rel_id] = image_filename
                            print(f"  图片关系映射: {rel_id} -> {image_filename}")
            except KeyError:
                pass  # 如果没有关系文件，继续处理
            except Exception as e:
                print(f"  警告: 读取图片关系时出错: {e}")

            # 提取所有图片文件
            image_files = {}
            for file_info in zip_ref.namelist():
                if file_info.startswith('word/media/'):
                    filename = os.path.basename(file_info)
                    if filename:  # 确保文件名不为空
                        # 保存图片文件
                        image_data = zip_ref.read(file_info)
                        image_path = os.path.join(images_dir, filename)
                        # 确保目录存在
                        os.makedirs(os.path.dirname(image_path), exist_ok=True)
                        with open(image_path, 'wb') as img_file:
                            img_file.write(image_data)
                        image_files[filename] = image_path
                        print(f"  提取图片: {filename}")

        root = ET.fromstring(document_xml)
        namespaces = {
            'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
            'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
            'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
        }

        md_lines = []
        in_note = False
        in_list = False
        note_item_count = 0
        used_images = set()  # 追踪已使用的图片
        image_counter = 1  # 图片计数器

        # 提取所有段落
        paragraphs = root.findall('.//w:p', namespaces)
        i = 0

        while i < len(paragraphs):
            para = paragraphs[i]

            # 检查段落是否包含图片
            para_has_image = has_image(para, namespaces)
            image_path = None

            if para_has_image:
                image_id = get_image_id(para, namespaces)
                if image_id and image_id in image_relations:
                    image_filename = image_relations[image_id]
                    # 查找对应的文件路径
                    if image_filename in image_files:
                        image_path = image_files[image_filename].replace('\\', '/')
                    else:
                        # 如果文件名不匹配，使用完整路径
                        image_path = os.path.join(images_dir, image_filename).replace('\\', '/')

                    # 标记为已使用
                    if image_path and image_path not in used_images:
                        used_images.add(image_path)
                else:
                    # 如果没有找到关系，尝试从image_files中找到未使用的图片
                    for filename, path in image_files.items():
                        rel_path = path.replace('\\', '/')
                        if rel_path not in used_images:
                            image_path = rel_path
                            used_images.add(image_path)
                            break

            text = get_paragraph_text(para, namespaces)

            # 如果段落只有图片没有文本，处理图片
            if para_has_image and image_path and not text:
                if md_lines and md_lines[-1]:
                    md_lines.append("")
                # 使用HTML img标签，设置样式让图片可以完整显示
                md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
                md_lines.append("")
                i += 1
                continue

            # 如果段落既没有图片也没有文本，跳过
            if not text and not para_has_image:
                i += 1
                continue

            # 如果有图片且没有文本，先添加图片
            if para_has_image and image_path and not text:
                if md_lines and md_lines[-1]:
                    md_lines.append("")
                md_lines.append(f"![]({image_path})")
                md_lines.append("")
                i += 1
                continue

            # 先检测标题（优先级最高，因为标题可能包含数字）
            heading_level = detect_heading_level(text)
            if heading_level > 0:
                in_note = False
                in_list = False
                note_item_count = 0
                if md_lines and md_lines[-1]:
                    md_lines.append("")
                md_lines.append("#" * heading_level + " " + text)
                md_lines.append("")
                # 如果标题后紧跟图片，添加图片
                if para_has_image and image_path:
                    md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
                    md_lines.append("")
                i += 1
                continue

            # 检测是否进入"注："段落
            if text.startswith('注：') or text.startswith('注:'):
                in_note = True
                note_item_count = 0
                if md_lines and md_lines[-1]:
                    md_lines.append("")
                md_lines.append("**" + text + "**")
                md_lines.append("")
                in_list = False
                i += 1
                continue

            # 检测列表项
            if is_list_item(text):
                in_note = False  # 列表项结束"注："状态
                note_item_count = 0
                # 判断是否是嵌套列表（括号数字通常是嵌套）
                if re.match(r'^[（(][\d一二三四五六七八九十]+[）)]', text):
                    formatted = format_nested_list_item(text)
                else:
                    formatted = format_list_item(text)
                md_lines.append(formatted)
                in_list = True
                i += 1
                continue

            # 如果在"注："段落中，且当前行不是列表项，可能是第一项
            if in_note and not is_list_item(text):
                # 检查下一行是否是列表项
                has_next_item = False
                if i + 1 < len(paragraphs):
                    next_para = paragraphs[i + 1]
                    next_text = get_paragraph_text(next_para, namespaces)
                    if is_list_item(next_text):
                        has_next_item = True

                if has_next_item:
                    # 当前行应该是第一项列表
                    note_item_count += 1
                    md_lines.append(f"- {note_item_count}. {text}")
                    in_list = True
                else:
                    # 可能是普通文本
                    md_lines.append(text)
                    md_lines.append("")
                    in_list = False
                i += 1
                continue

            # 检查是否是"主流程"或"功能说明"等描述
            if text in ['主流程：', '主流程', '功能说明：', '功能说明']:
                if md_lines and md_lines[-1]:
                    md_lines.append("")
                md_lines.append("**" + text + "**")
                md_lines.append("")
                in_list = False
                i += 1
                continue

            # 普通段落
            if md_lines and md_lines[-1] and not in_list:
                md_lines.append("")
            md_lines.append(text)
            # 如果段落中包含图片，在文本后添加图片
            if para_has_image and image_path:
                md_lines.append("")
                md_lines.append(f'<img src="{image_path}" alt="图片" style="max-width: 100%; height: auto;" />')
            if not in_list:
                md_lines.append("")
            in_list = False
            in_note = False
            note_item_count = 0
            i += 1

        # 提取表格
        tables = root.findall('.//w:tbl', namespaces)
        if tables:
            for table in tables:
                rows = table.findall('.//w:tr', namespaces)
                if rows:
                    md_lines.append("")
                    md_lines.append("### 表格")
                    md_lines.append("")

                    for row_idx, row in enumerate(rows):
                        cells = row.findall('.//w:tc', namespaces)
                        row_data = []
                        for cell in cells:
                            cell_texts = []
                            for text_elem in cell.findall('.//w:t', namespaces):
                                if text_elem.text:
                                    cell_texts.append(text_elem.text.strip())
                            cell_text = ' '.join(cell_texts).strip()
                            row_data.append(cell_text if cell_text else ' ')

                        if row_data:
                            md_lines.append("| " + " | ".join(row_data) + " |")
                            if row_idx == 0:
                                separator = "| " + " | ".join(["---"] * len(row_data)) + " |"
                                md_lines.append(separator)

                    md_lines.append("")

        # 清理多余空行（最多连续2个空行）
        cleaned_lines = []
        prev_empty = False
        for line in md_lines:
            is_empty = not line.strip()
            if is_empty and prev_empty:
                continue
            cleaned_lines.append(line)
            prev_empty = is_empty

        # 写入文件
        md_content = "\n".join(cleaned_lines)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(md_content)

        print(f"✓ 转换成功！")
        print(f"  输入文件: {file_path}")
        print(f"  输出文件: {output_path}")
        return output_path

    except Exception as e:
        print(f"✗ 错误: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    input_file = "OA系统功能定制方案(2).docx"
    output_file = "需求.md"

    if not os.path.exists(input_file):
        print(f"✗ 文件不存在: {input_file}")
        sys.exit(1)

    docx_to_markdown(input_file, output_file)