obsidian一键转hexo脚本

一键实现markdown的obsidian格式转为hexo格式的脚本。

功能基本实现，后续在使用中再进行优化，主要功能包括：

添加markdown头部信息，即 title，tags，categories等信息。
把obsidian的图片引用转换为hexo的图片引用。
把obsidian的图片复制到hexo文件夹”imgs”中（我的obsidian的图片也都放在了各级目录的”imgs”文件夹中。）。
把obsidian的其他文章内容引用转换为hexo的其他文章内容引用格式。hexo的格式为：
1
{% include_custom "reference_blogs/conceptAI.md" "##Softmax求梯度的公式推导" %}
需和hexo中引用其他文档片段的设置一起服用。
把obsidian的引用的文章复制到hexo文件夹”reference_blogs”中。
对标题前加空格。

其中utils_是我封装的工具函数，用到了utils_.get_file_paths_r(imgs_dir) ，主要功能是递归得到该文件夹中所以文件的路径。
工具函数：

def is_except_pattern(name: str, except_pattern: Dict[str, Union[str, list]]):
    """if one pattern is matched, return True.
    The key of except patterns is: 'start', 'mid', 'end'.
    """

    def build_pattern(flag: str, pattern: str) -> str:
        assert flag in flag_l, "flag is illegal!"
        if flag == "start":
            return "^" + re.escape(pattern) + ".+"
        elif flag == "end":
            return ".+" + re.escape(pattern) + "$"
        else:
            return ".*" + re.escape(pattern) + ".*"

    def judge_pattern(flag, except_pattern):
        if isinstance(except_pattern, str):
            if re.match(build_pattern(flag, except_pattern), name):
                return True
        else:
            for pattern_elem in except_pattern:
                if re.match(build_pattern(flag, pattern_elem), name):
                    return True

    flag_l = ["start", "mid", "end"]
    for flag in flag_l:
        if flag in except_pattern and judge_pattern(flag, except_pattern[flag]):
            return True
            
def get_file_paths_r(file_dir: str, except_pattern: Union[List, str] = None) -> list:
    """get all file paths recursively.
    Args:
        file_dir: the dir including files.
    Returns:
        origin_path_list: a list including all file paths.
    """
    origin_path_list = []
    for root, _, files in os.walk(file_dir, topdown=False):
        for name in files:
            if except_pattern and is_except_pattern(name, except_pattern):
                continue
            file_path = os.path.join(root, name)
            origin_path_list.append(file_path)
    return origin_path_list

obsidian一键转 hexo脚本：

import re
import os
import sys
import shutil
utils_dir = os.environ.get('utils_dir')
print(utils_dir)
sys.path.append(utils_dir)
import utils_
from utils_ import datetime_now

image_extensions = ['png', 'jpg', 'jpeg', 'gif', 'svg', 'bmp', 'tiff', 'webp', 'mp4']

def convert_front_matter(content):
    """
    Convert front matter from Obsidian format to Hexo format.
    """
    yaml_front_matter = re.findall(r'---\n(.*?)\n---', content, re.DOTALL)
    if yaml_front_matter:
        new_front_matter = yaml_front_matter[0]
        # Convert tags from list format to multiline format
        new_front_matter = re.sub(r'tags: \[(.*?)\]', lambda m: 'tags:\n  - ' + '\n  - '.join(m.group(1).split(', ')), new_front_matter)
        content = re.sub(r'---\n(.*?)\n---', f'---\n{new_front_matter}\n---', content, flags=re.DOTALL)
    return content


def extract_and_add_tags(content, existing_tags):
    """
    Extract tags from the first line if they are in the format #tag1 #tag2 and add them to the existing tags list.
    """
    lines = content.split('\n')
    first_line = lines[0]
    if all(tag.startswith('#') for tag in first_line.split()):
        new_tags = [tag[1:] for tag in first_line.split() if tag[1:]]
        for tag in new_tags:
            existing_tags.append(tag)
        # Remove the first line after extracting tags
        content = '\n'.join(lines[1:])
    return content, existing_tags


def convert_internal_links(content):
    """
    Convert Obsidian internal links to standard Markdown links.
    """
    def replace_link(match):
        section_name = match.group(1)
        # Convert spaces to hyphens and make lowercase for proper anchor links
        anchor = section_name.replace(' ', '-')
        return f'[{section_name}](#{anchor})'

    # return re.sub(r'\[\[#(.*?)\]\]', replace_link, content)
    return re.sub(r'\[\[#(?!\^[\w]{6}\|.*?)(.*?)\]\]', replace_link, content)

def convert_media_embeds(content, file_path):
    """
    Convert Obsidian media embeds to standard HTML syntax for images and videos.
    Obsidian image embeds are in the format: `<img src="/imgs/image.png">` or `<img src="/imgs/image.png" alt="" width="200px">`
    Obsidian video embeds are detected by file extension and embedded with video tags.
    If the format is `<img src="/imgs/image.png" alt="" width="200px">`, the image will be resized to 200px.
    """
    
    def replace_media_embed(match):
        media_embed = match.group(1)
        if '|' in media_embed:
            media_name, media_size = media_embed.split('|')
        else:
            media_name = media_embed
            media_size = None

        # Check if the media is a video by its extension
        if media_name.lower().endswith(('.mp4', '.webm', '.ogg')):
            return f'<video controls width="640" height="360"><source src="/imgs/{media_name}" type="video/mp4"></video>'
        else:
            if media_size:
                return f'<img src="/imgs/{media_name}" alt="" width="{media_size}px">'
            else:
                return f'<img src="/imgs/{media_name}">'

    return re.sub(r'!\[\[(.*?)\]\]', replace_media_embed, content)


# def convert_custom_references(content):

#     """

#     Convert Obsidian custom references to Hexo custom include tags,

#     excluding a broader range of image references and handling additional info in references.

#     """

#     def replace_reference(match):

#         file_section = match.group(1)

#         # 分割 file_section 以处理额外信息，如 "lw206.jpeg|475"

#         file_name_with_optional_info = file_section.split('|', 1)[0]

#         if '.' in file_name_with_optional_info and file_name_with_optional_info.split('.')[-1].lower() in image_extensions:

#             # It's an image reference, return it unchanged

#             return f'{% include_custom "reference_blogs/{file_section}.md" %}'

#         if '#' in file_name_with_optional_info:

#             file_name, section_name = file_name_with_optional_info.split('#', 1)

#             file_name = file_name.strip()

#             section_name = section_name.strip()

#             return f'{{% include_custom reference_blogs/{file_name}.md "{section_name}" %}}'

#         else:

#             # No section name provided

#             return f'{{% include_custom reference_blogs/{file_name_with_optional_info}.md %}}'


#     image_extensions = ['png', 'jpg', 'jpeg', 'gif', 'svg', 'bmp', 'tiff', 'webp']

#     # 调整正则表达式以排除更多类型的图片文件

#     image_extensions_pattern = '|'.join(image_extensions)

#     pattern = rf'!\[\[((?!.*\.({image_extensions_pattern})$).*?)\]\]'

#     return re.sub(pattern, replace_reference, content)

def convert_custom_references(content):
    """
    Convert Obsidian custom references to Hexo custom include tags,
    excluding a broader range of image references and handling additional info in references.
    """
    def replace_reference(match):
        file_section = match.group(1)
        # 分割 file_section 以处理额外信息，如 "lw206.jpeg|475"
        file_name_with_optional_info = file_section.split('|', 1)[0]
        if '.' in file_name_with_optional_info and file_name_with_optional_info.split('.')[-1].lower() in image_extensions:
            # It's an image reference, return it unchanged
            return f'{% include_custom "reference_blogs/{file_section}.md" %}'
        # 直接使用 file_section 而不是 file_name_with_optional_info 来保留 "#" 和后面的部分
        if '#' in file_section:
            file_name, section_name = file_section.split('#', 1)
            file_name = file_name.strip()
            # 保留 "#" 和后面的文本，以标识标题级别
            section_name = '#' + section_name.strip()
            return f'{{% include_custom "reference_blogs/{file_name}.md" "{section_name}" %}}'
        else:
            # No section name provided
            return f'{{% include_custom "reference_blogs/{file_name_with_optional_info}.md" %}}'

    
    # 调整正则表达式以排除更多类型的图片文件
    image_extensions_pattern = '|'.join(image_extensions)
    pattern = rf'!\[\[((?!.*\.({image_extensions_pattern})$).*?)\]\]'
    return re.sub(pattern, replace_reference, content)


def find_and_replace_custom_links(content):
    """
    Find and replace custom links like [transformer](/blog/2020/06/Transformer-Attention-Is-All-You-Need.html) or [Transformer-Attention Is All You Need](/blog/2020/06/Transformer-Attention-Is-All-You-Need.html)
    with proper links if the corresponding HTML files exist.
    """
    def replace_custom_link(match):
        full_text = match.group(0)
        file_name, link_text = match.group(1), match.group(2)
        if not link_text:
            link_text = file_name

        # Sanitize the file name to match the HTML file naming
        sanitized_file_name = file_name.replace(' ', '-')
        base_dir = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/public/blog/'
        
        # Search for the HTML file in the directory tree
        found_path = None
        for root, _, files in os.walk(base_dir):
            if f'{sanitized_file_name}.html' in files:
                relative_path = os.path.relpath(root, base_dir)
                found_path = os.path.join(relative_path, f'{sanitized_file_name}.html')
                break

        if found_path:
            return f'[{link_text}](/blog/{found_path})'
        else:
            print(f"File {sanitized_file_name}.html not found in {base_dir}.")
            return full_text

    pattern = r'(?<!!)\[\[([^|\]]+)\|?([^]]*)?\]\]'
    return re.sub(pattern, replace_custom_link, content)



def convert_tags(content):
    """
    Convert Obsidian tags to Hexo front matter tags.
    """
    return re.sub(r'#(\w+)', '', content)

def add_header(content, header):
    """
    Add a header to the content.
    """
    return header + content

def get_imgs_dir():
    imgs_dir_l = []
    for root, dirs, _ in os.walk(BASE_DIR):
        for dir in dirs:
            if 'imgs' == dir:
                imgs_dir_l.append(os.path.join(root, dir))
    return imgs_dir_l

def ecopy_image2hexo(content, file_path):
    """
    Extract all image names from Obsidian markdown content and copy them to the Hexo blog.
    """
    
    image_names = re.findall(r'!\[\[(.*?)\]\]', content)
    for i, img_name in enumerate(image_names):
        if '|' in img_name:
            image_names[i] = img_name.split('|')[0]
    md_dir = os.path.dirname(file_path)
    imgs_path_l = utils_.get_file_paths_r(utils_.ospj(md_dir, 'imgs'))
    for img_name in image_names:
        if img_name in imgs_path_l:
            os.system(f'cp {img_name} {HEXO_DIR}/imgs/')
            print(f"Copy {img_name} to {HEXO_DIR}/imgs/")
        else:
            imgs_dir_l = get_imgs_dir()
            for imgs_dir in imgs_dir_l:
                file_path_l = utils_.get_file_paths_r(imgs_dir)
                for file_path in file_path_l:
                    if img_name == os.path.basename(file_path):
                        utils_.shutil.copy(file_path, HEXO_DIR + '/imgs/')
                        print(f"Copy {img_name} to {HEXO_DIR}/imgs/")
                        break

def ecopy_markdown2hexo(content, file_path, header):
    """
    Extract all referenced markdown files from Obsidian markdown content and copy them to the Hexo blog.
    """
    
    # 构建排除图片扩展名的正则表达式部分
    image_extensions_pattern = r'(?:' + '|'.join(image_extensions) + r')'
    # 定义正则表达式，排除图片引用，匹配文章引用
    # 正则表达式解释：
    # !\[\[：匹配开始的![[字符
    # (?!.*\.(?:png|jpg|...))：负向前瞻，排除以图片扩展名结尾的引用
    # ([^\]\[]+)：捕获组，匹配并捕获非]和[的一个或多个字符
    # (?:#[^\]\[]*)?：非捕获组，可选地匹配#后面非]和[的字符
    # \]\]：匹配结尾的]]字符
    pattern = rf'!\[\[(?!.*\.(?:{image_extensions_pattern}))([^\]\[]+?)(?:#[^\]\[]*)?\]\]'
    
    md_names = re.findall(pattern, content)
    md_names = [md_name.strip() for md_name in md_names]
    md_dir = os.path.dirname(file_path)
    
    for md_name in md_names:
        md_path = utils_.ospj(md_dir, md_name + '.md')
        if os.path.exists(md_path):
            with open(md_path, 'r', encoding='utf-8') as md_file:
                md_content = md_file.read()
            md_content = header + md_content
            hexo_md_path = os.path.join(HEXO_DIR, 'reference_blogs', md_name + '.md')
            with open(hexo_md_path, 'w', encoding='utf-8') as hexo_md_file:
                hexo_md_file.write(md_content)
            print(f"Copied and added header to {md_path} to {hexo_md_path}")
        else:
            for root, _, files in os.walk(BASE_DIR):
                for file in files:
                    if file == md_name + '.md' and 'imgs' not in root:
                        src_path = os.path.join(root, file)
                        with open(src_path, 'r', encoding='utf-8') as src_file:
                            src_content = src_file.read()
                        src_content = header + src_content
                        hexo_md_path = os.path.join(HEXO_DIR, 'reference_blogs', md_name + '.md')
                        with open(hexo_md_path, 'w', encoding='utf-8') as hexo_md_file:
                            hexo_md_file.write(src_content)
                        print(f"Copied and added header to {src_path} to {hexo_md_path}")
                        break


def add_spaceLine_before_title(content):
    """
    Add a space line before the title.
    对多级标题都适用
    """
    return re.sub(r'(\n#)', r'\n\1', content)


def convert_custom_ids(content):
    """
    Convert ^764e2c-like content to <a id="^764e2c"></a>.
    """
    return re.sub(r'(^|\s)\^([a-zA-Z0-9]{6})(\s|$)', r'\1<a id="^\2"></a>\3', content)

def convert_custom_links(content):
    """
    Convert [2D VS 3D](#^4c8bbb) pattern to [2D VS 3D](#^4c8bbb).
    """
    return re.sub(r'\[\[#\^([a-zA-Z0-9]{6})\|([^\]]+)\]\]', r'[\2](#^\1)', content)


def find_and_replace_external_section_links(content):
    """
    Find and replace links like [Action Recognition Task](/blog/2023/08/PoseEstimation-TAD.html#R-C3D) or [PoseEstimation_TAD#R-C3D](/blog/2023/08/PoseEstimation-TAD.html#R-C3D)
    with proper links pointing to a specific section in an external document if the corresponding HTML files exist.
    """
    def replace_external_section_link(match):
        full_text = match.group(0)
        file_name, section_name, link_text = match.group(1), match.group(2), match.group(3)
        if not link_text:
            link_text = f"{file_name}#{section_name}"

        # Sanitize the file name to match the HTML file naming
        sanitized_file_name = file_name.replace(' ', '-').replace('_', '-')
        base_dir = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/public/blog/'
        
        # Search for the HTML file in the directory tree
        found_path = None
        for root, _, files in os.walk(base_dir):
            if f'{sanitized_file_name}.html' in files:
                relative_path = os.path.relpath(root, base_dir)
                found_path = os.path.join(relative_path, f'{sanitized_file_name}.html')
                break

        if found_path:
            return f'[{link_text}](/blog/{found_path}#{section_name})'
        else:
            print(f"File {sanitized_file_name}.html not found in {base_dir}.")
            return full_text

    pattern = r'(?<!!)\[\[([^\|\#]+)\#([^\|\]]+)\|?([^\]]*)?\]\]'
    return re.sub(pattern, replace_external_section_link, content)


def convert_headings_to_html(content):
    """
    Convert Obsidian headings to HTML headings with sequentially numbered IDs.
    """
    def replace_heading(match):
        level = len(match.group(1))
        heading_text = match.group(2).strip()
        nonlocal heading_counter
        heading_id = f"标题 {heading_counter}"
        heading_counter += 1
        return f'<h{level} id="{heading_id}">{heading_text}</h{level}>'

    heading_counter = 1
    return re.sub(r'^(#{1,6}) (.*)$', replace_heading, content, flags=re.MULTILINE)


def convert_obsidian_to_hexo(file_path, header=None):
    """
    Convert an Obsidian Markdown file to Hexo format.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    # content = convert_front_matter(content)
    header_tags = re.findall(r'tags: \[([^\]]+)\]', header)
    header_tags = header_tags[0].split(', ') if header_tags else []
    content, header_tags = extract_and_add_tags(content, header_tags)
    header_l = header.split('\n')
    header_l[5] = f"tags: {header_tags}"
    header = '\n'.join(header_l)
    if header:
        content = add_header(content, header)

    # Extract and add tags from the first line
    
    content = convert_internal_links(content)
    ecopy_markdown2hexo(content, file_path, header)

    content = convert_custom_references(content)
    content = find_and_replace_custom_links(content)
    content = find_and_replace_external_section_links(content)  # Add this line to integrate the new function
    ecopy_image2hexo(content, file_path)
    content = add_spaceLine_before_title(content)
    content = convert_media_embeds(content, file_path)
    # content = convert_tags(content)
    content = convert_custom_ids(content)  # 调用新功能函数
    content = convert_custom_links(content)  # 调用新功能函数
    
    
    
    # content = convert_headings_to_html(content)

    hexo_file_path = utils_.ospj(HEXO_DIR, '_posts', os.path.basename(file_path))
    with open(hexo_file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    print(f"Converted {file_path} to {hexo_file_path}")

BASE_DIR = '/Users/jc/Documents/CodeFlying/notebook/notebooks'
HEXO_DIR = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/source'
current_time = datetime_now.strftime("%Y-%m-%d %H:%M:%S")
header = f"""---
title: 人体姿态估计&动作识别
date: 2023-08-15 20:21:36
update: 2024-07-05 23:54:48
categories: ActionDetection
tags: ['ActionDetection', 'TemporalActionDetection']
mathjax: true
---
The detail of TriDet.

<!--more -->

---
"""
ob_md = '/Users/jc/Documents/CodeFlying/notebook/notebooks/AI/otherTech/humanRepresentation/action_detection/PoseEstimation_TAD.md'
convert_obsidian_to_hexo(ob_md, header=header)

# convert_obsidian_to_hexo(ob_md)