0%

obsidian一键转hexo脚本

一键实现markdown的obsidian格式转为hexo格式的脚本。


功能基本实现,后续在使用中再进行优化,主要功能包括:

  1. 添加markdown头部信息,即 title,tags,categories等信息。
  2. 把obsidian的图片引用转换为hexo的图片引用。
  3. 把obsidian的图片复制到hexo文件夹”imgs”中(我的obsidian的图片也都放在了各级目录的”imgs”文件夹中。)。
  4. 把obsidian的其他文章内容引用转换为hexo的其他文章内容引用格式。hexo的格式为:

    1
    {% include_custom "reference_blogs/conceptAI.md" "##Softmax求梯度的公式推导" %}

    需和hexo中引用其他文档片段的设置 一起服用。

  5. 把obsidian的引用的文章复制到hexo文件夹”reference_blogs”中。
  6. 对标题前加空格。

其中utils_是我封装的工具函数,用到了utils_.get_file_paths_r(imgs_dir) ,主要功能是递归得到该文件夹中所以文件的路径。
工具函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def is_except_pattern(name: str, except_pattern: Dict[str, Union[str, list]]):
"""if one pattern is matched, return True.
The key of except patterns is: 'start', 'mid', 'end'.
"""

def build_pattern(flag: str, pattern: str) -> str:
assert flag in flag_l, "flag is illegal!"
if flag == "start":
return "^" + re.escape(pattern) + ".+"
elif flag == "end":
return ".+" + re.escape(pattern) + "$"
else:
return ".*" + re.escape(pattern) + ".*"

def judge_pattern(flag, except_pattern):
if isinstance(except_pattern, str):
if re.match(build_pattern(flag, except_pattern), name):
return True
else:
for pattern_elem in except_pattern:
if re.match(build_pattern(flag, pattern_elem), name):
return True

flag_l = ["start", "mid", "end"]
for flag in flag_l:
if flag in except_pattern and judge_pattern(flag, except_pattern[flag]):
return True

def get_file_paths_r(file_dir: str, except_pattern: Union[List, str] = None) -> list:
"""get all file paths recursively.
Args:
file_dir: the dir including files.
Returns:
origin_path_list: a list including all file paths.
"""
origin_path_list = []
for root, _, files in os.walk(file_dir, topdown=False):
for name in files:
if except_pattern and is_except_pattern(name, except_pattern):
continue
file_path = os.path.join(root, name)
origin_path_list.append(file_path)
return origin_path_list

obsidian一键转 hexo脚本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
import re
import os
import sys
import shutil
utils_dir = os.environ.get('utils_dir')
print(utils_dir)
sys.path.append(utils_dir)
import utils_
from utils_ import datetime_now

image_extensions = ['png', 'jpg', 'jpeg', 'gif', 'svg', 'bmp', 'tiff', 'webp', 'mp4']

def convert_front_matter(content):
"""
Convert front matter from Obsidian format to Hexo format.
"""
yaml_front_matter = re.findall(r'---\n(.*?)\n---', content, re.DOTALL)
if yaml_front_matter:
new_front_matter = yaml_front_matter[0]
# Convert tags from list format to multiline format
new_front_matter = re.sub(r'tags: \[(.*?)\]', lambda m: 'tags:\n - ' + '\n - '.join(m.group(1).split(', ')), new_front_matter)
content = re.sub(r'---\n(.*?)\n---', f'---\n{new_front_matter}\n---', content, flags=re.DOTALL)
return content


def extract_and_add_tags(content, existing_tags):
"""
Extract tags from the first line if they are in the format #tag1 #tag2 and add them to the existing tags list.
"""
lines = content.split('\n')
first_line = lines[0]
if all(tag.startswith('#') for tag in first_line.split()):
new_tags = [tag[1:] for tag in first_line.split() if tag[1:]]
for tag in new_tags:
existing_tags.append(tag)
# Remove the first line after extracting tags
content = '\n'.join(lines[1:])
return content, existing_tags


def convert_internal_links(content):
"""
Convert Obsidian internal links to standard Markdown links.
"""
def replace_link(match):
section_name = match.group(1)
# Convert spaces to hyphens and make lowercase for proper anchor links
anchor = section_name.replace(' ', '-')
return f'[{section_name}](#{anchor})'

# return re.sub(r'\[\[#(.*?)\]\]', replace_link, content)
return re.sub(r'\[\[#(?!\^[\w]{6}\|.*?)(.*?)\]\]', replace_link, content)

def convert_media_embeds(content, file_path):
"""
Convert Obsidian media embeds to standard HTML syntax for images and videos.
Obsidian image embeds are in the format: `<img src="/imgs/image.png">` or `<img src="/imgs/image.png" alt="" width="200px">`
Obsidian video embeds are detected by file extension and embedded with video tags.
If the format is `<img src="/imgs/image.png" alt="" width="200px">`, the image will be resized to 200px.
"""

def replace_media_embed(match):
media_embed = match.group(1)
if '|' in media_embed:
media_name, media_size = media_embed.split('|')
else:
media_name = media_embed
media_size = None

# Check if the media is a video by its extension
if media_name.lower().endswith(('.mp4', '.webm', '.ogg')):
return f'<video controls width="640" height="360"><source src="/imgs/{media_name}" type="video/mp4"></video>'
else:
if media_size:
return f'<img src="/imgs/{media_name}" alt="" width="{media_size}px">'
else:
return f'<img src="/imgs/{media_name}">'

return re.sub(r'!\[\[(.*?)\]\]', replace_media_embed, content)


# def convert_custom_references(content):

# """

# Convert Obsidian custom references to Hexo custom include tags,

# excluding a broader range of image references and handling additional info in references.

# """

# def replace_reference(match):

# file_section = match.group(1)

# # 分割 file_section 以处理额外信息,如 "lw206.jpeg|475"

# file_name_with_optional_info = file_section.split('|', 1)[0]

# if '.' in file_name_with_optional_info and file_name_with_optional_info.split('.')[-1].lower() in image_extensions:

# # It's an image reference, return it unchanged

# return f'{% include_custom "reference_blogs/{file_section}.md" %}'

# if '#' in file_name_with_optional_info:

# file_name, section_name = file_name_with_optional_info.split('#', 1)

# file_name = file_name.strip()

# section_name = section_name.strip()

# return f'{{% include_custom reference_blogs/{file_name}.md "{section_name}" %}}'

# else:

# # No section name provided

# return f'{{% include_custom reference_blogs/{file_name_with_optional_info}.md %}}'


# image_extensions = ['png', 'jpg', 'jpeg', 'gif', 'svg', 'bmp', 'tiff', 'webp']

# # 调整正则表达式以排除更多类型的图片文件

# image_extensions_pattern = '|'.join(image_extensions)

# pattern = rf'!\[\[((?!.*\.({image_extensions_pattern})$).*?)\]\]'

# return re.sub(pattern, replace_reference, content)

def convert_custom_references(content):
"""
Convert Obsidian custom references to Hexo custom include tags,
excluding a broader range of image references and handling additional info in references.
"""
def replace_reference(match):
file_section = match.group(1)
# 分割 file_section 以处理额外信息,如 "lw206.jpeg|475"
file_name_with_optional_info = file_section.split('|', 1)[0]
if '.' in file_name_with_optional_info and file_name_with_optional_info.split('.')[-1].lower() in image_extensions:
# It's an image reference, return it unchanged
return f'{% include_custom "reference_blogs/{file_section}.md" %}'
# 直接使用 file_section 而不是 file_name_with_optional_info 来保留 "#" 和后面的部分
if '#' in file_section:
file_name, section_name = file_section.split('#', 1)
file_name = file_name.strip()
# 保留 "#" 和后面的文本,以标识标题级别
section_name = '#' + section_name.strip()
return f'{{% include_custom "reference_blogs/{file_name}.md" "{section_name}" %}}'
else:
# No section name provided
return f'{{% include_custom "reference_blogs/{file_name_with_optional_info}.md" %}}'


# 调整正则表达式以排除更多类型的图片文件
image_extensions_pattern = '|'.join(image_extensions)
pattern = rf'!\[\[((?!.*\.({image_extensions_pattern})$).*?)\]\]'
return re.sub(pattern, replace_reference, content)


def find_and_replace_custom_links(content):
"""
Find and replace custom links like [transformer](/blog/2020/06/Transformer-Attention-Is-All-You-Need.html) or [Transformer-Attention Is All You Need](/blog/2020/06/Transformer-Attention-Is-All-You-Need.html)
with proper links if the corresponding HTML files exist.
"""
def replace_custom_link(match):
full_text = match.group(0)
file_name, link_text = match.group(1), match.group(2)
if not link_text:
link_text = file_name

# Sanitize the file name to match the HTML file naming
sanitized_file_name = file_name.replace(' ', '-')
base_dir = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/public/blog/'

# Search for the HTML file in the directory tree
found_path = None
for root, _, files in os.walk(base_dir):
if f'{sanitized_file_name}.html' in files:
relative_path = os.path.relpath(root, base_dir)
found_path = os.path.join(relative_path, f'{sanitized_file_name}.html')
break

if found_path:
return f'[{link_text}](/blog/{found_path})'
else:
print(f"File {sanitized_file_name}.html not found in {base_dir}.")
return full_text

pattern = r'(?<!!)\[\[([^|\]]+)\|?([^]]*)?\]\]'
return re.sub(pattern, replace_custom_link, content)



def convert_tags(content):
"""
Convert Obsidian tags to Hexo front matter tags.
"""
return re.sub(r'#(\w+)', '', content)

def add_header(content, header):
"""
Add a header to the content.
"""
return header + content

def get_imgs_dir():
imgs_dir_l = []
for root, dirs, _ in os.walk(BASE_DIR):
for dir in dirs:
if 'imgs' == dir:
imgs_dir_l.append(os.path.join(root, dir))
return imgs_dir_l

def ecopy_image2hexo(content, file_path):
"""
Extract all image names from Obsidian markdown content and copy them to the Hexo blog.
"""

image_names = re.findall(r'!\[\[(.*?)\]\]', content)
for i, img_name in enumerate(image_names):
if '|' in img_name:
image_names[i] = img_name.split('|')[0]
md_dir = os.path.dirname(file_path)
imgs_path_l = utils_.get_file_paths_r(utils_.ospj(md_dir, 'imgs'))
for img_name in image_names:
if img_name in imgs_path_l:
os.system(f'cp {img_name} {HEXO_DIR}/imgs/')
print(f"Copy {img_name} to {HEXO_DIR}/imgs/")
else:
imgs_dir_l = get_imgs_dir()
for imgs_dir in imgs_dir_l:
file_path_l = utils_.get_file_paths_r(imgs_dir)
for file_path in file_path_l:
if img_name == os.path.basename(file_path):
utils_.shutil.copy(file_path, HEXO_DIR + '/imgs/')
print(f"Copy {img_name} to {HEXO_DIR}/imgs/")
break

def ecopy_markdown2hexo(content, file_path, header):
"""
Extract all referenced markdown files from Obsidian markdown content and copy them to the Hexo blog.
"""

# 构建排除图片扩展名的正则表达式部分
image_extensions_pattern = r'(?:' + '|'.join(image_extensions) + r')'
# 定义正则表达式,排除图片引用,匹配文章引用
# 正则表达式解释:
# !\[\[:匹配开始的![[字符
# (?!.*\.(?:png|jpg|...)):负向前瞻,排除以图片扩展名结尾的引用
# ([^\]\[]+):捕获组,匹配并捕获非]和[的一个或多个字符
# (?:#[^\]\[]*)?:非捕获组,可选地匹配#后面非]和[的字符
# \]\]:匹配结尾的]]字符
pattern = rf'!\[\[(?!.*\.(?:{image_extensions_pattern}))([^\]\[]+?)(?:#[^\]\[]*)?\]\]'

md_names = re.findall(pattern, content)
md_names = [md_name.strip() for md_name in md_names]
md_dir = os.path.dirname(file_path)

for md_name in md_names:
md_path = utils_.ospj(md_dir, md_name + '.md')
if os.path.exists(md_path):
with open(md_path, 'r', encoding='utf-8') as md_file:
md_content = md_file.read()
md_content = header + md_content
hexo_md_path = os.path.join(HEXO_DIR, 'reference_blogs', md_name + '.md')
with open(hexo_md_path, 'w', encoding='utf-8') as hexo_md_file:
hexo_md_file.write(md_content)
print(f"Copied and added header to {md_path} to {hexo_md_path}")
else:
for root, _, files in os.walk(BASE_DIR):
for file in files:
if file == md_name + '.md' and 'imgs' not in root:
src_path = os.path.join(root, file)
with open(src_path, 'r', encoding='utf-8') as src_file:
src_content = src_file.read()
src_content = header + src_content
hexo_md_path = os.path.join(HEXO_DIR, 'reference_blogs', md_name + '.md')
with open(hexo_md_path, 'w', encoding='utf-8') as hexo_md_file:
hexo_md_file.write(src_content)
print(f"Copied and added header to {src_path} to {hexo_md_path}")
break


def add_spaceLine_before_title(content):
"""
Add a space line before the title.
对多级标题都适用
"""
return re.sub(r'(\n#)', r'\n\1', content)


def convert_custom_ids(content):
"""
Convert ^764e2c-like content to <a id="^764e2c"></a>.
"""
return re.sub(r'(^|\s)\^([a-zA-Z0-9]{6})(\s|$)', r'\1<a id="^\2"></a>\3', content)

def convert_custom_links(content):
"""
Convert [2D VS 3D](#^4c8bbb) pattern to [2D VS 3D](#^4c8bbb).
"""
return re.sub(r'\[\[#\^([a-zA-Z0-9]{6})\|([^\]]+)\]\]', r'[\2](#^\1)', content)


def find_and_replace_external_section_links(content):
"""
Find and replace links like [Action Recognition Task](/blog/2023/08/PoseEstimation-TAD.html#R-C3D) or [PoseEstimation_TAD#R-C3D](/blog/2023/08/PoseEstimation-TAD.html#R-C3D)
with proper links pointing to a specific section in an external document if the corresponding HTML files exist.
"""
def replace_external_section_link(match):
full_text = match.group(0)
file_name, section_name, link_text = match.group(1), match.group(2), match.group(3)
if not link_text:
link_text = f"{file_name}#{section_name}"

# Sanitize the file name to match the HTML file naming
sanitized_file_name = file_name.replace(' ', '-').replace('_', '-')
base_dir = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/public/blog/'

# Search for the HTML file in the directory tree
found_path = None
for root, _, files in os.walk(base_dir):
if f'{sanitized_file_name}.html' in files:
relative_path = os.path.relpath(root, base_dir)
found_path = os.path.join(relative_path, f'{sanitized_file_name}.html')
break

if found_path:
return f'[{link_text}](/blog/{found_path}#{section_name})'
else:
print(f"File {sanitized_file_name}.html not found in {base_dir}.")
return full_text

pattern = r'(?<!!)\[\[([^\|\#]+)\#([^\|\]]+)\|?([^\]]*)?\]\]'
return re.sub(pattern, replace_external_section_link, content)


def convert_headings_to_html(content):
"""
Convert Obsidian headings to HTML headings with sequentially numbered IDs.
"""
def replace_heading(match):
level = len(match.group(1))
heading_text = match.group(2).strip()
nonlocal heading_counter
heading_id = f"标题 {heading_counter}"
heading_counter += 1
return f'<h{level} id="{heading_id}">{heading_text}</h{level}>'

heading_counter = 1
return re.sub(r'^(#{1,6}) (.*)$', replace_heading, content, flags=re.MULTILINE)


def convert_obsidian_to_hexo(file_path, header=None):
"""
Convert an Obsidian Markdown file to Hexo format.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# content = convert_front_matter(content)
header_tags = re.findall(r'tags: \[([^\]]+)\]', header)
header_tags = header_tags[0].split(', ') if header_tags else []
content, header_tags = extract_and_add_tags(content, header_tags)
header_l = header.split('\n')
header_l[5] = f"tags: {header_tags}"
header = '\n'.join(header_l)
if header:
content = add_header(content, header)

# Extract and add tags from the first line

content = convert_internal_links(content)
ecopy_markdown2hexo(content, file_path, header)

content = convert_custom_references(content)
content = find_and_replace_custom_links(content)
content = find_and_replace_external_section_links(content) # Add this line to integrate the new function
ecopy_image2hexo(content, file_path)
content = add_spaceLine_before_title(content)
content = convert_media_embeds(content, file_path)
# content = convert_tags(content)
content = convert_custom_ids(content) # 调用新功能函数
content = convert_custom_links(content) # 调用新功能函数



# content = convert_headings_to_html(content)

hexo_file_path = utils_.ospj(HEXO_DIR, '_posts', os.path.basename(file_path))
with open(hexo_file_path, 'w', encoding='utf-8') as file:
file.write(content)

print(f"Converted {file_path} to {hexo_file_path}")

BASE_DIR = '/Users/jc/Documents/CodeFlying/notebook/notebooks'
HEXO_DIR = '/Users/jc/Documents/CodeFlying/notebook/hexo-blog/source'
current_time = datetime_now.strftime("%Y-%m-%d %H:%M:%S")
header = f"""---
title: 人体姿态估计&动作识别
date: 2023-08-15 20:21:36
update: 2024-07-05 23:54:48
categories: ActionDetection
tags: ['ActionDetection', 'TemporalActionDetection']
mathjax: true
---
The detail of TriDet.

<!--more -->

---
"""
ob_md = '/Users/jc/Documents/CodeFlying/notebook/notebooks/AI/otherTech/humanRepresentation/action_detection/PoseEstimation_TAD.md'
convert_obsidian_to_hexo(ob_md, header=header)

# convert_obsidian_to_hexo(ob_md)