PracticeDev/migrate-blog/migrate_from_hexo_to_wordpress.py

131 lines
4.3 KiB
Python
Raw Normal View History

2024-09-04 16:20:19 +08:00
import os
import re
import yaml
import markdown
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods import posts
from datetime import datetime
# WordPress 设置
wp_url = "https://blog.lnf1.skybyte.me/xmlrpc.php"
wp_username = "songtianlun"
wp_password = "sotilu,WP2024"
# Hexo 文章目录
hexo_root_dir = "/home/songtianlun/Sync/Develop/frytea/source"
def parse_hexo_post(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 提取 front matter允许更灵活的分隔符
front_matter_match = re.match(r'^[-*]{3,}\s+(.*?)\s+[-*]{3,}\s*(.*)', content, re.DOTALL)
if front_matter_match:
front_matter_str = front_matter_match.group(1)
post_content = front_matter_match.group(2).strip()
# 使用 PyYAML 解析 front matter
try:
front_matter = yaml.safe_load(front_matter_str)
except yaml.YAMLError:
# 如果 YAML 解析失败,使用手动解析
front_matter = {}
current_key = None
for line in front_matter_str.split('\n'):
if ':' in line and not line.strip().startswith('-'):
key, value = line.split(':', 1)
current_key = key.strip()
front_matter[current_key] = value.strip() or None
elif current_key and line.strip():
if isinstance(front_matter[current_key], list):
front_matter[current_key].append(line.strip())
elif front_matter[current_key] is None:
front_matter[current_key] = line.strip()
else:
front_matter[current_key] = [front_matter[current_key], line.strip()]
else:
front_matter = {}
post_content = content
return front_matter, post_content
def get_categories_from_path(file_path, root_dir):
rel_path = os.path.relpath(file_path, root_dir)
dir_path = os.path.dirname(rel_path)
categories = dir_path.split(os.sep)
return [cat for cat in categories if cat and cat != '_posts']
def migrate_to_wordpress(title, content, categories, tags, date, author):
client = Client(wp_url, wp_username, wp_password)
post = WordPressPost()
post.title = title
post.content = markdown.markdown(content)
post.post_status = 'publish'
post.terms_names = {
'category': categories,
'post_tag': tags
}
if date:
post.date = date
post_id = client.call(posts.NewPost(post))
return post_id
def parse_date(date_value):
if isinstance(date_value, datetime):
return date_value
if not isinstance(date_value, str):
return None
date_formats = [
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%S%z",
"%Y-%m-%d %H:%M:%S %z",
"%Y-%m-%d"
]
for fmt in date_formats:
try:
return datetime.strptime(date_value, fmt)
except ValueError:
continue
return None
def main():
for root, dirs, files in os.walk(hexo_root_dir):
for filename in files:
if filename.endswith('.md'):
file_path = os.path.join(root, filename)
front_matter, content = parse_hexo_post(file_path)
title = front_matter.get('title', 'Untitled').strip()
tags = front_matter.get('tags', [])
if isinstance(tags, str):
tags = [tag.strip() for tag in tags.split(',') if tag.strip()]
date = parse_date(front_matter.get('date'))
author = front_matter.get('author')
dir_categories = get_categories_from_path(file_path, hexo_root_dir)
categories = front_matter.get('categories', [])
if isinstance(categories, str):
categories = [categories]
categories = list(set(categories + dir_categories))
post_id = migrate_to_wordpress(title, content, categories, tags, date, author)
print(f"Migrated post: {title} (ID: {post_id})")
print(f" filename: {filename}")
print(f" Categories: {categories}")
print(f" Tags: {tags}")
print(f" Date: {date}")
if __name__ == "__main__":
main()