Skip to content

Instantly share code, notes, and snippets.

@CFM880
Created June 13, 2025 07:53
Show Gist options
  • Save CFM880/b6986f60e5c7a5d4d9ede044f96e0f89 to your computer and use it in GitHub Desktop.
Save CFM880/b6986f60e5c7a5d4d9ede044f96e0f89 to your computer and use it in GitHub Desktop.
wiki.osdev.org offline file apped html
import os
import shutil
from bs4 import BeautifulSoup
import re
def copy_files_without_extension():
# 创建目标目录
os.makedirs('html_resources', exist_ok=True)
os.makedirs('other_resources', exist_ok=True)
# 遍历wiki.osdev.org目录
for root, dirs, files in os.walk('wiki.osdev.org'):
for file in files:
source_path = os.path.join(root, file)
# 构建相对路径
rel_path = os.path.relpath(source_path, 'wiki.osdev.org')
# 检查文件是否有后缀
if '.' not in file or file.endswith('.html'):
# 复制到html_resources
target_path = os.path.join('html_resources', rel_path)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
shutil.copy2(source_path, target_path)
else:
# 复制到other_resources
target_path = os.path.join('other_resources', rel_path)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
shutil.copy2(source_path, target_path)
def process_html_files():
# 创建not_html_resources目录
os.makedirs('not_html_resources', exist_ok=True)
# 遍历html_resources目录
for root, dirs, files in os.walk('html_resources'):
for file in files:
file_path = os.path.join(root, file)
# 检查是否是HTML文件
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
# 如果无法解析为HTML,移动到not_html_resources
if not soup.find():
target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
os.makedirs(os.path.dirname(target_path), exist_ok=True)
shutil.move(file_path, target_path)
continue
# 处理所有<a>标签
for a_tag in soup.find_all('a'):
href = a_tag.get('href')
if href and not href.startswith(('http://', 'https://', 'mailto:')):
if ':' in href:
href = href.replace(':', '_')
a_tag['href'] = f"{href}"
# 处理锚点
if '#' in href:
base, anchor = href.split('#', 1)
if not base.endswith('.html'):
a_tag['href'] = f"{base}.html#{anchor}"
else:
if not href.endswith('.html'):
a_tag['href'] = f"{href}.html"
# 保存修改后的文件
with open(file_path, 'w', encoding='utf-8') as f:
f.write(str(soup))
except Exception as e:
# 如果处理出错,移动到not_html_resources
target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
os.makedirs(os.path.dirname(target_path), exist_ok=True)
shutil.move(file_path, target_path)
def add_html_extension():
# 获取not_html_resources中的文件列表
not_html_files = set()
for root, dirs, files in os.walk('not_html_resources'):
for file in files:
rel_path = os.path.relpath(os.path.join(root, file), 'not_html_resources')
not_html_files.add(rel_path)
# 遍历html_resources目录
for root, dirs, files in os.walk('html_resources'):
for file in files:
file_path = os.path.join(root, file)
rel_path = os.path.relpath(file_path, 'html_resources')
# 如果文件不在not_html_resources中,添加.html后缀
if rel_path not in not_html_files and not file.endswith('.html'):
new_path = file_path + '.html'
os.rename(file_path, new_path)
def merge_resources():
# 将other_resources和not_html_resources复制到html_resources
for source_dir in ['other_resources', 'not_html_resources']:
for root, dirs, files in os.walk(source_dir):
for file in files:
source_path = os.path.join(root, file)
rel_path = os.path.relpath(source_path, source_dir)
target_path = os.path.join('html_resources', rel_path)
# 确保目标目录存在
os.makedirs(os.path.dirname(target_path), exist_ok=True)
# 复制文件
shutil.copy2(source_path, target_path)
def main():
if os.path.exists('html_resources'):
shutil.rmtree('html_resources')
if os.path.exists('other_resources'):
shutil.rmtree('other_resources')
if os.path.exists('not_html_resources'):
shutil.rmtree('not_html_resources')
print("开始复制文件...")
copy_files_without_extension()
print("开始处理HTML文件...")
process_html_files()
print("开始添加.html后缀...")
add_html_extension()
print("开始合并资源...")
merge_resources()
print("处理完成!")
if __name__ == "__main__":
main()
beautifulsoup4==4.12.2
@CFM880
Copy link
Author

CFM880 commented Jun 13, 2025

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment