Created
June 13, 2025 07:53
-
-
Save CFM880/b6986f60e5c7a5d4d9ede044f96e0f89 to your computer and use it in GitHub Desktop.
wiki.osdev.org offline file apped html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
from bs4 import BeautifulSoup | |
import re | |
def copy_files_without_extension(): | |
# 创建目标目录 | |
os.makedirs('html_resources', exist_ok=True) | |
os.makedirs('other_resources', exist_ok=True) | |
# 遍历wiki.osdev.org目录 | |
for root, dirs, files in os.walk('wiki.osdev.org'): | |
for file in files: | |
source_path = os.path.join(root, file) | |
# 构建相对路径 | |
rel_path = os.path.relpath(source_path, 'wiki.osdev.org') | |
# 检查文件是否有后缀 | |
if '.' not in file or file.endswith('.html'): | |
# 复制到html_resources | |
target_path = os.path.join('html_resources', rel_path) | |
os.makedirs(os.path.dirname(target_path), exist_ok=True) | |
shutil.copy2(source_path, target_path) | |
else: | |
# 复制到other_resources | |
target_path = os.path.join('other_resources', rel_path) | |
os.makedirs(os.path.dirname(target_path), exist_ok=True) | |
shutil.copy2(source_path, target_path) | |
def process_html_files(): | |
# 创建not_html_resources目录 | |
os.makedirs('not_html_resources', exist_ok=True) | |
# 遍历html_resources目录 | |
for root, dirs, files in os.walk('html_resources'): | |
for file in files: | |
file_path = os.path.join(root, file) | |
# 检查是否是HTML文件 | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
soup = BeautifulSoup(content, 'html.parser') | |
# 如果无法解析为HTML,移动到not_html_resources | |
if not soup.find(): | |
target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources')) | |
os.makedirs(os.path.dirname(target_path), exist_ok=True) | |
shutil.move(file_path, target_path) | |
continue | |
# 处理所有<a>标签 | |
for a_tag in soup.find_all('a'): | |
href = a_tag.get('href') | |
if href and not href.startswith(('http://', 'https://', 'mailto:')): | |
if ':' in href: | |
href = href.replace(':', '_') | |
a_tag['href'] = f"{href}" | |
# 处理锚点 | |
if '#' in href: | |
base, anchor = href.split('#', 1) | |
if not base.endswith('.html'): | |
a_tag['href'] = f"{base}.html#{anchor}" | |
else: | |
if not href.endswith('.html'): | |
a_tag['href'] = f"{href}.html" | |
# 保存修改后的文件 | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(str(soup)) | |
except Exception as e: | |
# 如果处理出错,移动到not_html_resources | |
target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources')) | |
os.makedirs(os.path.dirname(target_path), exist_ok=True) | |
shutil.move(file_path, target_path) | |
def add_html_extension(): | |
# 获取not_html_resources中的文件列表 | |
not_html_files = set() | |
for root, dirs, files in os.walk('not_html_resources'): | |
for file in files: | |
rel_path = os.path.relpath(os.path.join(root, file), 'not_html_resources') | |
not_html_files.add(rel_path) | |
# 遍历html_resources目录 | |
for root, dirs, files in os.walk('html_resources'): | |
for file in files: | |
file_path = os.path.join(root, file) | |
rel_path = os.path.relpath(file_path, 'html_resources') | |
# 如果文件不在not_html_resources中,添加.html后缀 | |
if rel_path not in not_html_files and not file.endswith('.html'): | |
new_path = file_path + '.html' | |
os.rename(file_path, new_path) | |
def merge_resources(): | |
# 将other_resources和not_html_resources复制到html_resources | |
for source_dir in ['other_resources', 'not_html_resources']: | |
for root, dirs, files in os.walk(source_dir): | |
for file in files: | |
source_path = os.path.join(root, file) | |
rel_path = os.path.relpath(source_path, source_dir) | |
target_path = os.path.join('html_resources', rel_path) | |
# 确保目标目录存在 | |
os.makedirs(os.path.dirname(target_path), exist_ok=True) | |
# 复制文件 | |
shutil.copy2(source_path, target_path) | |
def main(): | |
if os.path.exists('html_resources'): | |
shutil.rmtree('html_resources') | |
if os.path.exists('other_resources'): | |
shutil.rmtree('other_resources') | |
if os.path.exists('not_html_resources'): | |
shutil.rmtree('not_html_resources') | |
print("开始复制文件...") | |
copy_files_without_extension() | |
print("开始处理HTML文件...") | |
process_html_files() | |
print("开始添加.html后缀...") | |
add_html_extension() | |
print("开始合并资源...") | |
merge_resources() | |
print("处理完成!") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.12.2 |
Author
CFM880
commented
Jun 13, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment