CFM880 · June 13, 2025 07:53 · CFM880 · Jun 13, 2025
diff --git a/main.py b/main.py
 import os
 import shutil
 from bs4 import BeautifulSoup
 import re


 def copy_files_without_extension():
    # 创建目标目录
    os.makedirs('html_resources', exist_ok=True)
    os.makedirs('other_resources', exist_ok=True)
    
    # 遍历wiki.osdev.org目录
    for root, dirs, files in os.walk('wiki.osdev.org'):
        for file in files:
            source_path = os.path.join(root, file)
            # 构建相对路径
            rel_path = os.path.relpath(source_path, 'wiki.osdev.org')
            
            # 检查文件是否有后缀
            if '.' not in file or file.endswith('.html'):
                # 复制到html_resources
                target_path = os.path.join('html_resources', rel_path)
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                shutil.copy2(source_path, target_path)
            else:
                # 复制到other_resources
                target_path = os.path.join('other_resources', rel_path)
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                shutil.copy2(source_path, target_path)

 def process_html_files():
    # 创建not_html_resources目录
    os.makedirs('not_html_resources', exist_ok=True)
    
    # 遍历html_resources目录
    for root, dirs, files in os.walk('html_resources'):
        for file in files:
            file_path = os.path.join(root, file)
            
            # 检查是否是HTML文件
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    soup = BeautifulSoup(content, 'html.parser')
                    
                    # 如果无法解析为HTML，移动到not_html_resources
                    if not soup.find():
                        target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
                        os.makedirs(os.path.dirname(target_path), exist_ok=True)
                        shutil.move(file_path, target_path)
                        continue
                    
                    # 处理所有<a>标签
                    for a_tag in soup.find_all('a'):
                        href = a_tag.get('href')
                        if href and not href.startswith(('http://', 'https://', 'mailto:')):
                            if ':' in href:
                                href = href.replace(':', '_')
                                a_tag['href'] = f"{href}"
                            # 处理锚点
                            if '#' in href:
                                base, anchor = href.split('#', 1)
                                if not base.endswith('.html'):
                                    a_tag['href'] = f"{base}.html#{anchor}"

                            else:
                                if not href.endswith('.html'):
                                    a_tag['href'] = f"{href}.html"
                    
                    # 保存修改后的文件
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(str(soup))
                        
            except Exception as e:
                # 如果处理出错，移动到not_html_resources
                target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                shutil.move(file_path, target_path)

 def add_html_extension():
    # 获取not_html_resources中的文件列表
    not_html_files = set()
    for root, dirs, files in os.walk('not_html_resources'):
        for file in files:
            rel_path = os.path.relpath(os.path.join(root, file), 'not_html_resources')
            not_html_files.add(rel_path)
    
    # 遍历html_resources目录
    for root, dirs, files in os.walk('html_resources'):
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, 'html_resources')
            
            # 如果文件不在not_html_resources中，添加.html后缀
            if rel_path not in not_html_files and not file.endswith('.html'):
                new_path = file_path + '.html'
                os.rename(file_path, new_path)

 def merge_resources():
    # 将other_resources和not_html_resources复制到html_resources
    for source_dir in ['other_resources', 'not_html_resources']:
        for root, dirs, files in os.walk(source_dir):
            for file in files:
                source_path = os.path.join(root, file)
                rel_path = os.path.relpath(source_path, source_dir)
                target_path = os.path.join('html_resources', rel_path)
                
                # 确保目标目录存在
                os.makedirs(os.path.dirname(target_path), exist_ok=True)
                
                # 复制文件
                shutil.copy2(source_path, target_path)

 def main():
    
    if os.path.exists('html_resources'):
        shutil.rmtree('html_resources')
    if os.path.exists('other_resources'):
        shutil.rmtree('other_resources')
    if os.path.exists('not_html_resources'):
        shutil.rmtree('not_html_resources')

    print("开始复制文件...")
    copy_files_without_extension()
    print("开始处理HTML文件...")
    process_html_files()
    print("开始添加.html后缀...")
    add_html_extension()
    print("开始合并资源...")
    merge_resources()
    print("处理完成！")

 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.12.2
	import os
	import shutil
	from bs4 import BeautifulSoup
	import re


	def copy_files_without_extension():
	# 创建目标目录
	os.makedirs('html_resources', exist_ok=True)
	os.makedirs('other_resources', exist_ok=True)

	# 遍历wiki.osdev.org目录
	for root, dirs, files in os.walk('wiki.osdev.org'):
	for file in files:
	source_path = os.path.join(root, file)
	# 构建相对路径
	rel_path = os.path.relpath(source_path, 'wiki.osdev.org')

	# 检查文件是否有后缀
	if '.' not in file or file.endswith('.html'):
	# 复制到html_resources
	target_path = os.path.join('html_resources', rel_path)
	os.makedirs(os.path.dirname(target_path), exist_ok=True)
	shutil.copy2(source_path, target_path)
	else:
	# 复制到other_resources
	target_path = os.path.join('other_resources', rel_path)
	os.makedirs(os.path.dirname(target_path), exist_ok=True)
	shutil.copy2(source_path, target_path)

	def process_html_files():
	# 创建not_html_resources目录
	os.makedirs('not_html_resources', exist_ok=True)

	# 遍历html_resources目录
	for root, dirs, files in os.walk('html_resources'):
	for file in files:
	file_path = os.path.join(root, file)

	# 检查是否是HTML文件
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	soup = BeautifulSoup(content, 'html.parser')

	# 如果无法解析为HTML，移动到not_html_resources
	if not soup.find():
	target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
	os.makedirs(os.path.dirname(target_path), exist_ok=True)
	shutil.move(file_path, target_path)
	continue

	# 处理所有<a>标签
	for a_tag in soup.find_all('a'):
	href = a_tag.get('href')
	if href and not href.startswith(('http://', 'https://', 'mailto:')):
	if ':' in href:
	href = href.replace(':', '_')
	a_tag['href'] = f"{href}"
	# 处理锚点
	if '#' in href:
	base, anchor = href.split('#', 1)
	if not base.endswith('.html'):
	a_tag['href'] = f"{base}.html#{anchor}"

	else:
	if not href.endswith('.html'):
	a_tag['href'] = f"{href}.html"

	# 保存修改后的文件
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(str(soup))

	except Exception as e:
	# 如果处理出错，移动到not_html_resources
	target_path = os.path.join('not_html_resources', os.path.relpath(file_path, 'html_resources'))
	os.makedirs(os.path.dirname(target_path), exist_ok=True)
	shutil.move(file_path, target_path)

	def add_html_extension():
	# 获取not_html_resources中的文件列表
	not_html_files = set()
	for root, dirs, files in os.walk('not_html_resources'):
	for file in files:
	rel_path = os.path.relpath(os.path.join(root, file), 'not_html_resources')
	not_html_files.add(rel_path)

	# 遍历html_resources目录
	for root, dirs, files in os.walk('html_resources'):
	for file in files:
	file_path = os.path.join(root, file)
	rel_path = os.path.relpath(file_path, 'html_resources')

	# 如果文件不在not_html_resources中，添加.html后缀
	if rel_path not in not_html_files and not file.endswith('.html'):
	new_path = file_path + '.html'
	os.rename(file_path, new_path)

	def merge_resources():
	# 将other_resources和not_html_resources复制到html_resources
	for source_dir in ['other_resources', 'not_html_resources']:
	for root, dirs, files in os.walk(source_dir):
	for file in files:
	source_path = os.path.join(root, file)
	rel_path = os.path.relpath(source_path, source_dir)
	target_path = os.path.join('html_resources', rel_path)

	# 确保目标目录存在
	os.makedirs(os.path.dirname(target_path), exist_ok=True)

	# 复制文件
	shutil.copy2(source_path, target_path)

	def main():

	if os.path.exists('html_resources'):
	shutil.rmtree('html_resources')
	if os.path.exists('other_resources'):
	shutil.rmtree('other_resources')
	if os.path.exists('not_html_resources'):
	shutil.rmtree('not_html_resources')

	print("开始复制文件...")
	copy_files_without_extension()
	print("开始处理HTML文件...")
	process_html_files()
	print("开始添加.html后缀...")
	add_html_extension()
	print("开始合并资源...")
	merge_resources()
	print("处理完成！")

	if __name__ == "__main__":
	main()