Created
April 1, 2025 12:53
-
-
Save WalBeh/cd923564f3e1a9802ede1e541f61f8fa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "beautifulsoup4", | |
# "requests", | |
# ] | |
# /// | |
""" | |
Generates a JSON File from the documentation | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import re | |
def extract_cratedb_settings(): | |
url = "https://cratedb.com/docs/crate/reference/en/latest/config/cluster.html" | |
# Fetch the page content | |
print(f"Fetching {url}...") | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Failed to fetch the page. Status code: {response.status_code}") | |
return {} | |
print("Successfully fetched the page. Parsing content...") | |
# Parse HTML | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Initialize dictionary to store settings | |
settings = {} | |
# Find all section divs that contain settings | |
sections = soup.find_all(['div', 'section'], class_=['section', 'doc-content']) | |
print(f"Found {len(sections)} potential sections to process") | |
# Process tables first as they contain most structured settings | |
tables = soup.find_all('table') | |
print(f"Found {len(tables)} tables") | |
for table in tables: | |
# Check if this has headers | |
headers_row = table.find('tr') | |
if not headers_row: | |
continue | |
headers = [th.get_text().strip().lower() for th in headers_row.find_all(['th', 'td'])] | |
print(f"Table headers: {headers}") | |
# Check if this looks like a settings table | |
if not any(keyword in ' '.join(headers) for keyword in ['setting', 'name', 'property']): | |
continue | |
# Get indices for important columns | |
setting_idx = next((i for i, h in enumerate(headers) if any(keyword in h for keyword in ['setting', 'name', 'property'])), None) | |
desc_idx = next((i for i, h in enumerate(headers) if 'description' in h), None) | |
runtime_idx = next((i for i, h in enumerate(headers) if any(keyword in h for keyword in ['runtime', 'dynamic'])), None) | |
default_idx = next((i for i, h in enumerate(headers) if 'default' in h), None) | |
type_idx = next((i for i, h in enumerate(headers) if 'type' in h), None) | |
if setting_idx is None: | |
continue | |
print(f"Processing table with setting index: {setting_idx}, description index: {desc_idx}, runtime index: {runtime_idx}") | |
# Process rows | |
rows = table.find_all('tr')[1:] # Skip header row | |
for row in rows: | |
cells = row.find_all(['td', 'th']) | |
if len(cells) <= setting_idx: | |
continue | |
# Get setting name | |
setting_cell = cells[setting_idx] | |
setting_key = setting_cell.get_text().strip() | |
# Try to get from code element if present | |
code_elem = setting_cell.find('code') | |
if code_elem: | |
setting_key = code_elem.get_text().strip() | |
# Clean up setting key | |
setting_key = re.sub(r'\s+', ' ', setting_key) | |
setting_key = setting_key.split(':', 1)[0] if ':' in setting_key else setting_key | |
# Skip if this doesn't look like a setting | |
if not setting_key or setting_key.startswith('#') or len(setting_key) > 100: | |
continue | |
# Initialize structured fields | |
setting_info = { | |
"raw_description": "", | |
"runtime_configurable": False, | |
"default_value": "", | |
"type": "", | |
"purpose": "", | |
"constraints": "", | |
"related_settings": [], | |
"deprecated": False | |
} | |
# Get description | |
if desc_idx is not None and desc_idx < len(cells): | |
setting_info["raw_description"] = cells[desc_idx].get_text().strip() | |
# Get default value directly from table if available | |
if default_idx is not None and default_idx < len(cells): | |
setting_info["default_value"] = cells[default_idx].get_text().strip() | |
# Get type directly from table if available | |
if type_idx is not None and type_idx < len(cells): | |
setting_info["type"] = cells[type_idx].get_text().strip() | |
# Determine runtime configurability - FIXED this section | |
if runtime_idx is not None and runtime_idx < len(cells): | |
runtime_text = cells[runtime_idx].get_text().strip().lower() | |
print(f"Runtime text for {setting_key}: '{runtime_text}'") | |
# Check for various ways "yes" might be represented | |
setting_info["runtime_configurable"] = ( | |
'yes' in runtime_text or | |
'true' in runtime_text or | |
'✓' in runtime_text or | |
runtime_text == 'y' or | |
'runtime: yes' in runtime_text or | |
'dynamic: true' in runtime_text | |
) | |
# For debugging | |
if setting_info["runtime_configurable"]: | |
print(f"Setting {setting_key} IS runtime configurable based on: '{runtime_text}'") | |
else: | |
print(f"Setting {setting_key} is NOT runtime configurable based on: '{runtime_text}'") | |
# Check description for runtime configurable mentions if not found in dedicated column | |
if not setting_info["runtime_configurable"]: | |
desc_lower = setting_info["raw_description"].lower() | |
if "runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower: | |
setting_info["runtime_configurable"] = True | |
print(f"Setting {setting_key} IS runtime configurable based on description") | |
elif "runtime: yes" in desc_lower: | |
setting_info["runtime_configurable"] = True | |
print(f"Setting {setting_key} IS runtime configurable based on 'Runtime: yes' in description") | |
# Parse description into structured fields | |
parse_description(setting_info) | |
# Add to settings dictionary | |
settings[setting_key] = setting_info | |
print(f"Added setting from table: {setting_key}") | |
# Now look for definition lists across the page | |
dl_elements = soup.find_all('dl') | |
print(f"Found {len(dl_elements)} definition lists") | |
for dl in dl_elements: | |
dt_elements = dl.find_all('dt') | |
dd_elements = dl.find_all('dd') | |
if len(dt_elements) == 0 or len(dd_elements) == 0: | |
continue | |
for i in range(min(len(dt_elements), len(dd_elements))): | |
dt = dt_elements[i] | |
dd = dd_elements[i] | |
# Get setting name | |
setting_name = dt.get_text().strip() | |
# Try to extract from code element if present | |
code_elem = dt.find('code') | |
if code_elem: | |
setting_name = code_elem.get_text().strip() | |
# Clean up setting key | |
setting_name = re.sub(r'\s+', ' ', setting_name) | |
setting_key = setting_name.split(':', 1)[0] if ':' in setting_name else setting_name | |
# Skip if doesn't look like a valid setting | |
if not setting_key or len(setting_key) > 100: | |
continue | |
# Initialize structured fields | |
setting_info = { | |
"raw_description": dd.get_text().strip(), | |
"runtime_configurable": False, | |
"default_value": "", | |
"type": "", | |
"purpose": "", | |
"constraints": "", | |
"related_settings": [], | |
"deprecated": False | |
} | |
# Check runtime configurability - IMPROVED this section | |
desc_lower = setting_info["raw_description"].lower() | |
setting_info["runtime_configurable"] = ( | |
("runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower) or | |
"runtime: yes" in desc_lower or | |
"dynamic: true" in desc_lower | |
) | |
if setting_info["runtime_configurable"]: | |
print(f"Setting {setting_key} IS runtime configurable based on description") | |
# Parse description into structured fields | |
parse_description(setting_info) | |
# Add to settings dictionary | |
settings[setting_key] = setting_info | |
print(f"Added setting from dl: {setting_key}") | |
# Handle specific sections with settings that might be missed | |
for section in sections: | |
# Look for setting headers | |
headers = section.find_all(['h2', 'h3', 'h4']) | |
for header in headers: | |
header_text = header.get_text().strip() | |
# Look for typical setting patterns | |
if re.search(r'^[a-z0-9_.]+$', header_text, re.IGNORECASE) and header_text not in settings: | |
setting_key = header_text | |
# Initialize structured fields | |
setting_info = { | |
"raw_description": "", | |
"runtime_configurable": False, | |
"default_value": "", | |
"type": "", | |
"purpose": "", | |
"constraints": "", | |
"related_settings": [], | |
"deprecated": False | |
} | |
# Get the paragraph after the header for description | |
next_elem = header.find_next(['p', 'div']) | |
if next_elem: | |
setting_info["raw_description"] = next_elem.get_text().strip() | |
# Check runtime configurability - IMPROVED this section | |
desc_lower = setting_info["raw_description"].lower() | |
setting_info["runtime_configurable"] = ( | |
("runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower) or | |
"runtime: yes" in desc_lower or | |
"dynamic: true" in desc_lower | |
) | |
# Parse description into structured fields | |
parse_description(setting_info) | |
# Add to settings dictionary | |
settings[setting_key] = setting_info | |
print(f"Added setting from header: {setting_key}") | |
# Additional pass to check for "Runtime: yes" pattern in raw_description | |
for key, info in settings.items(): | |
if not info["runtime_configurable"]: | |
desc_lower = info["raw_description"].lower() | |
runtime_patterns = [ | |
r"runtime:\s*yes", | |
r"dynamic:\s*true", | |
r"runtime\s+configurable", | |
r"can be changed at runtime" | |
] | |
for pattern in runtime_patterns: | |
if re.search(pattern, desc_lower) and "not runtime configurable" not in desc_lower: | |
info["runtime_configurable"] = True | |
print(f"Updated setting {key} to runtime configurable based on pattern match") | |
break | |
print(f"Total settings found: {len(settings)}") | |
return settings | |
def parse_description(setting_info): | |
"""Parse description text to extract structured information.""" | |
desc = setting_info["raw_description"] | |
# Extract default value with regex | |
default_match = re.search(r'(default|defaults to|defaulting to|value is|initial value):?\s*[\'"`]?([^\'"`\n.,;]+)[\'"`]?', desc, re.IGNORECASE) | |
if default_match: | |
setting_info["default_value"] = default_match.group(2).strip() | |
# Extract type information | |
type_patterns = [ | |
r'(type|data type|value type):?\s*([a-zA-Z0-9_\- ]+)', | |
r'(string|integer|boolean|float|double|time|list|array|enum) (setting|value|type)' | |
] | |
for pattern in type_patterns: | |
type_match = re.search(pattern, desc, re.IGNORECASE) | |
if type_match: | |
if 'type' in type_match.group(1).lower(): | |
setting_info["type"] = type_match.group(2).strip() | |
else: | |
setting_info["type"] = type_match.group(1).strip() | |
break | |
# Check for deprecated status | |
setting_info["deprecated"] = "deprecated" in desc.lower() | |
# Extract constraints | |
constraint_patterns = [ | |
r'(must be|valid values|valid range|range is|range of|between|min|max|maximum|minimum).{1,100}', | |
r'(only|strictly).{1,50}(positive|negative|greater than|less than|non-negative).{1,50}' | |
] | |
for pattern in constraint_patterns: | |
constraint_match = re.search(pattern, desc, re.IGNORECASE) | |
if constraint_match: | |
setting_info["constraints"] = constraint_match.group(0).strip() | |
break | |
# Extract related settings | |
related_settings = re.findall(r'([a-z][a-z0-9_\-.]+\.[a-z0-9_\-.]+)', desc) | |
if related_settings: | |
setting_info["related_settings"] = list(set(related_settings)) # Remove duplicates | |
# Extract purpose (the first sentence or two that's not about defaults, types, or constraints) | |
purpose_text = desc | |
# Remove sections about defaults, types, constraints | |
patterns_to_remove = [ | |
r'default(s)? (is|are|to|value)?:?\s*[\'"`]?[^\'"`\n.,;]+[\'"`]?', | |
r'type:?\s*[a-zA-Z0-9_\- ]+', | |
r'(must be|valid values|valid range).{1,100}', | |
r'(deprecated).{1,100}', | |
r'runtime configurable', | |
r'runtime:\s*yes', | |
r'dynamic:\s*true' | |
] | |
for pattern in patterns_to_remove: | |
purpose_text = re.sub(pattern, '', purpose_text, flags=re.IGNORECASE) | |
# Get the first 1-2 sentences | |
sentences = re.split(r'(?<=[.!?])\s+', purpose_text) | |
if sentences: | |
purpose = ' '.join(sentences[:min(2, len(sentences))]) | |
setting_info["purpose"] = purpose.strip() | |
return setting_info | |
def main(): | |
settings = extract_cratedb_settings() | |
# Add SQL statements for runtime configurable settings | |
for setting_key, setting_info in settings.items(): | |
if setting_info["runtime_configurable"]: | |
default_value = setting_info["default_value"] | |
# Determine if the value needs quotes (strings and time values do, numbers don't) | |
is_numeric = re.match(r'^-?\d+(\.\d+)?$', default_value) | |
if is_numeric: | |
stmt = f'SET CLUSTER "{setting_key}" = {default_value}' | |
else: | |
# Handle values that already have quotes | |
if default_value.startswith("'") and default_value.endswith("'"): | |
stmt = f'SET CLUSTER "{setting_key}" = {default_value}' | |
else: | |
stmt = f'SET CLUSTER "{setting_key}" = \'{default_value}\'' | |
setting_info["stmt"] = stmt | |
print(f"Added statement for {setting_key}: {stmt}") | |
# Save to JSON file | |
with open('cratedb_settings.json', 'w', encoding='utf-8') as f: | |
json.dump(settings, f, indent=2, ensure_ascii=False) | |
print(f"Extracted {len(settings)} settings to cratedb_settings.json") | |
# Count runtime configurable settings | |
runtime_count = sum(1 for info in settings.values() if info["runtime_configurable"]) | |
print(f"Found {runtime_count} runtime configurable settings out of {len(settings)} total") | |
print(f"Added SQL statements for {runtime_count} runtime configurable settings") | |
# Print a few sample settings to verify | |
if settings: | |
print("\nSample settings extracted:") | |
count = 0 | |
for key, value in list(settings.items())[:5]: | |
print(f"Setting: {key}") | |
print(f"Raw description: {value['raw_description'][:100]}...") | |
print(f"Purpose: {value['purpose'][:100]}...") | |
print(f"Type: {value['type']}") | |
print(f"Default: {value['default_value']}") | |
print(f"Constraints: {value['constraints']}") | |
print(f"Runtime configurable: {value['runtime_configurable']}") | |
if value["runtime_configurable"]: | |
print(f"Statement: {value['stmt']}") | |
print(f"Related settings: {value['related_settings']}") | |
print(f"Deprecated: {value['deprecated']}") | |
print() | |
count += 1 | |
else: | |
print("No settings were extracted. Please check the script or the webpage structure.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment