Skip to content

Instantly share code, notes, and snippets.

@WalBeh
Created April 1, 2025 12:53
Show Gist options
  • Save WalBeh/cd923564f3e1a9802ede1e541f61f8fa to your computer and use it in GitHub Desktop.
Save WalBeh/cd923564f3e1a9802ede1e541f61f8fa to your computer and use it in GitHub Desktop.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "beautifulsoup4",
# "requests",
# ]
# ///
"""
Generates a JSON File from the documentation
"""
import requests
from bs4 import BeautifulSoup
import json
import re
import requests
from bs4 import BeautifulSoup
import json
import re
def extract_cratedb_settings():
url = "https://cratedb.com/docs/crate/reference/en/latest/config/cluster.html"
# Fetch the page content
print(f"Fetching {url}...")
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to fetch the page. Status code: {response.status_code}")
return {}
print("Successfully fetched the page. Parsing content...")
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Initialize dictionary to store settings
settings = {}
# Find all section divs that contain settings
sections = soup.find_all(['div', 'section'], class_=['section', 'doc-content'])
print(f"Found {len(sections)} potential sections to process")
# Process tables first as they contain most structured settings
tables = soup.find_all('table')
print(f"Found {len(tables)} tables")
for table in tables:
# Check if this has headers
headers_row = table.find('tr')
if not headers_row:
continue
headers = [th.get_text().strip().lower() for th in headers_row.find_all(['th', 'td'])]
print(f"Table headers: {headers}")
# Check if this looks like a settings table
if not any(keyword in ' '.join(headers) for keyword in ['setting', 'name', 'property']):
continue
# Get indices for important columns
setting_idx = next((i for i, h in enumerate(headers) if any(keyword in h for keyword in ['setting', 'name', 'property'])), None)
desc_idx = next((i for i, h in enumerate(headers) if 'description' in h), None)
runtime_idx = next((i for i, h in enumerate(headers) if any(keyword in h for keyword in ['runtime', 'dynamic'])), None)
default_idx = next((i for i, h in enumerate(headers) if 'default' in h), None)
type_idx = next((i for i, h in enumerate(headers) if 'type' in h), None)
if setting_idx is None:
continue
print(f"Processing table with setting index: {setting_idx}, description index: {desc_idx}, runtime index: {runtime_idx}")
# Process rows
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) <= setting_idx:
continue
# Get setting name
setting_cell = cells[setting_idx]
setting_key = setting_cell.get_text().strip()
# Try to get from code element if present
code_elem = setting_cell.find('code')
if code_elem:
setting_key = code_elem.get_text().strip()
# Clean up setting key
setting_key = re.sub(r'\s+', ' ', setting_key)
setting_key = setting_key.split(':', 1)[0] if ':' in setting_key else setting_key
# Skip if this doesn't look like a setting
if not setting_key or setting_key.startswith('#') or len(setting_key) > 100:
continue
# Initialize structured fields
setting_info = {
"raw_description": "",
"runtime_configurable": False,
"default_value": "",
"type": "",
"purpose": "",
"constraints": "",
"related_settings": [],
"deprecated": False
}
# Get description
if desc_idx is not None and desc_idx < len(cells):
setting_info["raw_description"] = cells[desc_idx].get_text().strip()
# Get default value directly from table if available
if default_idx is not None and default_idx < len(cells):
setting_info["default_value"] = cells[default_idx].get_text().strip()
# Get type directly from table if available
if type_idx is not None and type_idx < len(cells):
setting_info["type"] = cells[type_idx].get_text().strip()
# Determine runtime configurability - FIXED this section
if runtime_idx is not None and runtime_idx < len(cells):
runtime_text = cells[runtime_idx].get_text().strip().lower()
print(f"Runtime text for {setting_key}: '{runtime_text}'")
# Check for various ways "yes" might be represented
setting_info["runtime_configurable"] = (
'yes' in runtime_text or
'true' in runtime_text or
'✓' in runtime_text or
runtime_text == 'y' or
'runtime: yes' in runtime_text or
'dynamic: true' in runtime_text
)
# For debugging
if setting_info["runtime_configurable"]:
print(f"Setting {setting_key} IS runtime configurable based on: '{runtime_text}'")
else:
print(f"Setting {setting_key} is NOT runtime configurable based on: '{runtime_text}'")
# Check description for runtime configurable mentions if not found in dedicated column
if not setting_info["runtime_configurable"]:
desc_lower = setting_info["raw_description"].lower()
if "runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower:
setting_info["runtime_configurable"] = True
print(f"Setting {setting_key} IS runtime configurable based on description")
elif "runtime: yes" in desc_lower:
setting_info["runtime_configurable"] = True
print(f"Setting {setting_key} IS runtime configurable based on 'Runtime: yes' in description")
# Parse description into structured fields
parse_description(setting_info)
# Add to settings dictionary
settings[setting_key] = setting_info
print(f"Added setting from table: {setting_key}")
# Now look for definition lists across the page
dl_elements = soup.find_all('dl')
print(f"Found {len(dl_elements)} definition lists")
for dl in dl_elements:
dt_elements = dl.find_all('dt')
dd_elements = dl.find_all('dd')
if len(dt_elements) == 0 or len(dd_elements) == 0:
continue
for i in range(min(len(dt_elements), len(dd_elements))):
dt = dt_elements[i]
dd = dd_elements[i]
# Get setting name
setting_name = dt.get_text().strip()
# Try to extract from code element if present
code_elem = dt.find('code')
if code_elem:
setting_name = code_elem.get_text().strip()
# Clean up setting key
setting_name = re.sub(r'\s+', ' ', setting_name)
setting_key = setting_name.split(':', 1)[0] if ':' in setting_name else setting_name
# Skip if doesn't look like a valid setting
if not setting_key or len(setting_key) > 100:
continue
# Initialize structured fields
setting_info = {
"raw_description": dd.get_text().strip(),
"runtime_configurable": False,
"default_value": "",
"type": "",
"purpose": "",
"constraints": "",
"related_settings": [],
"deprecated": False
}
# Check runtime configurability - IMPROVED this section
desc_lower = setting_info["raw_description"].lower()
setting_info["runtime_configurable"] = (
("runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower) or
"runtime: yes" in desc_lower or
"dynamic: true" in desc_lower
)
if setting_info["runtime_configurable"]:
print(f"Setting {setting_key} IS runtime configurable based on description")
# Parse description into structured fields
parse_description(setting_info)
# Add to settings dictionary
settings[setting_key] = setting_info
print(f"Added setting from dl: {setting_key}")
# Handle specific sections with settings that might be missed
for section in sections:
# Look for setting headers
headers = section.find_all(['h2', 'h3', 'h4'])
for header in headers:
header_text = header.get_text().strip()
# Look for typical setting patterns
if re.search(r'^[a-z0-9_.]+$', header_text, re.IGNORECASE) and header_text not in settings:
setting_key = header_text
# Initialize structured fields
setting_info = {
"raw_description": "",
"runtime_configurable": False,
"default_value": "",
"type": "",
"purpose": "",
"constraints": "",
"related_settings": [],
"deprecated": False
}
# Get the paragraph after the header for description
next_elem = header.find_next(['p', 'div'])
if next_elem:
setting_info["raw_description"] = next_elem.get_text().strip()
# Check runtime configurability - IMPROVED this section
desc_lower = setting_info["raw_description"].lower()
setting_info["runtime_configurable"] = (
("runtime configurable" in desc_lower and "not runtime configurable" not in desc_lower) or
"runtime: yes" in desc_lower or
"dynamic: true" in desc_lower
)
# Parse description into structured fields
parse_description(setting_info)
# Add to settings dictionary
settings[setting_key] = setting_info
print(f"Added setting from header: {setting_key}")
# Additional pass to check for "Runtime: yes" pattern in raw_description
for key, info in settings.items():
if not info["runtime_configurable"]:
desc_lower = info["raw_description"].lower()
runtime_patterns = [
r"runtime:\s*yes",
r"dynamic:\s*true",
r"runtime\s+configurable",
r"can be changed at runtime"
]
for pattern in runtime_patterns:
if re.search(pattern, desc_lower) and "not runtime configurable" not in desc_lower:
info["runtime_configurable"] = True
print(f"Updated setting {key} to runtime configurable based on pattern match")
break
print(f"Total settings found: {len(settings)}")
return settings
def parse_description(setting_info):
"""Parse description text to extract structured information."""
desc = setting_info["raw_description"]
# Extract default value with regex
default_match = re.search(r'(default|defaults to|defaulting to|value is|initial value):?\s*[\'"`]?([^\'"`\n.,;]+)[\'"`]?', desc, re.IGNORECASE)
if default_match:
setting_info["default_value"] = default_match.group(2).strip()
# Extract type information
type_patterns = [
r'(type|data type|value type):?\s*([a-zA-Z0-9_\- ]+)',
r'(string|integer|boolean|float|double|time|list|array|enum) (setting|value|type)'
]
for pattern in type_patterns:
type_match = re.search(pattern, desc, re.IGNORECASE)
if type_match:
if 'type' in type_match.group(1).lower():
setting_info["type"] = type_match.group(2).strip()
else:
setting_info["type"] = type_match.group(1).strip()
break
# Check for deprecated status
setting_info["deprecated"] = "deprecated" in desc.lower()
# Extract constraints
constraint_patterns = [
r'(must be|valid values|valid range|range is|range of|between|min|max|maximum|minimum).{1,100}',
r'(only|strictly).{1,50}(positive|negative|greater than|less than|non-negative).{1,50}'
]
for pattern in constraint_patterns:
constraint_match = re.search(pattern, desc, re.IGNORECASE)
if constraint_match:
setting_info["constraints"] = constraint_match.group(0).strip()
break
# Extract related settings
related_settings = re.findall(r'([a-z][a-z0-9_\-.]+\.[a-z0-9_\-.]+)', desc)
if related_settings:
setting_info["related_settings"] = list(set(related_settings)) # Remove duplicates
# Extract purpose (the first sentence or two that's not about defaults, types, or constraints)
purpose_text = desc
# Remove sections about defaults, types, constraints
patterns_to_remove = [
r'default(s)? (is|are|to|value)?:?\s*[\'"`]?[^\'"`\n.,;]+[\'"`]?',
r'type:?\s*[a-zA-Z0-9_\- ]+',
r'(must be|valid values|valid range).{1,100}',
r'(deprecated).{1,100}',
r'runtime configurable',
r'runtime:\s*yes',
r'dynamic:\s*true'
]
for pattern in patterns_to_remove:
purpose_text = re.sub(pattern, '', purpose_text, flags=re.IGNORECASE)
# Get the first 1-2 sentences
sentences = re.split(r'(?<=[.!?])\s+', purpose_text)
if sentences:
purpose = ' '.join(sentences[:min(2, len(sentences))])
setting_info["purpose"] = purpose.strip()
return setting_info
def main():
settings = extract_cratedb_settings()
# Add SQL statements for runtime configurable settings
for setting_key, setting_info in settings.items():
if setting_info["runtime_configurable"]:
default_value = setting_info["default_value"]
# Determine if the value needs quotes (strings and time values do, numbers don't)
is_numeric = re.match(r'^-?\d+(\.\d+)?$', default_value)
if is_numeric:
stmt = f'SET CLUSTER "{setting_key}" = {default_value}'
else:
# Handle values that already have quotes
if default_value.startswith("'") and default_value.endswith("'"):
stmt = f'SET CLUSTER "{setting_key}" = {default_value}'
else:
stmt = f'SET CLUSTER "{setting_key}" = \'{default_value}\''
setting_info["stmt"] = stmt
print(f"Added statement for {setting_key}: {stmt}")
# Save to JSON file
with open('cratedb_settings.json', 'w', encoding='utf-8') as f:
json.dump(settings, f, indent=2, ensure_ascii=False)
print(f"Extracted {len(settings)} settings to cratedb_settings.json")
# Count runtime configurable settings
runtime_count = sum(1 for info in settings.values() if info["runtime_configurable"])
print(f"Found {runtime_count} runtime configurable settings out of {len(settings)} total")
print(f"Added SQL statements for {runtime_count} runtime configurable settings")
# Print a few sample settings to verify
if settings:
print("\nSample settings extracted:")
count = 0
for key, value in list(settings.items())[:5]:
print(f"Setting: {key}")
print(f"Raw description: {value['raw_description'][:100]}...")
print(f"Purpose: {value['purpose'][:100]}...")
print(f"Type: {value['type']}")
print(f"Default: {value['default_value']}")
print(f"Constraints: {value['constraints']}")
print(f"Runtime configurable: {value['runtime_configurable']}")
if value["runtime_configurable"]:
print(f"Statement: {value['stmt']}")
print(f"Related settings: {value['related_settings']}")
print(f"Deprecated: {value['deprecated']}")
print()
count += 1
else:
print("No settings were extracted. Please check the script or the webpage structure.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment