Last active
July 7, 2023 07:00
-
-
Save omc8db/47a6759b6ed06671e1e77c0c7e6ed84f to your computer and use it in GitHub Desktop.
@spacenerd — Today at 1:59 PM "Could you make an awk script to print out the lyrics from WAP by only grouping 3 or more letters together at a time when parsing through the federal budget proposal?"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys, re | |
MIN_MATCH=3 | |
MAX_MATCH=9 | |
# The lyrics of "Wet Ass Pussy" by Cardi B | |
target=open(sys.argv[1]).read() | |
# United States House Resolution 2617, Omnibus spending bill for FY2023 | |
reference=open(sys.argv[2]).read().lower() | |
def sanitize(s): return re.sub('[\W_]+', '', s.lower()) | |
# Sanitize the input by converting to lowercase and removing all non-letters | |
target = re.sub('[\W_]+', '', target.lower()) | |
# Modifiers that can be added to regex letters | |
OPTIONAL_WITH_NOISE='?[^a-z\n]*' | |
WITH_NOISE='[^a-z\n]*' | |
def regex_modify(s, option): | |
return re.sub('([a-z])', r'\1' + option, s) | |
# ANSI Output modifiers | |
BOLD = '\033[1m' | |
PURPLE = '\033[95m' | |
UNDERLINE = '\033[4m' | |
END = '\033[0m' | |
offset = 0 | |
def longest_match_regex(s): | |
"""Return a regex that matches any number of letters from s""" | |
result = "" | |
for c in s: | |
result += f"({c}[^a-z\n]*" | |
for _ in s: | |
result += ")?" | |
return result | |
while reference and target: | |
searchstr = regex_modify(target[:MIN_MATCH], WITH_NOISE) | |
searchstr += longest_match_regex(target[MIN_MATCH:MAX_MATCH]) | |
line = re.search(f"\n.*({searchstr}).*\n", reference, re.IGNORECASE) | |
if line is None: | |
break | |
bolded_line = re.sub(f'({searchstr})', BOLD + PURPLE + UNDERLINE + r'\1' + END, line.group(0), re.IGNORECASE).strip() | |
# Find boundaries of word match within the line | |
mstart, mend = line.span(1) | |
matched_chars = len(re.sub('[^a-zA-Z]', '', line.group(1))) | |
print(f"Omnibus spending bill, characters {offset + mstart} to {offset + mend}") | |
print("\t" + bolded_line) | |
target = target[matched_chars:] | |
reference = reference[mend:] | |
offset += mstart |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment