Last active
September 25, 2021 04:27
-
-
Save afiaka87/e9b73930d9b1c46baf17f08185cb5f45 to your computer and use it in GitHub Desktop.
Clean and Filter Crawling @ Home by length, aspect ratio, image size, detected language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
number_regex = re.compile(r'[0-9]{5,}') | |
date_regex = re.compile(r'[0-9]{4}-[0-9]{2}-[0-9]{2}') | |
url_regex = re.compile(r'https?://[^\s]+') | |
@lru_cache(maxsize=32) | |
def tokenize(s): | |
s = s.decode('utf-8') | |
s = s.lower() | |
s = number_regex.sub('', s) | |
s = date_regex.sub('', s) | |
s = url_regex.sub('', s) | |
return tokenizer.tokenize(s, TEXT_SEQ_LEN, truncate_text=args.truncate_captions).squeeze(0) | |
# WebDataset only: filtering options | |
MAXIMUM_CAP_LEN = 5000 | |
MINIMUM_CAP_LEN = 1 | |
MIN_ASPECT_RATIO = 0.5 | |
MAX_ASPECT_RATIO = 2.0 | |
FILTER_LANGUAGE = False | |
FILTER_LANGUAGE_CODE = 'en' | |
if FILTER_LANGUAGE: | |
print(f'Filtering language: {FILTER_LANGUAGE_CODE} using cld3') | |
if ENABLE_WEBDATASET: | |
DATASET_SIZE = int(1e9) # You need to set a nominal length for the Dataset in order to avoid warnings from DataLoader | |
myimg, mycap = WEBDATASET_IMAGE_TEXT_COLUMNS | |
image_text_mapping = { | |
myimg: imagetransform, | |
mycap: tokenize | |
} | |
image_mapping = { | |
myimg: imagepreproc | |
} | |
def filter_dataset(item): # For e.g. C@H which (rarely) has no caption available. | |
if mycap not in item: return False | |
if myimg not in item: return False | |
if 'json' not in item: return False | |
metadata = json.loads(item['json'].decode('utf-8')) | |
original_width = float(metadata['original_width']) | |
original_height = float(metadata['original_height']) | |
caption = item[mycap].decode('utf-8') | |
# image size | |
if original_width < IMAGE_SIZE or original_height < IMAGE_SIZE: | |
return False | |
# aspect ratio | |
if original_width / original_height < MIN_ASPECT_RATIO or original_width / original_height > MAX_ASPECT_RATIO: | |
return False | |
# caption length | |
if len(caption) > MAXIMUM_CAP_LEN or len(caption) < MINIMUM_CAP_LEN: | |
return False | |
# language detection | |
detected = cld3.get_language(caption) # (you may ignore the linter warning about this) | |
if FILTER_LANGUAGE and detected.language != FILTER_LANGUAGE_CODE: | |
return False | |
return True | |
w_dataset = wds.WebDataset(DATASET, handler=wds.warn_and_continue) | |
filtered_dataset = w_dataset.select(filter_dataset) | |
ds = filtered_dataset.map_dict(**image_text_mapping).map_dict(**image_mapping).to_tuple(mycap, myimg).batched(BATCH_SIZE, partial=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment