This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import load as spacy_load | |
# This loads the largest English corpus, which must be downloaded | |
# separate from package installation. Other choices are available. | |
nlp = spacy_load('en_core_web_lg') | |
def doc_to_spans(list_of_texts, join_string=' ||| '): | |
all_docs = nlp(' ||| '.join(list_of_texts)) | |
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Regular expression to find floats. Match groups are the whole string, the | |
# whole coefficient, the decimal part of the coefficient, and the exponent | |
# part. | |
_float_re = re.compile(r'(([+-]?\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)') | |
def valid_float_string(string): | |
match = _float_re.search(string) | |
return match.groups()[0] == string if match else False | |