Last active
July 16, 2025 10:23
-
-
Save barseghyanartur/52d6a1c8ee140bfb75db8aff25a2cb3a to your computer and use it in GitHub Desktop.
Get text around passage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "flair", | |
# "IPython", | |
# ] | |
# /// | |
from flair.splitter import SegtokSentenceSplitter | |
# Initialize the splitter once | |
splitter = SegtokSentenceSplitter() | |
def get_sentence_context_flair( | |
text: str, | |
passage: str, | |
window: int = 4, | |
) -> tuple[list[str], list[str], list[str]]: | |
""" | |
Extract up to `window` sentences before/after `passage` using Flair's SegtokSentenceSplitter. | |
Returns a tuple (before, target, after) of sentence lists. | |
""" | |
# Split text into Sentence objects, then extract raw text | |
sentences = [sent.text for sent in splitter.split(text)] | |
# Locate the character offset of the passage | |
start_char = text.find(passage) | |
if start_char < 0: | |
raise ValueError("Passage not found in text.") | |
end_char = start_char + len(passage) | |
# Build cumulative character offsets for each sentence | |
offsets: list[tuple[int, int]] = [] | |
cum = 0 | |
for s in sentences: | |
offsets.append((cum, cum + len(s))) | |
# Account for the separator that was in the original text (usually a space or newline) | |
cum += len(s) + 1 | |
# Find which sentence contains the start of the passage | |
target_idx = next( | |
(i for i, (a, b) in enumerate(offsets) if a <= start_char < b), None | |
) | |
if target_idx is None: | |
raise ValueError("Could not map passage to a sentence.") | |
# Compute context window bounds | |
start = max(0, target_idx - window) | |
end = min(len(sentences), target_idx + window + 1) | |
before = sentences[start:target_idx] | |
target = sentences[target_idx : end - window] | |
after = sentences[target_idx + 1 : end] | |
return before, target, after | |
# Example usage | |
if __name__ == "__main__": | |
full_text = ( | |
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do. " | |
"Once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, " | |
"'and what is the use of a book,' thought Alice 'without pictures or conversation?' " | |
"So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), " | |
"whether the pleasure of making a daisy-chain would be worth the trouble. " | |
"Suddenly a White Rabbit with pink eyes ran close by her. " | |
"There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say " | |
"to itself 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to " | |
"have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its " | |
"waistcoat-pocket, and looked at it, and hurried on, Alice started to her feet, for it flashed across her mind that she had " | |
"never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it. " | |
"With nothing more than a curious glance back at her sister, who was still sitting on the bank, and was getting very " | |
"tired of waiting by now, Alice ran across the field after the Rabbit, and fortunately was just in time to see it pop down " | |
"a large rabbit-hole under the hedge. " | |
"In another moment down went Alice after it, never once considering how in the world she was to get out again. " | |
"The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not " | |
"a moment to think about stopping herself before she found herself falling down a very deep well. " | |
"Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to " | |
"wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to " | |
"see anything; she felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and " | |
"saying to her 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! Down she came upon a heap of sticks " | |
"and dry leaves, and the fall was over." | |
) | |
passage = "Oh dear! Oh dear! I shall be late!" | |
window = 4 | |
before, target, after = get_sentence_context_flair( | |
full_text, passage, window=window | |
) | |
# Nicely print the context window | |
print(f"=== {window} sentences BEFORE ===") | |
for s in before: | |
print(f" • {s}") | |
print(f"\n=== TARGET sentence ===") | |
for s in target: | |
print(f" • {s}") | |
print(f"\n=== {window} sentences AFTER ===") | |
for s in after: | |
print(f" • {s}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "sentence-splitter", | |
# "IPython", | |
# ] | |
# /// | |
from sentence_splitter import SentenceSplitter | |
# Initialize the splitter for English | |
splitter = SentenceSplitter(language="en") | |
def get_sentence_context_splitter( | |
text: str, | |
passage: str, | |
window: int = 4, | |
) -> tuple[list[str], list[str], list[str]]: | |
""" | |
Extract up to `window` sentences before/after `passage` using sentence-splitter. | |
Returns a tuple (before, target, after) of sentence lists. | |
""" | |
# Split the full text into sentences | |
sentences = splitter.split(text) | |
# Locate the character offset of the passage | |
start_char = text.find(passage) | |
if start_char < 0: | |
raise ValueError("Passage not found in text.") | |
end_char = start_char + len(passage) | |
# Build cumulative character offsets for each sentence | |
offsets: list[tuple[int, int]] = [] | |
cum = 0 | |
for s in sentences: | |
offsets.append((cum, cum + len(s))) | |
# Account for the separator (a single space) in the original text | |
cum += len(s) + 1 | |
# Find which sentence contains the start of the passage | |
target_idx = next( | |
(i for i, (a, b) in enumerate(offsets) if a <= start_char < b), None | |
) | |
if target_idx is None: | |
raise ValueError("Could not map passage to a sentence.") | |
# Compute context window bounds | |
start = max(0, target_idx - window) | |
end = min(len(sentences), target_idx + window + 1) | |
before = sentences[start:target_idx] | |
target = sentences[target_idx : end - window] | |
after = sentences[target_idx + 1 : end] | |
return before, target, after | |
if __name__ == "__main__": | |
# Define the text and the passage of interest | |
full_text = ( | |
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do. " | |
"Once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, " | |
"'and what is the use of a book,' thought Alice 'without pictures or conversation?' " | |
"So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), " | |
"whether the pleasure of making a daisy-chain would be worth the trouble. " | |
"Suddenly a White Rabbit with pink eyes ran close by her. " | |
"There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say " | |
"to itself 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to " | |
"have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its " | |
"waistcoat-pocket, and looked at it, and hurried on, Alice started to her feet, for it flashed across her mind that she had " | |
"never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it. " | |
"With nothing more than a curious glance back at her sister, who was still sitting on the bank, and was getting very " | |
"tired of waiting by now, Alice ran across the field after the Rabbit, and fortunately was just in time to see it pop down " | |
"a large rabbit-hole under the hedge. " | |
"In another moment down went Alice after it, never once considering how in the world she was to get out again. " | |
"The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not " | |
"a moment to think about stopping herself before she found herself falling down a very deep well. " | |
"Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to " | |
"wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to " | |
"see anything; she felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and " | |
"saying to her 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! Down she came upon a heap of sticks " | |
"and dry leaves, and the fall was over." | |
) | |
passage = "Oh dear! Oh dear! I shall be late!" | |
window = 4 | |
before, target, after = get_sentence_context_splitter( | |
full_text, passage, window=window | |
) | |
# Nicely print the context window | |
print(f"=== {window} sentences BEFORE ===") | |
for s in before: | |
print(f" • {s}") | |
print(f"\n=== TARGET sentence ===") | |
for s in target: | |
print(f" • {s}") | |
print(f"\n=== {window} sentences AFTER ===") | |
for s in after: | |
print(f" • {s}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "stanza", | |
# "IPython", | |
# ] | |
# /// | |
import stanza | |
# Download once: stanza.download('en') | |
nlp = stanza.Pipeline(lang="en", processors="tokenize") | |
def get_sentence_context_stanza( | |
text: str, | |
passage: str, | |
window: int = 4, | |
) -> tuple[list[str], list[str], list[str]]: | |
""" | |
Extract up to `window` sentences before/after `passage` using Stanza. | |
Returns a tuple (before, target, after) of sentence lists. | |
""" | |
# Process the full text | |
doc = nlp(text) | |
# Extract raw sentence texts | |
sentences = [sent.text for sent in doc.sentences] | |
# Locate passage in the original text | |
start_char = text.find(passage) | |
if start_char < 0: | |
raise ValueError("Passage not found in text.") | |
end_char = start_char + len(passage) | |
# Build cumulative character offsets to map passage → sentence index | |
offsets: list[tuple[int, int]] = [] | |
cum = 0 | |
for s in sentences: | |
offsets.append((cum, cum + len(s))) | |
cum += len(s) + 1 # Account for the space/newline separator | |
# Find the index of the sentence containing the passage | |
target_idx = next( | |
(i for i, (a, b) in enumerate(offsets) if a <= start_char < b), None | |
) | |
if target_idx is None: | |
raise ValueError("Could not map passage to a sentence.") | |
# Compute window bounds | |
start = max(0, target_idx - window) | |
end = min(len(sentences), target_idx + window + 1) | |
before = sentences[start:target_idx] | |
target = sentences[target_idx : end - window] | |
after = sentences[target_idx + 1 : end] | |
return before, target, after | |
if __name__ == "__main__": | |
# Define the text and the passage of interest | |
full_text = ( | |
"Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do. " | |
"Once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, " | |
"'and what is the use of a book,' thought Alice 'without pictures or conversation?' " | |
"So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), " | |
"whether the pleasure of making a daisy-chain would be worth the trouble. " | |
"Suddenly a White Rabbit with pink eyes ran close by her. " | |
"There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say " | |
"to itself 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to " | |
"have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its " | |
"waistcoat-pocket, and looked at it, and hurried on, Alice started to her feet, for it flashed across her mind that she had " | |
"never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it. " | |
"With nothing more than a curious glance back at her sister, who was still sitting on the bank, and was getting very " | |
"tired of waiting by now, Alice ran across the field after the Rabbit, and fortunately was just in time to see it pop down " | |
"a large rabbit-hole under the hedge. " | |
"In another moment down went Alice after it, never once considering how in the world she was to get out again. " | |
"The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not " | |
"a moment to think about stopping herself before she found herself falling down a very deep well. " | |
"Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to " | |
"wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to " | |
"see anything; she felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and " | |
"saying to her 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! Down she came upon a heap of sticks " | |
"and dry leaves, and the fall was over." | |
) | |
passage = "Oh dear! Oh dear! I shall be late!" | |
window = 4 | |
before, target, after = get_sentence_context_stanza( | |
full_text, passage, window=window | |
) | |
# Nicely print the context window | |
print(f"=== {window} sentences BEFORE ===") | |
for s in before: | |
print(f" • {s}") | |
print(f"\n=== TARGET sentence ===") | |
for s in target: | |
print(f" • {s}") | |
print(f"\n=== {window} sentences AFTER ===") | |
for s in after: | |
print(f" • {s}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment