Created
December 1, 2023 02:02
-
-
Save masta-g3/f1ce2fd033af41c440c5e225137c278f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%load_ext autoreload | |
%autoreload 2 | |
# Summarizer | |
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.prompts import ChatPromptTemplate | |
from langchain.document_loaders import ArxivLoader | |
from langchain.chains import LLMChain | |
from langchain.callbacks import get_openai_callback | |
import tiktoken | |
import os | |
import re | |
import json | |
import arxiv | |
import pandas as pd | |
from tqdm import tqdm | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import warnings | |
from dotenv import load_dotenv | |
warnings.filterwarnings("ignore") | |
load_dotenv() | |
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 5000, | |
chunk_overlap = 100, | |
length_function = len, | |
is_separator_regex = False, | |
) | |
def summarize_by_segments(paper_title: str, document: str): | |
""" Summarize a paper by segments. """ | |
doc_chunks = text_splitter.create_documents([document]) | |
## First section. | |
current_chunk = doc_chunks[0].page_content | |
summary_notes = numbered_to_bullet_list( | |
chain.run({"paper_title": paper_title, | |
"previous_notes": "*(No notes, just starting to read.)*", | |
"content": current_chunk}) + "\n") | |
## All other sections. | |
for current_chunk in tqdm(doc_chunks[1:]): | |
summary_notes += numbered_to_bullet_list( | |
chain.run({"paper_title": paper_title, | |
"previous_notes": summary_notes, | |
"content": current_chunk.page_content}) + "\n") | |
return summary_notes | |
def numbered_to_bullet_list(list_str: str): | |
""" Convert a numbered list to a bullet list. """ | |
list_str = re.sub(r'^\d+\.', r'-', list_str, flags=re.MULTILINE) | |
return list_str | |
## LLM Chain Setup | |
## Underlying LLM. | |
llm = AzureChatOpenAI(deployment_name="gpt-35-turbo-16k", temperature=0.1) | |
## Create prompt. | |
system_message = """You are an applied AI researcher specialized in the field of Large Language Models (LLMs), and you are currently reviewing the academic paper "{paper_title}". Your goal is to analyze the paper, identify the main constributions and most interesting findings, and write a summary of it in your own words. This summary will serve as reference for future LLM researchers within your organization, so it is very important that you are able to convey the main ideas in a clear, complete and concise manner. | |
You have already read through some of the initial sections and taken some notes: | |
{previous_notes} | |
Now you must read over the following section and continue expanding on your notes (without repeating information). | |
{content} | |
## Guidelines | |
- Make sure to identify connections between the paper segments and the notes you have already taken. Avoid duplicate comments and ensure that your summary is coherent. | |
- Focus on the bigger picture and the main ideas, rather than on the details. | |
- If a table is presented just report back the main findings. | |
- Be sure to explain any new concept or term you introduce. Explain how things work, and be precise when discussing metrics and results. | |
- If examples are provided you can include them in your notes, as long as they help clarify the main ideas. | |
- Take your notes in the form of a numbered list. Do not include headers or any other elements. | |
- Do not include more than 5 items in your list. | |
- Do not repeat information that is already present in your previous notes. | |
- Your summary must be shorter than the original paper. | |
""" | |
prompt = ChatPromptTemplate.from_messages([("system", system_message)]) | |
chain = LLMChain(llm=llm, prompt=prompt) | |
## Summarize Paper | |
paper_names = [ | |
" Chain-of-verification reduces hallucination in large language models", | |
"Discovering language model behaviors with model-written evaluations", | |
"Measuring and narrowing the compositionality gap in language models.", | |
"Text Rendering Strategies for Pixel Language Models", | |
"Cappy: Outperforming and Boosting Large Multi-Task LMs with a Small Scorer", | |
"GPQA: A Graduate-Level Google-Proof Q&A Benchmark", | |
"System 2 Attention (is something you might need too)", | |
"Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models", | |
] | |
print(len(paper_names)) | |
docs = ArxivLoader(query=preprocess(paper_names[0]), load_max_docs=1).load() | |
## Extract data. | |
paper_content = docs[0].page_content | |
paper_title = docs[0].metadata["Title"] | |
with get_openai_callback() as cb: | |
token_count = 999999999 | |
token_diff = 999999999 | |
i = 1 | |
ori_token_count = len(encoding.encode(docs[0].page_content)) | |
print(f"Starting tokens: {ori_token_count}") | |
while token_count > 500 and token_diff > 200: | |
print("------------------------") | |
print(f"Summarization iteration {i}...") | |
paper_content = summarize_by_segments(paper_title, paper_content) | |
token_diff = token_count - len(encoding.encode(paper_content)) | |
token_count = len(encoding.encode(paper_content)) | |
frac = len(encoding.encode(paper_content)) / len(encoding.encode(docs[0].page_content)) | |
i += 1 | |
print(f"Total tokens: {token_count}") | |
print(f"Compression: {frac:.2f}") | |
print("==================") | |
print("Done! Usage stats:") | |
print(cb) | |
print(paper_content) | |
## Narrative Form | |
narrative_system_msg = """You are an expert New York Times technology writer tasked with writing a summary of "{paper_title}". Your task is to read the following set of notes and convert it into an engaging paragraph. You must not alter the meaning of the notes, but you can reorganize and rephrase in order to improve the flow of the paragraph. You should also abstain from making unwarranted inferences and avoid bombastic language. | |
{previous_notes} | |
""" | |
narrative_prompt = ChatPromptTemplate.from_messages([("system", narrative_system_msg)]) | |
narrative_chain = LLMChain(llm=llm, prompt=narrative_prompt) | |
with get_openai_callback() as cb: | |
narrative = narrative_chain.run({"paper_title": paper_title, | |
"previous_notes": paper_content}) | |
print(narrative) | |
print("==================") | |
print("Done! Usage stats:") | |
print(cb) | |
print(narrative) | |
print(len(encoding.encode(narrative))) | |
## Copywriting | |
copywriting_system_msg = """You are a copywriter tasked with reviewing the following summary of "{paper_title}" and improving it. Your goal is to make the summary more engaging and readable, remove duplicate content, and preserve the meaning of the original text. You can reorganize and rephrase the text as you see fit, but you must not alter the meaning of the text. Your output should be a single paragraph. | |
{previous_summary} | |
""" | |
copywriting_prompt = ChatPromptTemplate.from_messages([("system", copywriting_system_msg)]) | |
copywriting_chain = LLMChain(llm=llm, prompt=copywriting_prompt) | |
with get_openai_callback() as cb: | |
copywriting = copywriting_chain.run({"paper_title": paper_title, | |
"previous_summary": narrative}) | |
print(copywriting) | |
print("==================") | |
print("Done! Usage stats:") | |
print(cb) | |
print(copywriting) | |
print(len(encoding.encode(copywriting))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment