Created
January 13, 2023 10:17
-
-
Save hursh-desai/61ea24bca576055f649e3b8706e2dc03 to your computer and use it in GitHub Desktop.
obsidian + langchain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import faiss | |
from langchain import FAISS | |
import obsidiantools.api as otools | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.chains.qa_with_sources import load_qa_with_sources_chain | |
from langchain.llms import OpenAI | |
os.environ["OPENAI_API_KEY"] = 'sk-********' | |
dirpath = '/Users/hursh/<vault_name>' | |
vault = otools.Vault(dirpath).connect().gather() | |
embeddings = OpenAIEmbeddings() # type: ignore | |
def markdown_to_dict(markdown_text): | |
# Initialize empty dictionary | |
markdown_dict = {} | |
# Split markdown file into a list of lines | |
lines = markdown_text.split('\n') | |
# Initialize current header and text | |
current_header = None | |
current_text = '' | |
# Iterate through lines | |
for line in lines: | |
# Check if line is a markdown header | |
header_match = re.match(r'^#+\s', line) | |
if header_match: | |
# If current header is not None, add current header and text to dictionary | |
if current_header is not None: | |
markdown_dict[current_header] = current_text | |
# Update current header and reset current text | |
current_header = line | |
current_text = '' | |
else: | |
# If line is not a header, append it to current text | |
current_text += line + '\n' | |
# Add final header and text to dictionary | |
markdown_dict[current_header] = current_text | |
# Remove markdown formatting from header and text | |
if current_header is not None: | |
markdown_dict = {re.sub(r'#', '', key).strip(): re.sub(r'<.*?>', '', value) for key, value in markdown_dict.items()} | |
else: | |
markdown_dict = {key: re.sub(r'<.*?>', '', value) for key, value in markdown_dict.items()} | |
return markdown_dict | |
df = vault.get_note_metadata() | |
all_text = [] | |
all_metadata = [] | |
for index, row in df.loc[df['rel_filepath'].notna()].iterrows(): | |
note = vault.get_source_text(index) | |
clean_note = markdown_to_dict(note) | |
text = [value for value in clean_note.values()] | |
metadata = [{'source' : index + '-' + str(key)} for key in clean_note.keys()] | |
all_text.extend(text) | |
all_metadata.extend(metadata) | |
docsearch = FAISS.from_texts(all_text, embeddings, metadatas=all_metadata) | |
chain = load_qa_with_sources_chain(OpenAI(temperature=0)) | |
def print_answer(question): | |
print( | |
chain( | |
{ | |
"input_documents": docsearch.similarity_search(question, k=4), | |
"question": question, | |
}, | |
return_only_outputs=True, | |
)["output_text"] | |
) | |
print_answer('What is the meaning of life?') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment