jamiekt · July 13, 2025 21:44
diff --git a/data_processing.py b/data_processing.py
 import os
 import time
 from langchain_community.document_loaders import DirectoryLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_ollama import OllamaEmbeddings
 from langchain_chroma import Chroma

 OLLAMA_BASE_URL = "http://host.docker.internal:11434"
 EMBEDDING_MODEL_NAME = "nomic-embed-text"

 def load_and_process_notes(notes_dir="../notes"):
    loader = DirectoryLoader(
        notes_dir,
        glob="**/*.md", 
        loader_cls=TextLoader, # Simple text loader for Markdown
        loader_kwargs={"autodetect_encoding": True} # Helps with various encodings
    )
    documents = loader.load()
    print(f"Loaded {len(documents)} documents.")

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,   # Max characters per chunk
        chunk_overlap=200  # Overlap to maintain context between chunks
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")
    return chunks

 def create_embeddings_and_vectorstore(chunks, persist_directory="./chroma_db", batch_size=100, retry_delay=5):
    # Create embeddings using a local Ollama embedding model
    # Ensure 'nomic-embed-text' is pulled in Ollama (e.g., docker exec -it ollama ollama pull nomic-embed-text)
    embeddings_model = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME, base_url=OLLAMA_BASE_URL)

    # Initialize ChromaDB
    # If the directory exists, it will load the existing vector store
    # We will add chunks incrementally
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        print(f"Loading existing vector store from {persist_directory}...")
        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings_model)
    else:
        print(f"Creating new vector store at {persist_directory}...")
        # Create an empty vectorstore or with a dummy document to initialize
        vectorstore = Chroma.from_documents(
            documents=[], # Start with an empty list
            embedding=embeddings_model,
            persist_directory=persist_directory
        )

    num_chunks = len(chunks)
    print(f"Starting to embed and add {num_chunks} chunks in batches of {batch_size}...")

    total_time_elapsed = 0.0

    for i in range(0, num_chunks, batch_size):
        batch = chunks[i:i + batch_size]
        print(f"Processing batch {int(i/batch_size) + 1}/{(num_chunks + batch_size - 1) // batch_size} ({len(batch)} chunks)...")
        
        retries = 3
        for attempt in range(retries):
            try:
                start_time = time.time()
                # Use add_documents to add the batch to the vector store
                vectorstore.add_documents(batch)
                end_time = time.time()
                
                batch_time = end_time - start_time
                total_time_elapsed += batch_time
                
                print(f"Successfully added {len(batch)} chunks in {batch_time:.2f} seconds. Total processed: {i + len(batch)}")
                print_time_elapsed(total_time_elapsed)
                
                break # Break out of retry loop on success
            except Exception as e:
                print(f"Error processing batch {int(i/batch_size) + 1} on attempt {attempt + 1}: {e}")
                print(f"Waiting {retry_delay} seconds before retrying...")
                time.sleep(retry_delay)
                if attempt == retries - 1:
                    print(f"Failed to process batch {int(i/batch_size) + 1} after {retries} attempts. Skipping this batch.")
                    # Optionally, log these failed chunks to a file for later review
                    # with open("failed_chunks.log", "a") as f:
                    #     for chunk in batch:
                    #         f.write(f"Failed: {chunk.page_content[:100]}...\n")
    
    print(f"Embedding and vector store update complete. Total time: {total_time_elapsed:.2f} seconds.")
    return vectorstore

 def print_time_elapsed(total_time_elapsed_in_seconds: int):
    # Print total time elapsed in a friendly format
    if total_time_elapsed_in_seconds < 60:
        print(f"Total time elapsed: {total_time_elapsed_in_seconds:.2f} seconds.")
    elif total_time_elapsed_in_seconds < 3600:
        minutes = total_time_elapsed_in_seconds // 60
        seconds = total_time_elapsed_in_seconds % 60
        print(f"Total time elapsed: {int(minutes)} min {int(seconds)} sec.")
    else:
        hours = total_time_elapsed_in_seconds // 3600
        minutes = (total_time_elapsed_in_seconds % 3600) // 60
        seconds = total_time_elapsed_in_seconds % 60
        print(f"Total time elapsed: {int(hours)} hr {int(minutes)} min {int(seconds)} sec.")

 if __name__ == "__main__":
    # This assumes your 'notes' directory is a sibling of the directory contianing this file
    # Adjust the path as necessary based on your project structure
    notes_repo_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'notes'))
    print(f"Starting data processing for notes in: {notes_repo_path}")

    # Ensure your Ollama container is running and 'nomic-embed-text' model is pulled
    note_chunks = load_and_process_notes(notes_dir=notes_repo_path)
    vector_db = create_embeddings_and_vectorstore(note_chunks, persist_directory="./chroma_db", batch_size=100)
    print("Data processing complete. Your notes are ready to be queried!")
diff --git a/query_system.py b/query_system.py
 import os
 from langchain_ollama import OllamaLLM
 from langchain_ollama import OllamaEmbeddings
 from langchain_chroma import Chroma
 from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate

 # Configuration constants
 PERSIST_DIRECTORY = "./chroma_db"
 OLLAMA_BASE_URL = "http://host.docker.internal:11434" # Default Ollama API address
 LLM_MODEL_NAME = "llama3" # The LLM you pulled with Ollama

 def setup_rag_chain():
    # Load the embedding model (must match the one used for indexing)
    embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url=OLLAMA_BASE_URL)

    # Load the persistent vector store
    if not os.path.exists(PERSIST_DIRECTORY):
        print(f"Error: Vector store not found at {PERSIST_DIRECTORY}.")
        print("Please run data_processing.py first to create the vector store.")
        return None

    vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)
    print(f"Loaded vector store from {PERSIST_DIRECTORY}.")

    # Set up the retriever to fetch relevant documents
    # k=3 means it will retrieve the top 3 most relevant chunks
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

    # Initialize the local LLM (ensure it's pulled in Ollama)
    llm = OllamaLLM(model=LLM_MODEL_NAME, base_url=OLLAMA_BASE_URL)
    print(f"Initialized LLM: {LLM_MODEL_NAME}")

    # Define a custom prompt template for RAG
    # This guides the LLM on how to use the retrieved context
    prompt_template = """Use the following pieces of context from my notes to answer the user's question.
 If you don't know the answer based *only* on the provided context, just say that you don't know, don't try to make up an answer.
 Cite the source filenames of the notes if possible.

 Context:
 {context}

 Question: {question}

 Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt_template)

    # Create the RAG chain
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        return_source_documents=True, # So we can show the original notes used
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
    )
    return qa_chain

 if __name__ == "__main__":
    qa_chain = setup_rag_chain()
    if qa_chain:
        print("RAG system ready. Ask a question about your notes (type 'exit' to quit):")
        while True:
            query = input("\nYour question: ")
            if query.lower() == 'exit':
                break
            
            try:
                result = qa_chain.invoke({"query": query})
                print("\nAnswer:", result["result"])
                
                # Print sources if available
                if result.get("source_documents"):
                    print("\nSources (from your notes):")
                    for doc in result["source_documents"]:
                        # LangChain adds metadata like 'source' (filename)
                        source_path = doc.metadata.get('source', 'Unknown Source')
                        # Extract just the filename for cleaner output
                        filename = os.path.basename(source_path)
                        print(f"- {filename}")
                else:
                    print("\nNo specific sources found in your notes for this answer.")
            except Exception as e:
                print(f"An error occurred: {e}")
                print("Make sure your Ollama container is running and the specified LLM/embedding models are pulled.")
	import os
	import time
	from langchain_community.document_loaders import DirectoryLoader, TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_ollama import OllamaEmbeddings
	from langchain_chroma import Chroma

	OLLAMA_BASE_URL = "http://host.docker.internal:11434"
	EMBEDDING_MODEL_NAME = "nomic-embed-text"

	def load_and_process_notes(notes_dir="../notes"):
	loader = DirectoryLoader(
	notes_dir,
	glob="*/.md",
	loader_cls=TextLoader, # Simple text loader for Markdown
	loader_kwargs={"autodetect_encoding": True} # Helps with various encodings
	)
	documents = loader.load()
	print(f"Loaded {len(documents)} documents.")

	# Split documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, # Max characters per chunk
	chunk_overlap=200 # Overlap to maintain context between chunks
	)
	chunks = text_splitter.split_documents(documents)
	print(f"Split into {len(chunks)} chunks.")
	return chunks

	def create_embeddings_and_vectorstore(chunks, persist_directory="./chroma_db", batch_size=100, retry_delay=5):
	# Create embeddings using a local Ollama embedding model
	# Ensure 'nomic-embed-text' is pulled in Ollama (e.g., docker exec -it ollama ollama pull nomic-embed-text)
	embeddings_model = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME, base_url=OLLAMA_BASE_URL)

	# Initialize ChromaDB
	# If the directory exists, it will load the existing vector store
	# We will add chunks incrementally
	if os.path.exists(persist_directory) and os.listdir(persist_directory):
	print(f"Loading existing vector store from {persist_directory}...")
	vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings_model)
	else:
	print(f"Creating new vector store at {persist_directory}...")
	# Create an empty vectorstore or with a dummy document to initialize
	vectorstore = Chroma.from_documents(
	documents=[], # Start with an empty list
	embedding=embeddings_model,
	persist_directory=persist_directory
	)

	num_chunks = len(chunks)
	print(f"Starting to embed and add {num_chunks} chunks in batches of {batch_size}...")

	total_time_elapsed = 0.0

	for i in range(0, num_chunks, batch_size):
	batch = chunks[i:i + batch_size]
	print(f"Processing batch {int(i/batch_size) + 1}/{(num_chunks + batch_size - 1) // batch_size} ({len(batch)} chunks)...")

	retries = 3
	for attempt in range(retries):
	try:
	start_time = time.time()
	# Use add_documents to add the batch to the vector store
	vectorstore.add_documents(batch)
	end_time = time.time()

	batch_time = end_time - start_time
	total_time_elapsed += batch_time

	print(f"Successfully added {len(batch)} chunks in {batch_time:.2f} seconds. Total processed: {i + len(batch)}")
	print_time_elapsed(total_time_elapsed)

	break # Break out of retry loop on success
	except Exception as e:
	print(f"Error processing batch {int(i/batch_size) + 1} on attempt {attempt + 1}: {e}")
	print(f"Waiting {retry_delay} seconds before retrying...")
	time.sleep(retry_delay)
	if attempt == retries - 1:
	print(f"Failed to process batch {int(i/batch_size) + 1} after {retries} attempts. Skipping this batch.")
	# Optionally, log these failed chunks to a file for later review
	# with open("failed_chunks.log", "a") as f:
	# for chunk in batch:
	# f.write(f"Failed: {chunk.page_content[:100]}...\n")

	print(f"Embedding and vector store update complete. Total time: {total_time_elapsed:.2f} seconds.")
	return vectorstore

	def print_time_elapsed(total_time_elapsed_in_seconds: int):
	# Print total time elapsed in a friendly format
	if total_time_elapsed_in_seconds < 60:
	print(f"Total time elapsed: {total_time_elapsed_in_seconds:.2f} seconds.")
	elif total_time_elapsed_in_seconds < 3600:
	minutes = total_time_elapsed_in_seconds // 60
	seconds = total_time_elapsed_in_seconds % 60
	print(f"Total time elapsed: {int(minutes)} min {int(seconds)} sec.")
	else:
	hours = total_time_elapsed_in_seconds // 3600
	minutes = (total_time_elapsed_in_seconds % 3600) // 60
	seconds = total_time_elapsed_in_seconds % 60
	print(f"Total time elapsed: {int(hours)} hr {int(minutes)} min {int(seconds)} sec.")

	if __name__ == "__main__":
	# This assumes your 'notes' directory is a sibling of the directory contianing this file
	# Adjust the path as necessary based on your project structure
	notes_repo_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'notes'))
	print(f"Starting data processing for notes in: {notes_repo_path}")

	# Ensure your Ollama container is running and 'nomic-embed-text' model is pulled
	note_chunks = load_and_process_notes(notes_dir=notes_repo_path)
	vector_db = create_embeddings_and_vectorstore(note_chunks, persist_directory="./chroma_db", batch_size=100)
	print("Data processing complete. Your notes are ready to be queried!")
	import os
	from langchain_ollama import OllamaLLM
	from langchain_ollama import OllamaEmbeddings
	from langchain_chroma import Chroma
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate

	# Configuration constants
	PERSIST_DIRECTORY = "./chroma_db"
	OLLAMA_BASE_URL = "http://host.docker.internal:11434" # Default Ollama API address
	LLM_MODEL_NAME = "llama3" # The LLM you pulled with Ollama

	def setup_rag_chain():
	# Load the embedding model (must match the one used for indexing)
	embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url=OLLAMA_BASE_URL)

	# Load the persistent vector store
	if not os.path.exists(PERSIST_DIRECTORY):
	print(f"Error: Vector store not found at {PERSIST_DIRECTORY}.")
	print("Please run data_processing.py first to create the vector store.")
	return None

	vectorstore = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)
	print(f"Loaded vector store from {PERSIST_DIRECTORY}.")

	# Set up the retriever to fetch relevant documents
	# k=3 means it will retrieve the top 3 most relevant chunks
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	# Initialize the local LLM (ensure it's pulled in Ollama)
	llm = OllamaLLM(model=LLM_MODEL_NAME, base_url=OLLAMA_BASE_URL)
	print(f"Initialized LLM: {LLM_MODEL_NAME}")

	# Define a custom prompt template for RAG
	# This guides the LLM on how to use the retrieved context
	prompt_template = """Use the following pieces of context from my notes to answer the user's question.
	If you don't know the answer based only on the provided context, just say that you don't know, don't try to make up an answer.
	Cite the source filenames of the notes if possible.

	Context:
	{context}

	Question: {question}

	Helpful Answer:"""
	QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt_template)

	# Create the RAG chain
	qa_chain = RetrievalQA.from_chain_type(
	llm,
	retriever=retriever,
	return_source_documents=True, # So we can show the original notes used
	chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
	)
	return qa_chain

	if __name__ == "__main__":
	qa_chain = setup_rag_chain()
	if qa_chain:
	print("RAG system ready. Ask a question about your notes (type 'exit' to quit):")
	while True:
	query = input("\nYour question: ")
	if query.lower() == 'exit':
	break

	try:
	result = qa_chain.invoke({"query": query})
	print("\nAnswer:", result["result"])

	# Print sources if available
	if result.get("source_documents"):
	print("\nSources (from your notes):")
	for doc in result["source_documents"]:
	# LangChain adds metadata like 'source' (filename)
	source_path = doc.metadata.get('source', 'Unknown Source')
	# Extract just the filename for cleaner output
	filename = os.path.basename(source_path)
	print(f"- {filename}")
	else:
	print("\nNo specific sources found in your notes for this answer.")
	except Exception as e:
	print(f"An error occurred: {e}")
	print("Make sure your Ollama container is running and the specified LLM/embedding models are pulled.")