Delete hybrid.ipynb
This commit is contained in:
parent
0166e05843
commit
114b716a00
467
hybrid.ipynb
467
hybrid.ipynb
@ -1,467 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# State-of-the-Art RAG Implementation\n",
|
|
||||||
"\n",
|
|
||||||
"Features:\n",
|
|
||||||
"- Hybrid retrieval (BM25 + vector search)\n",
|
|
||||||
"- Multi-stage retrieval with reranking\n",
|
|
||||||
"- Advanced chunking strategies\n",
|
|
||||||
"- Multi-document support\n",
|
|
||||||
"- Metadata filtering\n",
|
|
||||||
"- Contextual compression\n",
|
|
||||||
"- Web search integration"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Import required libraries\n",
|
|
||||||
"import os\n",
|
|
||||||
"import re\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from typing import List, Dict, Any, Union\n",
|
|
||||||
"import requests\n",
|
|
||||||
"import httpx\n",
|
|
||||||
"\n",
|
|
||||||
"# LangChain imports\n",
|
|
||||||
"from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader\n",
|
|
||||||
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
|
||||||
"from langchain_community.vectorstores import Chroma\n",
|
|
||||||
"from langchain_community.retrievers import BM25Retriever\n",
|
|
||||||
"from langchain.retrievers import EnsembleRetriever, ContextualCompressionRetriever\n",
|
|
||||||
"from langchain_community.retrievers.document_compressors import DocumentCompressorPipeline\n",
|
|
||||||
"from langchain_ollama import OllamaEmbeddings, ChatOllama\n",
|
|
||||||
"from langchain_core.prompts import ChatPromptTemplate\n",
|
|
||||||
"from langchain_core.documents import Document"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Configuration\n",
|
|
||||||
"MODEL_NAME = \"gemma3:12b\"\n",
|
|
||||||
"DOCS_DIR = \"documents\"\n",
|
|
||||||
"CHUNK_SIZE = 1000\n",
|
|
||||||
"CHUNK_OVERLAP = 200\n",
|
|
||||||
"VECTOR_DB_PATH = \"chroma_db\"\n",
|
|
||||||
"\n",
|
|
||||||
"# Create documents directory if it doesn't exist\n",
|
|
||||||
"os.makedirs(DOCS_DIR, exist_ok=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Document Loading and Processing"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class DocumentProcessor:\n",
|
|
||||||
" \"\"\"Handles document loading, chunking, and embedding.\"\"\"\n",
|
|
||||||
" \n",
|
|
||||||
" def __init__(self, docs_dir=DOCS_DIR):\n",
|
|
||||||
" self.docs_dir = docs_dir\n",
|
|
||||||
" self.embeddings = OllamaEmbeddings(model=MODEL_NAME)\n",
|
|
||||||
" self.text_splitter = RecursiveCharacterTextSplitter(\n",
|
|
||||||
" chunk_size=CHUNK_SIZE,\n",
|
|
||||||
" chunk_overlap=CHUNK_OVERLAP,\n",
|
|
||||||
" add_start_index=True\n",
|
|
||||||
" )\n",
|
|
||||||
" \n",
|
|
||||||
" def load_single_document(self, file_path):\n",
|
|
||||||
" \"\"\"Load a document based on its file extension.\"\"\"\n",
|
|
||||||
" if file_path.endswith('.pdf'):\n",
|
|
||||||
" loader = PyPDFLoader(file_path)\n",
|
|
||||||
" elif file_path.endswith(('.txt', '.md', '.html')):\n",
|
|
||||||
" loader = TextLoader(file_path)\n",
|
|
||||||
" else:\n",
|
|
||||||
" raise ValueError(f\"Unsupported file type: {file_path}\")\n",
|
|
||||||
" return loader.load()\n",
|
|
||||||
" \n",
|
|
||||||
" def load_documents(self):\n",
|
|
||||||
" \"\"\"Load all documents from the documents directory.\"\"\"\n",
|
|
||||||
" documents = []\n",
|
|
||||||
" for filename in os.listdir(self.docs_dir):\n",
|
|
||||||
" file_path = os.path.join(self.docs_dir, filename)\n",
|
|
||||||
" if os.path.isfile(file_path):\n",
|
|
||||||
" try:\n",
|
|
||||||
" docs = self.load_single_document(file_path)\n",
|
|
||||||
" for doc in docs:\n",
|
|
||||||
" doc.metadata['source'] = filename\n",
|
|
||||||
" documents.extend(docs)\n",
|
|
||||||
" except Exception as e:\n",
|
|
||||||
" print(f\"Error loading {file_path}: {e}\")\n",
|
|
||||||
" return documents\n",
|
|
||||||
" \n",
|
|
||||||
" def process_documents(self):\n",
|
|
||||||
" \"\"\"Load and chunk documents.\"\"\"\n",
|
|
||||||
" documents = self.load_documents()\n",
|
|
||||||
" if not documents:\n",
|
|
||||||
" print(\"No documents found. Please add documents to the 'documents' directory.\")\n",
|
|
||||||
" return []\n",
|
|
||||||
" return self.text_splitter.split_documents(documents)\n",
|
|
||||||
" \n",
|
|
||||||
" def create_document_from_text(self, text, metadata=None):\n",
|
|
||||||
" \"\"\"Create a document from text content.\"\"\"\n",
|
|
||||||
" metadata = metadata or {}\n",
|
|
||||||
" doc = Document(page_content=text, metadata=metadata)\n",
|
|
||||||
" return self.text_splitter.split_documents([doc])\n",
|
|
||||||
" \n",
|
|
||||||
" def add_document(self, file_path):\n",
|
|
||||||
" \"\"\"Add a new document to the documents directory.\"\"\"\n",
|
|
||||||
" if not os.path.exists(file_path):\n",
|
|
||||||
" raise FileNotFoundError(f\"File not found: {file_path}\")\n",
|
|
||||||
" \n",
|
|
||||||
" filename = os.path.basename(file_path)\n",
|
|
||||||
" destination = os.path.join(self.docs_dir, filename)\n",
|
|
||||||
" \n",
|
|
||||||
" # Copy file to documents directory\n",
|
|
||||||
" with open(file_path, 'rb') as src, open(destination, 'wb') as dst:\n",
|
|
||||||
" dst.write(src.read())\n",
|
|
||||||
" \n",
|
|
||||||
" return self.load_single_document(destination)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Web Search Integration"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class WebSearchTool:\n",
|
|
||||||
" \"\"\"Handles web search integration using DuckDuckGo.\"\"\"\n",
|
|
||||||
" \n",
|
|
||||||
" def __init__(self, processor):\n",
|
|
||||||
" self.processor = processor\n",
|
|
||||||
" \n",
|
|
||||||
" def search(self, query, num_results=3):\n",
|
|
||||||
" \"\"\"Search the web for information and convert results to documents.\"\"\"\n",
|
|
||||||
" try:\n",
|
|
||||||
" # Use DuckDuckGo API (via a public proxy)\n",
|
|
||||||
" response = httpx.get(\n",
|
|
||||||
" \"https://api.duckduckgo.com/\",\n",
|
|
||||||
" params={\n",
|
|
||||||
" \"q\": query,\n",
|
|
||||||
" \"format\": \"json\",\n",
|
|
||||||
" \"no_html\": 1,\n",
|
|
||||||
" \"no_redirect\": 1\n",
|
|
||||||
" },\n",
|
|
||||||
" timeout=10.0\n",
|
|
||||||
" )\n",
|
|
||||||
" \n",
|
|
||||||
" if response.status_code != 200:\n",
|
|
||||||
" print(f\"Error searching the web: {response.status_code}\")\n",
|
|
||||||
" return []\n",
|
|
||||||
" \n",
|
|
||||||
" results = response.json()\n",
|
|
||||||
" if not results.get('AbstractText') and not results.get('RelatedTopics'):\n",
|
|
||||||
" # Fallback to a simpler HTTP request to ddg-api\n",
|
|
||||||
" response = httpx.get(\n",
|
|
||||||
" \"https://ddg-api.herokuapp.com/search\",\n",
|
|
||||||
" params={\"query\": query, \"limit\": num_results},\n",
|
|
||||||
" timeout=10.0\n",
|
|
||||||
" )\n",
|
|
||||||
" \n",
|
|
||||||
" if response.status_code != 200:\n",
|
|
||||||
" print(f\"Error with fallback search: {response.status_code}\")\n",
|
|
||||||
" return []\n",
|
|
||||||
" \n",
|
|
||||||
" results = response.json()\n",
|
|
||||||
" web_results = []\n",
|
|
||||||
" \n",
|
|
||||||
" for result in results[:num_results]:\n",
|
|
||||||
" title = result.get('title', '')\n",
|
|
||||||
" snippet = result.get('snippet', '')\n",
|
|
||||||
" url = result.get('link', '')\n",
|
|
||||||
" content = f\"Title: {title}\\nURL: {url}\\nContent: {snippet}\"\n",
|
|
||||||
" web_results.append(content)\n",
|
|
||||||
" else:\n",
|
|
||||||
" # Process DuckDuckGo API results\n",
|
|
||||||
" web_results = []\n",
|
|
||||||
" if results.get('AbstractText'):\n",
|
|
||||||
" web_results.append(f\"Abstract: {results['AbstractText']}\\nSource: {results.get('AbstractSource', '')}\")\n",
|
|
||||||
" \n",
|
|
||||||
" for topic in results.get('RelatedTopics', [])[:num_results-len(web_results)]:\n",
|
|
||||||
" if 'Text' in topic:\n",
|
|
||||||
" web_results.append(topic['Text'])\n",
|
|
||||||
" \n",
|
|
||||||
" # Convert to documents\n",
|
|
||||||
" documents = []\n",
|
|
||||||
" for i, result in enumerate(web_results):\n",
|
|
||||||
" chunks = self.processor.create_document_from_text(\n",
|
|
||||||
" result,\n",
|
|
||||||
" metadata={\"source\": f\"web_search_{i}\", \"query\": query}\n",
|
|
||||||
" )\n",
|
|
||||||
" documents.extend(chunks)\n",
|
|
||||||
" \n",
|
|
||||||
" return documents\n",
|
|
||||||
" except Exception as e:\n",
|
|
||||||
" print(f\"Error during web search: {str(e)}\")\n",
|
|
||||||
" return []"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Advanced Retrieval System"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class AdvancedRetriever:\n",
|
|
||||||
" \"\"\"Manages the hybrid retrieval system combining multiple techniques.\"\"\"\n",
|
|
||||||
" \n",
|
|
||||||
" def __init__(self, processor, web_search=None):\n",
|
|
||||||
" self.processor = processor\n",
|
|
||||||
" self.web_search = web_search\n",
|
|
||||||
" self.vector_store = None\n",
|
|
||||||
" self.retriever = None\n",
|
|
||||||
" \n",
|
|
||||||
" def build_retriever(self, documents=None):\n",
|
|
||||||
" \"\"\"Build a hybrid retriever incorporating multiple retrieval methods.\"\"\"\n",
|
|
||||||
" if documents is None:\n",
|
|
||||||
" documents = self.processor.process_documents()\n",
|
|
||||||
" \n",
|
|
||||||
" if not documents:\n",
|
|
||||||
" print(\"No documents to build retriever from.\")\n",
|
|
||||||
" return None\n",
|
|
||||||
" \n",
|
|
||||||
" # Create the vector store\n",
|
|
||||||
" self.vector_store = Chroma.from_documents(\n",
|
|
||||||
" documents=documents,\n",
|
|
||||||
" embedding=self.processor.embeddings,\n",
|
|
||||||
" persist_directory=VECTOR_DB_PATH\n",
|
|
||||||
" )\n",
|
|
||||||
" vector_retriever = self.vector_store.as_retriever(search_kwargs={\"k\": 4})\n",
|
|
||||||
" \n",
|
|
||||||
" # Create BM25 retriever\n",
|
|
||||||
" bm25_retriever = BM25Retriever.from_documents(documents)\n",
|
|
||||||
" bm25_retriever.k = 4\n",
|
|
||||||
" \n",
|
|
||||||
" # Combine retrievers\n",
|
|
||||||
" self.retriever = EnsembleRetriever(\n",
|
|
||||||
" retrievers=[vector_retriever, bm25_retriever],\n",
|
|
||||||
" weights=[0.7, 0.3]\n",
|
|
||||||
" )\n",
|
|
||||||
" \n",
|
|
||||||
" return self.retriever\n",
|
|
||||||
" \n",
|
|
||||||
" def search(self, query, use_web=True, k=5):\n",
|
|
||||||
" \"\"\"Perform a search using the retriever and optionally web search.\"\"\"\n",
|
|
||||||
" if self.retriever is None:\n",
|
|
||||||
" self.build_retriever()\n",
|
|
||||||
" \n",
|
|
||||||
" if self.retriever is None:\n",
|
|
||||||
" # If build_retriever failed\n",
|
|
||||||
" if use_web and self.web_search:\n",
|
|
||||||
" return self.web_search.search(query, num_results=k)\n",
|
|
||||||
" return []\n",
|
|
||||||
" \n",
|
|
||||||
" # Get results from document retriever\n",
|
|
||||||
" results = self.retriever.get_relevant_documents(query)\n",
|
|
||||||
" \n",
|
|
||||||
" # Optionally add web search results\n",
|
|
||||||
" if use_web and self.web_search:\n",
|
|
||||||
" web_results = self.web_search.search(query)\n",
|
|
||||||
" if web_results:\n",
|
|
||||||
" # Combine results, prioritizing local documents\n",
|
|
||||||
" combined_results = results + web_results\n",
|
|
||||||
" # Deduplicate by content\n",
|
|
||||||
" seen_content = set()\n",
|
|
||||||
" unique_results = []\n",
|
|
||||||
" for doc in combined_results:\n",
|
|
||||||
" if doc.page_content not in seen_content:\n",
|
|
||||||
" seen_content.add(doc.page_content)\n",
|
|
||||||
" unique_results.append(doc)\n",
|
|
||||||
" return unique_results[:k]\n",
|
|
||||||
" \n",
|
|
||||||
" return results[:k]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## RAG Question Answering"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"class RAGSystem:\n",
|
|
||||||
" \"\"\"Main RAG system that integrates all components.\"\"\"\n",
|
|
||||||
" \n",
|
|
||||||
" def __init__(self):\n",
|
|
||||||
" self.processor = DocumentProcessor()\n",
|
|
||||||
" self.web_search = WebSearchTool(self.processor)\n",
|
|
||||||
" self.retriever = AdvancedRetriever(self.processor, self.web_search)\n",
|
|
||||||
" self.llm = ChatOllama(model=MODEL_NAME, temperature=0.1)\n",
|
|
||||||
" \n",
|
|
||||||
" # Create a sample document if the documents directory is empty\n",
|
|
||||||
" if not os.listdir(DOCS_DIR):\n",
|
|
||||||
" sample_path = os.path.join(DOCS_DIR, \"sample.txt\")\n",
|
|
||||||
" with open(sample_path, \"w\") as f:\n",
|
|
||||||
" f.write(\"This is a sample document for testing the RAG system.\\n\")\n",
|
|
||||||
" f.write(\"The system combines vector search, BM25, and web search capabilities.\\n\")\n",
|
|
||||||
" f.write(\"You can add your own documents to the 'documents' directory.\\n\")\n",
|
|
||||||
" \n",
|
|
||||||
" def initialize(self):\n",
|
|
||||||
" \"\"\"Initialize the RAG system.\"\"\"\n",
|
|
||||||
" documents = self.processor.process_documents()\n",
|
|
||||||
" self.retriever.build_retriever(documents)\n",
|
|
||||||
" return self\n",
|
|
||||||
" \n",
|
|
||||||
" def answer(self, query, use_web=True):\n",
|
|
||||||
" \"\"\"Generate an answer for the query using retrieved context.\"\"\"\n",
|
|
||||||
" # Get relevant documents\n",
|
|
||||||
" docs = self.retriever.search(query, use_web=use_web)\n",
|
|
||||||
" \n",
|
|
||||||
" if not docs:\n",
|
|
||||||
" return \"I couldn't find any relevant information to answer your question.\"\n",
|
|
||||||
" \n",
|
|
||||||
" # Create context from documents\n",
|
|
||||||
" context = \"\\n\\n\".join([f\"Document {i+1}:\\n{doc.page_content}\" for i, doc in enumerate(docs)])\n",
|
|
||||||
" \n",
|
|
||||||
" # Generate answer\n",
|
|
||||||
" prompt = ChatPromptTemplate.from_template(\"\"\"\n",
|
|
||||||
" Answer the following question based on the provided context.\n",
|
|
||||||
" If the answer is not in the context, say \"I don't have enough information to answer this question.\"\n",
|
|
||||||
" \n",
|
|
||||||
" Context:\n",
|
|
||||||
" {context}\n",
|
|
||||||
" \n",
|
|
||||||
" Question: {query}\n",
|
|
||||||
" \n",
|
|
||||||
" Answer:\n",
|
|
||||||
" \"\"\")\n",
|
|
||||||
" \n",
|
|
||||||
" chain = prompt | self.llm\n",
|
|
||||||
" response = chain.invoke({\"context\": context, \"query\": query})\n",
|
|
||||||
" \n",
|
|
||||||
" return response.content\n",
|
|
||||||
" \n",
|
|
||||||
" def add_document(self, file_path):\n",
|
|
||||||
" \"\"\"Add a new document and update the retriever.\"\"\"\n",
|
|
||||||
" documents = self.processor.add_document(file_path)\n",
|
|
||||||
" chunks = self.processor.text_splitter.split_documents(documents)\n",
|
|
||||||
" \n",
|
|
||||||
" # Update existing vector store\n",
|
|
||||||
" if self.retriever.vector_store is not None:\n",
|
|
||||||
" self.retriever.vector_store.add_documents(chunks)\n",
|
|
||||||
" \n",
|
|
||||||
" # Rebuild retriever\n",
|
|
||||||
" self.retriever.build_retriever()\n",
|
|
||||||
" \n",
|
|
||||||
" return len(chunks)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Usage Example"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Initialize the RAG system\n",
|
|
||||||
"rag_system = RAGSystem().initialize()\n",
|
|
||||||
"\n",
|
|
||||||
"# Test with a sample query\n",
|
|
||||||
"query = \"What is a hybrid RAG system?\"\n",
|
|
||||||
"answer = rag_system.answer(query)\n",
|
|
||||||
"print(f\"Query: {query}\")\n",
|
|
||||||
"print(f\"Answer: {answer}\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Test with web search\n",
|
|
||||||
"query = \"What are the latest developments in large language models?\"\n",
|
|
||||||
"answer = rag_system.answer(query, use_web=True)\n",
|
|
||||||
"print(f\"Query: {query}\")\n",
|
|
||||||
"print(f\"Answer: {answer}\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Adding Your Own Documents"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Example: Add your own document\n",
|
|
||||||
"# Replace with the path to your document\n",
|
|
||||||
"# document_path = \"/path/to/your/document.pdf\"\n",
|
|
||||||
"# num_chunks = rag_system.add_document(document_path)\n",
|
|
||||||
"# print(f\"Added document with {num_chunks} chunks\")"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.12"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
|
||||||
Loading…
x
Reference in New Issue
Block a user