From a994adcfa5debfa5b81c8e14421a233cf0f16557 Mon Sep 17 00:00:00 2001 From: Masih Moafi <132553157+MasihMoafi@users.noreply.github.com> Date: Sun, 2 Feb 2025 12:54:04 +0330 Subject: [PATCH] Add files via upload --- RAG + AGENT.ipynb | 522 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 522 insertions(+) create mode 100644 RAG + AGENT.ipynb diff --git a/RAG + AGENT.ipynb b/RAG + AGENT.ipynb new file mode 100644 index 0000000..5e7c569 --- /dev/null +++ b/RAG + AGENT.ipynb @@ -0,0 +1,522 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6a2d4624-4d76-4c52-a0f8-b353b6848549", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#Test Dorna\n", + "response: ChatResponse = chat(model='llama3.2', messages=[\n", + " {\n", + " 'role': 'user',\n", + " 'content': 'چرا اینترنت قطع میشه؟',\n", + " },\n", + "])\n", + "print(response['message']['content'])\n", + "print(response.message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a312f740-400a-49a5-a79e-41195aa49746", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!pip install faiss-cpu sentence-transformers ollama numpy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9100712e-4b06-4168-b8ac-36f0c2865e42", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# MANUALLY RAG\n", + "from ollama import chat\n", + "import numpy as np\n", + "import faiss\n", + "from sentence_transformers import SentenceTransformer\n", + "import torch\n", + "import os\n", + "import re\n", + "\n", + "DOCUMENT_PATHS = [\n", + " r'/home/masih/rag_data/Hamrah.txt', #replace path\n", + " r'/home/masih/rag_data/vape.txt',\n", + " r'/home/masih/rag_data/Shah.txt',\n", + " r'/home/masih/rag_data/Khalife.txt',\n", + " r'/home/masih/rag_data/carbon.txt',\n", + " r'/home/masih/rag_data/takapoo.txt'\n", + "]\n", + "\n", + "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", + "LLM_MODEL = 'llama3.2'\n", + "CHUNK_SIZE = 1000\n", + "OVERLAP = 200\n", + "INDEX_PATH = r'C:\\Users\\ASUS\\Downloads\\doc_index.faiss'\n", + "CHUNK_MAP_PATH = r'C:\\Users\\ASUS\\Downloads\\chunk_map.npy'\n", + "\n", + "class AdvancedRAG:\n", + " def __init__(self):\n", + " self.encoder = SentenceTransformer(EMBEDDING_MODEL, device='cuda' if torch.cuda.is_available() else 'cpu')\n", + " self.index = None\n", + " self.chunk_map = []\n", + " \n", + " def create_index(self):\n", + " \"\"\"Create FAISS index with cosine similarity and document mapping\"\"\"\n", + " all_chunks = []\n", + " doc_mapping = []\n", + " \n", + " # Process via CHUNKING (REQ 4 RAG)\n", + " for doc_idx, path in enumerate(DOCUMENT_PATHS):\n", + " with open(path, 'r', encoding='utf-8') as f:\n", + " text = re.sub(r'\\s+', ' ', f.read()).strip()\n", + " chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE - OVERLAP)]\n", + " all_chunks.extend(chunks)\n", + " doc_mapping.extend([doc_idx] * len(chunks))\n", + " \n", + " # Normalized embeddings (REQ 4 cosine similarity)\n", + " embeddings = self.encoder.encode(all_chunks, convert_to_tensor=True, device='cuda' if torch.cuda.is_available() else 'cpu')\n", + " embeddings = embeddings.cpu().numpy() # Move back to CPU for FAISS\n", + " \n", + " faiss.normalize_L2(embeddings) \n", + " \n", + " # FAISS index & Mapping\n", + " self.index = faiss.IndexFlatIP(embeddings.shape[1])\n", + " self.index.add(embeddings.astype(np.float32))\n", + " self.chunk_map = np.array(doc_mapping)\n", + " \n", + " # Index \n", + " faiss.write_index(self.index, INDEX_PATH)\n", + " # Mapping \n", + " np.save(CHUNK_MAP_PATH, self.chunk_map)\n", + " \n", + " def load_index(self):\n", + " \"\"\"LOAD EXISTING DATA\"\"\"\n", + " self.index = faiss.read_index(INDEX_PATH)\n", + " self.chunk_map = np.load(CHUNK_MAP_PATH, allow_pickle=True)\n", + " \n", + " def query(self, question, doc_index, top_k=5):\n", + " \"\"\"DOCUMENT-SPECIFIC QUERY WITH COSINE SIMILARITY \"\"\"\n", + " # Encode \n", + " query_embed = self.encoder.encode([question], convert_to_tensor=True, device='cuda' if torch.cuda.is_available() else 'cpu')\n", + " query_embed = query_embed.cpu().numpy() # Move back to CPU for FAISS\n", + " \n", + " # Normalize \n", + " faiss.normalize_L2(query_embed)\n", + " \n", + " distances, indices = self.index.search(query_embed.astype(np.float32), top_k*3)\n", + " \n", + " relevant_chunks = []\n", + " for idx in indices[0]:\n", + " if self.chunk_map[idx] == doc_index:\n", + " relevant_chunks.append(idx)\n", + " if len(relevant_chunks) >= top_k:\n", + " break\n", + " \n", + " return relevant_chunks\n", + "\n", + "class AnswerGenerator:\n", + " def __init__(self, rag_system):\n", + " self.rag = rag_system\n", + " self.chunks = [] \n", + " \n", + " def get_answer(self, question, doc_index):\n", + " \"\"\"GENERATING CONTEXT-AWARE ANSWER\"\"\"\n", + " if not self.chunks:\n", + " self._load_chunks()\n", + " \n", + " chunk_indices = self.rag.query(question, doc_index)\n", + " context = \"\\n\".join([self.chunks[idx] for idx in chunk_indices])\n", + " \n", + " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", + "{context}\n", + "\n", + "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", + "\n", + "سوال: {question}\n", + "پاسخ:\"\"\"\n", + " \n", + " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", + " return response['message']['content']\n", + " \n", + " def _load_chunks(self):\n", + " \"\"\"LOAD ALL CHUNKS(LAZY)\"\"\"\n", + " self.chunks = []\n", + " for path in DOCUMENT_PATHS:\n", + " with open(path, 'r', encoding='utf-8') as f:\n", + " text = re.sub(r'\\s+', ' ', f.read()).strip()\n", + " self.chunks.extend([text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE - OVERLAP)])\n", + "\n", + "# MAIN EXE of RAG\n", + "if __name__ == \"__main__\":\n", + " # RAG init\n", + " rag = AdvancedRAG()\n", + " \n", + " if not os.path.exists(INDEX_PATH):\n", + " print(\"Building optimized index...\")\n", + " rag.create_index()\n", + " else:\n", + " print(\"Loading existing index...\")\n", + " rag.load_index()\n", + " # Answer Generator init\n", + " generator = AnswerGenerator(rag)\n", + " \n", + " queries = [\n", + " (\"چرا اینترنت همراه اول گوشی وصل نمیشود؟\", 0),\n", + " (\"چطوری ویپ مورد نظرمو پیدا کنم؟\", 1),\n", + " (\"شاه عباس که بود؟\", 2),\n", + " (\"خلیفه سلطان که بود و چه کرد؟\", 3),\n", + " (\"کربن اکتیو و کربن بلک چه هستند و چه تفاوتی دارند و برای چه استفاده میشن؟\", 4),\n", + " (\"شرکت تکاپو صنعت نامی چه محصولاتی ارایه میدهد؟ چه چیزی این شرکت را منحصر به فرد میسازد؟ سهام این شرکت صعودی است یا نزولی؟\", 5)\n", + " ]\n", + " \n", + " with open(r'C:\\Users\\ASUS\\Downloads\\representation.txt', 'w', encoding='utf-8') as f: #replace path\n", + " for q_idx, (query, doc_idx) in enumerate(queries):\n", + " answer = generator.get_answer(query, doc_idx)\n", + " f.write(f\"سوال {q_idx+1} ({doc_idx+1}):\\n{query}\\n\\nپاسخ:\\n{answer}\\n\\n{'='*50}\\n\\n\")\n", + " print(f\"پردازش سوال {q_idx+1}/{len(queries)} تکمیل شد\")\n", + "\n", + "print(\"تمامی سوالات با موفقیت پردازش شدند!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1acad95b-6ae6-480a-98cc-29b0e38d2646", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain chromadb sentence-transformers ollama" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5fd21a3-d820-4e6e-aef4-9ce4955ce2ff", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!pip install -U langchain-community" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "960bc8ae-5d3d-401c-93ae-85b45fe9adee", + "metadata": {}, + "outputs": [], + "source": [ + "# CHROMOA\n", + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import SentenceTransformerEmbeddings\n", + "from langchain.schema import Document\n", + "from ollama import chat\n", + "import os\n", + "import re\n", + "\n", + "DOCUMENT_PATHS = [\n", + " r'/home/masih/rag_data/Hamrah.txt', #replace path\n", + " r'/home/masih/rag_data/vape.txt',\n", + " r'/home/masih/rag_data/Shah.txt',\n", + " r'/home/masih/rag_data/Khalife.txt',\n", + " r'/home/masih/rag_data/carbon.txt',\n", + " r'/home/masih/rag_data/takapoo.txt'\n", + "]\n", + "\n", + "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", + "LLM_MODEL = 'llama3.2'\n", + "CHUNK_SIZE = 1000\n", + "OVERLAP = 200\n", + "CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db' \n", + "\n", + "class ChromaRAGSystem:\n", + " def __init__(self):\n", + " # Init embedding model\n", + " self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n", + " # Vector store instance\n", + " self.vector_db = None\n", + " \n", + " def build_vector_store(self):\n", + " \"\"\"Process documents and create Chroma vector store\"\"\"\n", + " all_docs = []\n", + " \n", + "\n", + " for doc_idx, path in enumerate(DOCUMENT_PATHS):\n", + " with open(path, 'r', encoding='utf-8') as f:\n", + " text = re.sub(r'\\s+', ' ', f.read()).strip()\n", + " # sliding window chunking\n", + " chunks = [\n", + " text[i:i+CHUNK_SIZE] \n", + " for i in range(0, len(text), CHUNK_SIZE - OVERLAP)\n", + " ]\n", + " # LangChain documents with metadata\n", + " for chunk in chunks:\n", + " all_docs.append(Document(\n", + " page_content=chunk,\n", + " metadata={\"source_doc\": doc_idx}\n", + " ))\n", + " \n", + " # Chroma vector store\n", + " self.vector_db = Chroma.from_documents(\n", + " documents=all_docs,\n", + " embedding=self.embeddings,\n", + " persist_directory=CHROMA_PERSIST_DIR\n", + " )\n", + " self.vector_db.persist()\n", + " \n", + " def load_vector_store(self):\n", + " \"\"\"Load existing Chroma vector store\"\"\"\n", + " self.vector_db = Chroma(\n", + " persist_directory=CHROMA_PERSIST_DIR,\n", + " embedding_function=self.embeddings\n", + " )\n", + " \n", + " def document_query(self, query, doc_index, top_k=5):\n", + " \"\"\"Retrieve context from specific document\"\"\"\n", + " # Chroma metadata filtering\n", + " results = self.vector_db.similarity_search(\n", + " query=query,\n", + " k=top_k,\n", + " filter={\"source_doc\": doc_index}\n", + " )\n", + " return [doc.page_content for doc in results]\n", + "\n", + "class AnswerGenerator:\n", + " def __init__(self, rag_system):\n", + " self.rag = rag_system\n", + " \n", + " def generate_response(self, question, doc_index):\n", + " \"\"\"Generate context-aware answer using LLM\"\"\"\n", + " # Retrieve relevant context\n", + " context_chunks = self.rag.document_query(question, doc_index)\n", + " context = \"\\n\".join(context_chunks)\n", + " \n", + " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", + "{context}\n", + "\n", + "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", + "\n", + "سوال: {question}\n", + "پاسخ:\"\"\"\n", + " \n", + " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", + " return response['message']['content']\n", + "\n", + "if __name__ == \"__main__\":\n", + " rag_system = ChromaRAGSystem()\n", + " \n", + " # Init vector store\n", + " if not os.path.exists(CHROMA_PERSIST_DIR):\n", + " print(\"Creating new vector store...\")\n", + " rag_system.build_vector_store()\n", + " else:\n", + " print(\"Loading existing vector store...\")\n", + " rag_system.load_vector_store()\n", + " \n", + " # Init answer generator\n", + " answer_engine = AnswerGenerator(rag_system)\n", + "\n", + " queries = [\n", + " (\"چرا اینترنت همراه اول گوشی وصل نمیشود؟\", 0),\n", + " (\"چطوری ویپ مورد نظرمو پیدا کنم؟\", 1),\n", + " (\"شاه عباس که بود؟\", 2),\n", + " (\"خلیفه سلطان که بود و چه کرد؟\", 3),\n", + " (\"کربن اکتیو و کربن بلک چه هستند و چه تفاوتی دارند و برای چه استفاده میشن؟\", 4),\n", + " (\"شرکت تکاپو صنعت نامی چه محصولاتی ارایه میدهد؟ چه چیزی این شرکت را منحصر به فرد میسازد؟ سهام این شرکت صعودی است یا نزولی؟\", 5)\n", + " ]\n", + " \n", + " with open( r'/home/masih/rag_data/response.txt', 'w', encoding='utf-8') as output_file: #repalce path\n", + " for q_num, (query, doc_idx) in enumerate(queries):\n", + " answer = answer_engine.generate_response(query, doc_idx)\n", + " output_file.write(f\"سوال {q_num+1} ({doc_idx+1}):\\n{query}\\n\\nپاسخ:\\n{answer}\\n\\n{'='*50}\\n\\n\")\n", + " print(f\"پردازش سوال {q_num+1}/{len(queries)} تکمیل شد\")\n", + "\n", + "print(\"تمامی سوالات با موفقیت پردازش شدند!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8858c90-1d8d-4b7b-9c25-d94370930f04", + "metadata": {}, + "outputs": [], + "source": [ + "# AGENT\n", + "from langchain.agents import Tool, initialize_agent, AgentType\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain_community.llms import Ollama\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "# 1. Add web browsing tool\n", + "def web_browser_tool(url: str) -> str:\n", + " \"\"\"Fetch webpage content (5000 character limit)\"\"\"\n", + " try:\n", + " response = requests.get(url, timeout=10)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + " \n", + " # Clean HTML\n", + " for element in soup(['script', 'style', 'header', 'footer', 'nav']):\n", + " element.decompose()\n", + " \n", + " text = soup.get_text(separator='\\n', strip=True)\n", + " return text[:5000] # Prevent token overflow\n", + " except Exception as e:\n", + " return f\"Error accessing website: {str(e)}\"\n", + "\n", + "# 2. Agent \n", + "class AgentEnhancedGenerator:\n", + " def __init__(self, rag_system):\n", + " self.rag = rag_system # Use RAG \n", + " self.llm = Ollama(model=LLM_MODEL)\n", + " self.memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n", + " \n", + " # Define tools\n", + " self.tools = [\n", + " Tool(\n", + " name=\"Document_Search\",\n", + " func=self._document_search, # Directly use RAG\n", + " description=\"For questions about mobile networks, historical figures, materials, or companies\"\n", + " ),\n", + " Tool(\n", + " name=\"Web_Browser\",\n", + " func=web_browser_tool,\n", + " description=\"For live web data or current information\"\n", + " )\n", + " ]\n", + " \n", + " # Init agent\n", + " self.agent = initialize_agent(\n", + " tools=self.tools,\n", + " llm=self.llm,\n", + " agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n", + " memory=self.memory,\n", + " verbose=True,\n", + " handle_parsing_errors=True,\n", + " max_iterations=3 # Prevent infinite loops\n", + " )\n", + " \n", + " def _document_search(self, query: str) -> str:\n", + " \"\"\"Search across ALL documents in your existing RAG\"\"\"\n", + " results = self.rag.vector_db.similarity_search(query, k=5)\n", + " return \"\\n\".join([doc.page_content for doc in results])\n", + " \n", + " def generate_response(self, question):\n", + " \"\"\"Generate answer using agent system\"\"\"\n", + " try:\n", + " response = self.agent.run(question)\n", + " return response\n", + " except Exception as e:\n", + " return f\"خطا در پردازش: {str(e)}\"\n", + "\n", + "# 3. Example\n", + "if __name__ == \"__main__\":\n", + " # Init RAG\n", + " rag_system = ChromaRAGSystem()\n", + " \n", + " if not os.path.exists(CHROMA_PERSIST_DIR):\n", + " rag_system.build_vector_store()\n", + " else:\n", + " rag_system.load_vector_store()\n", + " \n", + " # Create agent-enhanced generator\n", + " enhanced_agent = AgentEnhancedGenerator(rag_system)\n", + " \n", + " # Test questions (mix of document and web queries)\n", + " test_questions = [\n", + " \"آخرین اخبار درباره شرکت تکاپو صنعت چیست؟\", # Will use web browser\n", + " \"تفاوت کربن اکتیو و کربن بلک چیست؟\", # Uses document search\n", + " \"آیا شاه عباس با خلیفه سلطان همکاری داشت؟\" # Uses conversation memory\n", + " ]\n", + " # Run queries\n", + " with open(r'~\\Desktop\\agent_results.txt', 'w', encoding='utf-8') as output_file:\n", + " for idx, question in enumerate(test_questions):\n", + " answer = enhanced_agent.generate_response(question)\n", + " output_file.write(f\"سوال {idx+1}:\\n{question}\\n\\nپاسخ:\\n{answer}\\n\\n{'='*50}\\n\\n\")\n", + " print(f\"پردازش سوال {idx+1}/{len(test_questions)} تکمیل شد\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbb77f3d-6aec-4414-ae74-4c849eda1ffc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef439935-9a7a-43fb-a314-8adde514551a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c37998a3-c371-4dcd-a8b8-c7ee2b574266", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe3a2891-f668-4105-901e-808d28ba7657", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ecdd22e-d9eb-4d37-8222-d90a4aa8ea3c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a05aea00-3ad5-4b67-8924-87adbb935e25", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}