{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "6b7fb76c-3c87-4f8c-9635-a2bddf5bc8ea", "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import SentenceTransformerEmbeddings\n", "from langchain.schema import Document\n", "from ollama import chat\n", "import os\n", "import re\n", "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", "LLM_MODEL = 'gemma2:9b'\n", "CHUNK_SIZE = 1000\n", "OVERLAP = 200\n", "CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db'\n", "\n", "class ChromaRAGSystem:\n", " def __init__(self):\n", " # Init embedding model\n", " self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n", " # Vector store instance\n", " self.vector_db = None\n", " \n", " def build_vector_store(self, documents):\n", " \"\"\"Create Chroma vector store from documents\"\"\"\n", " self.vector_db = Chroma.from_documents(\n", " documents=documents,\n", " embedding=self.embeddings,\n", " persist_directory=CHROMA_PERSIST_DIR\n", " )\n", " self.vector_db.persist()\n", " \n", " def load_vector_store(self):\n", " \"\"\"Load existing Chroma vector store\"\"\"\n", " self.vector_db = Chroma(\n", " persist_directory=CHROMA_PERSIST_DIR,\n", " embedding_function=self.embeddings\n", " )\n", " \n", " def document_query(self, query, top_k=5):\n", " \"\"\"Retrieve context from documents based on query\"\"\"\n", " # Perform similarity search across all documents\n", " results = self.vector_db.similarity_search(query=query, k=top_k)\n", " return [doc.page_content for doc in results]\n", "\n", "class AnswerGenerator:\n", " def __init__(self, rag_system):\n", " self.rag = rag_system\n", " \n", " def generate_response(self, question):\n", " \"\"\"Generate context-aware answer using LLM\"\"\"\n", " # Retrieve relevant context from the best matching documents\n", " context_chunks = self.rag.document_query(question)\n", " context = \"\\n\".join(context_chunks)\n", " \n", " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", "{context}\n", "\n", "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", "\n", "سوال: {question}\n", "پاسخ:\"\"\"\n", " \n", " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", " return response['message']['content']\n", "\n", "def scrape_url(url):\n", " \"\"\"Scrape the content from a given URL\"\"\"\n", " response = requests.get(url)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " # Extract the article text (adjust this as per the specific page's structure)\n", " paragraphs = soup.find_all('p')\n", " article_text = \"\\n\".join([para.get_text() for para in paragraphs])\n", "\n", " return article_text\n", "\n", "if __name__ == \"__main__\":\n", " url = \"https://tosinso.com/articles/40596\"\n", " article_content = scrape_url(url)\n", "\n", " # Process the scraped content and create a vector store\n", " rag_system = ChromaRAGSystem()\n", "\n", " # Chunk the article content\n", " chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]\n", " documents = [Document(page_content=chunk) for chunk in chunks]\n", "\n", " # Build vector store\n", " rag_system.build_vector_store(documents)\n", "\n", " # Init answer generator\n", " answer_engine = AnswerGenerator(rag_system)\n", "\n", " # The query to be answered\n", " query = \"تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟\"\n", "\n", " # Generate and print the response\n", " answer = answer_engine.generate_response(query)\n", " print(answer)" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:base] *", "language": "python", "name": "conda-base-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }