{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f9327343-3e11-4a88-b798-95ff4644e2a5", "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Chroma\n", "from langchain.embeddings import SentenceTransformerEmbeddings\n", "from langchain.schema import Document\n", "from ollama import chat\n", "import os\n", "import re\n", "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", "LLM_MODEL = 'gemma2:9b'\n", "CHUNK_SIZE = 1000\n", "OVERLAP = 200\n", "CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db'\n", "\n", "class ChromaRAGSystem:\n", " def __init__(self):\n", " # Init embedding model\n", " self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n", " # Vector store instance\n", " self.vector_db = None\n", " \n", " def build_vector_store(self, documents):\n", " \"\"\"Create Chroma vector store from documents\"\"\"\n", " self.vector_db = Chroma.from_documents(\n", " documents=documents,\n", " embedding=self.embeddings,\n", " persist_directory=CHROMA_PERSIST_DIR\n", " )\n", " self.vector_db.persist()\n", " \n", " def load_vector_store(self):\n", " \"\"\"Load existing Chroma vector store\"\"\"\n", " self.vector_db = Chroma(\n", " persist_directory=CHROMA_PERSIST_DIR,\n", " embedding_function=self.embeddings\n", " )\n", " \n", " def document_query(self, query, top_k=5):\n", " \"\"\"Retrieve context from documents based on query\"\"\"\n", " # Perform similarity search across all documents\n", " results = self.vector_db.similarity_search(query=query, k=top_k)\n", " return [doc.page_content for doc in results]\n", "\n", "class AnswerGenerator:\n", " def __init__(self, rag_system):\n", " self.rag = rag_system\n", " \n", " def generate_response(self, question):\n", " \"\"\"Generate context-aware answer using LLM\"\"\"\n", " # Retrieve relevant context from the best matching documents\n", " context_chunks = self.rag.document_query(question)\n", " context = \"\\n\".join(context_chunks)\n", " \n", " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", "{context}\n", "\n", "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", "\n", "سوال: {question}\n", "پاسخ:\"\"\"\n", " \n", " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", " return response['message']['content']\n", "\n", "def scrape_url(url):\n", " \"\"\"Scrape the content from a given URL\"\"\"\n", " response = requests.get(url)\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " # Extract the article text (adjust this as per the specific page's structure)\n", " paragraphs = soup.find_all('p')\n", " article_text = \"\\n\".join([para.get_text() for para in paragraphs])\n", "\n", " return article_text\n", "\n", "if __name__ == \"__main__\":\n", " url = \"https://tosinso.com/articles/40596\"\n", " article_content = scrape_url(url)\n", "\n", " # Process the scraped content and create a vector store\n", " rag_system = ChromaRAGSystem()\n", "\n", " # Chunk the article content\n", " chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]\n", " documents = [Document(page_content=chunk) for chunk in chunks]\n", "\n", " # Build vector store\n", " rag_system.build_vector_store(documents)\n", "\n", " # Init answer generator\n", " answer_engine = AnswerGenerator(rag_system)\n", "\n", " # The query to be answered\n", " query = \"تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟\"\n", "\n", " # Generate and print the response\n", " answer = answer_engine.generate_response(query)\n", " print(answer)" ] }, { "cell_type": "code", "execution_count": null, "id": "cc9f2290-d16f-4722-857a-7996d4722857", "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "def search_internet(query):\n", " \"\"\"Search the web for the given query and return a relevant snippet.\"\"\"\n", " query = query.replace(\" \", \"+\") # Format the query for URLs\n", " url = f\"https://www.google.com/search?q={query}\"\n", " \n", " # Send a GET request to Google (NOTE: scraping Google directly can get blocked)\n", " headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\"\n", " }\n", " response = requests.get(url, headers=headers)\n", "\n", " if response.status_code != 200:\n", " return \"Error: Unable to retrieve data from the internet.\"\n", " \n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " # Scrape content from search result snippets (extract the first result snippet)\n", " search_results = soup.find_all('div', {'class': 'BNeawe iBp4i AP7Wnd'})\n", " if search_results:\n", " return search_results[0].get_text()\n", " \n", " return \"No relevant information found on the web.\"\n", "\n", "def generate_answer(query):\n", " \"\"\"Generate an answer by first checking Wikipedia and then searching the internet.\"\"\"\n", " # First, check Wikipedia for Persian content\n", " wikipedia_answer = search_wikipedia(query)\n", " if wikipedia_answer and \"Error\" not in wikipedia_answer:\n", " return wikipedia_answer\n", " \n", " # If not found in Wikipedia, search the web\n", " internet_answer = search_internet(query)\n", " return internet_answer\n", "\n", "if __name__ == \"__main__\":\n", " query = \"شاه عباس صفوی که بود و چه کرد؟\"\n", " \n", " # Get the answer from Wikipedia and Internet search\n", " answer = generate_answer(query)\n", " \n", " # Print the answer\n", " print(f\"Answer: {answer}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }