diff --git a/5. URL-Scraper Agent.ipynb b/5. URL-Scraper Agent.ipynb index 62d6964..8b13789 100644 --- a/5. URL-Scraper Agent.ipynb +++ b/5. URL-Scraper Agent.ipynb @@ -1,116 +1 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "f9327343-3e11-4a88-b798-95ff4644e2a5", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.vectorstores import Chroma\n", - "from langchain.embeddings import SentenceTransformerEmbeddings\n", - "from langchain.schema import Document\n", - "from ollama import chat\n", - "import os\n", - "import re\n", - "import requests\n", - "from bs4 import BeautifulSoup\n", - "\n", - "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", - "LLM_MODEL = 'gemma2:9b'\n", - "CHUNK_SIZE = 1000\n", - "OVERLAP = 200\n", - "CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db'\n", - "\n", - "class ChromaRAGSystem:\n", - " def __init__(self):\n", - " # Init embedding model\n", - " self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n", - " # Vector store instance\n", - " self.vector_db = None\n", - " \n", - " def build_vector_store(self, documents):\n", - " \"\"\"Create Chroma vector store from documents\"\"\"\n", - " self.vector_db = Chroma.from_documents(\n", - " documents=documents,\n", - " embedding=self.embeddings,\n", - " persist_directory=CHROMA_PERSIST_DIR\n", - " )\n", - " self.vector_db.persist()\n", - " \n", - " def load_vector_store(self):\n", - " \"\"\"Load existing Chroma vector store\"\"\"\n", - " self.vector_db = Chroma(\n", - " persist_directory=CHROMA_PERSIST_DIR,\n", - " embedding_function=self.embeddings\n", - " )\n", - " \n", - " def document_query(self, query, top_k=5):\n", - " \"\"\"Retrieve context from documents based on query\"\"\"\n", - " # Perform similarity search across all documents\n", - " results = self.vector_db.similarity_search(query=query, k=top_k)\n", - " return [doc.page_content for doc in results]\n", - "\n", - "class AnswerGenerator:\n", - " def __init__(self, rag_system):\n", - " self.rag = rag_system\n", - " \n", - " def generate_response(self, question):\n", - " \"\"\"Generate context-aware answer using LLM\"\"\"\n", - " # Retrieve relevant context from the best matching documents\n", - " context_chunks = self.rag.document_query(question)\n", - " context = \"\\n\".join(context_chunks)\n", - " \n", - " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", - "{context}\n", - "\n", - "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", - "\n", - "سوال: {question}\n", - "پاسخ:\"\"\"\n", - " \n", - " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", - " return response['message']['content']\n", - "\n", - "def scrape_url(url):\n", - " \"\"\"Scrape the content from a given URL\"\"\"\n", - " response = requests.get(url)\n", - " soup = BeautifulSoup(response.content, 'html.parser')\n", - "\n", - " # Extract the article text (adjust this as per the specific page's structure)\n", - " paragraphs = soup.find_all('p')\n", - " article_text = \"\\n\".join([para.get_text() for para in paragraphs])\n", - "\n", - " return article_text\n", - "\n", - "if __name__ == \"__main__\":\n", - " url = \"https://tosinso.com/articles/40596\"\n", - " article_content = scrape_url(url)\n", - "\n", - " # Process the scraped content and create a vector store\n", - " rag_system = ChromaRAGSystem()\n", - "\n", - " # Chunk the article content\n", - " chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]\n", - " documents = [Document(page_content=chunk) for chunk in chunks]\n", - "\n", - " # Build vector store\n", - " rag_system.build_vector_store(documents)\n", - "\n", - " # Init answer generator\n", - " answer_engine = AnswerGenerator(rag_system)\n", - "\n", - " # The query to be answered\n", - " query = \"تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟\"\n", - "\n", - " # Generate and print the response\n", - " answer = answer_engine.generate_response(query)\n", - " print(answer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc9f2290-d16f-4722-857a-7996d4722857", - "metadata": {}, - "outputs": [] +