From a80e9ef5b7f3d305822e084ec2a4e277a2060674 Mon Sep 17 00:00:00 2001 From: Masih Moafi <132553157+MasihMoafi@users.noreply.github.com> Date: Mon, 3 Mar 2025 04:22:12 -0800 Subject: [PATCH] Add files via upload --- 5. URL-scraper.ipynb | 133 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 5. URL-scraper.ipynb diff --git a/5. URL-scraper.ipynb b/5. URL-scraper.ipynb new file mode 100644 index 0000000..f2407e8 --- /dev/null +++ b/5. URL-scraper.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "6b7fb76c-3c87-4f8c-9635-a2bddf5bc8ea", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings import SentenceTransformerEmbeddings\n", + "from langchain.schema import Document\n", + "from ollama import chat\n", + "import os\n", + "import re\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n", + "LLM_MODEL = 'gemma2:9b'\n", + "CHUNK_SIZE = 1000\n", + "OVERLAP = 200\n", + "CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db'\n", + "\n", + "class ChromaRAGSystem:\n", + " def __init__(self):\n", + " # Init embedding model\n", + " self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n", + " # Vector store instance\n", + " self.vector_db = None\n", + " \n", + " def build_vector_store(self, documents):\n", + " \"\"\"Create Chroma vector store from documents\"\"\"\n", + " self.vector_db = Chroma.from_documents(\n", + " documents=documents,\n", + " embedding=self.embeddings,\n", + " persist_directory=CHROMA_PERSIST_DIR\n", + " )\n", + " self.vector_db.persist()\n", + " \n", + " def load_vector_store(self):\n", + " \"\"\"Load existing Chroma vector store\"\"\"\n", + " self.vector_db = Chroma(\n", + " persist_directory=CHROMA_PERSIST_DIR,\n", + " embedding_function=self.embeddings\n", + " )\n", + " \n", + " def document_query(self, query, top_k=5):\n", + " \"\"\"Retrieve context from documents based on query\"\"\"\n", + " # Perform similarity search across all documents\n", + " results = self.vector_db.similarity_search(query=query, k=top_k)\n", + " return [doc.page_content for doc in results]\n", + "\n", + "class AnswerGenerator:\n", + " def __init__(self, rag_system):\n", + " self.rag = rag_system\n", + " \n", + " def generate_response(self, question):\n", + " \"\"\"Generate context-aware answer using LLM\"\"\"\n", + " # Retrieve relevant context from the best matching documents\n", + " context_chunks = self.rag.document_query(question)\n", + " context = \"\\n\".join(context_chunks)\n", + " \n", + " prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n", + "{context}\n", + "\n", + "اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n", + "\n", + "سوال: {question}\n", + "پاسخ:\"\"\"\n", + " \n", + " response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n", + " return response['message']['content']\n", + "\n", + "def scrape_url(url):\n", + " \"\"\"Scrape the content from a given URL\"\"\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + "\n", + " # Extract the article text (adjust this as per the specific page's structure)\n", + " paragraphs = soup.find_all('p')\n", + " article_text = \"\\n\".join([para.get_text() for para in paragraphs])\n", + "\n", + " return article_text\n", + "\n", + "if __name__ == \"__main__\":\n", + " url = \"https://tosinso.com/articles/40596\"\n", + " article_content = scrape_url(url)\n", + "\n", + " # Process the scraped content and create a vector store\n", + " rag_system = ChromaRAGSystem()\n", + "\n", + " # Chunk the article content\n", + " chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]\n", + " documents = [Document(page_content=chunk) for chunk in chunks]\n", + "\n", + " # Build vector store\n", + " rag_system.build_vector_store(documents)\n", + "\n", + " # Init answer generator\n", + " answer_engine = AnswerGenerator(rag_system)\n", + "\n", + " # The query to be answered\n", + " query = \"تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟\"\n", + "\n", + " # Generate and print the response\n", + " answer = answer_engine.generate_response(query)\n", + " print(answer)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}