LLMs/5. URL-Scraper Agent.ipynb

from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document
from ollama import chat
import os
import re
import requests
from bs4 import BeautifulSoup

EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma2:9b'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'

class ChromaRAGSystem:
    def __init__(self):
        # Init embedding model
        self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
        # Vector store instance
        self.vector_db = None

    def build_vector_store(self, documents):
        """Create Chroma vector store from documents"""
        self.vector_db = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory=CHROMA_PERSIST_DIR
        )
        self.vector_db.persist()

    def load_vector_store(self):
        """Load existing Chroma vector store"""
        self.vector_db = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=self.embeddings
        )

    def document_query(self, query, top_k=5):
        """Retrieve context from documents based on query"""
        # Perform similarity search across all documents
        results = self.vector_db.similarity_search(query=query, k=top_k)
        return [doc.page_content for doc in results]

class AnswerGenerator:
    def __init__(self, rag_system):
        self.rag = rag_system

    def generate_response(self, question):
        """Generate context-aware answer using LLM"""
        # Retrieve relevant context from the best matching documents
        context_chunks = self.rag.document_query(question)
        context = "\n".join(context_chunks)

        prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
{context}

اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید

سوال: {question}
پاسخ:"""

        response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
        return response['message']['content']

def scrape_url(url):
    """Scrape the content from a given URL"""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the article text (adjust this as per the specific page's structure)
    paragraphs = soup.find_all('p')
    article_text = "\n".join([para.get_text() for para in paragraphs])

    return article_text

if __name__ == "__main__":
    url = "https://tosinso.com/articles/40596"
    article_content = scrape_url(url)

    # Process the scraped content and create a vector store
    rag_system = ChromaRAGSystem()

    # Chunk the article content
    chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
    documents = [Document(page_content=chunk) for chunk in chunks]

    # Build vector store
    rag_system.build_vector_store(documents)

    # Init answer generator
    answer_engine = AnswerGenerator(rag_system)

    # The query to be answered
    query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"

    # Generate and print the response
    answer = answer_engine.generate_response(query)
    print(answer)