Update 5. URL-Scraper Agent.ipynb

2025-03-03 15:46:32 +03:30 · 2025-03-03 15:46:32 +03:30 · fb52875428
commit fb52875428
parent d81c610125
1 changed files with 98 additions and 0 deletions
--- a/Agent.ipynb
+++ b/Agent.ipynb
@ -1 +1,99 @@
 from langchain.vectorstores import Chroma
 from langchain.embeddings import SentenceTransformerEmbeddings
 from langchain.schema import Document
 from ollama import chat
 import os
 import re
 import requests
 from bs4 import BeautifulSoup
 EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
 LLM_MODEL = 'gemma2:9b'
 CHUNK_SIZE = 1000
 OVERLAP = 200
 CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'
 class ChromaRAGSystem:
    def __init__(self):
        # Init embedding model
        self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
        # Vector store instance
        self.vector_db = None
    def build_vector_store(self, documents):
        """Create Chroma vector store from documents"""
        self.vector_db = Chroma.from_documents(
            documents=documents,
            embedding=self.embeddings,
            persist_directory=CHROMA_PERSIST_DIR
        )
        self.vector_db.persist()
    def load_vector_store(self):
        """Load existing Chroma vector store"""
        self.vector_db = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=self.embeddings
        )
    def document_query(self, query, top_k=5):
        """Retrieve context from documents based on query"""
        # Perform similarity search across all documents
        results = self.vector_db.similarity_search(query=query, k=top_k)
        return [doc.page_content for doc in results]
 class AnswerGenerator:
    def __init__(self, rag_system):
        self.rag = rag_system
    def generate_response(self, question):
        """Generate context-aware answer using LLM"""
        # Retrieve relevant context from the best matching documents
        context_chunks = self.rag.document_query(question)
        context = "\n".join(context_chunks)
        prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
 {context}
 اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید
 سوال: {question}
 پاسخ:"""
        response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
        return response['message']['content']
 def scrape_url(url):
    """Scrape the content from a given URL"""
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract the article text (adjust this as per the specific page's structure)
    paragraphs = soup.find_all('p')
    article_text = "\n".join([para.get_text() for para in paragraphs])
    return article_text
 if __name__ == "__main__":
    url = "https://tosinso.com/articles/40596"
    article_content = scrape_url(url)
    # Process the scraped content and create a vector store
    rag_system = ChromaRAGSystem()
    # Chunk the article content
    chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
    documents = [Document(page_content=chunk) for chunk in chunks]
    # Build vector store
    rag_system.build_vector_store(documents)
    # Init answer generator
    answer_engine = AnswerGenerator(rag_system)
    # The query to be answered
    query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"
    # Generate and print the response
    answer = answer_engine.generate_response(query)
    print(answer)