Update 5. URL-Scraper Agent.ipynb

This commit is contained in:
Masih Moafi 2025-03-03 15:46:32 +03:30 committed by GitHub
parent d81c610125
commit fb52875428
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1 +1,99 @@
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document
from ollama import chat
import os
import re
import requests
from bs4 import BeautifulSoup
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma2:9b'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'
class ChromaRAGSystem:
def __init__(self):
# Init embedding model
self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
# Vector store instance
self.vector_db = None
def build_vector_store(self, documents):
"""Create Chroma vector store from documents"""
self.vector_db = Chroma.from_documents(
documents=documents,
embedding=self.embeddings,
persist_directory=CHROMA_PERSIST_DIR
)
self.vector_db.persist()
def load_vector_store(self):
"""Load existing Chroma vector store"""
self.vector_db = Chroma(
persist_directory=CHROMA_PERSIST_DIR,
embedding_function=self.embeddings
)
def document_query(self, query, top_k=5):
"""Retrieve context from documents based on query"""
# Perform similarity search across all documents
results = self.vector_db.similarity_search(query=query, k=top_k)
return [doc.page_content for doc in results]
class AnswerGenerator:
def __init__(self, rag_system):
self.rag = rag_system
def generate_response(self, question):
"""Generate context-aware answer using LLM"""
# Retrieve relevant context from the best matching documents
context_chunks = self.rag.document_query(question)
context = "\n".join(context_chunks)
prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
{context}
اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید
سوال: {question}
پاسخ:"""
response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
return response['message']['content']
def scrape_url(url):
"""Scrape the content from a given URL"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the article text (adjust this as per the specific page's structure)
paragraphs = soup.find_all('p')
article_text = "\n".join([para.get_text() for para in paragraphs])
return article_text
if __name__ == "__main__":
url = "https://tosinso.com/articles/40596"
article_content = scrape_url(url)
# Process the scraped content and create a vector store
rag_system = ChromaRAGSystem()
# Chunk the article content
chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
documents = [Document(page_content=chunk) for chunk in chunks]
# Build vector store
rag_system.build_vector_store(documents)
# Init answer generator
answer_engine = AnswerGenerator(rag_system)
# The query to be answered
query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"
# Generate and print the response
answer = answer_engine.generate_response(query)
print(answer)