LLMs/5. URL-Scraper Agent.ipynb
2025-03-03 15:46:32 +03:30

100 lines
3.4 KiB
Plaintext

from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document
from ollama import chat
import os
import re
import requests
from bs4 import BeautifulSoup
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma2:9b'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'
class ChromaRAGSystem:
def __init__(self):
# Init embedding model
self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
# Vector store instance
self.vector_db = None
def build_vector_store(self, documents):
"""Create Chroma vector store from documents"""
self.vector_db = Chroma.from_documents(
documents=documents,
embedding=self.embeddings,
persist_directory=CHROMA_PERSIST_DIR
)
self.vector_db.persist()
def load_vector_store(self):
"""Load existing Chroma vector store"""
self.vector_db = Chroma(
persist_directory=CHROMA_PERSIST_DIR,
embedding_function=self.embeddings
)
def document_query(self, query, top_k=5):
"""Retrieve context from documents based on query"""
# Perform similarity search across all documents
results = self.vector_db.similarity_search(query=query, k=top_k)
return [doc.page_content for doc in results]
class AnswerGenerator:
def __init__(self, rag_system):
self.rag = rag_system
def generate_response(self, question):
"""Generate context-aware answer using LLM"""
# Retrieve relevant context from the best matching documents
context_chunks = self.rag.document_query(question)
context = "\n".join(context_chunks)
prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
{context}
اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید
سوال: {question}
پاسخ:"""
response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
return response['message']['content']
def scrape_url(url):
"""Scrape the content from a given URL"""
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the article text (adjust this as per the specific page's structure)
paragraphs = soup.find_all('p')
article_text = "\n".join([para.get_text() for para in paragraphs])
return article_text
if __name__ == "__main__":
url = "https://tosinso.com/articles/40596"
article_content = scrape_url(url)
# Process the scraped content and create a vector store
rag_system = ChromaRAGSystem()
# Chunk the article content
chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
documents = [Document(page_content=chunk) for chunk in chunks]
# Build vector store
rag_system.build_vector_store(documents)
# Init answer generator
answer_engine = AnswerGenerator(rag_system)
# The query to be answered
query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"
# Generate and print the response
answer = answer_engine.generate_response(query)
print(answer)