Update 5. URL-Scraper Agent.ipynb
This commit is contained in:
parent
d81c610125
commit
fb52875428
@ -1 +1,99 @@
|
|||||||
|
from langchain.vectorstores import Chroma
|
||||||
|
from langchain.embeddings import SentenceTransformerEmbeddings
|
||||||
|
from langchain.schema import Document
|
||||||
|
from ollama import chat
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
|
||||||
|
LLM_MODEL = 'gemma2:9b'
|
||||||
|
CHUNK_SIZE = 1000
|
||||||
|
OVERLAP = 200
|
||||||
|
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'
|
||||||
|
|
||||||
|
class ChromaRAGSystem:
|
||||||
|
def __init__(self):
|
||||||
|
# Init embedding model
|
||||||
|
self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
|
||||||
|
# Vector store instance
|
||||||
|
self.vector_db = None
|
||||||
|
|
||||||
|
def build_vector_store(self, documents):
|
||||||
|
"""Create Chroma vector store from documents"""
|
||||||
|
self.vector_db = Chroma.from_documents(
|
||||||
|
documents=documents,
|
||||||
|
embedding=self.embeddings,
|
||||||
|
persist_directory=CHROMA_PERSIST_DIR
|
||||||
|
)
|
||||||
|
self.vector_db.persist()
|
||||||
|
|
||||||
|
def load_vector_store(self):
|
||||||
|
"""Load existing Chroma vector store"""
|
||||||
|
self.vector_db = Chroma(
|
||||||
|
persist_directory=CHROMA_PERSIST_DIR,
|
||||||
|
embedding_function=self.embeddings
|
||||||
|
)
|
||||||
|
|
||||||
|
def document_query(self, query, top_k=5):
|
||||||
|
"""Retrieve context from documents based on query"""
|
||||||
|
# Perform similarity search across all documents
|
||||||
|
results = self.vector_db.similarity_search(query=query, k=top_k)
|
||||||
|
return [doc.page_content for doc in results]
|
||||||
|
|
||||||
|
class AnswerGenerator:
|
||||||
|
def __init__(self, rag_system):
|
||||||
|
self.rag = rag_system
|
||||||
|
|
||||||
|
def generate_response(self, question):
|
||||||
|
"""Generate context-aware answer using LLM"""
|
||||||
|
# Retrieve relevant context from the best matching documents
|
||||||
|
context_chunks = self.rag.document_query(question)
|
||||||
|
context = "\n".join(context_chunks)
|
||||||
|
|
||||||
|
prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
|
||||||
|
{context}
|
||||||
|
|
||||||
|
اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید
|
||||||
|
|
||||||
|
سوال: {question}
|
||||||
|
پاسخ:"""
|
||||||
|
|
||||||
|
response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
|
||||||
|
return response['message']['content']
|
||||||
|
|
||||||
|
def scrape_url(url):
|
||||||
|
"""Scrape the content from a given URL"""
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
# Extract the article text (adjust this as per the specific page's structure)
|
||||||
|
paragraphs = soup.find_all('p')
|
||||||
|
article_text = "\n".join([para.get_text() for para in paragraphs])
|
||||||
|
|
||||||
|
return article_text
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
url = "https://tosinso.com/articles/40596"
|
||||||
|
article_content = scrape_url(url)
|
||||||
|
|
||||||
|
# Process the scraped content and create a vector store
|
||||||
|
rag_system = ChromaRAGSystem()
|
||||||
|
|
||||||
|
# Chunk the article content
|
||||||
|
chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
|
||||||
|
documents = [Document(page_content=chunk) for chunk in chunks]
|
||||||
|
|
||||||
|
# Build vector store
|
||||||
|
rag_system.build_vector_store(documents)
|
||||||
|
|
||||||
|
# Init answer generator
|
||||||
|
answer_engine = AnswerGenerator(rag_system)
|
||||||
|
|
||||||
|
# The query to be answered
|
||||||
|
query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"
|
||||||
|
|
||||||
|
# Generate and print the response
|
||||||
|
answer = answer_engine.generate_response(query)
|
||||||
|
print(answer)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user