Delete 5. URL-Scraper Agent.ipynb
This commit is contained in:
parent
fb52875428
commit
991520f126
@ -1,99 +0,0 @@
|
|||||||
from langchain.vectorstores import Chroma
|
|
||||||
from langchain.embeddings import SentenceTransformerEmbeddings
|
|
||||||
from langchain.schema import Document
|
|
||||||
from ollama import chat
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
|
|
||||||
LLM_MODEL = 'gemma2:9b'
|
|
||||||
CHUNK_SIZE = 1000
|
|
||||||
OVERLAP = 200
|
|
||||||
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'
|
|
||||||
|
|
||||||
class ChromaRAGSystem:
|
|
||||||
def __init__(self):
|
|
||||||
# Init embedding model
|
|
||||||
self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
|
|
||||||
# Vector store instance
|
|
||||||
self.vector_db = None
|
|
||||||
|
|
||||||
def build_vector_store(self, documents):
|
|
||||||
"""Create Chroma vector store from documents"""
|
|
||||||
self.vector_db = Chroma.from_documents(
|
|
||||||
documents=documents,
|
|
||||||
embedding=self.embeddings,
|
|
||||||
persist_directory=CHROMA_PERSIST_DIR
|
|
||||||
)
|
|
||||||
self.vector_db.persist()
|
|
||||||
|
|
||||||
def load_vector_store(self):
|
|
||||||
"""Load existing Chroma vector store"""
|
|
||||||
self.vector_db = Chroma(
|
|
||||||
persist_directory=CHROMA_PERSIST_DIR,
|
|
||||||
embedding_function=self.embeddings
|
|
||||||
)
|
|
||||||
|
|
||||||
def document_query(self, query, top_k=5):
|
|
||||||
"""Retrieve context from documents based on query"""
|
|
||||||
# Perform similarity search across all documents
|
|
||||||
results = self.vector_db.similarity_search(query=query, k=top_k)
|
|
||||||
return [doc.page_content for doc in results]
|
|
||||||
|
|
||||||
class AnswerGenerator:
|
|
||||||
def __init__(self, rag_system):
|
|
||||||
self.rag = rag_system
|
|
||||||
|
|
||||||
def generate_response(self, question):
|
|
||||||
"""Generate context-aware answer using LLM"""
|
|
||||||
# Retrieve relevant context from the best matching documents
|
|
||||||
context_chunks = self.rag.document_query(question)
|
|
||||||
context = "\n".join(context_chunks)
|
|
||||||
|
|
||||||
prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
|
|
||||||
{context}
|
|
||||||
|
|
||||||
اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید
|
|
||||||
|
|
||||||
سوال: {question}
|
|
||||||
پاسخ:"""
|
|
||||||
|
|
||||||
response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
|
|
||||||
return response['message']['content']
|
|
||||||
|
|
||||||
def scrape_url(url):
|
|
||||||
"""Scrape the content from a given URL"""
|
|
||||||
response = requests.get(url)
|
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
|
||||||
|
|
||||||
# Extract the article text (adjust this as per the specific page's structure)
|
|
||||||
paragraphs = soup.find_all('p')
|
|
||||||
article_text = "\n".join([para.get_text() for para in paragraphs])
|
|
||||||
|
|
||||||
return article_text
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
url = "https://tosinso.com/articles/40596"
|
|
||||||
article_content = scrape_url(url)
|
|
||||||
|
|
||||||
# Process the scraped content and create a vector store
|
|
||||||
rag_system = ChromaRAGSystem()
|
|
||||||
|
|
||||||
# Chunk the article content
|
|
||||||
chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
|
|
||||||
documents = [Document(page_content=chunk) for chunk in chunks]
|
|
||||||
|
|
||||||
# Build vector store
|
|
||||||
rag_system.build_vector_store(documents)
|
|
||||||
|
|
||||||
# Init answer generator
|
|
||||||
answer_engine = AnswerGenerator(rag_system)
|
|
||||||
|
|
||||||
# The query to be answered
|
|
||||||
query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"
|
|
||||||
|
|
||||||
# Generate and print the response
|
|
||||||
answer = answer_engine.generate_response(query)
|
|
||||||
print(answer)
|
|
||||||
Loading…
x
Reference in New Issue
Block a user