diff --git a/5. URL-Scraper Agent.ipynb b/5. URL-Scraper Agent.ipynb deleted file mode 100644 index aab7b56..0000000 --- a/5. URL-Scraper Agent.ipynb +++ /dev/null @@ -1,99 +0,0 @@ -from langchain.vectorstores import Chroma -from langchain.embeddings import SentenceTransformerEmbeddings -from langchain.schema import Document -from ollama import chat -import os -import re -import requests -from bs4 import BeautifulSoup - -EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' -LLM_MODEL = 'gemma2:9b' -CHUNK_SIZE = 1000 -OVERLAP = 200 -CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db' - -class ChromaRAGSystem: - def __init__(self): - # Init embedding model - self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL) - # Vector store instance - self.vector_db = None - - def build_vector_store(self, documents): - """Create Chroma vector store from documents""" - self.vector_db = Chroma.from_documents( - documents=documents, - embedding=self.embeddings, - persist_directory=CHROMA_PERSIST_DIR - ) - self.vector_db.persist() - - def load_vector_store(self): - """Load existing Chroma vector store""" - self.vector_db = Chroma( - persist_directory=CHROMA_PERSIST_DIR, - embedding_function=self.embeddings - ) - - def document_query(self, query, top_k=5): - """Retrieve context from documents based on query""" - # Perform similarity search across all documents - results = self.vector_db.similarity_search(query=query, k=top_k) - return [doc.page_content for doc in results] - -class AnswerGenerator: - def __init__(self, rag_system): - self.rag = rag_system - - def generate_response(self, question): - """Generate context-aware answer using LLM""" - # Retrieve relevant context from the best matching documents - context_chunks = self.rag.document_query(question) - context = "\n".join(context_chunks) - - prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید: -{context} - -اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید - -سوال: {question} -پاسخ:""" - - response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}]) - return response['message']['content'] - -def scrape_url(url): - """Scrape the content from a given URL""" - response = requests.get(url) - soup = BeautifulSoup(response.content, 'html.parser') - - # Extract the article text (adjust this as per the specific page's structure) - paragraphs = soup.find_all('p') - article_text = "\n".join([para.get_text() for para in paragraphs]) - - return article_text - -if __name__ == "__main__": - url = "https://tosinso.com/articles/40596" - article_content = scrape_url(url) - - # Process the scraped content and create a vector store - rag_system = ChromaRAGSystem() - - # Chunk the article content - chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)] - documents = [Document(page_content=chunk) for chunk in chunks] - - # Build vector store - rag_system.build_vector_store(documents) - - # Init answer generator - answer_engine = AnswerGenerator(rag_system) - - # The query to be answered - query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟" - - # Generate and print the response - answer = answer_engine.generate_response(query) - print(answer)