diff --git a/5. URL-Scraper Agent.ipynb b/5. URL-Scraper Agent.ipynb index 8b13789..aab7b56 100644 --- a/5. URL-Scraper Agent.ipynb +++ b/5. URL-Scraper Agent.ipynb @@ -1 +1,99 @@ +from langchain.vectorstores import Chroma +from langchain.embeddings import SentenceTransformerEmbeddings +from langchain.schema import Document +from ollama import chat +import os +import re +import requests +from bs4 import BeautifulSoup +EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' +LLM_MODEL = 'gemma2:9b' +CHUNK_SIZE = 1000 +OVERLAP = 200 +CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db' + +class ChromaRAGSystem: + def __init__(self): + # Init embedding model + self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL) + # Vector store instance + self.vector_db = None + + def build_vector_store(self, documents): + """Create Chroma vector store from documents""" + self.vector_db = Chroma.from_documents( + documents=documents, + embedding=self.embeddings, + persist_directory=CHROMA_PERSIST_DIR + ) + self.vector_db.persist() + + def load_vector_store(self): + """Load existing Chroma vector store""" + self.vector_db = Chroma( + persist_directory=CHROMA_PERSIST_DIR, + embedding_function=self.embeddings + ) + + def document_query(self, query, top_k=5): + """Retrieve context from documents based on query""" + # Perform similarity search across all documents + results = self.vector_db.similarity_search(query=query, k=top_k) + return [doc.page_content for doc in results] + +class AnswerGenerator: + def __init__(self, rag_system): + self.rag = rag_system + + def generate_response(self, question): + """Generate context-aware answer using LLM""" + # Retrieve relevant context from the best matching documents + context_chunks = self.rag.document_query(question) + context = "\n".join(context_chunks) + + prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید: +{context} + +اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید + +سوال: {question} +پاسخ:""" + + response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}]) + return response['message']['content'] + +def scrape_url(url): + """Scrape the content from a given URL""" + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + # Extract the article text (adjust this as per the specific page's structure) + paragraphs = soup.find_all('p') + article_text = "\n".join([para.get_text() for para in paragraphs]) + + return article_text + +if __name__ == "__main__": + url = "https://tosinso.com/articles/40596" + article_content = scrape_url(url) + + # Process the scraped content and create a vector store + rag_system = ChromaRAGSystem() + + # Chunk the article content + chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)] + documents = [Document(page_content=chunk) for chunk in chunks] + + # Build vector store + rag_system.build_vector_store(documents) + + # Init answer generator + answer_engine = AnswerGenerator(rag_system) + + # The query to be answered + query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟" + + # Generate and print the response + answer = answer_engine.generate_response(query) + print(answer)