from langchain.vectorstores import Chroma from langchain.embeddings import SentenceTransformerEmbeddings from langchain.schema import Document from ollama import chat import os import re import requests from bs4 import BeautifulSoup EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' LLM_MODEL = 'gemma2:9b' CHUNK_SIZE = 1000 OVERLAP = 200 CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db' class ChromaRAGSystem: def __init__(self): # Init embedding model self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL) # Vector store instance self.vector_db = None def build_vector_store(self, documents): """Create Chroma vector store from documents""" self.vector_db = Chroma.from_documents( documents=documents, embedding=self.embeddings, persist_directory=CHROMA_PERSIST_DIR ) self.vector_db.persist() def load_vector_store(self): """Load existing Chroma vector store""" self.vector_db = Chroma( persist_directory=CHROMA_PERSIST_DIR, embedding_function=self.embeddings ) def document_query(self, query, top_k=5): """Retrieve context from documents based on query""" # Perform similarity search across all documents results = self.vector_db.similarity_search(query=query, k=top_k) return [doc.page_content for doc in results] class AnswerGenerator: def __init__(self, rag_system): self.rag = rag_system def generate_response(self, question): """Generate context-aware answer using LLM""" # Retrieve relevant context from the best matching documents context_chunks = self.rag.document_query(question) context = "\n".join(context_chunks) prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید: {context} اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید سوال: {question} پاسخ:""" response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}]) return response['message']['content'] def scrape_url(url): """Scrape the content from a given URL""" response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Extract the article text (adjust this as per the specific page's structure) paragraphs = soup.find_all('p') article_text = "\n".join([para.get_text() for para in paragraphs]) return article_text if __name__ == "__main__": url = "https://tosinso.com/articles/40596" article_content = scrape_url(url) # Process the scraped content and create a vector store rag_system = ChromaRAGSystem() # Chunk the article content chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)] documents = [Document(page_content=chunk) for chunk in chunks] # Build vector store rag_system.build_vector_store(documents) # Init answer generator answer_engine = AnswerGenerator(rag_system) # The query to be answered query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟" # Generate and print the response answer = answer_engine.generate_response(query) print(answer)