In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.schema import Document
from ollama import chat
import os
import re
import requests
from bs4 import BeautifulSoup

EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma2:9b'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = r'\home\Masih\chroma_db\chroma_db'

class ChromaRAGSystem:
 def __init__(self):
 # Init embedding model
 self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
 # Vector store instance
 self.vector_db = None
 
 def build_vector_store(self, documents):
 """Create Chroma vector store from documents"""
 self.vector_db = Chroma.from_documents(
 documents=documents,
 embedding=self.embeddings,
 persist_directory=CHROMA_PERSIST_DIR
 )
 self.vector_db.persist()
 
 def load_vector_store(self):
 """Load existing Chroma vector store"""
 self.vector_db = Chroma(
 persist_directory=CHROMA_PERSIST_DIR,
 embedding_function=self.embeddings
 )
 
 def document_query(self, query, top_k=5):
 """Retrieve context from documents based on query"""
 # Perform similarity search across all documents
 results = self.vector_db.similarity_search(query=query, k=top_k)
 return [doc.page_content for doc in results]

class AnswerGenerator:
 def __init__(self, rag_system):
 self.rag = rag_system
 
 def generate_response(self, question):
 """Generate context-aware answer using LLM"""
 # Retrieve relevant context from the best matching documents
 context_chunks = self.rag.document_query(question)
 context = "\n".join(context_chunks)
 
 prompt = f"""با استفاده از متن زیر به سوال پاسخ دهید:
{context}

اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید

سوال: {question}
پاسخ:"""
 
 response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])
 return response['message']['content']

def scrape_url(url):
 """Scrape the content from a given URL"""
 response = requests.get(url)
 soup = BeautifulSoup(response.content, 'html.parser')

 # Extract the article text (adjust this as per the specific page's structure)
 paragraphs = soup.find_all('p')
 article_text = "\n".join([para.get_text() for para in paragraphs])

 return article_text

if __name__ == "__main__":
 url = "https://tosinso.com/articles/40596"
 article_content = scrape_url(url)

 # Process the scraped content and create a vector store
 rag_system = ChromaRAGSystem()

 # Chunk the article content
 chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]
 documents = [Document(page_content=chunk) for chunk in chunks]

 # Build vector store
 rag_system.build_vector_store(documents)

 # Init answer generator
 answer_engine = AnswerGenerator(rag_system)

 # The query to be answered
 query = "تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟"

 # Generate and print the response
 answer = answer_engine.generate_response(query)
 print(answer)

In [None]:
import requests
from bs4 import BeautifulSoup

def search_internet(query):
 """Search the web for the given query and return a relevant snippet."""
 query = query.replace(" ", "+") # Format the query for URLs
 url = f"https://www.google.com/search?q={query}"
 
 # Send a GET request to Google (NOTE: scraping Google directly can get blocked)
 headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 }
 response = requests.get(url, headers=headers)

 if response.status_code != 200:
 return "Error: Unable to retrieve data from the internet."
 
 soup = BeautifulSoup(response.content, 'html.parser')

 # Scrape content from search result snippets (extract the first result snippet)
 search_results = soup.find_all('div', {'class': 'BNeawe iBp4i AP7Wnd'})
 if search_results:
 return search_results[0].get_text()
 
 return "No relevant information found on the web."

def generate_answer(query):
 """Generate an answer by first checking Wikipedia and then searching the internet."""
 # First, check Wikipedia for Persian content
 wikipedia_answer = search_wikipedia(query)
 if wikipedia_answer and "Error" not in wikipedia_answer:
 return wikipedia_answer
 
 # If not found in Wikipedia, search the web
 internet_answer = search_internet(query)
 return internet_answer

if __name__ == "__main__":
 query = "شاه عباس صفوی که بود و چه کرد؟"
 
 # Get the answer from Wikipedia and Internet search
 answer = generate_answer(query)
 
 # Print the answer
 print(f"Answer: {answer}")