Update enhanced_combined.py

This commit is contained in:
MasihMoafi 2025-05-02 11:02:36 +00:00
parent 96ccd3d78d
commit 6b0442c791

View File

@ -1,460 +1,340 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os import os
import re import pickle
import json import json
import ssl import nltk
import argparse
import requests import requests
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import quote from urllib.parse import quote
from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.document_loaders import PDFPlumberLoader, WebBaseLoader
from langchain_community.vectorstores import Chroma from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document from langchain_community.retrievers import BM25Retriever
import traceback
# Disable SSL warnings and proxy settings try:
ssl._create_default_https_context = ssl._create_unverified_context nltk.data.find('tokenizers/punkt')
requests.packages.urllib3.disable_warnings() except LookupError:
nltk.download('punkt')
def clear_proxy_settings(): class ModularRAG:
"""Remove proxy environment variables that might cause connection issues.""" def __init__(self):
for var in ["HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY", "http_proxy", "https_proxy", "all_proxy"]: self.storage_path = "./rag_data"
if var in os.environ:
print(f"Removing proxy env var: {var}")
del os.environ[var]
# Run at module load time if not os.path.exists(self.storage_path):
clear_proxy_settings() os.makedirs(self.storage_path)
os.makedirs(os.path.join(self.storage_path, "documents"))
os.makedirs(os.path.join(self.storage_path, "web_results"))
# Configuration self.documents = []
DOCUMENT_PATHS = [ self.web_results = []
r'doc1.txt',
r'doc2.txt',
r'doc3.txt',
r'doc4.txt',
r'doc5.txt',
r'doc6.txt'
]
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma3'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = 'chroma_db'
# Confidence thresholds # Web crawler settings
THRESHOLDS = { self.headers = {
'direct_answer': 0.7, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
'rag_confidence': 0.6,
'web_search': 0.5
}
def query_llm(prompt, model='gemma3'):
"""Query the LLM model directly using Ollama API."""
try:
ollama_endpoint = "http://localhost:11434/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": False
} }
response = requests.post(ollama_endpoint, json=payload) self.num_search_results = 10
self.max_depth = 2
self.max_links_per_page = 5
self.max_paragraphs = 5
if response.status_code == 200: self._load_saved_data()
result = response.json()
return result.get('response', '')
else:
print(f"Ollama API error: {response.status_code}")
return f"Error calling Ollama API: {response.status_code}"
except Exception as e:
print(f"Error querying LLM: {e}")
return f"Error: {str(e)}"
class BM25Retriever: def _load_saved_data(self):
"""BM25 retriever implementation for text similarity search""" doc_path = os.path.join(self.storage_path, "documents", "docs.pkl")
web_path = os.path.join(self.storage_path, "web_results", "web.json")
@classmethod if os.path.exists(doc_path):
def from_documents(cls, documents): try:
"""Create a BM25 retriever from documents""" with open(doc_path, 'rb') as f:
retriever = cls() self.documents = pickle.load(f)
retriever.documents = documents except Exception as e:
retriever.k = 4 print(f"خطا در بارگیری اسناد: {e}")
return retriever
def get_relevant_documents(self, query): if os.path.exists(web_path):
"""Get relevant documents using BM25 algorithm""" try:
# Simple BM25-like implementation with open(web_path, 'r', encoding='utf-8') as f:
scores = [] self.web_results = json.load(f)
query_terms = set(re.findall(r'\b\w+\b', query.lower())) except Exception as e:
print(f"خطا در بارگیری نتایج وب: {e}")
for doc in self.documents: def _save_documents(self):
doc_terms = set(re.findall(r'\b\w+\b', doc.page_content.lower())) doc_path = os.path.join(self.storage_path, "documents", "docs.pkl")
# Calculate term overlap as a simple approximation of BM25
overlap = len(query_terms.intersection(doc_terms))
scores.append((doc, overlap))
# Sort by score and return top k
sorted_docs = [doc for doc, score in sorted(scores, key=lambda x: x[1], reverse=True)]
return sorted_docs[:self.k]
class HybridRetriever:
"""Hybrid retriever combining BM25 and vector search with configurable weights"""
def __init__(self, vector_retriever, bm25_retriever, vector_weight=0.3):
"""Initialize with separate retrievers and weights"""
self._vector_retriever = vector_retriever
self._bm25_retriever = bm25_retriever
self._vector_weight = vector_weight
self._bm25_weight = 1.0 - vector_weight
def get_relevant_documents(self, query):
"""Get relevant documents using weighted combination of retrievers"""
try: try:
# Get results from both retrievers with open(doc_path, 'wb') as f:
vector_docs = self._vector_retriever.get_relevant_documents(query) pickle.dump(self.documents, f)
bm25_docs = self._bm25_retriever.get_relevant_documents(query)
# Create dictionary to track unique documents and their scores
doc_dict = {}
# Add vector docs with their weights
for i, doc in enumerate(vector_docs):
# Score based on position (inverse rank)
score = (len(vector_docs) - i) * self._vector_weight
doc_id = doc.page_content[:50] # Use first 50 chars as a simple ID
if doc_id in doc_dict:
doc_dict[doc_id]["score"] += score
else:
doc_dict[doc_id] = {"doc": doc, "score": score}
# Add BM25 docs with their weights
for i, doc in enumerate(bm25_docs):
# Score based on position (inverse rank)
score = (len(bm25_docs) - i) * self._bm25_weight
doc_id = doc.page_content[:50] # Use first 50 chars as a simple ID
if doc_id in doc_dict:
doc_dict[doc_id]["score"] += score
else:
doc_dict[doc_id] = {"doc": doc, "score": score}
# Sort by combined score (highest first)
sorted_docs = sorted(doc_dict.values(), key=lambda x: x["score"], reverse=True)
# Return just the document objects
return [item["doc"] for item in sorted_docs]
except Exception as e: except Exception as e:
print(f"Error in hybrid retrieval: {e}") print(f"خطا در ذخیره‌سازی اسناد: {e}")
def _save_web_results(self):
web_path = os.path.join(self.storage_path, "web_results", "web.json")
try:
with open(web_path, 'w', encoding='utf-8') as f:
json.dump(self.web_results, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"خطا در ذخیره‌سازی نتایج وب: {e}")
def load_pdf(self, file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"فایل یافت نشد: {file_path}")
try:
loader = PDFPlumberLoader(file_path)
documents = loader.load()
if documents:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True
)
chunked_docs = text_splitter.split_documents(documents)
self.documents.extend(chunked_docs)
self._save_documents()
return len(chunked_docs)
return 0
except Exception as e:
raise Exception(f"خطا در بارگیری PDF: {e}")
def search_duckduckgo(self, query, num_results=None):
if num_results is None:
num_results = self.num_search_results
try:
search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
response = requests.get(search_url, headers=self.headers, timeout=10)
if response.status_code != 200:
print(f"خطا در جستجوی وب: HTTP {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for element in soup.select('.result__url, .result__a'):
href = element.get('href') if 'href' in element.attrs else None
if href and not href.startswith('/') and (href.startswith('http://') or href.startswith('https://')):
results.append(href)
elif not href and element.find('a') and 'href' in element.find('a').attrs:
href = element.find('a')['href']
if href and not href.startswith('/'):
results.append(href)
unique_results = list(set(results))
return unique_results[:num_results]
except Exception as e:
print(f"خطا در جستجوی DuckDuckGo: {e}")
return [] return []
class AgenticQASystem: def crawl_page(self, url, depth=0):
"""QA system implementing the specified architecture""" if depth > self.max_depth:
return None, []
def __init__(self):
"""Initialize the QA system with retrievers"""
# Load embeddings
self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
# Load documents and retrievers
self.documents = self.load_documents()
self.retriever = self.initialize_retriever()
def load_documents(self):
"""Load documents from configured paths with sliding window chunking"""
print("Loading documents...")
docs = []
for path in DOCUMENT_PATHS:
try:
with open(path, 'r', encoding='utf-8') as f:
text = re.sub(r'\s+', ' ', f.read()).strip()
# Sliding window chunking
chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE - OVERLAP)]
for chunk in chunks:
docs.append(Document(
page_content=chunk,
metadata={"source": os.path.basename(path)}
))
except Exception as e:
print(f"Error loading document {path}: {e}")
print(f"Loaded {len(docs)} document chunks")
return docs
def initialize_retriever(self):
"""Initialize the hybrid retriever with BM25 and direct Chroma queries"""
if not self.documents:
print("No documents loaded, retriever initialization failed")
return None
try: try:
# Create BM25 retriever response = requests.get(url, headers=self.headers, timeout=10)
bm25_retriever = BM25Retriever.from_documents(self.documents) response.raise_for_status()
bm25_retriever.k = 4 # Top k results to retrieve
# Initialize vector store with KNN search soup = BeautifulSoup(response.text, 'html.parser')
import shutil
if os.path.exists(CHROMA_PERSIST_DIR):
print(f"Removing existing Chroma DB to prevent dimension mismatch")
shutil.rmtree(CHROMA_PERSIST_DIR)
# Create vector store directly from Chroma title = soup.title.string if soup.title else "بدون عنوان"
print("Creating vector store...")
vector_store = Chroma.from_documents(
documents=self.documents,
embedding=self.embeddings,
persist_directory=CHROMA_PERSIST_DIR
)
vector_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4}) paragraphs = []
print(f"Vector retriever created: {type(vector_retriever)}") for p in soup.find_all('p'):
text = p.get_text(strip=True)
if len(text) > 50:
paragraphs.append(text)
if len(paragraphs) >= self.max_paragraphs:
break
# Create hybrid retriever - BM25 (70%) and Vector (30%) links = []
print("Creating hybrid retriever") for a in soup.find_all('a', href=True):
hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever, vector_weight=0.3) href = a['href']
print("Hybrid retriever initialized successfully") if href.startswith('http') and href != url:
return hybrid_retriever links.append(href)
if len(links) >= self.max_links_per_page:
break
content = {
"url": url,
"title": title,
"paragraphs": paragraphs
}
return content, links
except Exception as e: except Exception as e:
print(f"Error initializing retriever: {e}") print(f"خطا در خزش صفحه {url}: {e}")
traceback.print_exc() return None, []
return None
def estimate_confidence(self, text, query, context=None): def crawl_website(self, start_url, max_pages=10):
"""Estimate confidence of response""" visited = set()
# Start with baseline confidence to_visit = [start_url]
confidence = 0.5 contents = []
# Check for uncertainty markers while to_visit and len(visited) < max_pages:
uncertainty_phrases = [ current_url = to_visit.pop(0)
"نمی‌دانم", "مطمئن نیستم", "ممکن است", "شاید", "احتمالاً",
"فکر می‌کنم", "به نظر می‌رسد"
]
if any(phrase in text.lower() for phrase in uncertainty_phrases): if current_url in visited:
confidence -= 0.2
# Check for question relevance
query_words = set(re.findall(r'\b\w+\b', query.lower()))
text_words = set(re.findall(r'\b\w+\b', text.lower()))
# Calculate overlap between query and response
if query_words:
overlap_ratio = len(query_words.intersection(text_words)) / len(query_words)
if overlap_ratio > 0.5:
confidence += 0.2
elif overlap_ratio < 0.2:
confidence -= 0.2
# If context provided, check context relevance
if context:
context_words = set(re.findall(r'\b\w+\b', context.lower()))
if context_words:
context_overlap = len(context_words.intersection(text_words)) / len(context_words)
if context_overlap > 0.3:
confidence += 0.2
else:
confidence -= 0.1
# Ensure confidence is within [0,1]
return max(0.0, min(1.0, confidence))
def check_direct_knowledge(self, query):
"""Check if the LLM can answer directly from its knowledge"""
print("Checking LLM's direct knowledge...")
prompt = f"""به این سوال با استفاده از دانش خود پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query)
print(f"LLM direct knowledge confidence: {confidence:.2f}")
return response, confidence
def rag_query(self, query):
"""Use RAG to retrieve and generate answer"""
if not self.retriever:
print("Retriever not initialized, skipping RAG")
return None, 0.0
print("Retrieving documents for RAG...")
# Retrieve relevant documents
docs = self.retriever.get_relevant_documents(query)
if not docs:
print("No relevant documents found")
return None, 0.0
print(f"Retrieved {len(docs)} relevant documents")
# Prepare context
context = "\n\n".join([doc.page_content for doc in docs])
sources = [doc.metadata.get("source", "Unknown") for doc in docs]
# Query LLM with context
prompt = f"""با توجه به اطلاعات زیر، به سوال پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
اطلاعات:
{context}
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query, context)
print(f"RAG confidence: {confidence:.2f}")
return {
"response": response,
"confidence": confidence,
"sources": list(set(sources))
}, confidence
def web_search(self, query):
"""Search the web for an answer"""
print("Searching web for answer...")
# Search DuckDuckGo
search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
response = requests.get(search_url, verify=False, timeout=10)
if response.status_code != 200:
print(f"Error searching web: HTTP {response.status_code}")
return None, 0.0
# Parse results
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for element in soup.select('.result__url, .result__a')[:4]:
href = element.get('href') if 'href' in element.attrs else None
if href and not href.startswith('/') and (href.startswith('http://') or href.startswith('https://')):
results.append(href)
elif not href and element.find('a') and 'href' in element.find('a').attrs:
href = element.find('a')['href']
if href and not href.startswith('/'):
results.append(href)
if not results:
print("No web results found")
return None, 0.0
# Crawl top results
web_content = []
for url in results[:3]:
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
page = requests.get(url, headers=headers, timeout=10, verify=False)
page.raise_for_status()
soup = BeautifulSoup(page.text, 'html.parser')
# Remove non-content elements
for tag in ['script', 'style', 'nav', 'footer', 'header']:
for element in soup.find_all(tag):
element.decompose()
# Get paragraphs
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')
if len(p.get_text(strip=True)) > 20]
if paragraphs:
web_content.append(f"[Source: {url}] " + " ".join(paragraphs[:5]))
except Exception as e:
print(f"Error crawling {url}: {e}")
if not web_content:
print("No useful content found from web results")
return None, 0.0
# Query LLM with web content
context = "\n\n".join(web_content)
prompt = f"""با توجه به اطلاعات زیر که از وب بدست آمده، به سوال پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
اطلاعات:
{context}
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query, context)
print(f"Web search confidence: {confidence:.2f}")
return {
"response": response,
"confidence": confidence,
"sources": results[:3]
}, confidence
def get_answer(self, query):
"""Main method to get an answer following the specified architecture"""
print(f"Processing query: {query}")
# STEP 1: Try direct LLM knowledge
direct_response, direct_confidence = self.check_direct_knowledge(query)
if direct_confidence >= THRESHOLDS['direct_answer']:
print("Using direct LLM knowledge (high confidence)")
return f"{direct_response}\n\n[Source: LLM Knowledge, Confidence: {direct_confidence:.2f}]"
# STEP 2: Try RAG with local documents
rag_result, rag_confidence = self.rag_query(query)
if rag_result and rag_confidence >= THRESHOLDS['rag_confidence']:
print("Using RAG response (sufficient confidence)")
sources_text = ", ".join(rag_result["sources"][:3])
return f"{rag_result['response']}\n\n[Source: Local Documents, Confidence: {rag_confidence:.2f}, Sources: {sources_text}]"
# STEP 3: Try web search
web_result, web_confidence = self.web_search(query)
if web_result and web_confidence >= THRESHOLDS['web_search']:
print("Using web search response (sufficient confidence)")
sources_text = ", ".join(web_result["sources"])
return f"{web_result['response']}\n\n[Source: Web Search, Confidence: {web_confidence:.2f}, Sources: {sources_text}]"
# STEP 4: Fall back to direct response with warning
print("No high-confidence source found, using direct response with warning")
return f"{direct_response}\n\n[Warning: Low confidence ({direct_confidence:.2f}). Please verify information.]"
# Simple API functions
def get_answer(query):
"""Get an answer for a query"""
system = AgenticQASystem()
return system.get_answer(query)
# Main entry point
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="QA System")
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument("--query", "-q", help="Query to answer")
mode_group.add_argument("--interactive", "-i", action="store_true", help="Run in interactive chat mode")
mode_group.add_argument("--test", "-t", action="store_true", help="Run tests")
args = parser.parse_args()
if args.interactive:
# Simple interactive mode without memory
qa_system = AgenticQASystem()
print("=== QA System ===")
print("Type 'exit' or 'quit' to end")
while True:
user_input = input("\nYou: ")
if not user_input.strip():
continue continue
if user_input.lower() in ['exit', 'quit', 'خروج']: content, links = self.crawl_page(current_url)
break
response = qa_system.get_answer(user_input) visited.add(current_url)
print(f"\nBot: {response}")
elif args.query: if content and content["paragraphs"]:
qa_system = AgenticQASystem() contents.append(content)
print(qa_system.get_answer(args.query))
elif args.test: for link in links:
print("Running tests...") if link not in visited and link not in to_visit:
to_visit.append(link)
time.sleep(1)
return contents
def crawl_web(self, query):
urls = self.search_duckduckgo(query)
if not urls:
print("هیچ نتیجه‌ای یافت نشد.")
return []
all_results = []
for url in urls[:3]: # Limit to first 3 URLs for efficiency
content, links = self.crawl_page(url)
if content and content["paragraphs"]:
all_results.append(content)
# Follow links from the main page (recursive crawling)
for link in links[:2]: # Limit to first 2 links
sub_content, _ = self.crawl_page(link, depth=1)
if sub_content and sub_content["paragraphs"]:
all_results.append(sub_content)
time.sleep(1)
time.sleep(1)
self.web_results = all_results
self._save_web_results()
# Convert web results to documents for RAG
web_docs = []
for result in all_results:
text = f"[{result['title']}]\n" + "\n".join(result['paragraphs'])
web_docs.append({"page_content": text, "metadata": {"source": result['url']}})
return all_results, web_docs
def build_retriever(self, documents):
if not documents:
return None
# Create BM25 retriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 3 # Return top 3 results
return bm25_retriever
def get_relevant_documents(self, query, documents):
retriever = self.build_retriever(documents)
if not retriever:
return []
return retriever.get_relevant_documents(query)
def extract_context_from_documents(self, query):
if not self.documents:
return None
relevant_docs = self.get_relevant_documents(query, self.documents)
if not relevant_docs:
return None
context = "\n\n".join([doc.page_content for doc in relevant_docs])
return context
def extract_context_from_web(self, web_results, web_docs, query):
if not web_results or not web_docs:
return None, []
# Try to use the retriever for better results
if web_docs:
relevant_docs = self.get_relevant_documents(query, web_docs)
if relevant_docs:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
sources = [doc.metadata.get("source", "") for doc in relevant_docs if "source" in doc.metadata]
return context, sources
# Fall back to simple extraction if retriever fails
contexts = []
sources = []
for doc in web_results:
context_text = "\n".join(doc["paragraphs"])
contexts.append(f"[{doc['title']}] {context_text}")
sources.append(doc['url'])
context = "\n\n".join(contexts)
return context, sources
def get_context(query, crawl_params=None):
"""
سیستم RAG مدولار برای پاسخگویی به سوالات با استفاده از اسناد و جستجوی وب
پارامترها:
query (str): سوال به زبان فارسی
crawl_params (dict, optional): پارامترهای خزش وب
- max_depth: حداکثر عمق خزش
- max_links_per_page: حداکثر تعداد لینکهای استخراج شده از هر صفحه
- max_paragraphs: حداکثر تعداد پاراگرافهای استخراج شده از هر صفحه
- num_search_results: تعداد نتایج جستجو
خروجی:
dict: نتیجه جستجو شامل متن و منابع
"""
rag = ModularRAG()
# Configure crawling parameters if provided
if crawl_params:
if 'max_depth' in crawl_params:
rag.max_depth = crawl_params['max_depth']
if 'max_links_per_page' in crawl_params:
rag.max_links_per_page = crawl_params['max_links_per_page']
if 'max_paragraphs' in crawl_params:
rag.max_paragraphs = crawl_params['max_paragraphs']
if 'num_search_results' in crawl_params:
rag.num_search_results = crawl_params['num_search_results']
# First try to get context from documents
doc_context = rag.extract_context_from_documents(query)
if doc_context:
return {
"has_context": True,
"context": doc_context,
"source": "documents",
"language": "fa"
}
# Fall back to web search
web_results, web_docs = rag.crawl_web(query)
if web_results:
web_context, sources = rag.extract_context_from_web(web_results, web_docs, query)
return {
"has_context": True,
"context": web_context,
"source": "web",
"sources": sources,
"language": "fa"
}
# No context found
return {
"has_context": False,
"context": "متأسفانه اطلاعاتی در مورد سوال شما یافت نشد.",
"source": "none",
"language": "fa"
}