Update enhanced_combined.py

This commit is contained in:
MasihMoafi 2025-05-02 11:02:36 +00:00
parent 96ccd3d78d
commit 6b0442c791

View File

@ -1,460 +1,340 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os import os
import re import pickle
import json import json
import ssl import nltk
import argparse
import requests import requests
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import quote from urllib.parse import quote
from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.document_loaders import PDFPlumberLoader, WebBaseLoader
from langchain_community.vectorstores import Chroma from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document from langchain_community.retrievers import BM25Retriever
import traceback
# Disable SSL warnings and proxy settings try:
ssl._create_default_https_context = ssl._create_unverified_context nltk.data.find('tokenizers/punkt')
requests.packages.urllib3.disable_warnings() except LookupError:
nltk.download('punkt')
def clear_proxy_settings(): class ModularRAG:
"""Remove proxy environment variables that might cause connection issues."""
for var in ["HTTP_PROXY", "HTTPS_PROXY", "ALL_PROXY", "http_proxy", "https_proxy", "all_proxy"]:
if var in os.environ:
print(f"Removing proxy env var: {var}")
del os.environ[var]
# Run at module load time
clear_proxy_settings()
# Configuration
DOCUMENT_PATHS = [
r'doc1.txt',
r'doc2.txt',
r'doc3.txt',
r'doc4.txt',
r'doc5.txt',
r'doc6.txt'
]
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
LLM_MODEL = 'gemma3'
CHUNK_SIZE = 1000
OVERLAP = 200
CHROMA_PERSIST_DIR = 'chroma_db'
# Confidence thresholds
THRESHOLDS = {
'direct_answer': 0.7,
'rag_confidence': 0.6,
'web_search': 0.5
}
def query_llm(prompt, model='gemma3'):
"""Query the LLM model directly using Ollama API."""
try:
ollama_endpoint = "http://localhost:11434/api/generate"
payload = {
"model": model,
"prompt": prompt,
"stream": False
}
response = requests.post(ollama_endpoint, json=payload)
if response.status_code == 200:
result = response.json()
return result.get('response', '')
else:
print(f"Ollama API error: {response.status_code}")
return f"Error calling Ollama API: {response.status_code}"
except Exception as e:
print(f"Error querying LLM: {e}")
return f"Error: {str(e)}"
class BM25Retriever:
"""BM25 retriever implementation for text similarity search"""
@classmethod
def from_documents(cls, documents):
"""Create a BM25 retriever from documents"""
retriever = cls()
retriever.documents = documents
retriever.k = 4
return retriever
def get_relevant_documents(self, query):
"""Get relevant documents using BM25 algorithm"""
# Simple BM25-like implementation
scores = []
query_terms = set(re.findall(r'\b\w+\b', query.lower()))
for doc in self.documents:
doc_terms = set(re.findall(r'\b\w+\b', doc.page_content.lower()))
# Calculate term overlap as a simple approximation of BM25
overlap = len(query_terms.intersection(doc_terms))
scores.append((doc, overlap))
# Sort by score and return top k
sorted_docs = [doc for doc, score in sorted(scores, key=lambda x: x[1], reverse=True)]
return sorted_docs[:self.k]
class HybridRetriever:
"""Hybrid retriever combining BM25 and vector search with configurable weights"""
def __init__(self, vector_retriever, bm25_retriever, vector_weight=0.3):
"""Initialize with separate retrievers and weights"""
self._vector_retriever = vector_retriever
self._bm25_retriever = bm25_retriever
self._vector_weight = vector_weight
self._bm25_weight = 1.0 - vector_weight
def get_relevant_documents(self, query):
"""Get relevant documents using weighted combination of retrievers"""
try:
# Get results from both retrievers
vector_docs = self._vector_retriever.get_relevant_documents(query)
bm25_docs = self._bm25_retriever.get_relevant_documents(query)
# Create dictionary to track unique documents and their scores
doc_dict = {}
# Add vector docs with their weights
for i, doc in enumerate(vector_docs):
# Score based on position (inverse rank)
score = (len(vector_docs) - i) * self._vector_weight
doc_id = doc.page_content[:50] # Use first 50 chars as a simple ID
if doc_id in doc_dict:
doc_dict[doc_id]["score"] += score
else:
doc_dict[doc_id] = {"doc": doc, "score": score}
# Add BM25 docs with their weights
for i, doc in enumerate(bm25_docs):
# Score based on position (inverse rank)
score = (len(bm25_docs) - i) * self._bm25_weight
doc_id = doc.page_content[:50] # Use first 50 chars as a simple ID
if doc_id in doc_dict:
doc_dict[doc_id]["score"] += score
else:
doc_dict[doc_id] = {"doc": doc, "score": score}
# Sort by combined score (highest first)
sorted_docs = sorted(doc_dict.values(), key=lambda x: x["score"], reverse=True)
# Return just the document objects
return [item["doc"] for item in sorted_docs]
except Exception as e:
print(f"Error in hybrid retrieval: {e}")
return []
class AgenticQASystem:
"""QA system implementing the specified architecture"""
def __init__(self): def __init__(self):
"""Initialize the QA system with retrievers""" self.storage_path = "./rag_data"
# Load embeddings
self.embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) if not os.path.exists(self.storage_path):
# Load documents and retrievers os.makedirs(self.storage_path)
self.documents = self.load_documents() os.makedirs(os.path.join(self.storage_path, "documents"))
self.retriever = self.initialize_retriever() os.makedirs(os.path.join(self.storage_path, "web_results"))
self.documents = []
self.web_results = []
# Web crawler settings
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
self.num_search_results = 10
self.max_depth = 2
self.max_links_per_page = 5
self.max_paragraphs = 5
self._load_saved_data()
def load_documents(self): def _load_saved_data(self):
"""Load documents from configured paths with sliding window chunking""" doc_path = os.path.join(self.storage_path, "documents", "docs.pkl")
print("Loading documents...") web_path = os.path.join(self.storage_path, "web_results", "web.json")
docs = []
for path in DOCUMENT_PATHS: if os.path.exists(doc_path):
try: try:
with open(path, 'r', encoding='utf-8') as f: with open(doc_path, 'rb') as f:
text = re.sub(r'\s+', ' ', f.read()).strip() self.documents = pickle.load(f)
# Sliding window chunking
chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE - OVERLAP)]
for chunk in chunks:
docs.append(Document(
page_content=chunk,
metadata={"source": os.path.basename(path)}
))
except Exception as e: except Exception as e:
print(f"Error loading document {path}: {e}") print(f"خطا در بارگیری اسناد: {e}")
print(f"Loaded {len(docs)} document chunks")
return docs if os.path.exists(web_path):
try:
with open(web_path, 'r', encoding='utf-8') as f:
self.web_results = json.load(f)
except Exception as e:
print(f"خطا در بارگیری نتایج وب: {e}")
def initialize_retriever(self): def _save_documents(self):
"""Initialize the hybrid retriever with BM25 and direct Chroma queries""" doc_path = os.path.join(self.storage_path, "documents", "docs.pkl")
if not self.documents: try:
print("No documents loaded, retriever initialization failed") with open(doc_path, 'wb') as f:
return None pickle.dump(self.documents, f)
except Exception as e:
print(f"خطا در ذخیره‌سازی اسناد: {e}")
def _save_web_results(self):
web_path = os.path.join(self.storage_path, "web_results", "web.json")
try:
with open(web_path, 'w', encoding='utf-8') as f:
json.dump(self.web_results, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"خطا در ذخیره‌سازی نتایج وب: {e}")
def load_pdf(self, file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"فایل یافت نشد: {file_path}")
try: try:
# Create BM25 retriever loader = PDFPlumberLoader(file_path)
bm25_retriever = BM25Retriever.from_documents(self.documents) documents = loader.load()
bm25_retriever.k = 4 # Top k results to retrieve
# Initialize vector store with KNN search if documents:
import shutil text_splitter = RecursiveCharacterTextSplitter(
if os.path.exists(CHROMA_PERSIST_DIR): chunk_size=1000,
print(f"Removing existing Chroma DB to prevent dimension mismatch") chunk_overlap=200,
shutil.rmtree(CHROMA_PERSIST_DIR) add_start_index=True
)
chunked_docs = text_splitter.split_documents(documents)
# Create vector store directly from Chroma self.documents.extend(chunked_docs)
print("Creating vector store...") self._save_documents()
vector_store = Chroma.from_documents( return len(chunked_docs)
documents=self.documents, return 0
embedding=self.embeddings,
persist_directory=CHROMA_PERSIST_DIR
)
vector_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
print(f"Vector retriever created: {type(vector_retriever)}")
# Create hybrid retriever - BM25 (70%) and Vector (30%)
print("Creating hybrid retriever")
hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever, vector_weight=0.3)
print("Hybrid retriever initialized successfully")
return hybrid_retriever
except Exception as e: except Exception as e:
print(f"Error initializing retriever: {e}") raise Exception(f"خطا در بارگیری PDF: {e}")
traceback.print_exc()
return None
def estimate_confidence(self, text, query, context=None): def search_duckduckgo(self, query, num_results=None):
"""Estimate confidence of response""" if num_results is None:
# Start with baseline confidence num_results = self.num_search_results
confidence = 0.5
# Check for uncertainty markers
uncertainty_phrases = [
"نمی‌دانم", "مطمئن نیستم", "ممکن است", "شاید", "احتمالاً",
"فکر می‌کنم", "به نظر می‌رسد"
]
if any(phrase in text.lower() for phrase in uncertainty_phrases):
confidence -= 0.2
# Check for question relevance
query_words = set(re.findall(r'\b\w+\b', query.lower()))
text_words = set(re.findall(r'\b\w+\b', text.lower()))
# Calculate overlap between query and response
if query_words:
overlap_ratio = len(query_words.intersection(text_words)) / len(query_words)
if overlap_ratio > 0.5:
confidence += 0.2
elif overlap_ratio < 0.2:
confidence -= 0.2
# If context provided, check context relevance
if context:
context_words = set(re.findall(r'\b\w+\b', context.lower()))
if context_words:
context_overlap = len(context_words.intersection(text_words)) / len(context_words)
if context_overlap > 0.3:
confidence += 0.2
else:
confidence -= 0.1
# Ensure confidence is within [0,1]
return max(0.0, min(1.0, confidence))
def check_direct_knowledge(self, query):
"""Check if the LLM can answer directly from its knowledge"""
print("Checking LLM's direct knowledge...")
prompt = f"""به این سوال با استفاده از دانش خود پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query)
print(f"LLM direct knowledge confidence: {confidence:.2f}")
return response, confidence
def rag_query(self, query):
"""Use RAG to retrieve and generate answer"""
if not self.retriever:
print("Retriever not initialized, skipping RAG")
return None, 0.0
print("Retrieving documents for RAG...")
# Retrieve relevant documents
docs = self.retriever.get_relevant_documents(query)
if not docs:
print("No relevant documents found")
return None, 0.0
print(f"Retrieved {len(docs)} relevant documents")
# Prepare context
context = "\n\n".join([doc.page_content for doc in docs])
sources = [doc.metadata.get("source", "Unknown") for doc in docs]
# Query LLM with context
prompt = f"""با توجه به اطلاعات زیر، به سوال پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
اطلاعات:
{context}
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query, context)
print(f"RAG confidence: {confidence:.2f}")
return {
"response": response,
"confidence": confidence,
"sources": list(set(sources))
}, confidence
def web_search(self, query):
"""Search the web for an answer"""
print("Searching web for answer...")
# Search DuckDuckGo
search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
response = requests.get(search_url, verify=False, timeout=10)
if response.status_code != 200:
print(f"Error searching web: HTTP {response.status_code}")
return None, 0.0
# Parse results
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for element in soup.select('.result__url, .result__a')[:4]:
href = element.get('href') if 'href' in element.attrs else None
if href and not href.startswith('/') and (href.startswith('http://') or href.startswith('https://')): try:
results.append(href) search_url = f"https://html.duckduckgo.com/html/?q={quote(query)}"
elif not href and element.find('a') and 'href' in element.find('a').attrs: response = requests.get(search_url, headers=self.headers, timeout=10)
href = element.find('a')['href']
if href and not href.startswith('/'): if response.status_code != 200:
print(f"خطا در جستجوی وب: HTTP {response.status_code}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for element in soup.select('.result__url, .result__a'):
href = element.get('href') if 'href' in element.attrs else None
if href and not href.startswith('/') and (href.startswith('http://') or href.startswith('https://')):
results.append(href) results.append(href)
elif not href and element.find('a') and 'href' in element.find('a').attrs:
href = element.find('a')['href']
if href and not href.startswith('/'):
results.append(href)
unique_results = list(set(results))
return unique_results[:num_results]
if not results: except Exception as e:
print("No web results found") print(f"خطا در جستجوی DuckDuckGo: {e}")
return None, 0.0 return []
# Crawl top results
web_content = []
for url in results[:3]:
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
page = requests.get(url, headers=headers, timeout=10, verify=False)
page.raise_for_status()
soup = BeautifulSoup(page.text, 'html.parser')
# Remove non-content elements
for tag in ['script', 'style', 'nav', 'footer', 'header']:
for element in soup.find_all(tag):
element.decompose()
# Get paragraphs
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')
if len(p.get_text(strip=True)) > 20]
if paragraphs:
web_content.append(f"[Source: {url}] " + " ".join(paragraphs[:5]))
except Exception as e:
print(f"Error crawling {url}: {e}")
if not web_content:
print("No useful content found from web results")
return None, 0.0
# Query LLM with web content
context = "\n\n".join(web_content)
prompt = f"""با توجه به اطلاعات زیر که از وب بدست آمده، به سوال پاسخ دهید. فقط به زبان فارسی پاسخ دهید.
اطلاعات:
{context}
سوال: {query}
پاسخ فارسی:"""
response = query_llm(prompt, model=LLM_MODEL)
confidence = self.estimate_confidence(response, query, context)
print(f"Web search confidence: {confidence:.2f}")
return {
"response": response,
"confidence": confidence,
"sources": results[:3]
}, confidence
def get_answer(self, query): def crawl_page(self, url, depth=0):
"""Main method to get an answer following the specified architecture""" if depth > self.max_depth:
print(f"Processing query: {query}") return None, []
# STEP 1: Try direct LLM knowledge try:
direct_response, direct_confidence = self.check_direct_knowledge(query) response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string if soup.title else "بدون عنوان"
paragraphs = []
for p in soup.find_all('p'):
text = p.get_text(strip=True)
if len(text) > 50:
paragraphs.append(text)
if len(paragraphs) >= self.max_paragraphs:
break
links = []
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith('http') and href != url:
links.append(href)
if len(links) >= self.max_links_per_page:
break
content = {
"url": url,
"title": title,
"paragraphs": paragraphs
}
return content, links
if direct_confidence >= THRESHOLDS['direct_answer']: except Exception as e:
print("Using direct LLM knowledge (high confidence)") print(f"خطا در خزش صفحه {url}: {e}")
return f"{direct_response}\n\n[Source: LLM Knowledge, Confidence: {direct_confidence:.2f}]" return None, []
# STEP 2: Try RAG with local documents
rag_result, rag_confidence = self.rag_query(query)
if rag_result and rag_confidence >= THRESHOLDS['rag_confidence']:
print("Using RAG response (sufficient confidence)")
sources_text = ", ".join(rag_result["sources"][:3])
return f"{rag_result['response']}\n\n[Source: Local Documents, Confidence: {rag_confidence:.2f}, Sources: {sources_text}]"
# STEP 3: Try web search
web_result, web_confidence = self.web_search(query)
if web_result and web_confidence >= THRESHOLDS['web_search']:
print("Using web search response (sufficient confidence)")
sources_text = ", ".join(web_result["sources"])
return f"{web_result['response']}\n\n[Source: Web Search, Confidence: {web_confidence:.2f}, Sources: {sources_text}]"
# STEP 4: Fall back to direct response with warning
print("No high-confidence source found, using direct response with warning")
return f"{direct_response}\n\n[Warning: Low confidence ({direct_confidence:.2f}). Please verify information.]"
# Simple API functions
def get_answer(query):
"""Get an answer for a query"""
system = AgenticQASystem()
return system.get_answer(query)
# Main entry point
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="QA System")
mode_group = parser.add_mutually_exclusive_group(required=True) def crawl_website(self, start_url, max_pages=10):
mode_group.add_argument("--query", "-q", help="Query to answer") visited = set()
mode_group.add_argument("--interactive", "-i", action="store_true", help="Run in interactive chat mode") to_visit = [start_url]
mode_group.add_argument("--test", "-t", action="store_true", help="Run tests") contents = []
args = parser.parse_args()
if args.interactive:
# Simple interactive mode without memory
qa_system = AgenticQASystem()
print("=== QA System ===")
print("Type 'exit' or 'quit' to end")
while True: while to_visit and len(visited) < max_pages:
user_input = input("\nYou: ") current_url = to_visit.pop(0)
if not user_input.strip():
if current_url in visited:
continue continue
content, links = self.crawl_page(current_url)
visited.add(current_url)
if content and content["paragraphs"]:
contents.append(content)
for link in links:
if link not in visited and link not in to_visit:
to_visit.append(link)
time.sleep(1)
return contents
def crawl_web(self, query):
urls = self.search_duckduckgo(query)
if not urls:
print("هیچ نتیجه‌ای یافت نشد.")
return []
all_results = []
for url in urls[:3]: # Limit to first 3 URLs for efficiency
content, links = self.crawl_page(url)
if content and content["paragraphs"]:
all_results.append(content)
if user_input.lower() in ['exit', 'quit', 'خروج']: # Follow links from the main page (recursive crawling)
break for link in links[:2]: # Limit to first 2 links
sub_content, _ = self.crawl_page(link, depth=1)
response = qa_system.get_answer(user_input) if sub_content and sub_content["paragraphs"]:
print(f"\nBot: {response}") all_results.append(sub_content)
elif args.query: time.sleep(1)
qa_system = AgenticQASystem()
print(qa_system.get_answer(args.query)) time.sleep(1)
elif args.test:
print("Running tests...") self.web_results = all_results
self._save_web_results()
# Convert web results to documents for RAG
web_docs = []
for result in all_results:
text = f"[{result['title']}]\n" + "\n".join(result['paragraphs'])
web_docs.append({"page_content": text, "metadata": {"source": result['url']}})
return all_results, web_docs
def build_retriever(self, documents):
if not documents:
return None
# Create BM25 retriever
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 3 # Return top 3 results
return bm25_retriever
def get_relevant_documents(self, query, documents):
retriever = self.build_retriever(documents)
if not retriever:
return []
return retriever.get_relevant_documents(query)
def extract_context_from_documents(self, query):
if not self.documents:
return None
relevant_docs = self.get_relevant_documents(query, self.documents)
if not relevant_docs:
return None
context = "\n\n".join([doc.page_content for doc in relevant_docs])
return context
def extract_context_from_web(self, web_results, web_docs, query):
if not web_results or not web_docs:
return None, []
# Try to use the retriever for better results
if web_docs:
relevant_docs = self.get_relevant_documents(query, web_docs)
if relevant_docs:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
sources = [doc.metadata.get("source", "") for doc in relevant_docs if "source" in doc.metadata]
return context, sources
# Fall back to simple extraction if retriever fails
contexts = []
sources = []
for doc in web_results:
context_text = "\n".join(doc["paragraphs"])
contexts.append(f"[{doc['title']}] {context_text}")
sources.append(doc['url'])
context = "\n\n".join(contexts)
return context, sources
def get_context(query, crawl_params=None):
"""
سیستم RAG مدولار برای پاسخگویی به سوالات با استفاده از اسناد و جستجوی وب
پارامترها:
query (str): سوال به زبان فارسی
crawl_params (dict, optional): پارامترهای خزش وب
- max_depth: حداکثر عمق خزش
- max_links_per_page: حداکثر تعداد لینکهای استخراج شده از هر صفحه
- max_paragraphs: حداکثر تعداد پاراگرافهای استخراج شده از هر صفحه
- num_search_results: تعداد نتایج جستجو
خروجی:
dict: نتیجه جستجو شامل متن و منابع
"""
rag = ModularRAG()
# Configure crawling parameters if provided
if crawl_params:
if 'max_depth' in crawl_params:
rag.max_depth = crawl_params['max_depth']
if 'max_links_per_page' in crawl_params:
rag.max_links_per_page = crawl_params['max_links_per_page']
if 'max_paragraphs' in crawl_params:
rag.max_paragraphs = crawl_params['max_paragraphs']
if 'num_search_results' in crawl_params:
rag.num_search_results = crawl_params['num_search_results']
# First try to get context from documents
doc_context = rag.extract_context_from_documents(query)
if doc_context:
return {
"has_context": True,
"context": doc_context,
"source": "documents",
"language": "fa"
}
# Fall back to web search
web_results, web_docs = rag.crawl_web(query)
if web_results:
web_context, sources = rag.extract_context_from_web(web_results, web_docs, query)
return {
"has_context": True,
"context": web_context,
"source": "web",
"sources": sources,
"language": "fa"
}
# No context found
return {
"has_context": False,
"context": "متأسفانه اطلاعاتی در مورد سوال شما یافت نشد.",
"source": "none",
"language": "fa"
}