188 lines
7.2 KiB
Plaintext
188 lines
7.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f9327343-3e11-4a88-b798-95ff4644e2a5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.vectorstores import Chroma\n",
|
|
"from langchain.embeddings import SentenceTransformerEmbeddings\n",
|
|
"from langchain.schema import Document\n",
|
|
"from ollama import chat\n",
|
|
"import os\n",
|
|
"import re\n",
|
|
"import requests\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"\n",
|
|
"EMBEDDING_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'\n",
|
|
"LLM_MODEL = 'gemma2:9b'\n",
|
|
"CHUNK_SIZE = 1000\n",
|
|
"OVERLAP = 200\n",
|
|
"CHROMA_PERSIST_DIR = r'\\home\\Masih\\chroma_db\\chroma_db'\n",
|
|
"\n",
|
|
"class ChromaRAGSystem:\n",
|
|
" def __init__(self):\n",
|
|
" # Init embedding model\n",
|
|
" self.embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)\n",
|
|
" # Vector store instance\n",
|
|
" self.vector_db = None\n",
|
|
" \n",
|
|
" def build_vector_store(self, documents):\n",
|
|
" \"\"\"Create Chroma vector store from documents\"\"\"\n",
|
|
" self.vector_db = Chroma.from_documents(\n",
|
|
" documents=documents,\n",
|
|
" embedding=self.embeddings,\n",
|
|
" persist_directory=CHROMA_PERSIST_DIR\n",
|
|
" )\n",
|
|
" self.vector_db.persist()\n",
|
|
" \n",
|
|
" def load_vector_store(self):\n",
|
|
" \"\"\"Load existing Chroma vector store\"\"\"\n",
|
|
" self.vector_db = Chroma(\n",
|
|
" persist_directory=CHROMA_PERSIST_DIR,\n",
|
|
" embedding_function=self.embeddings\n",
|
|
" )\n",
|
|
" \n",
|
|
" def document_query(self, query, top_k=5):\n",
|
|
" \"\"\"Retrieve context from documents based on query\"\"\"\n",
|
|
" # Perform similarity search across all documents\n",
|
|
" results = self.vector_db.similarity_search(query=query, k=top_k)\n",
|
|
" return [doc.page_content for doc in results]\n",
|
|
"\n",
|
|
"class AnswerGenerator:\n",
|
|
" def __init__(self, rag_system):\n",
|
|
" self.rag = rag_system\n",
|
|
" \n",
|
|
" def generate_response(self, question):\n",
|
|
" \"\"\"Generate context-aware answer using LLM\"\"\"\n",
|
|
" # Retrieve relevant context from the best matching documents\n",
|
|
" context_chunks = self.rag.document_query(question)\n",
|
|
" context = \"\\n\".join(context_chunks)\n",
|
|
" \n",
|
|
" prompt = f\"\"\"با استفاده از متن زیر به سوال پاسخ دهید:\n",
|
|
"{context}\n",
|
|
"\n",
|
|
"اگر پاسخ در متن وجود ندارد عبارت 'پاسخی یافت نشد' را برگردانید\n",
|
|
"\n",
|
|
"سوال: {question}\n",
|
|
"پاسخ:\"\"\"\n",
|
|
" \n",
|
|
" response = chat(model=LLM_MODEL, messages=[{'role': 'user', 'content': prompt}])\n",
|
|
" return response['message']['content']\n",
|
|
"\n",
|
|
"def scrape_url(url):\n",
|
|
" \"\"\"Scrape the content from a given URL\"\"\"\n",
|
|
" response = requests.get(url)\n",
|
|
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
|
"\n",
|
|
" # Extract the article text (adjust this as per the specific page's structure)\n",
|
|
" paragraphs = soup.find_all('p')\n",
|
|
" article_text = \"\\n\".join([para.get_text() for para in paragraphs])\n",
|
|
"\n",
|
|
" return article_text\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" url = \"https://tosinso.com/articles/40596\"\n",
|
|
" article_content = scrape_url(url)\n",
|
|
"\n",
|
|
" # Process the scraped content and create a vector store\n",
|
|
" rag_system = ChromaRAGSystem()\n",
|
|
"\n",
|
|
" # Chunk the article content\n",
|
|
" chunks = [article_content[i:i+CHUNK_SIZE] for i in range(0, len(article_content), CHUNK_SIZE - OVERLAP)]\n",
|
|
" documents = [Document(page_content=chunk) for chunk in chunks]\n",
|
|
"\n",
|
|
" # Build vector store\n",
|
|
" rag_system.build_vector_store(documents)\n",
|
|
"\n",
|
|
" # Init answer generator\n",
|
|
" answer_engine = AnswerGenerator(rag_system)\n",
|
|
"\n",
|
|
" # The query to be answered\n",
|
|
" query = \"تفاوت زیروکلاینت و تین کلاینت با PC در چیست؟\"\n",
|
|
"\n",
|
|
" # Generate and print the response\n",
|
|
" answer = answer_engine.generate_response(query)\n",
|
|
" print(answer)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cc9f2290-d16f-4722-857a-7996d4722857",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"\n",
|
|
"def search_internet(query):\n",
|
|
" \"\"\"Search the web for the given query and return a relevant snippet.\"\"\"\n",
|
|
" query = query.replace(\" \", \"+\") # Format the query for URLs\n",
|
|
" url = f\"https://www.google.com/search?q={query}\"\n",
|
|
" \n",
|
|
" # Send a GET request to Google (NOTE: scraping Google directly can get blocked)\n",
|
|
" headers = {\n",
|
|
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\"\n",
|
|
" }\n",
|
|
" response = requests.get(url, headers=headers)\n",
|
|
"\n",
|
|
" if response.status_code != 200:\n",
|
|
" return \"Error: Unable to retrieve data from the internet.\"\n",
|
|
" \n",
|
|
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
|
"\n",
|
|
" # Scrape content from search result snippets (extract the first result snippet)\n",
|
|
" search_results = soup.find_all('div', {'class': 'BNeawe iBp4i AP7Wnd'})\n",
|
|
" if search_results:\n",
|
|
" return search_results[0].get_text()\n",
|
|
" \n",
|
|
" return \"No relevant information found on the web.\"\n",
|
|
"\n",
|
|
"def generate_answer(query):\n",
|
|
" \"\"\"Generate an answer by first checking Wikipedia and then searching the internet.\"\"\"\n",
|
|
" # First, check Wikipedia for Persian content\n",
|
|
" wikipedia_answer = search_wikipedia(query)\n",
|
|
" if wikipedia_answer and \"Error\" not in wikipedia_answer:\n",
|
|
" return wikipedia_answer\n",
|
|
" \n",
|
|
" # If not found in Wikipedia, search the web\n",
|
|
" internet_answer = search_internet(query)\n",
|
|
" return internet_answer\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" query = \"شاه عباس صفوی که بود و چه کرد؟\"\n",
|
|
" \n",
|
|
" # Get the answer from Wikipedia and Internet search\n",
|
|
" answer = generate_answer(query)\n",
|
|
" \n",
|
|
" # Print the answer\n",
|
|
" print(f\"Answer: {answer}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|