Viewing File: /home/ubuntu/codegamaai-test/general_bot/src/search_help.py
import requests
from bs4 import BeautifulSoup
import re
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import json
import torch
# Initialize NLP models and tools
nltk.download('punkt')
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def perform_web_search(query, api_key ="AIzaSyCl0-ppOL9ayUr5iaCycc4v2exAFFFnKsM", cse_id="83e74a6a41c114e61",
num_results=6):
api_url = "https://www.googleapis.com/customsearch/v1"
params = {
"q": query,
"cx":cse_id,
"key": api_key,
"num": num_results
}
response = requests.get(api_url, params=params)
response.raise_for_status()
search_results = response.json()
urls = [item['link'] for item in search_results.get('items', [])]
return urls
def is_finance_related(query):
finance_keywords = ['stock', 'shares', 'price','finance', 'market', 'crypto', 'bitcoin', 'ethereum', 'forex', 'exchange '
'rate']
return any(keyword in query.lower() for keyword in finance_keywords)
def clean_text(text):
"""
Clean the text by removing unnecessary tags, unrecognized characters, and common irrelevant sections.
"""
text = re.sub('<.*?>', '', text)
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
lines = text.split('\n')
filtered_lines = [line for line in lines if len(line) > 30 and not line.lower().startswith(
("menu", "privacy", "terms", "sign in", "subscribe", "footer", "header"))]
filtered_text = ' '.join(filtered_lines).strip()
filtered_text = re.sub(r'\s+', ' ', filtered_text)
return filtered_text
def parse_web_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status
soup = BeautifulSoup(response.content, 'html.parser')
price_div = soup.find('div', class_='YMlKec fxKbKc')
if price_div:
return price_div.get_text(strip=True)
return clean_text(soup.get_text())
except Exception as e:
return None
def compute_embeddings(texts):
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
with torch.no_grad():
model_output = model(**encoded_input)
embeddings = model_output.last_hidden_state.mean(dim=1)
return embeddings
def compute_similarity(query, contents):
embeddings = compute_embeddings([query]+ contents)
query_embedding = embeddings[0].reshape(1 , -1)
content_embeddings = embeddings[-1:]
cos_similarities = cosine_similarity(query_embedding, content_embeddings).flatten()
return cos_similarities
def find_most_relevant_content(query, content, topK=3):
sentences = sent_tokenize(content)
similarity_scores = compute_similarity(query, sentences)
sorted_indices = similarity_scores.argsort()[::-1][:topK]
top_sentences = [sentences[i] for i in sorted_indices]
return ' '.join(top_sentences)
def run_web_search_python(query):
if is_finance_related(query):
query += " site:google.com/finance/quote"
search_results_urls = perform_web_search(query)
successful_scrapes = 0
relevant_contents = []
for url in search_results_urls:
if successful_scrapes >= 1:
break
content = parse_web_content(url)
if content:
relevant_content = find_most_relevant_content(query, content)
relevant_contents.append(relevant_content)
successful_scrapes +=1
return {
"query": query,
"relevant_contents": relevant_contents
}
Back to Directory
File Manager