Viewing File: /home/ubuntu/codegamaai-test/general_bot/src/search_help.py

import requests
from bs4 import BeautifulSoup
import re
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import json
import torch

# Initialize NLP models and tools
nltk.download('punkt')
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def perform_web_search(query, api_key ="AIzaSyCl0-ppOL9ayUr5iaCycc4v2exAFFFnKsM", cse_id="83e74a6a41c114e61",
                       num_results=6):
    api_url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "q": query,
        "cx":cse_id,
        "key": api_key,
        "num": num_results
    }
    response = requests.get(api_url, params=params)
    response.raise_for_status()

    search_results = response.json()
    urls = [item['link'] for item in search_results.get('items', [])]
    return urls

def is_finance_related(query):
    finance_keywords = ['stock', 'shares', 'price','finance', 'market', 'crypto', 'bitcoin', 'ethereum', 'forex', 'exchange '
                                                                                                          'rate']
    return any(keyword in query.lower() for keyword in finance_keywords)

def clean_text(text):
    """
    Clean the text by removing unnecessary tags, unrecognized characters, and common irrelevant sections.
    """
    text = re.sub('<.*?>', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    lines = text.split('\n')
    filtered_lines = [line for line in lines if len(line) > 30 and not line.lower().startswith(
        ("menu", "privacy", "terms", "sign in", "subscribe", "footer", "header"))]
    filtered_text = ' '.join(filtered_lines).strip()
    filtered_text = re.sub(r'\s+', ' ', filtered_text)
    return filtered_text

def parse_web_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status
        soup = BeautifulSoup(response.content, 'html.parser')
        price_div = soup.find('div', class_='YMlKec fxKbKc')
        if price_div:
            return price_div.get_text(strip=True)
        return clean_text(soup.get_text())
    except Exception as e:
        return None 


def compute_embeddings(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings


def compute_similarity(query, contents):
    embeddings = compute_embeddings([query]+ contents)
    query_embedding = embeddings[0].reshape(1 , -1)
    content_embeddings = embeddings[-1:]

    cos_similarities = cosine_similarity(query_embedding, content_embeddings).flatten()
    return cos_similarities

def find_most_relevant_content(query, content, topK=3):
    sentences = sent_tokenize(content)
    similarity_scores = compute_similarity(query, sentences)
    sorted_indices = similarity_scores.argsort()[::-1][:topK]
    top_sentences = [sentences[i] for i in sorted_indices]

    return ' '.join(top_sentences)

def run_web_search_python(query):
    if is_finance_related(query):
        query += " site:google.com/finance/quote"

    search_results_urls = perform_web_search(query)
    successful_scrapes = 0
    relevant_contents = []

    for url in search_results_urls:
        if successful_scrapes >= 1:
            break
        content = parse_web_content(url)
        if content:
            relevant_content = find_most_relevant_content(query, content)
            relevant_contents.append(relevant_content)
            successful_scrapes +=1

    return {
        "query": query,
        "relevant_contents": relevant_contents
    }

Back to Directory File Manager