Viewing File: /home/ubuntu/codegamaai-test/efimarket_bot/src/token_count.py

import os
import json
# from transformers import GPT2Tokenizer
from PyPDF2 import PdfReader
import docx
import pandas as pd
from src.utils import *
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def count_tokens_in_text(text):
    # Tokenize the text using GPT-2 tokenizer
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

def read_and_count_tokens_in_folder(user_id,bot_id):
    results = {}
    status = download_files_from_s3_bucket_LLM_data(user_id, bot_id)
    if status == "Download successfully":
        # folder_path = "./Local_DB/" + str(user_id) + "/" + str(bot_id) + "/" + "data"
        folder_path = os.path.join(os.environ['DB_DIR'], user_id, bot_id, 'data')
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            if file_path.endswith('.txt') or file_path.endswith('.md'):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    results[file_name] = count_tokens_in_text(content)
            elif file_path.endswith('.docx'):
                doc = docx.Document(file_path)
                content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
                results[file_name] = count_tokens_in_text(content)
            elif file_path.endswith('.pdf'):
                pdf_reader = PdfReader(file_path)
                content = ''
                for page_num in range(len(pdf_reader.pages)):
                    content += pdf_reader.pages[page_num].extract_text()
                results[file_name] = count_tokens_in_text(content)
            elif file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
                content = '\n'.join(df.to_string(index=False, header=False).split('\n'))
                results[file_name] = count_tokens_in_text(content)
            else:
                results[file_name] = "Unsupported file format"
    return results

# # Example usage
# folder_path = '/home/kesavan/Documents/Work/ai_as_v2/ai-as-service-haive/Local_DB/58/179/data'  # Replace this with the path to your folder
# results = read_and_count_tokens_in_folder(folder_path)
# print(results)

Back to Directory File Manager