Viewing File: /home/ubuntu/codegamaai-test/efimarket_bot/src/token_count.py
import os
import json
# from transformers import GPT2Tokenizer
from PyPDF2 import PdfReader
import docx
import pandas as pd
from src.utils import *
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
def count_tokens_in_text(text):
# Tokenize the text using GPT-2 tokenizer
tokens = tokenizer.encode(text, add_special_tokens=False)
return len(tokens)
def read_and_count_tokens_in_folder(user_id,bot_id):
results = {}
status = download_files_from_s3_bucket_LLM_data(user_id, bot_id)
if status == "Download successfully":
# folder_path = "./Local_DB/" + str(user_id) + "/" + str(bot_id) + "/" + "data"
folder_path = os.path.join(os.environ['DB_DIR'], user_id, bot_id, 'data')
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if file_path.endswith('.txt') or file_path.endswith('.md'):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
results[file_name] = count_tokens_in_text(content)
elif file_path.endswith('.docx'):
doc = docx.Document(file_path)
content = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
results[file_name] = count_tokens_in_text(content)
elif file_path.endswith('.pdf'):
pdf_reader = PdfReader(file_path)
content = ''
for page_num in range(len(pdf_reader.pages)):
content += pdf_reader.pages[page_num].extract_text()
results[file_name] = count_tokens_in_text(content)
elif file_path.endswith('.csv'):
df = pd.read_csv(file_path)
content = '\n'.join(df.to_string(index=False, header=False).split('\n'))
results[file_name] = count_tokens_in_text(content)
else:
results[file_name] = "Unsupported file format"
return results
# # Example usage
# folder_path = '/home/kesavan/Documents/Work/ai_as_v2/ai-as-service-haive/Local_DB/58/179/data' # Replace this with the path to your folder
# results = read_and_count_tokens_in_folder(folder_path)
# print(results)
Back to Directory
File Manager