import fitz
from docx import Document
from transformers import pipeline
from collections import defaultdict
import numpy as np
import openai
from openai import OpenAI
import gradio as gr
from dotenv import load_dotenv
import os
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import os
from io import BytesIO
app = FastAPI()
# sentiment_pipeline = pipeline("sentiment-analysis")
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()
def analyze_sentiment(text):
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant which provide sentiment score based on the content like BERT model like eg Sentiment: NEGATIVE, Sentiment Score: 0.9917946457862854"},
{"role": "user", "content": f"Provide sentiment score this document: {text}"}
]
)
print(response)
response_content = response.choices[0].message.content
parts = response_content.split(', ')
sentiment_label = parts[0].split(': ')[1] # 'Neutral'
sentiment_score = float(parts[1].split(': ')[1]) # 0.0
sentiment = {'label': sentiment_label, 'score': sentiment_score}
print(sentiment)
return sentiment
def extract_text_from_pdf(pdf_path):
text = ""
with fitz.open(pdf_path) as doc:
for page in doc:
text += page.get_text()
return text
def extract_text_from_docx_path(docx_path):
doc = Document(docx_path)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
def generate_summary(text):
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Summarize this document in less than 200 word: {text}"}
]
)
return response.choices[0].message.content.strip()
def extract_insight(text):
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "you are a helpful assistant."},
{"role": "user", "content": f"extract insights from this document: {text}"}
]
)
return response.choices[0].message.content.strip()
def process_document(file_path):
file_type = file_path.split('.')[-1].lower()
if file_type == "pdf":
text = extract_text_from_pdf(file_path)
elif file_type == "docx":
text = extract_text_from_docx_path(file_path)
else:
return "Unsupported file type"
MAX_LENGTH = 1500 # Typo corrected: was MAX_lENGHT
if len(text) > MAX_LENGTH:
# If text exceeds the MAX_LENGTH, we process it in parts
parts = [text[i:i + MAX_LENGTH] for i in range(0, len(text), MAX_LENGTH)]
summaries = [generate_summary(part) for part in parts]
insights = [extract_insight(part) for part in parts]
# Aggregate or choose how to handle multiple summaries and insights
summary = " ".join(summaries) # Simple aggregation example
insight = " ".join(insights) # Simple aggregation example
else:
# Process the entire text at once if it doesn't exceed MAX_LENGTH
summary = generate_summary(text)
insight = extract_insight(text)
sentiment_result = analyze_sentiment(text)
return sentiment_result, summary, insight
@app.post("/analyze-document/")
async def analyze_document(file: UploadFile = File(...)):
file_extension = file.filename.split('.')[-1].lower()
if file_extension not in ["pdf", "docx"]:
raise HTTPException(status_code=400, detail="Unsupported file type. Please upload a PDF or DOCX file.")
file_content = await file.read()
file_path = f"temp.{file_extension}"
with open(file_path, "wb") as temp_file:
temp_file.write(file_content)
try:
sentiment_result, summary, insights = process_document(file_path)
finally:
os.remove(file_path) # Clean up the file
result = {
"Sentiment": sentiment_result['label'],
"Sentiment Score": sentiment_result['score'],
"Summary": summary,
"Insights": insights
}
return JSONResponse(content=result)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8007,ssl_keyfile="privkey.pem", ssl_certfile="fullchain.pem")