File Manager

Viewing File: /home/ubuntu/combine_ai/tts_stt/main.py

from fastapi import FastAPI, HTTPException, File, UploadFile, Form, Body
from fastapi.responses import FileResponse
from google.cloud import texttospeech, speech
import os
import boto3
from botocore.exceptions import NoCredentialsError
from fastapi.responses import JSONResponse
from dotenv import load_dotenv


app = FastAPI()

# Set up Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/ubuntu/combine_ai/tts_stt/GOOGLE_APPLICATION_CREDENTIALS.json"


load_dotenv()

access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_DEFAULT_REGION")
bucket_name = os.getenv("AWS_BUCKET_NAME")


tts_client = texttospeech.TextToSpeechClient()

# Mapping human-readable language descriptions to Google API language codes
language_codes = {
        "English (United States)": "en-US",
        "English (United Kingdom)": "en-GB",
        "Spanish (Spain)": "es-ES",
        "Spanish (Latin America)": "es-US",
        "French (France)": "fr-FR",
        "German (Germany)": "de-DE",
        "Italian (Italy)": "it-IT",
        "Japanese (Japan)": "ja-JP",
        "Korean (South Korea)": "ko-KR",
        "Portuguese (Brazil)": "pt-BR",
        "Russian (Russia)": "ru-RU",
        "Chinese (Mandarin, Simplified)": "zh-CN",
        "Hindi (India)": "hi-IN",
        "Arabic (Saudi Arabia)": "ar-SA",
}

# Mapping human-readable gender descriptions to Google API gender values
gender_map = {
    "Male": texttospeech.SsmlVoiceGender.MALE,
    "Female": texttospeech.SsmlVoiceGender.FEMALE,
}

def upload_to_s3(file_name, bucket_name, folder_name):
    s3 = boto3.client('s3')
    object_name = f"{folder_name}/{os.path.basename(file_name)}"  # This line constructs the object key
    try:
        s3.upload_file(file_name, bucket_name, object_name)
        location = s3.get_bucket_location(Bucket=bucket_name)['LocationConstraint']
        url = f"https://{bucket_name}.s3-{location}.amazonaws.com/{object_name}"
        return url
    except FileNotFoundError:
        return "The file was not found"
    except NoCredentialsError:
        return "Credentials not available"


@app.post("/text-to-speech/")
async def text_to_speech(text: str = Body(...), language: str = Body(...), gender: str = Body(...), folder_name: str = Body(default="audio")):
    language_code = language_codes.get(language, "en-US")
    voice_gender = gender_map.get(gender, texttospeech.SsmlVoiceGender.NEUTRAL)

    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(language_code=language_code, ssml_gender=voice_gender)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
    response = tts_client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

    output_filename = f"temp_{os.urandom(24).hex()}.mp3"
    try:
        with open(output_filename, "wb") as out:
            out.write(response.audio_content)
        url = upload_to_s3(output_filename, bucket_name, folder_name)
        return JSONResponse(content={"url": url})
    finally:
        os.remove(output_filename)  # Clean up the file after uploading

@app.post("/speech-to-text/")
async def speech_to_text(file: UploadFile = File(...)):
    audio = speech.RecognitionAudio(content=await file.read())
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
        sample_rate_hertz=44100,
        language_code="en-US"
    )
    client = speech.SpeechClient()
    response = client.recognize(config=config, audio=audio)
    transcript = "".join([result.alternatives[0].transcript for result in response.results])
    return {"transcript": transcript}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8010,ssl_keyfile="privkey.pem", ssl_certfile="fullchain.pem")
Back to Directory File Manager