123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- from collections.abc import Generator
- import concurrent.futures
- from functools import reduce
- from io import BytesIO
- from typing import Optional
- from openai import OpenAI
- from pydub import AudioSegment
- from dify_plugin import TTSModel
- from dify_plugin.errors.model import (
- CredentialsValidateFailedError,
- InvokeBadRequestError,
- )
- from ..common_openai import _CommonOpenAI
- class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
- """
- Model class for OpenAI Speech to text model.
- """
- def _invoke(
- self,
- model: str,
- credentials: dict,
- content_text: str,
- voice: str,
- user: Optional[str] = None,
- ) -> bytes | Generator[bytes, None, None]:
- """
- _invoke text2speech model
- :param model: model name
- :param tenant_id: user tenant id
- :param credentials: model credentials
- :param content_text: text content to be translated
- :param voice: model timbre
- :param user: unique user id
- :return: text translated to audio file
- """
- voices = self.get_tts_model_voices(model=model, credentials=credentials)
- if not voices:
- raise InvokeBadRequestError("No voices found for the model")
- if not voice or voice not in [d["value"] for d in voices]:
- voice = self._get_model_default_voice(model, credentials)
- # if streaming:
- return self._tts_invoke_streaming(
- model=model, credentials=credentials, content_text=content_text, voice=voice
- )
- def validate_credentials(
- self, model: str, credentials: dict, user: Optional[str] = None
- ) -> None:
- """
- validate credentials text2speech model
- :param model: model name
- :param credentials: model credentials
- :param user: unique user id
- :return: text translated to audio file
- """
- try:
- self._tts_invoke(
- model=model,
- credentials=credentials,
- content_text="Hello Dify!",
- voice=self._get_model_default_voice(model, credentials),
- )
- except Exception as ex:
- raise CredentialsValidateFailedError(str(ex))
- def _tts_invoke(
- self, model: str, credentials: dict, content_text: str, voice: str
- ) -> bytes:
- """
- _tts_invoke text2speech model
- :param model: model name
- :param credentials: model credentials
- :param content_text: text content to be translated
- :param voice: model timbre
- :return: text translated to audio file
- """
- audio_type = self._get_model_audio_type(model, credentials)
- word_limit = self._get_model_word_limit(model, credentials) or 500
- max_workers = self._get_model_workers_limit(model, credentials)
- try:
- sentences = list(
- self._split_text_into_sentences(
- org_text=content_text, max_length=word_limit
- )
- )
- audio_bytes_list = []
- # Create a thread pool and map the function to the list of sentences
- with concurrent.futures.ThreadPoolExecutor(
- max_workers=max_workers
- ) as executor:
- futures = [
- executor.submit(
- self._process_sentence,
- sentence=sentence,
- model=model,
- voice=voice,
- credentials=credentials,
- )
- for sentence in sentences
- ]
- for future in futures:
- try:
- if future.result():
- audio_bytes_list.append(future.result())
- except Exception as ex:
- raise InvokeBadRequestError(str(ex))
- if len(audio_bytes_list) > 0:
- audio_segments = [
- AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type)
- for audio_bytes in audio_bytes_list
- if audio_bytes
- ]
- combined_segment = reduce(lambda x, y: x + y, audio_segments)
- buffer: BytesIO = BytesIO()
- combined_segment.export(buffer, format=audio_type)
- buffer.seek(0)
- return buffer.read()
- else:
- raise InvokeBadRequestError("No audio bytes found")
- except Exception as ex:
- raise InvokeBadRequestError(str(ex))
- def _tts_invoke_streaming(
- self, model: str, credentials: dict, content_text: str, voice: str
- ) -> Generator[bytes, None, None]:
- """
- _tts_invoke_streaming text2speech model
- :param model: model name
- :param credentials: model credentials
- :param content_text: text content to be translated
- :param voice: model timbre
- :return: text translated to audio file
- """
- try:
- # doc: https://platform.openai.com/docs/guides/text-to-speech
- credentials_kwargs = self._to_credential_kwargs(credentials)
- client = OpenAI(**credentials_kwargs)
- voices = self.get_tts_model_voices(model=model, credentials=credentials)
- if not voices:
- raise InvokeBadRequestError("No voices found for the model")
- if not voice or voice not in voices:
- voice = self._get_model_default_voice(model, credentials)
- word_limit = self._get_model_word_limit(model, credentials) or 500
- if len(content_text) > word_limit:
- sentences = self._split_text_into_sentences(
- content_text, max_length=word_limit
- )
- executor = concurrent.futures.ThreadPoolExecutor(
- max_workers=min(3, len(sentences))
- )
- futures = [
- executor.submit(
- client.audio.speech.with_streaming_response.create,
- model=model,
- response_format="mp3",
- input=sentences[i],
- voice=voice, # type: ignore
- )
- for i in range(len(sentences))
- ]
- for index, future in enumerate(futures):
- yield from future.result().__enter__().iter_bytes(1024)
- else:
- response = client.audio.speech.with_streaming_response.create(
- model=model,
- voice=voice, # type: ignore
- response_format="mp3",
- input=content_text.strip(),
- )
- yield from response.__enter__().iter_bytes(1024)
- except Exception as ex:
- raise InvokeBadRequestError(str(ex))
- def _process_sentence(self, sentence: str, model: str, voice, credentials: dict):
- """
- _tts_invoke openai text2speech model api
- :param model: model name
- :param credentials: model credentials
- :param voice: model timbre
- :param sentence: text content to be translated
- :return: text translated to audio file
- """
- # transform credentials to kwargs for model instance
- credentials_kwargs = self._to_credential_kwargs(credentials)
- client = OpenAI(**credentials_kwargs)
- response = client.audio.speech.create(
- model=model, voice=voice, input=sentence.strip()
- )
- if isinstance(response.read(), bytes):
- return response.read()
|