| 
					
				 | 
			
			
				@@ -1,11 +1,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 import concurrent.futures 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from functools import reduce 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from io import BytesIO 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from typing import Optional 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from flask import Response 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from openai import OpenAI 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-from pydub import AudioSegment 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.model_runtime.errors.invoke import InvokeBadRequestError 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 from core.model_runtime.errors.validate import CredentialsValidateFailedError 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         :return: text translated to audio file 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if not voice or voice not in [d['value'] for d in 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                                      self.get_tts_model_voices(model=model, credentials=credentials)]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             voice = self._get_model_default_voice(model, credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         # if streaming: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         return self._tts_invoke_streaming(model=model, 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         :return: text translated to audio file 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            self._tts_invoke( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            self._tts_invoke_streaming( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 model=model, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 credentials=credentials, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 content_text='Hello Dify!', 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         except Exception as ex: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             raise CredentialsValidateFailedError(str(ex)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        _tts_invoke text2speech model 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        :param model: model name 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        :param credentials: model credentials 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        :param content_text: text content to be translated 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        :param voice: model timbre 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        :return: text translated to audio file 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        audio_type = self._get_model_audio_type(model, credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        word_limit = self._get_model_word_limit(model, credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        max_workers = self._get_model_workers_limit(model, credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            audio_bytes_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            # Create a thread pool and map the function to the list of sentences 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                                           credentials=credentials) for sentence in sentences] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                for future in futures: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                        if future.result(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                            audio_bytes_list.append(future.result()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                    except Exception as ex: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                        raise InvokeBadRequestError(str(ex)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            if len(audio_bytes_list) > 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                                  audio_bytes_list if audio_bytes] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                combined_segment = reduce(lambda x, y: x + y, audio_segments) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                buffer: BytesIO = BytesIO() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                combined_segment.export(buffer, format=audio_type) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                buffer.seek(0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        except Exception as ex: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            raise InvokeBadRequestError(str(ex)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                               voice: str) -> any: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         """ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             # doc: https://platform.openai.com/docs/guides/text-to-speech 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             credentials_kwargs = self._to_credential_kwargs(credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             client = OpenAI(**credentials_kwargs) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            model_support_voice = [x.get("value") for x in 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                                   self.get_tts_model_voices(model=model, credentials=credentials)] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             if not voice or voice not in model_support_voice: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				                 voice = self._get_model_default_voice(model, credentials) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             word_limit = self._get_model_word_limit(model, credentials) 
			 |