mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
Add tts document&fix bug (#2156)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Yeuoly <45712896+Yeuoly@users.noreply.github.com>
This commit is contained in:
parent
ad71386adf
commit
fd5c45ae10
|
@ -32,6 +32,7 @@ class ChatAudioApi(InstalledAppResource):
|
|||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
end_user=None
|
||||
)
|
||||
|
||||
return response
|
||||
|
|
|
@ -66,6 +66,7 @@ class TextApi(AppApiResource):
|
|||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('text', type=str, required=True, nullable=False, location='json')
|
||||
parser.add_argument('user', type=str, required=True, nullable=False, location='json')
|
||||
parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
|
@ -73,7 +74,7 @@ class TextApi(AppApiResource):
|
|||
tenant_id=app_model.tenant_id,
|
||||
text=args['text'],
|
||||
end_user=args['user'],
|
||||
streaming=False
|
||||
streaming=args['streaming']
|
||||
)
|
||||
|
||||
return response
|
||||
|
|
|
@ -31,6 +31,7 @@ class AudioApi(WebApiResource):
|
|||
response = AudioService.transcript_asr(
|
||||
tenant_id=app_model.tenant_id,
|
||||
file=file,
|
||||
end_user=end_user
|
||||
)
|
||||
|
||||
return response
|
||||
|
|
|
@ -13,6 +13,7 @@ This module provides the interface for invoking and authenticating various model
|
|||
- `Text Embedding Model` - Text Embedding, pre-computed tokens capability
|
||||
- `Rerank Model` - Segment Rerank capability
|
||||
- `Speech-to-text Model` - Speech to text capability
|
||||
- `Text-to-speech Model` - Text to speech capability
|
||||
- `Moderation` - Moderation capability
|
||||
|
||||
- Model provider display
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
- `Text Embedidng Model` - 文本 Embedding ,预计算 tokens 能力
|
||||
- `Rerank Model` - 分段 Rerank 能力
|
||||
- `Speech-to-text Model` - 语音转文本能力
|
||||
- `Text-to-speech Model` - 文本转语音能力
|
||||
- `Moderation` - Moderation 能力
|
||||
|
||||
- 模型供应商展示
|
||||
|
|
|
@ -299,9 +299,7 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement
|
|||
- Invoke Invocation
|
||||
|
||||
```python
|
||||
def _invoke(self, model: str, credentials: dict,
|
||||
file: IO[bytes], user: Optional[str] = None) \
|
||||
-> str:
|
||||
def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str:
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
|
@ -331,6 +329,46 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement
|
|||
|
||||
The string after speech-to-text conversion.
|
||||
|
||||
### Text2speech
|
||||
|
||||
Inherit the `__base.text2speech_model.Text2SpeechModel` base class and implement the following interfaces:
|
||||
|
||||
- Invoke Invocation
|
||||
|
||||
```python
|
||||
def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: translated audio file
|
||||
"""
|
||||
```
|
||||
|
||||
- Parameters:
|
||||
|
||||
- `model` (string) Model name
|
||||
|
||||
- `credentials` (object) Credential information
|
||||
|
||||
The parameters of credential information are defined by either the `provider_credential_schema` or `model_credential_schema` in the provider's YAML configuration file. Inputs such as `api_key` are included.
|
||||
|
||||
- `content_text` (string) The text content that needs to be converted
|
||||
|
||||
- `streaming` (bool) Whether to stream output
|
||||
|
||||
- `user` (string) [optional] Unique identifier of the user
|
||||
|
||||
This can help the provider monitor and detect abusive behavior.
|
||||
|
||||
- Returns:
|
||||
|
||||
Text converted speech stream。
|
||||
|
||||
### Moderation
|
||||
|
||||
Inherit the `__base.moderation_model.ModerationModel` base class and implement the following interfaces:
|
||||
|
|
|
@ -94,6 +94,7 @@ The currently supported model types are as follows:
|
|||
- `text_embedding` Text Embedding model
|
||||
- `rerank` Rerank model
|
||||
- `speech2text` Speech to text
|
||||
- `tts` Text to speech
|
||||
- `moderation` Moderation
|
||||
|
||||
Continuing with `Anthropic` as an example, since `Anthropic` only supports LLM, we create a `module` named `llm` in `model_providers.anthropic`.
|
||||
|
|
|
@ -47,6 +47,10 @@
|
|||
- `max_chunks` (int) Maximum number of chunks (available for model types `text-embedding`, `moderation`)
|
||||
- `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
|
||||
- `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
|
||||
- `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`)
|
||||
- `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`)
|
||||
- `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`)
|
||||
- `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`)
|
||||
- `max_characters_per_chunk` (int) Maximum characters per chunk (available for model type `moderation`)
|
||||
- `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] Model invocation parameter rules
|
||||
- `pricing` ([PriceConfig](#PriceConfig)) [optional] Pricing information
|
||||
|
@ -58,6 +62,7 @@
|
|||
- `text-embedding` Text Embedding model
|
||||
- `rerank` Rerank model
|
||||
- `speech2text` Speech to text
|
||||
- `tts` Text to speech
|
||||
- `moderation` Moderation
|
||||
|
||||
### ConfigurateMethod
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
- `text_embedding` 文本 Embedding 模型
|
||||
- `rerank` Rerank 模型
|
||||
- `speech2text` 语音转文字
|
||||
- `tts` 文字转语音
|
||||
- `moderation` 审查
|
||||
|
||||
`Xinference`支持`LLM`和`Text Embedding`和Rerank,那么我们开始编写`xinference.yaml`。
|
||||
|
|
|
@ -369,6 +369,46 @@ class XinferenceProvider(Provider):
|
|||
|
||||
语音转换后的字符串。
|
||||
|
||||
### Text2speech
|
||||
|
||||
继承 `__base.text2speech_model.Text2SpeechModel` 基类,实现以下接口:
|
||||
|
||||
- Invoke 调用
|
||||
|
||||
```python
|
||||
def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
||||
"""
|
||||
Invoke large language model
|
||||
|
||||
:param model: model name
|
||||
:param credentials: model credentials
|
||||
:param content_text: text content to be translated
|
||||
:param streaming: output is streaming
|
||||
:param user: unique user id
|
||||
:return: translated audio file
|
||||
"""
|
||||
```
|
||||
|
||||
- 参数:
|
||||
|
||||
- `model` (string) 模型名称
|
||||
|
||||
- `credentials` (object) 凭据信息
|
||||
|
||||
凭据信息的参数由供应商 YAML 配置文件的 `provider_credential_schema` 或 `model_credential_schema` 定义,传入如:`api_key` 等。
|
||||
|
||||
- `content_text` (string) 需要转换的文本内容
|
||||
|
||||
- `streaming` (bool) 是否进行流式输出
|
||||
|
||||
- `user` (string) [optional] 用户的唯一标识符
|
||||
|
||||
可以帮助供应商监控和检测滥用行为。
|
||||
|
||||
- 返回:
|
||||
|
||||
文本转换后的语音流。
|
||||
|
||||
### Moderation
|
||||
|
||||
继承 `__base.moderation_model.ModerationModel` 基类,实现以下接口:
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
- `text_embedding` 文本 Embedding 模型
|
||||
- `rerank` Rerank 模型
|
||||
- `speech2text` 语音转文字
|
||||
- `tts` 文字转语音
|
||||
- `moderation` 审查
|
||||
|
||||
依旧以 `Anthropic` 为例,`Anthropic` 仅支持 LLM,因此在 `model_providers.anthropic` 创建一个 `llm` 为名称的 `module`。
|
||||
|
|
|
@ -48,6 +48,10 @@
|
|||
- `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
|
||||
- `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用)
|
||||
- `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用)
|
||||
- `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用)
|
||||
- `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用)
|
||||
- `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用)
|
||||
- `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用)
|
||||
- `max_characters_per_chunk` (int) 每块最大字符数 (模型类型 `moderation` 可用)
|
||||
- `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] 模型调用参数规则
|
||||
- `pricing` ([PriceConfig](#PriceConfig)) [optional] 价格信息
|
||||
|
@ -59,6 +63,7 @@
|
|||
- `text-embedding` 文本 Embedding 模型
|
||||
- `rerank` Rerank 模型
|
||||
- `speech2text` 语音转文字
|
||||
- `tts` 文字转语音
|
||||
- `moderation` 审查
|
||||
|
||||
### ConfigurateMethod
|
||||
|
|
|
@ -5,3 +5,8 @@ model_properties:
|
|||
word_limit: 120
|
||||
audio_type: 'mp3'
|
||||
max_workers: 5
|
||||
pricing:
|
||||
input: '1'
|
||||
output: '0'
|
||||
unit: '0.0001'
|
||||
currency: RMB
|
||||
|
|
|
@ -62,7 +62,6 @@ bs4~=0.0.1
|
|||
markdown~=3.5.1
|
||||
google-generativeai~=0.3.2
|
||||
httpx[socks]~=0.24.1
|
||||
pydub~=0.25.1
|
||||
matplotlib~=3.8.2
|
||||
yfinance~=0.2.35
|
||||
pydub~=0.25.1
|
||||
|
|
|
@ -56,7 +56,6 @@ class AudioService:
|
|||
raise ProviderNotSupportTextToSpeechServiceError()
|
||||
|
||||
try:
|
||||
audio_response = model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
|
||||
return audio_response
|
||||
return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
|
Loading…
Reference in New Issue
Block a user