Feat/tool-D-ID (#6278)

This commit is contained in:
Matri 2024-08-09 11:05:33 +08:00 committed by GitHub
parent 633808de06
commit 4dfa8eedb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 476 additions and 0 deletions

View File

@ -0,0 +1,14 @@
<svg viewBox="0 0 40 40" xmlns="http://www.w3.org/2000/svg">
<rect x="0" y="0" width="40" height="40" style="fill: #0d0a08;"></rect>
<g clip-path="url(#clip0_269_13)" transform="matrix(0.429227, 0, 0, 0.429227, 6.326543, 9.593137)" style="background-color: f4f3f2">
<path d="M6.05159 7.04111H0.5V44.0227H6.05159C13.5 44.0227 16.6023 42.1692 16.6023 34.1718V16.8831C16.6023 8.791 13.503 7.03223 6.05159 7.03223V7.03815V7.04111ZM11.9755 34.1718C11.9755 38.7019 10.5898 39.3948 6.09591 39.3948H5.12091V11.6601H6.09591C10.5839 11.6601 11.9755 12.353 11.9755 16.8831V34.1718Z" fill="white"></path>
<path d="M18.9834 26.2188V29.9169H25.9207V26.2188H18.9834Z" fill="white"></path>
<path d="M28.562 13.9783V44.0225H33.1888V13.9783H28.562Z" fill="white"></path>
<path d="M41.3822 7.04111H35.8306V44.0227H41.3822C48.8306 44.0227 51.9358 42.1692 51.9358 34.1718V16.8831C51.9358 8.791 48.8365 7.03223 41.3822 7.03223V7.03815V7.04111ZM47.306 34.1718C47.306 38.7019 45.9203 39.3948 41.4265 39.3948H40.4515V11.6601H41.4265C45.9144 11.6601 47.306 12.353 47.306 16.8831V34.1718Z" fill="white"></path>
<path d="M30.8758 11.2278C32.2775 11.2278 33.4138 10.0917 33.4138 8.69032C33.4138 7.2889 32.2775 6.15283 30.8758 6.15283C29.4742 6.15283 28.3379 7.2889 28.3379 8.69032C28.3379 10.0917 29.4742 11.2278 30.8758 11.2278Z" fill="#FF882E"></path>
<path d="M36.191 4.02677C36.9621 4.02677 37.5885 3.40202 37.5885 2.62923C37.5885 1.85644 36.9621 1.23169 36.191 1.23169C35.4198 1.23169 34.7935 1.85644 34.7935 2.62923C34.7935 3.40202 35.4198 4.02677 36.191 4.02677Z" fill="#FF882E"></path>
<path d="M42.1978 2.09631C42.7769 2.09631 43.2467 1.62553 43.2467 1.04816C43.2467 0.470782 42.7769 0 42.1978 0C41.6187 0 41.1489 0.470782 41.1489 1.04816C41.1489 1.62553 41.6187 2.09631 42.1978 2.09631Z" fill="#FF882E"></path>
<path d="M47.8467 3.14734C48.4258 3.14734 48.8956 2.67656 48.8956 2.09918C48.8956 1.52181 48.4258 1.05103 47.8467 1.05103C47.2676 1.05103 46.7979 1.52181 46.7979 2.09918C46.7979 2.67656 47.2676 3.14734 47.8467 3.14734Z" fill="#FF882E"></path>
<path d="M55.9065 53C54.7276 53 53.729 52.6239 53.0081 52.3515L52.7422 52.2538C51.5367 51.8156 50.3726 51.3774 49.2854 50.951C48.6826 50.7142 48.3842 50.0332 48.6206 49.4291C48.857 48.8251 49.5395 48.529 50.1422 48.7659C51.2117 49.1863 52.3581 49.6157 53.5488 50.048C53.6433 50.0835 53.7408 50.119 53.8383 50.1575C54.6449 50.4625 55.5608 50.8089 56.5654 50.5839C57.4635 50.3825 58.0219 50.0391 58.3144 49.5091C58.5892 49.0117 58.5035 48.6593 58.3144 48.0227C58.1549 47.4897 57.9599 46.8265 58.214 46.107C58.4976 45.3016 59.0738 44.9078 59.4963 44.6206C59.8833 44.3542 59.9631 44.2831 60.0074 44.0581C60.1049 43.5606 59.8272 43.3001 59.7297 43.2261C59.2895 43.0662 58.9763 42.6516 58.9585 42.1661C58.9379 41.5916 59.3338 41.0883 59.8951 40.9728C59.9956 40.9521 60.2999 40.8899 60.5451 40.6412C60.6722 40.5139 60.8908 40.2474 60.9115 39.8891C60.9292 39.5605 60.7549 39.291 60.4683 38.8913C60.1492 38.4501 59.5554 37.627 60.1049 36.7032C60.5392 35.9719 61.4581 35.5899 62.1967 35.282C62.3504 35.2168 62.4892 35.1606 62.5956 35.1103C63.0388 34.8911 63.2338 34.6484 63.1392 33.8696C63.1097 33.6357 63.0004 33.4492 62.9295 33.3485C61.9456 32.0813 61.0297 30.8081 60.1315 29.4579C59.3397 28.2617 58.8079 27.3823 58.601 26.1328C58.3913 24.8804 59.0472 22.5916 59.124 22.334C59.907 19.6692 59.9424 17.641 58.0367 13.321C56.5979 10.064 54.376 7.8345 52.7658 6.53762C52.2606 6.13198 52.1808 5.39176 52.5885 4.88841C52.9963 4.38209 53.7349 4.30215 54.2401 4.71075C56.8401 6.8041 58.8935 9.4541 60.1847 12.3735C62.1435 16.8119 62.4331 19.3938 61.3783 22.9943C61.1006 23.9388 60.8465 25.3008 60.9204 25.7479C61.0415 26.4792 61.3192 26.9915 62.0904 28.161C62.959 29.4668 63.8454 30.6985 64.7997 31.9243C64.8085 31.9362 64.8174 31.9451 64.8233 31.9569C65.0685 32.2944 65.3788 32.8511 65.4704 33.5824C65.7244 35.7084 64.6135 36.7299 63.6385 37.2125C63.4967 37.2835 63.3076 37.3635 63.1008 37.4494C62.9531 37.5115 62.7226 37.6063 62.5129 37.707C62.8645 38.2103 63.3195 38.9742 63.2604 40.0165C63.2131 40.8603 62.8408 41.6716 62.2115 42.2993C62.1613 42.3496 62.1081 42.4 62.052 42.4473C62.3622 43.0721 62.4567 43.7857 62.3179 44.5022C62.0845 45.6954 61.2956 46.2343 60.8258 46.5541C60.6249 46.6903 60.492 46.7851 60.4476 46.8561C60.4565 46.9597 60.5245 47.1818 60.5717 47.3476C60.7845 48.0612 61.139 49.2574 60.3767 50.6372C59.7533 51.7682 58.6454 52.5173 57.0883 52.8667C56.6806 52.9585 56.2876 52.997 55.9124 52.997L55.9065 53Z" fill="#FF882E"></path>
</g>
</svg>

After

Width:  |  Height:  |  Size: 4.4 KiB

View File

@ -0,0 +1,21 @@
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.did.tools.talks import TalksTool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class DIDProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict) -> None:
try:
# Example validation using the D-ID talks tool
TalksTool().fork_tool_runtime(
runtime={"credentials": credentials}
).invoke(
user_id='',
tool_parameters={
"source_url": "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png",
"text_input": "Hello, welcome to use D-ID tool in Dify",
}
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@ -0,0 +1,28 @@
identity:
author: Matri Qi
name: did
label:
en_US: D-ID
description:
en_US: D-ID is a tool enabling the creation of high-quality, custom videos of Digital Humans from a single image.
icon: icon.svg
tags:
- videos
credentials_for_provider:
did_api_key:
type: secret-input
required: true
label:
en_US: D-ID API Key
placeholder:
en_US: Please input your D-ID API key
help:
en_US: Get your D-ID API key from your D-ID account settings.
url: https://studio.d-id.com/account-settings
base_url:
type: text-input
required: false
label:
en_US: D-ID server's Base URL
placeholder:
en_US: https://api.d-id.com

View File

@ -0,0 +1,87 @@
import logging
import time
from collections.abc import Mapping
from typing import Any
import requests
from requests.exceptions import HTTPError
logger = logging.getLogger(__name__)
class DIDApp:
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.api_key = api_key
self.base_url = base_url or 'https://api.d-id.com'
if not self.api_key:
raise ValueError('API key is required')
def _prepare_headers(self, idempotency_key: str | None = None):
headers = {'Content-Type': 'application/json', 'Authorization': f'Basic {self.api_key}'}
if idempotency_key:
headers['Idempotency-Key'] = idempotency_key
return headers
def _request(
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
) -> Mapping[str, Any] | None:
for i in range(retries):
try:
response = requests.request(method, url, json=data, headers=headers)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
if i < retries - 1 and isinstance(e, HTTPError) and e.response.status_code >= 500:
time.sleep(backoff_factor * (2**i))
else:
raise
return None
def talks(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
endpoint = f'{self.base_url}/talks'
headers = self._prepare_headers(idempotency_key)
data = kwargs['params']
logger.debug(f'Send request to {endpoint=} body={data}')
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError('Failed to initiate D-ID talks after multiple retries')
id: str = response['id']
if wait:
return self._monitor_job_status(id=id, target='talks', poll_interval=poll_interval)
return id
def animations(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
endpoint = f'{self.base_url}/animations'
headers = self._prepare_headers(idempotency_key)
data = kwargs['params']
logger.debug(f'Send request to {endpoint=} body={data}')
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError('Failed to initiate D-ID talks after multiple retries')
id: str = response['id']
if wait:
return self._monitor_job_status(target='animations', id=id, poll_interval=poll_interval)
return id
def check_did_status(self, target: str, id: str):
endpoint = f'{self.base_url}/{target}/{id}'
headers = self._prepare_headers()
response = self._request('GET', endpoint, headers=headers)
if response is None:
raise HTTPError(f'Failed to check status for talks {id} after multiple retries')
return response
def _monitor_job_status(self, target: str, id: str, poll_interval: int):
while True:
status = self.check_did_status(target=target, id=id)
if status['status'] == 'done':
return status
elif status['status'] == 'error' or status['status'] == 'rejected':
raise HTTPError(f'Talks {id} failed: {status["status"]} {status.get("error",{}).get("description")}')
time.sleep(poll_interval)

View File

@ -0,0 +1,49 @@
import json
from typing import Any, Union
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.did.did_appx import DIDApp
from core.tools.tool.builtin_tool import BuiltinTool
class AnimationsTool(BuiltinTool):
def _invoke(
self, user_id: str, tool_parameters: dict[str, Any]
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])
driver_expressions_str = tool_parameters.get('driver_expressions')
driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None
config = {
'stitch': tool_parameters.get('stitch', True),
'mute': tool_parameters.get('mute'),
'result_format': tool_parameters.get('result_format') or 'mp4',
}
config = {k: v for k, v in config.items() if v is not None and v != ''}
options = {
'source_url': tool_parameters['source_url'],
'driver_url': tool_parameters.get('driver_url'),
'config': config,
}
options = {k: v for k, v in options.items() if v is not None and v != ''}
if not options.get('source_url'):
raise ValueError('Source URL is required')
if config.get('logo_url'):
if not config.get('logo_x'):
raise ValueError('Logo X position is required when logo URL is provided')
if not config.get('logo_y'):
raise ValueError('Logo Y position is required when logo URL is provided')
animations_result = app.animations(params=options, wait=True)
if not isinstance(animations_result, str):
animations_result = json.dumps(animations_result, ensure_ascii=False, indent=4)
if not animations_result:
return self.create_text_message('D-ID animations request failed.')
return self.create_text_message(animations_result)

View File

@ -0,0 +1,86 @@
identity:
name: animations
author: Matri Qi
label:
en_US: Animations
description:
human:
en_US: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
llm: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
parameters:
- name: source_url
type: string
required: true
label:
en_US: source url
human_description:
en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
form: llm
- name: driver_url
type: string
required: false
label:
en_US: driver url
human_description:
en_US: The URL of the driver video to drive the animation, or a provided driver name from D-ID.
form: form
- name: mute
type: boolean
required: false
label:
en_US: mute
human_description:
en_US: Mutes the driver sound in the animated video result, defaults to true
form: form
- name: stitch
type: boolean
required: false
label:
en_US: stitch
human_description:
en_US: If enabled, the driver video will be stitched with the animationing head video.
form: form
- name: logo_url
type: string
required: false
label:
en_US: logo url
human_description:
en_US: The URL of the logo image to be added to the animation video.
form: form
- name: logo_x
type: number
required: false
label:
en_US: logo position x
human_description:
en_US: The x position of the logo image in the animation video. It's required when logo url is provided.
form: form
- name: logo_y
type: number
required: false
label:
en_US: logo position y
human_description:
en_US: The y position of the logo image in the animation video. It's required when logo url is provided.
form: form
- name: result_format
type: string
default: mp4
required: false
label:
en_US: result format
human_description:
en_US: The format of the result video.
form: form
options:
- value: mp4
label:
en_US: mp4
- value: gif
label:
en_US: gif
- value: mov
label:
en_US: mov

View File

@ -0,0 +1,65 @@
import json
from typing import Any, Union
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.did.did_appx import DIDApp
from core.tools.tool.builtin_tool import BuiltinTool
class TalksTool(BuiltinTool):
def _invoke(
self, user_id: str, tool_parameters: dict[str, Any]
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])
driver_expressions_str = tool_parameters.get('driver_expressions')
driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None
script = {
'type': tool_parameters.get('script_type') or 'text',
'input': tool_parameters.get('text_input'),
'audio_url': tool_parameters.get('audio_url'),
'reduce_noise': tool_parameters.get('audio_reduce_noise', False),
}
script = {k: v for k, v in script.items() if v is not None and v != ''}
config = {
'stitch': tool_parameters.get('stitch', True),
'sharpen': tool_parameters.get('sharpen'),
'fluent': tool_parameters.get('fluent'),
'result_format': tool_parameters.get('result_format') or 'mp4',
'pad_audio': tool_parameters.get('pad_audio'),
'driver_expressions': driver_expressions,
}
config = {k: v for k, v in config.items() if v is not None and v != ''}
options = {
'source_url': tool_parameters['source_url'],
'driver_url': tool_parameters.get('driver_url'),
'script': script,
'config': config,
}
options = {k: v for k, v in options.items() if v is not None and v != ''}
if not options.get('source_url'):
raise ValueError('Source URL is required')
if script.get('type') == 'audio':
script.pop('input', None)
if not script.get('audio_url'):
raise ValueError('Audio URL is required for audio script type')
if script.get('type') == 'text':
script.pop('audio_url', None)
script.pop('reduce_noise', None)
if not script.get('input'):
raise ValueError('Text input is required for text script type')
talks_result = app.talks(params=options, wait=True)
if not isinstance(talks_result, str):
talks_result = json.dumps(talks_result, ensure_ascii=False, indent=4)
if not talks_result:
return self.create_text_message('D-ID talks request failed.')
return self.create_text_message(talks_result)

View File

@ -0,0 +1,126 @@
identity:
name: talks
author: Matri Qi
label:
en_US: Talks
description:
human:
en_US: Talks enables the creation of realistic talking head videos from text or audio inputs.
llm: Talks enables the creation of realistic talking head videos from text or audio inputs.
parameters:
- name: source_url
type: string
required: true
label:
en_US: source url
human_description:
en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
form: llm
- name: driver_url
type: string
required: false
label:
en_US: driver url
human_description:
en_US: The URL of the driver video to drive the talk, or a provided driver name from D-ID.
form: form
- name: script_type
type: string
required: false
label:
en_US: script type
human_description:
en_US: The type of the script.
form: form
options:
- value: text
label:
en_US: text
- value: audio
label:
en_US: audio
- name: text_input
type: string
required: false
label:
en_US: text input
human_description:
en_US: The text input to be spoken by the talking head. Required when script type is text.
form: form
- name: audio_url
type: string
required: false
label:
en_US: audio url
human_description:
en_US: The URL of the audio file to be spoken by the talking head. Required when script type is audio.
form: form
- name: audio_reduce_noise
type: boolean
required: false
label:
en_US: audio reduce noise
human_description:
en_US: If enabled, the audio will be processed to reduce noise before being spoken by the talking head. It only works when script type is audio.
form: form
- name: stitch
type: boolean
required: false
label:
en_US: stitch
human_description:
en_US: If enabled, the driver video will be stitched with the talking head video.
form: form
- name: sharpen
type: boolean
required: false
label:
en_US: sharpen
human_description:
en_US: If enabled, the talking head video will be sharpened.
form: form
- name: result_format
type: string
required: false
label:
en_US: result format
human_description:
en_US: The format of the result video.
form: form
options:
- value: mp4
label:
en_US: mp4
- value: gif
label:
en_US: gif
- value: mov
label:
en_US: mov
- name: fluent
type: boolean
required: false
label:
en_US: fluent
human_description:
en_US: Interpolate between the last & first frames of the driver video When used together with pad_audio can create a seamless transition between videos of the same driver
form: form
- name: pad_audio
type: number
required: false
label:
en_US: pad audio
human_description:
en_US: Pad the audio with silence at the end (given in seconds) Will increase the video duration & the credits it consumes
form: form
min: 1
max: 60
- name: driver_expressions
type: string
required: false
label:
en_US: driver expressions
human_description:
en_US: timed expressions for animation. It should be an JSON array style string. Take D-ID documentation(https://docs.d-id.com/reference/createtalk) for more information.
form: form