Merge branch 'main' into fix/bedrock-claude

This commit is contained in:
chenhe 2024-03-29 13:46:54 +08:00
commit 2e48446194
23 changed files with 195 additions and 45 deletions

View File

@ -155,4 +155,4 @@ And that's it! Once your PR is merged, you will be featured as a contributor in
## Getting Help
If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/AhzKf7dNgk) for a quick chat.
If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat.

View File

@ -152,4 +152,4 @@ Dify的后端使用Python编写使用[Flask](https://flask.palletsprojects.co
## 获取帮助
如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/AhzKf7dNgk) 进行快速交流。
如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/8Tpq4AcN9c) 进行快速交流。

View File

@ -131,7 +131,7 @@ At the same time, please consider supporting Dify by sharing it on social media
### Translations
We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/AhzKf7dNgk).
We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/8Tpq4AcN9c).
## Community & Support

View File

@ -109,15 +109,16 @@ def reset_encrypt_key_pair():
click.echo(click.style('Sorry, only support SELF_HOSTED mode.', fg='red'))
return
tenant = db.session.query(Tenant).first()
tenants = db.session.query(Tenant).all()
for tenant in tenants:
if not tenant:
click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red'))
return
tenant.encrypt_public_key = generate_key_pair(tenant.id)
db.session.query(Provider).filter(Provider.provider_type == 'custom').delete()
db.session.query(ProviderModel).delete()
db.session.query(Provider).filter(Provider.provider_type == 'custom', Provider.tenant_id == tenant.id).delete()
db.session.query(ProviderModel).filter(ProviderModel.tenant_id == tenant.id).delete()
db.session.commit()
click.echo(click.style('Congratulations! '

View File

@ -1,3 +1,4 @@
import concurrent.futures
import datetime
import json
import logging
@ -650,17 +651,44 @@ class IndexingRunner:
# chunk nodes by chunk size
indexing_start_at = time.perf_counter()
tokens = 0
chunk_size = 100
chunk_size = 10
embedding_model_type_instance = None
if embedding_model_instance:
embedding_model_type_instance = embedding_model_instance.model_type_instance
embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i in range(0, len(documents), chunk_size):
chunk_documents = documents[i:i + chunk_size]
futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor,
chunk_documents, dataset,
dataset_document, embedding_model_instance,
embedding_model_type_instance))
for future in futures:
tokens += future.result()
indexing_end_at = time.perf_counter()
# update document status to completed
self._update_document_index_status(
document_id=dataset_document.id,
after_indexing_status="completed",
extra_update_params={
DatasetDocument.tokens: tokens,
DatasetDocument.completed_at: datetime.datetime.utcnow(),
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
}
)
def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document,
embedding_model_instance, embedding_model_type_instance):
with flask_app.app_context():
# check document is paused
self._check_document_paused_status(dataset_document.id)
chunk_documents = documents[i:i + chunk_size]
tokens = 0
if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance:
tokens += sum(
embedding_model_type_instance.get_num_tokens(
@ -670,9 +698,9 @@ class IndexingRunner:
)
for document in chunk_documents
)
# load index
index_processor.load(dataset, chunk_documents)
db.session.add(dataset)
document_ids = [document.metadata['doc_id'] for document in chunk_documents]
db.session.query(DocumentSegment).filter(
@ -687,18 +715,7 @@ class IndexingRunner:
db.session.commit()
indexing_end_at = time.perf_counter()
# update document status to completed
self._update_document_index_status(
document_id=dataset_document.id,
after_indexing_status="completed",
extra_update_params={
DatasetDocument.tokens: tokens,
DatasetDocument.completed_at: datetime.datetime.utcnow(),
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
}
)
return tokens
def _check_document_paused_status(self, document_id: str):
indexing_cache_key = 'document_{}_is_paused'.format(document_id)

View File

@ -144,6 +144,16 @@ class MilvusVector(BaseVector):
utility.drop_collection(self._collection_name, None, using=alias)
def text_exists(self, id: str) -> bool:
alias = uuid4().hex
if self._client_config.secure:
uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port)
else:
uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port)
connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password)
from pymilvus import utility
if not utility.has_collection(self._collection_name, using=alias):
return False
result = self._client.query(collection_name=self._collection_name,
filter=f'metadata["doc_id"] == "{id}"',

View File

@ -275,6 +275,13 @@ class QdrantVector(BaseVector):
)
def text_exists(self, id: str) -> bool:
all_collection_name = []
collections_response = self._client.get_collections()
collection_list = collections_response.collections
for collection in collection_list:
all_collection_name.append(collection.name)
if self._collection_name not in all_collection_name:
return False
response = self._client.retrieve(
collection_name=self._collection_name,
ids=[id]

View File

@ -128,8 +128,8 @@ class Vector:
if kwargs.get('duplicate_check', False):
documents = self._filter_duplicate_texts(documents)
embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
self._vector_processor.add_texts(
documents=documents,
self._vector_processor.create(
texts=documents,
embeddings=embeddings,
**kwargs
)

View File

@ -134,6 +134,11 @@ class WeaviateVector(BaseVector):
def text_exists(self, id: str) -> bool:
collection_name = self._collection_name
schema = self._default_schema(self._collection_name)
# check whether the index already exists
if not self._client.schema.contains(schema):
return False
result = self._client.query.get(collection_name).with_additional(["id"]).with_where({
"path": ["doc_id"],
"operator": "Equal",

View File

@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
elements = partition_docx(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
pass
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
elements = partition_md(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):
elements = partition_text(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<svg width="800px" height="800px" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" clip-rule="evenodd" d="M15.6111 1.5837C17.2678 1.34703 18.75 2.63255 18.75 4.30606V5.68256C19.9395 6.31131 20.75 7.56102 20.75 9.00004V19C20.75 21.0711 19.0711 22.75 17 22.75H7C4.92893 22.75 3.25 21.0711 3.25 19V5.00004C3.25 4.99074 3.25017 4.98148 3.2505 4.97227C3.25017 4.95788 3.25 4.94344 3.25 4.92897C3.25 4.02272 3.91638 3.25437 4.81353 3.12621L15.6111 1.5837ZM4.75 6.75004V19C4.75 20.2427 5.75736 21.25 7 21.25H17C18.2426 21.25 19.25 20.2427 19.25 19V9.00004C19.25 7.7574 18.2426 6.75004 17 6.75004H4.75ZM5.07107 5.25004H17.25V4.30606C17.25 3.54537 16.5763 2.96104 15.8232 3.06862L5.02566 4.61113C4.86749 4.63373 4.75 4.76919 4.75 4.92897C4.75 5.10629 4.89375 5.25004 5.07107 5.25004ZM7.25 12C7.25 11.5858 7.58579 11.25 8 11.25H16C16.4142 11.25 16.75 11.5858 16.75 12C16.75 12.4143 16.4142 12.75 16 12.75H8C7.58579 12.75 7.25 12.4143 7.25 12ZM7.25 15.5C7.25 15.0858 7.58579 14.75 8 14.75H13.5C13.9142 14.75 14.25 15.0858 14.25 15.5C14.25 15.9143 13.9142 16.25 13.5 16.25H8C7.58579 16.25 7.25 15.9143 7.25 15.5Z" fill="#1C274D"/>
</svg>

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -0,0 +1,21 @@
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.devdocs.tools.searchDevDocs import SearchDevDocsTool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class DevDocsProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict) -> None:
try:
SearchDevDocsTool().fork_tool_runtime(
meta={
"credentials": credentials,
}
).invoke(
user_id='',
tool_parameters={
"doc": "python~3.12",
"topic": "library/code",
},
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@ -0,0 +1,10 @@
identity:
author: Richards Tu
name: devdocs
label:
en_US: DevDocs
zh_Hans: DevDocs
description:
en_US: Get official developer documentations on DevDocs.
zh_Hans: 从DevDocs获取官方开发者文档。
icon: icon.svg

View File

@ -0,0 +1,42 @@
from typing import Any, Union
import requests
from pydantic import BaseModel, Field
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
class SearchDevDocsInput(BaseModel):
doc: str = Field(..., description="The name of the documentation.")
topic: str = Field(..., description="The path of the section/topic.")
class SearchDevDocsTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
"""
Invokes the DevDocs search tool with the given user ID and tool parameters.
Args:
user_id (str): The ID of the user invoking the tool.
tool_parameters (dict[str, Any]): The parameters for the tool, including 'doc' and 'topic'.
Returns:
ToolInvokeMessage | list[ToolInvokeMessage]: The result of the tool invocation, which can be a single message or a list of messages.
"""
doc = tool_parameters.get('doc', '')
topic = tool_parameters.get('topic', '')
if not doc:
return self.create_text_message('Please provide the documentation name.')
if not topic:
return self.create_text_message('Please provide the topic path.')
url = f"https://documents.devdocs.io/{doc}/{topic}.html"
response = requests.get(url)
if response.status_code == 200:
content = response.text
return self.create_text_message(self.summary(user_id=user_id, content=content))
else:
return self.create_text_message(f"Failed to retrieve the documentation. Status code: {response.status_code}")

View File

@ -0,0 +1,34 @@
identity:
name: searchDevDocs
author: Richards Tu
label:
en_US: Search Developer Docs
zh_Hans: 搜索开发者文档
description:
human:
en_US: A tools for searching for a specific topic and path in DevDocs based on the provided documentation name and topic. Don't for get to add some shots in the system prompt; for example, the documentation name should be like \"vuex~4\", \"css\", or \"python~3.12\", while the topic should be like \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
zh_Hans: 一个用于根据提供的文档名称和主题在DevDocs中搜索特定主题和路径的工具。不要忘记在系统提示词中添加一些示例例如文档名称应该是\"vuex~4\"、\"css\"或\"python~3.12\",而主题应该是\"guide/actions\"用于Vuex 4\"display-box\"用于CSS或\"library/code\"用于Python 3.12。
llm: A tools for searching for specific developer documentation in DevDocs based on the provided documentation name and topic.
parameters:
- name: doc
type: string
required: true
label:
en_US: Documentation name
zh_Hans: 文档名称
human_description:
en_US: The name of the documentation.
zh_Hans: 文档名称。
llm_description: The name of the documentation, such as \"vuex~4\", \"css\", or \"python~3.12\". The exact value should be identified by the user.
form: llm
- name: topic
type: string
required: true
label:
en_US: Topic name
zh_Hans: 主题名称
human_description:
en_US: The path of the section/topic.
zh_Hans: 文档主题的路径。
llm_description: The path of the section/topic, such as \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
form: llm

View File

@ -53,7 +53,7 @@ def login_required(func):
def decorated_view(*args, **kwargs):
auth_header = request.headers.get('Authorization')
admin_api_key_enable = os.getenv('ADMIN_API_KEY_ENABLE', default='False')
if admin_api_key_enable:
if admin_api_key_enable.lower() == 'true':
if auth_header:
if ' ' not in auth_header:
raise Unauthorized('Invalid Authorization header format. Expected \'Bearer <api-key>\' format.')

View File

@ -435,20 +435,20 @@ class RegisterService:
if open_id is not None or provider is not None:
AccountService.link_account_integrate(provider, open_id, account)
if current_app.config['EDITION'] != 'SELF_HOSTED':
tenant = TenantService.create_tenant(f"{account.name}'s Workspace")
TenantService.create_tenant_member(tenant, account, role='owner')
account.current_tenant = tenant
tenant_was_created.send(tenant)
db.session.commit()
except Exception as e:
db.session.rollback() # todo: do not work
logging.error(f'Register failed: {e}')
raise AccountRegisterError(f'Registration failed: {e}') from e
tenant_was_created.send(tenant)
return account
@classmethod
@ -461,7 +461,6 @@ class RegisterService:
name = email.split('@')[0]
account = cls.register(email=email, name=name, language=language, status=AccountStatus.PENDING)
# Create new tenant member for invited tenant
TenantService.create_tenant_member(tenant, account, role)
TenantService.switch_tenant(account, tenant.id)