Merge branch 'main' into fix/bedrock-claude

2024-11-16 11:42:29 +08:00 · 2024-03-29 13:46:54 +08:00 · 2024-03-29 13:46:54 +08:00 · 2e48446194
commit 2e48446194
parent ecd0ddab5e 0f94e4cd01
23 changed files with 195 additions and 45 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -155,4 +155,4 @@ And that's it! Once your PR is merged, you will be featured as a contributor in

 ## Getting Help

-If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/AhzKf7dNgk) for a quick chat. 
+If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat. 
--- a/CONTRIBUTING_CN.md
+++ b/CONTRIBUTING_CN.md
@ -152,4 +152,4 @@ Dify的后端使用Python编写，使用[Flask](https://flask.palletsprojects.co

 ## 获取帮助

-如果你在贡献过程中遇到困难或者有任何问题，可以通过相关的 GitHub 问题提出你的疑问，或者加入我们的 [Discord](https://discord.gg/AhzKf7dNgk) 进行快速交流。
+如果你在贡献过程中遇到困难或者有任何问题，可以通过相关的 GitHub 问题提出你的疑问，或者加入我们的 [Discord](https://discord.gg/8Tpq4AcN9c) 进行快速交流。
--- a/README.md
+++ b/README.md
@ -131,7 +131,7 @@ At the same time, please consider supporting Dify by sharing it on social media

 ### Translations

-We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/AhzKf7dNgk).
+We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/8Tpq4AcN9c).

 ## Community & Support

--- a/api/commands.py
+++ b/api/commands.py
@ -109,15 +109,16 @@ def reset_encrypt_key_pair():
        click.echo(click.style('Sorry, only support SELF_HOSTED mode.', fg='red'))
        return

-    tenant = db.session.query(Tenant).first()
+    tenants = db.session.query(Tenant).all()
+    for tenant in tenants:
        if not tenant:
            click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red'))
            return

        tenant.encrypt_public_key = generate_key_pair(tenant.id)

-    db.session.query(Provider).filter(Provider.provider_type == 'custom').delete()
-    db.session.query(ProviderModel).delete()
+        db.session.query(Provider).filter(Provider.provider_type == 'custom', Provider.tenant_id == tenant.id).delete()
+        db.session.query(ProviderModel).filter(ProviderModel.tenant_id == tenant.id).delete()
        db.session.commit()

        click.echo(click.style('Congratulations! '
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@ -1,3 +1,4 @@
+import concurrent.futures
 import datetime
 import json
 import logging
@ -650,17 +651,44 @@ class IndexingRunner:
        # chunk nodes by chunk size
        indexing_start_at = time.perf_counter()
        tokens = 0
-        chunk_size = 100
+        chunk_size = 10

        embedding_model_type_instance = None
        if embedding_model_instance:
            embedding_model_type_instance = embedding_model_instance.model_type_instance
            embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
-
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            futures = []
            for i in range(0, len(documents), chunk_size):
+                chunk_documents = documents[i:i + chunk_size]
+                futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor,
+                                               chunk_documents, dataset,
+                                               dataset_document, embedding_model_instance,
+                                               embedding_model_type_instance))
+
+            for future in futures:
+                tokens += future.result()
+
+        indexing_end_at = time.perf_counter()
+
+        # update document status to completed
+        self._update_document_index_status(
+            document_id=dataset_document.id,
+            after_indexing_status="completed",
+            extra_update_params={
+                DatasetDocument.tokens: tokens,
+                DatasetDocument.completed_at: datetime.datetime.utcnow(),
+                DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
+            }
+        )
+
+    def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document,
+                       embedding_model_instance, embedding_model_type_instance):
+        with flask_app.app_context():
            # check document is paused
            self._check_document_paused_status(dataset_document.id)
-            chunk_documents = documents[i:i + chunk_size]
+
+            tokens = 0
            if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance:
                tokens += sum(
                    embedding_model_type_instance.get_num_tokens(
@ -670,9 +698,9 @@ class IndexingRunner:
                    )
                    for document in chunk_documents
                )
+
            # load index
            index_processor.load(dataset, chunk_documents)
-            db.session.add(dataset)

            document_ids = [document.metadata['doc_id'] for document in chunk_documents]
            db.session.query(DocumentSegment).filter(
@ -687,18 +715,7 @@ class IndexingRunner:

            db.session.commit()

-        indexing_end_at = time.perf_counter()
-
-        # update document status to completed
-        self._update_document_index_status(
-            document_id=dataset_document.id,
-            after_indexing_status="completed",
-            extra_update_params={
-                DatasetDocument.tokens: tokens,
-                DatasetDocument.completed_at: datetime.datetime.utcnow(),
-                DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
-            }
-        )
+            return tokens

    def _check_document_paused_status(self, document_id: str):
        indexing_cache_key = 'document_{}_is_paused'.format(document_id)
--- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py
+++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py
@ -144,6 +144,16 @@ class MilvusVector(BaseVector):
            utility.drop_collection(self._collection_name, None, using=alias)

    def text_exists(self, id: str) -> bool:
+        alias = uuid4().hex
+        if self._client_config.secure:
+            uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+        else:
+            uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+        connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password)
+
+        from pymilvus import utility
+        if not utility.has_collection(self._collection_name, using=alias):
+            return False

        result = self._client.query(collection_name=self._collection_name,
                                    filter=f'metadata["doc_id"] == "{id}"',
--- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
+++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
@ -275,6 +275,13 @@ class QdrantVector(BaseVector):
            )

    def text_exists(self, id: str) -> bool:
+        all_collection_name = []
+        collections_response = self._client.get_collections()
+        collection_list = collections_response.collections
+        for collection in collection_list:
+            all_collection_name.append(collection.name)
+        if self._collection_name not in all_collection_name:
+            return False
        response = self._client.retrieve(
            collection_name=self._collection_name,
            ids=[id]
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -128,8 +128,8 @@ class Vector:
        if kwargs.get('duplicate_check', False):
            documents = self._filter_duplicate_texts(documents)
        embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
-        self._vector_processor.add_texts(
-            documents=documents,
+        self._vector_processor.create(
+            texts=documents,
            embeddings=embeddings,
            **kwargs
        )
--- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
+++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
@ -134,6 +134,11 @@ class WeaviateVector(BaseVector):

    def text_exists(self, id: str) -> bool:
        collection_name = self._collection_name
+        schema = self._default_schema(self._collection_name)
+
+        # check whether the index already exists
+        if not self._client.schema.contains(schema):
+            return False
        result = self._client.query.get(collection_name).with_additional(["id"]).with_where({
            "path": ["doc_id"],
            "operator": "Equal",
--- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
            elements = partition_docx(filename=self._file_path)

        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
            pass

        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):

        elements = partition_md(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):

        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):

        elements = partition_text(filename=self._file_path, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):

        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
        from unstructured.chunking.title import chunk_by_title
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
--- a/api/core/tools/provider/builtin/devdocs/_assets/icon.svg
+++ b/api/core/tools/provider/builtin/devdocs/_assets/icon.svg
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<svg width="800px" height="800px" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M15.6111 1.5837C17.2678 1.34703 18.75 2.63255 18.75 4.30606V5.68256C19.9395 6.31131 20.75 7.56102 20.75 9.00004V19C20.75 21.0711 19.0711 22.75 17 22.75H7C4.92893 22.75 3.25 21.0711 3.25 19V5.00004C3.25 4.99074 3.25017 4.98148 3.2505 4.97227C3.25017 4.95788 3.25 4.94344 3.25 4.92897C3.25 4.02272 3.91638 3.25437 4.81353 3.12621L15.6111 1.5837ZM4.75 6.75004V19C4.75 20.2427 5.75736 21.25 7 21.25H17C18.2426 21.25 19.25 20.2427 19.25 19V9.00004C19.25 7.7574 18.2426 6.75004 17 6.75004H4.75ZM5.07107 5.25004H17.25V4.30606C17.25 3.54537 16.5763 2.96104 15.8232 3.06862L5.02566 4.61113C4.86749 4.63373 4.75 4.76919 4.75 4.92897C4.75 5.10629 4.89375 5.25004 5.07107 5.25004ZM7.25 12C7.25 11.5858 7.58579 11.25 8 11.25H16C16.4142 11.25 16.75 11.5858 16.75 12C16.75 12.4143 16.4142 12.75 16 12.75H8C7.58579 12.75 7.25 12.4143 7.25 12ZM7.25 15.5C7.25 15.0858 7.58579 14.75 8 14.75H13.5C13.9142 14.75 14.25 15.0858 14.25 15.5C14.25 15.9143 13.9142 16.25 13.5 16.25H8C7.58579 16.25 7.25 15.9143 7.25 15.5Z" fill="#1C274D"/>
+</svg>
--- a/api/core/tools/provider/builtin/devdocs/devdocs.py
+++ b/api/core/tools/provider/builtin/devdocs/devdocs.py
@ -0,0 +1,21 @@
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.devdocs.tools.searchDevDocs import SearchDevDocsTool
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class DevDocsProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict) -> None:
+        try:
+            SearchDevDocsTool().fork_tool_runtime(
+                meta={
+                    "credentials": credentials,
+                }
+            ).invoke(
+                user_id='',
+                tool_parameters={
+                    "doc": "python~3.12",
+                    "topic": "library/code",
+                },
+            )
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(str(e))
--- a/api/core/tools/provider/builtin/devdocs/devdocs.yaml
+++ b/api/core/tools/provider/builtin/devdocs/devdocs.yaml
@ -0,0 +1,10 @@
+identity:
+  author: Richards Tu
+  name: devdocs
+  label:
+    en_US: DevDocs
+    zh_Hans: DevDocs
+  description:
+    en_US: Get official developer documentations on DevDocs.
+    zh_Hans: 从DevDocs获取官方开发者文档。
+  icon: icon.svg
--- a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py
+++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py
@ -0,0 +1,42 @@
+from typing import Any, Union
+
+import requests
+from pydantic import BaseModel, Field
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class SearchDevDocsInput(BaseModel):
+    doc: str = Field(..., description="The name of the documentation.")
+    topic: str = Field(..., description="The path of the section/topic.")
+
+
+class SearchDevDocsTool(BuiltinTool):
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        """
+        Invokes the DevDocs search tool with the given user ID and tool parameters.
+
+        Args:
+            user_id (str): The ID of the user invoking the tool.
+            tool_parameters (dict[str, Any]): The parameters for the tool, including 'doc' and 'topic'.
+
+        Returns:
+            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the tool invocation, which can be a single message or a list of messages.
+        """
+        doc = tool_parameters.get('doc', '')
+        topic = tool_parameters.get('topic', '')
+
+        if not doc:
+            return self.create_text_message('Please provide the documentation name.')
+        if not topic:
+            return self.create_text_message('Please provide the topic path.')
+
+        url = f"https://documents.devdocs.io/{doc}/{topic}.html"
+        response = requests.get(url)
+
+        if response.status_code == 200:
+            content = response.text
+            return self.create_text_message(self.summary(user_id=user_id, content=content))
+        else:
+            return self.create_text_message(f"Failed to retrieve the documentation. Status code: {response.status_code}")
--- a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml
+++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml
@ -0,0 +1,34 @@
+identity:
+  name: searchDevDocs
+  author: Richards Tu
+  label:
+    en_US: Search Developer Docs
+    zh_Hans: 搜索开发者文档
+description:
+  human:
+    en_US: A tools for searching for a specific topic and path in DevDocs based on the provided documentation name and topic. Don't for get to add some shots in the system prompt; for example, the documentation name should be like \"vuex~4\", \"css\", or \"python~3.12\", while the topic should be like \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
+    zh_Hans: 一个用于根据提供的文档名称和主题，在DevDocs中搜索特定主题和路径的工具。不要忘记在系统提示词中添加一些示例；例如，文档名称应该是\"vuex~4\"、\"css\"或\"python~3.12\"，而主题应该是\"guide/actions\"用于Vuex 4，\"display-box\"用于CSS，或\"library/code\"用于Python 3.12。
+  llm: A tools for searching for specific developer documentation in DevDocs based on the provided documentation name and topic.
+parameters:
+  - name: doc
+    type: string
+    required: true
+    label:
+      en_US: Documentation name
+      zh_Hans: 文档名称
+    human_description:
+      en_US: The name of the documentation.
+      zh_Hans: 文档名称。
+    llm_description: The name of the documentation, such as \"vuex~4\", \"css\", or \"python~3.12\". The exact value should be identified by the user.
+    form: llm
+  - name: topic
+    type: string
+    required: true
+    label:
+      en_US: Topic name
+      zh_Hans: 主题名称
+    human_description:
+      en_US: The path of the section/topic.
+      zh_Hans: 文档主题的路径。
+    llm_description: The path of the section/topic, such as \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
+    form: llm
--- a/api/libs/login.py
+++ b/api/libs/login.py
@ -53,7 +53,7 @@ def login_required(func):
    def decorated_view(*args, **kwargs):
        auth_header = request.headers.get('Authorization')
        admin_api_key_enable = os.getenv('ADMIN_API_KEY_ENABLE', default='False')
-        if admin_api_key_enable:
+        if admin_api_key_enable.lower() == 'true':
            if auth_header:
                if ' ' not in auth_header:
                    raise Unauthorized('Invalid Authorization header format. Expected \'Bearer <api-key>\' format.')
--- a/api/services/account_service.py
+++ b/api/services/account_service.py
@ -435,20 +435,20 @@ class RegisterService:

            if open_id is not None or provider is not None:
                AccountService.link_account_integrate(provider, open_id, account)
-
+            if current_app.config['EDITION'] != 'SELF_HOSTED':
                tenant = TenantService.create_tenant(f"{account.name}'s Workspace")

                TenantService.create_tenant_member(tenant, account, role='owner')
                account.current_tenant = tenant

+                tenant_was_created.send(tenant)
+
            db.session.commit()
        except Exception as e:
            db.session.rollback()  # todo: do not work
            logging.error(f'Register failed: {e}')
            raise AccountRegisterError(f'Registration failed: {e}') from e

-        tenant_was_created.send(tenant)
-
        return account

    @classmethod
@ -461,7 +461,6 @@ class RegisterService:
            name = email.split('@')[0]

            account = cls.register(email=email, name=name, language=language, status=AccountStatus.PENDING)
-
            # Create new tenant member for invited tenant
            TenantService.create_tenant_member(tenant, account, role)
            TenantService.switch_tenant(account, tenant.id)