From 215661ef9198e17738b00e11cc41f67a6024b4bf Mon Sep 17 00:00:00 2001 From: Su Yang Date: Wed, 10 Jul 2024 18:26:10 +0800 Subject: [PATCH] feat: add PerfXCloud, Qwen series #6116 (#6117) --- .../model_providers/_position.yaml | 1 + .../model_providers/perfxcloud/__init__.py | 0 .../perfxcloud/_assets/icon_l_en.svg | 8 + .../perfxcloud/_assets/icon_s_en.svg | 8 + .../perfxcloud/llm/Qwen-14B-Chat-Int4.yaml | 61 +++++ .../llm/Qwen1.5-110B-Chat-GPTQ-Int4.yaml | 61 +++++ .../llm/Qwen1.5-72B-Chat-GPTQ-Int4.yaml | 61 +++++ .../perfxcloud/llm/Qwen1.5-7B.yaml | 61 +++++ .../llm/Qwen2-72B-Instruct-GPTQ-Int4.yaml | 63 +++++ .../perfxcloud/llm/Qwen2-7B.yaml | 63 +++++ .../perfxcloud/llm/__init__.py | 0 .../perfxcloud/llm/_position.yaml | 6 + .../model_providers/perfxcloud/llm/llm.py | 110 ++++++++ .../model_providers/perfxcloud/perfxcloud.py | 32 +++ .../perfxcloud/perfxcloud.yaml | 42 +++ .../text_embedding/BAAI-bge-m3.yaml | 4 + .../perfxcloud/text_embedding/__init__.py | 0 .../text_embedding/text_embedding.py | 250 ++++++++++++++++++ 18 files changed, 831 insertions(+) create mode 100644 api/core/model_runtime/model_providers/perfxcloud/__init__.py create mode 100644 api/core/model_runtime/model_providers/perfxcloud/_assets/icon_l_en.svg create mode 100644 api/core/model_runtime/model_providers/perfxcloud/_assets/icon_s_en.svg create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen-14B-Chat-Int4.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-110B-Chat-GPTQ-Int4.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-72B-Chat-GPTQ-Int4.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-7B.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-72B-Instruct-GPTQ-Int4.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-7B.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/__init__.py create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/_position.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/llm/llm.py create mode 100644 api/core/model_runtime/model_providers/perfxcloud/perfxcloud.py create mode 100644 api/core/model_runtime/model_providers/perfxcloud/perfxcloud.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/text_embedding/BAAI-bge-m3.yaml create mode 100644 api/core/model_runtime/model_providers/perfxcloud/text_embedding/__init__.py create mode 100644 api/core/model_runtime/model_providers/perfxcloud/text_embedding/text_embedding.py diff --git a/api/core/model_runtime/model_providers/_position.yaml b/api/core/model_runtime/model_providers/_position.yaml index da654d2174..cf4ac10828 100644 --- a/api/core/model_runtime/model_providers/_position.yaml +++ b/api/core/model_runtime/model_providers/_position.yaml @@ -33,3 +33,4 @@ - deepseek - hunyuan - siliconflow +- perfxcloud diff --git a/api/core/model_runtime/model_providers/perfxcloud/__init__.py b/api/core/model_runtime/model_providers/perfxcloud/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_l_en.svg b/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_l_en.svg new file mode 100644 index 0000000000..060d9de3a9 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_l_en.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_s_en.svg b/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_s_en.svg new file mode 100644 index 0000000000..be0c2eeb1c --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/_assets/icon_s_en.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen-14B-Chat-Int4.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen-14B-Chat-Int4.yaml new file mode 100644 index 0000000000..af6fb91cd9 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen-14B-Chat-Int4.yaml @@ -0,0 +1,61 @@ +model: Qwen-14B-Chat-Int4 +label: + en_US: Qwen-14B-Chat-Int4 +model_type: llm +features: + - agent-thought +model_properties: + mode: chat + context_size: 4096 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 600 + min: 1 + max: 1248 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-110B-Chat-GPTQ-Int4.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-110B-Chat-GPTQ-Int4.yaml new file mode 100644 index 0000000000..4ab9a80055 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-110B-Chat-GPTQ-Int4.yaml @@ -0,0 +1,61 @@ +model: Qwen1.5-110B-Chat-GPTQ-Int4 +label: + en_US: Qwen1.5-110B-Chat-GPTQ-Int4 +model_type: llm +features: + - agent-thought +model_properties: + mode: chat + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 128 + min: 1 + max: 256 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-72B-Chat-GPTQ-Int4.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-72B-Chat-GPTQ-Int4.yaml new file mode 100644 index 0000000000..4a8b1cf479 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-72B-Chat-GPTQ-Int4.yaml @@ -0,0 +1,61 @@ +model: Qwen1.5-72B-Chat-GPTQ-Int4 +label: + en_US: Qwen1.5-72B-Chat-GPTQ-Int4 +model_type: llm +features: + - agent-thought +model_properties: + mode: chat + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 600 + min: 1 + max: 2000 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-7B.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-7B.yaml new file mode 100644 index 0000000000..b076504493 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen1.5-7B.yaml @@ -0,0 +1,61 @@ +model: Qwen1.5-7B +label: + en_US: Qwen1.5-7B +model_type: llm +features: + - agent-thought +model_properties: + mode: completion + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 600 + min: 1 + max: 2000 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-72B-Instruct-GPTQ-Int4.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-72B-Instruct-GPTQ-Int4.yaml new file mode 100644 index 0000000000..e24a69fe63 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-72B-Instruct-GPTQ-Int4.yaml @@ -0,0 +1,63 @@ +model: Qwen2-72B-Instruct-GPTQ-Int4 +label: + en_US: Qwen2-72B-Instruct-GPTQ-Int4 +model_type: llm +features: + - multi-tool-call + - agent-thought + - stream-tool-call +model_properties: + mode: chat + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 600 + min: 1 + max: 2000 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-7B.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-7B.yaml new file mode 100644 index 0000000000..e3d804729d --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/Qwen2-7B.yaml @@ -0,0 +1,63 @@ +model: Qwen2-7B +label: + en_US: Qwen2-7B +model_type: llm +features: + - multi-tool-call + - agent-thought + - stream-tool-call +model_properties: + mode: completion + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + type: float + default: 0.3 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: max_tokens + use_template: max_tokens + type: int + default: 600 + min: 1 + max: 2000 + help: + zh_Hans: 用于指定模型在生成内容时token的最大数量,它定义了生成的上限,但不保证每次都会生成到这个数量。 + en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time. + - name: top_p + use_template: top_p + type: float + default: 0.8 + min: 0.1 + max: 0.9 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: top_k + type: int + min: 0 + max: 99 + label: + zh_Hans: 取样数量 + en_US: Top k + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. + - name: repetition_penalty + required: false + type: float + default: 1.1 + label: + en_US: Repetition penalty + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment. +pricing: + input: '0.000' + output: '0.000' + unit: '0.000' + currency: RMB diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/__init__.py b/api/core/model_runtime/model_providers/perfxcloud/llm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/_position.yaml b/api/core/model_runtime/model_providers/perfxcloud/llm/_position.yaml new file mode 100644 index 0000000000..b95f6bdc1b --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/_position.yaml @@ -0,0 +1,6 @@ +- Qwen2-72B-Instruct-GPTQ-Int4 +- Qwen2-7B +- Qwen1.5-110B-Chat-GPTQ-Int4 +- Qwen1.5-72B-Chat-GPTQ-Int4 +- Qwen1.5-7B +- Qwen-14B-Chat-Int4 diff --git a/api/core/model_runtime/model_providers/perfxcloud/llm/llm.py b/api/core/model_runtime/model_providers/perfxcloud/llm/llm.py new file mode 100644 index 0000000000..c9116bf685 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/llm/llm.py @@ -0,0 +1,110 @@ +from collections.abc import Generator +from typing import Optional, Union +from urllib.parse import urlparse + +import tiktoken + +from core.model_runtime.entities.llm_entities import LLMResult +from core.model_runtime.entities.message_entities import ( + PromptMessage, + PromptMessageTool, +) +from core.model_runtime.model_providers.openai.llm.llm import OpenAILargeLanguageModel + + +class PerfXCloudLargeLanguageModel(OpenAILargeLanguageModel): + def _invoke(self, model: str, credentials: dict, + prompt_messages: list[PromptMessage], model_parameters: dict, + tools: Optional[list[PromptMessageTool]] = None, stop: Optional[list[str]] = None, + stream: bool = True, user: Optional[str] = None) \ + -> Union[LLMResult, Generator]: + self._add_custom_parameters(credentials) + + return super()._invoke(model, credentials, prompt_messages, model_parameters, tools, stop, stream) + + def validate_credentials(self, model: str, credentials: dict) -> None: + self._add_custom_parameters(credentials) + super().validate_credentials(model, credentials) + + # refactored from openai model runtime, use cl100k_base for calculate token number + def _num_tokens_from_string(self, model: str, text: str, + tools: Optional[list[PromptMessageTool]] = None) -> int: + """ + Calculate num tokens for text completion model with tiktoken package. + + :param model: model name + :param text: prompt text + :param tools: tools for tool calling + :return: number of tokens + """ + encoding = tiktoken.get_encoding("cl100k_base") + num_tokens = len(encoding.encode(text)) + + if tools: + num_tokens += self._num_tokens_for_tools(encoding, tools) + + return num_tokens + + # refactored from openai model runtime, use cl100k_base for calculate token number + def _num_tokens_from_messages(self, model: str, messages: list[PromptMessage], + tools: Optional[list[PromptMessageTool]] = None) -> int: + """Calculate num tokens for gpt-3.5-turbo and gpt-4 with tiktoken package. + + Official documentation: https://github.com/openai/openai-cookbook/blob/ + main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb""" + encoding = tiktoken.get_encoding("cl100k_base") + tokens_per_message = 3 + tokens_per_name = 1 + + num_tokens = 0 + messages_dict = [self._convert_prompt_message_to_dict(m) for m in messages] + for message in messages_dict: + num_tokens += tokens_per_message + for key, value in message.items(): + # Cast str(value) in case the message value is not a string + # This occurs with function messages + # TODO: The current token calculation method for the image type is not implemented, + # which need to download the image and then get the resolution for calculation, + # and will increase the request delay + if isinstance(value, list): + text = '' + for item in value: + if isinstance(item, dict) and item['type'] == 'text': + text += item['text'] + + value = text + + if key == "tool_calls": + for tool_call in value: + for t_key, t_value in tool_call.items(): + num_tokens += len(encoding.encode(t_key)) + if t_key == "function": + for f_key, f_value in t_value.items(): + num_tokens += len(encoding.encode(f_key)) + num_tokens += len(encoding.encode(f_value)) + else: + num_tokens += len(encoding.encode(t_key)) + num_tokens += len(encoding.encode(t_value)) + else: + num_tokens += len(encoding.encode(str(value))) + + if key == "name": + num_tokens += tokens_per_name + + # every reply is primed with assistant + num_tokens += 3 + + if tools: + num_tokens += self._num_tokens_for_tools(encoding, tools) + + return num_tokens + + @staticmethod + def _add_custom_parameters(credentials: dict) -> None: + credentials['mode'] = 'chat' + credentials['openai_api_key']=credentials['api_key'] + if 'endpoint_url' not in credentials or credentials['endpoint_url'] == "": + credentials['openai_api_base']='https://cloud.perfxlab.cn' + else: + parsed_url = urlparse(credentials['endpoint_url']) + credentials['openai_api_base']=f"{parsed_url.scheme}://{parsed_url.netloc}" diff --git a/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.py b/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.py new file mode 100644 index 0000000000..0854ef5185 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.py @@ -0,0 +1,32 @@ +import logging + +from core.model_runtime.entities.model_entities import ModelType +from core.model_runtime.errors.validate import CredentialsValidateFailedError +from core.model_runtime.model_providers.__base.model_provider import ModelProvider + +logger = logging.getLogger(__name__) + + +class PerfXCloudProvider(ModelProvider): + + def validate_provider_credentials(self, credentials: dict) -> None: + """ + Validate provider credentials + if validate failed, raise exception + + :param credentials: provider credentials, credentials form defined in `provider_credential_schema`. + """ + try: + model_instance = self.get_model_instance(ModelType.LLM) + + # Use `Qwen2_72B_Chat_GPTQ_Int4` model for validate, + # no matter what model you pass in, text completion model or chat model + model_instance.validate_credentials( + model='Qwen2-72B-Instruct-GPTQ-Int4', + credentials=credentials + ) + except CredentialsValidateFailedError as ex: + raise ex + except Exception as ex: + logger.exception(f'{self.get_provider_schema().provider} credentials validate failed') + raise ex diff --git a/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.yaml b/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.yaml new file mode 100644 index 0000000000..10ee691ebd --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/perfxcloud.yaml @@ -0,0 +1,42 @@ +provider: perfxcloud +label: + en_US: PerfXCloud + zh_Hans: PerfXCloud +description: + en_US: PerfXCloud (Pengfeng Technology) is an AI development and deployment platform tailored for developers and enterprises, providing reasoning capabilities for multiple models. + zh_Hans: PerfXCloud(澎峰科技)为开发者和企业量身打造的AI开发和部署平台,提供多种模型的的推理能力。 +icon_small: + en_US: icon_s_en.svg +icon_large: + en_US: icon_l_en.svg +background: "#e3f0ff" +help: + title: + en_US: Get your API Key from PerfXCloud + zh_Hans: 从 PerfXCloud 获取 API Key + url: + en_US: https://cloud.perfxlab.cn/panel/token +supported_model_types: + - llm + - text-embedding +configurate_methods: + - predefined-model +provider_credential_schema: + credential_form_schemas: + - variable: api_key + label: + en_US: API Key + type: secret-input + required: true + placeholder: + zh_Hans: 在此输入您的 API Key + en_US: Enter your API Key + - variable: endpoint_url + label: + zh_Hans: 自定义 API endpoint 地址 + en_US: Custom API endpoint URL + type: text-input + required: false + placeholder: + zh_Hans: Base URL, e.g. https://cloud.perfxlab.cn/v1 + en_US: Base URL, e.g. https://cloud.perfxlab.cn/v1 diff --git a/api/core/model_runtime/model_providers/perfxcloud/text_embedding/BAAI-bge-m3.yaml b/api/core/model_runtime/model_providers/perfxcloud/text_embedding/BAAI-bge-m3.yaml new file mode 100644 index 0000000000..55488e5688 --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/text_embedding/BAAI-bge-m3.yaml @@ -0,0 +1,4 @@ +model: BAAI/bge-m3 +model_type: text-embedding +model_properties: + context_size: 32768 diff --git a/api/core/model_runtime/model_providers/perfxcloud/text_embedding/__init__.py b/api/core/model_runtime/model_providers/perfxcloud/text_embedding/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/model_runtime/model_providers/perfxcloud/text_embedding/text_embedding.py b/api/core/model_runtime/model_providers/perfxcloud/text_embedding/text_embedding.py new file mode 100644 index 0000000000..5a99ad301f --- /dev/null +++ b/api/core/model_runtime/model_providers/perfxcloud/text_embedding/text_embedding.py @@ -0,0 +1,250 @@ +import json +import time +from decimal import Decimal +from typing import Optional +from urllib.parse import urljoin + +import numpy as np +import requests + +from core.model_runtime.entities.common_entities import I18nObject +from core.model_runtime.entities.model_entities import ( + AIModelEntity, + FetchFrom, + ModelPropertyKey, + ModelType, + PriceConfig, + PriceType, +) +from core.model_runtime.entities.text_embedding_entities import EmbeddingUsage, TextEmbeddingResult +from core.model_runtime.errors.validate import CredentialsValidateFailedError +from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel +from core.model_runtime.model_providers.openai_api_compatible._common import _CommonOAI_API_Compat + + +class OAICompatEmbeddingModel(_CommonOAI_API_Compat, TextEmbeddingModel): + """ + Model class for an OpenAI API-compatible text embedding model. + """ + + def _invoke(self, model: str, credentials: dict, + texts: list[str], user: Optional[str] = None) \ + -> TextEmbeddingResult: + """ + Invoke text embedding model + + :param model: model name + :param credentials: model credentials + :param texts: texts to embed + :param user: unique user id + :return: embeddings result + """ + + # Prepare headers and payload for the request + headers = { + 'Content-Type': 'application/json' + } + + api_key = credentials.get('api_key') + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + if 'endpoint_url' not in credentials or credentials['endpoint_url'] == "": + endpoint_url='https://cloud.perfxlab.cn/v1/' + else: + endpoint_url = credentials.get('endpoint_url') + if not endpoint_url.endswith('/'): + endpoint_url += '/' + + endpoint_url = urljoin(endpoint_url, 'embeddings') + + extra_model_kwargs = {} + if user: + extra_model_kwargs['user'] = user + + extra_model_kwargs['encoding_format'] = 'float' + + # get model properties + context_size = self._get_context_size(model, credentials) + max_chunks = self._get_max_chunks(model, credentials) + + inputs = [] + indices = [] + used_tokens = 0 + + for i, text in enumerate(texts): + + # Here token count is only an approximation based on the GPT2 tokenizer + # TODO: Optimize for better token estimation and chunking + num_tokens = self._get_num_tokens_by_gpt2(text) + + if num_tokens >= context_size: + cutoff = int(len(text) * (np.floor(context_size / num_tokens))) + # if num tokens is larger than context length, only use the start + inputs.append(text[0: cutoff]) + else: + inputs.append(text) + indices += [i] + + batched_embeddings = [] + _iter = range(0, len(inputs), max_chunks) + + for i in _iter: + # Prepare the payload for the request + payload = { + 'input': inputs[i: i + max_chunks], + 'model': model, + **extra_model_kwargs + } + + # Make the request to the OpenAI API + response = requests.post( + endpoint_url, + headers=headers, + data=json.dumps(payload), + timeout=(10, 300) + ) + + response.raise_for_status() # Raise an exception for HTTP errors + response_data = response.json() + + # Extract embeddings and used tokens from the response + embeddings_batch = [data['embedding'] for data in response_data['data']] + embedding_used_tokens = response_data['usage']['total_tokens'] + + used_tokens += embedding_used_tokens + batched_embeddings += embeddings_batch + + # calc usage + usage = self._calc_response_usage( + model=model, + credentials=credentials, + tokens=used_tokens + ) + + return TextEmbeddingResult( + embeddings=batched_embeddings, + usage=usage, + model=model + ) + + def get_num_tokens(self, model: str, credentials: dict, texts: list[str]) -> int: + """ + Approximate number of tokens for given messages using GPT2 tokenizer + + :param model: model name + :param credentials: model credentials + :param texts: texts to embed + :return: + """ + return sum(self._get_num_tokens_by_gpt2(text) for text in texts) + + def validate_credentials(self, model: str, credentials: dict) -> None: + """ + Validate model credentials + + :param model: model name + :param credentials: model credentials + :return: + """ + try: + headers = { + 'Content-Type': 'application/json' + } + + api_key = credentials.get('api_key') + + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + if 'endpoint_url' not in credentials or credentials['endpoint_url'] == "": + endpoint_url='https://cloud.perfxlab.cn/v1/' + else: + endpoint_url = credentials.get('endpoint_url') + if not endpoint_url.endswith('/'): + endpoint_url += '/' + + endpoint_url = urljoin(endpoint_url, 'embeddings') + + payload = { + 'input': 'ping', + 'model': model + } + + response = requests.post( + url=endpoint_url, + headers=headers, + data=json.dumps(payload), + timeout=(10, 300) + ) + + if response.status_code != 200: + raise CredentialsValidateFailedError( + f'Credentials validation failed with status code {response.status_code}') + + try: + json_result = response.json() + except json.JSONDecodeError as e: + raise CredentialsValidateFailedError('Credentials validation failed: JSON decode error') + + if 'model' not in json_result: + raise CredentialsValidateFailedError( + 'Credentials validation failed: invalid response') + except CredentialsValidateFailedError: + raise + except Exception as ex: + raise CredentialsValidateFailedError(str(ex)) + + def get_customizable_model_schema(self, model: str, credentials: dict) -> AIModelEntity: + """ + generate custom model entities from credentials + """ + entity = AIModelEntity( + model=model, + label=I18nObject(en_US=model), + model_type=ModelType.TEXT_EMBEDDING, + fetch_from=FetchFrom.CUSTOMIZABLE_MODEL, + model_properties={ + ModelPropertyKey.CONTEXT_SIZE: int(credentials.get('context_size')), + ModelPropertyKey.MAX_CHUNKS: 1, + }, + parameter_rules=[], + pricing=PriceConfig( + input=Decimal(credentials.get('input_price', 0)), + unit=Decimal(credentials.get('unit', 0)), + currency=credentials.get('currency', "USD") + ) + ) + + return entity + + + def _calc_response_usage(self, model: str, credentials: dict, tokens: int) -> EmbeddingUsage: + """ + Calculate response usage + + :param model: model name + :param credentials: model credentials + :param tokens: input tokens + :return: usage + """ + # get input price info + input_price_info = self.get_price( + model=model, + credentials=credentials, + price_type=PriceType.INPUT, + tokens=tokens + ) + + # transform usage + usage = EmbeddingUsage( + tokens=tokens, + total_tokens=tokens, + unit_price=input_price_info.unit_price, + price_unit=input_price_info.unit, + total_price=input_price_info.total_amount, + currency=input_price_info.currency, + latency=time.perf_counter() - self.started_at + ) + + return usage