dify/api/tests/integration_tests/model_runtime/xinference/test_llm.py

import os
from typing import Generator

import pytest
from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessageTool,
                                                          SystemPromptMessage, TextPromptMessageContent,
                                                          UserPromptMessage)
from core.model_runtime.entities.model_entities import AIModelEntity
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.xinference.llm.llm import XinferenceAILargeLanguageModel

"""FOR MOCK FIXTURES, DO NOT REMOVE"""
from tests.integration_tests.model_runtime.__mock.openai import setup_openai_mock
from tests.integration_tests.model_runtime.__mock.xinference import setup_xinference_mock


@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)
def test_validate_credentials_for_chat_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model='ChatGLM3',
            credentials={
                'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
                'model_uid': 'www ' + os.environ.get('XINFERENCE_CHAT_MODEL_UID')
            }
        )

    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model='aaaaa',
            credentials={
                'server_url': '',
                'model_uid': ''
            }
        )

    model.validate_credentials(
        model='ChatGLM3',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')
        }
    )

@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)
def test_invoke_chat_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    response = model.invoke(
        model='ChatGLM3',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')
        },
        prompt_messages=[
            SystemPromptMessage(
                content='You are a helpful AI assistant.',
            ),
            UserPromptMessage(
                content='Hello World!'
            )
        ],
        model_parameters={
            'temperature': 0.7,
            'top_p': 1.0,
        },
        stop=['you'],
        user="abc-123",
        stream=False
    )

    assert isinstance(response, LLMResult)
    assert len(response.message.content) > 0
    assert response.usage.total_tokens > 0

@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)
def test_invoke_stream_chat_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    response = model.invoke(
        model='ChatGLM3',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')
        },
        prompt_messages=[
            SystemPromptMessage(
                content='You are a helpful AI assistant.',
            ),
            UserPromptMessage(
                content='Hello World!'
            )
        ],
        model_parameters={
            'temperature': 0.7,
            'top_p': 1.0,
        },
        stop=['you'],
        stream=True,
        user="abc-123"
    )

    assert isinstance(response, Generator)
    for chunk in response:
        assert isinstance(chunk, LLMResultChunk)
        assert isinstance(chunk.delta, LLMResultChunkDelta)
        assert isinstance(chunk.delta.message, AssistantPromptMessage)
        assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True
"""
    Funtion calling of xinference does not support stream mode currently
"""
# def test_invoke_stream_chat_model_with_functions():
#     model = XinferenceAILargeLanguageModel()

#     response = model.invoke(
#         model='ChatGLM3-6b',
#         credentials={
#             'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
#             'model_type': 'text-generation',
#             'model_name': 'ChatGLM3',
#             'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')
#         },
#         prompt_messages=[
#             SystemPromptMessage(
#                 content='你是一个天气机器人，可以通过调用函数来获取天气信息',
#             ),
#             UserPromptMessage(
#                 content='波士顿天气如何？'
#             )
#         ],
#         model_parameters={
#             'temperature': 0,
#             'top_p': 1.0,
#         },
#         stop=['you'],
#         user='abc-123',
#         stream=True,
#         tools=[
#             PromptMessageTool(
#                 name='get_current_weather',
#                 description='Get the current weather in a given location',
#                 parameters={
#                     "type": "object",
#                     "properties": {
#                         "location": {
#                         "type": "string",
#                             "description": "The city and state e.g. San Francisco, CA"
#                         },
#                         "unit": {
#                             "type": "string",
#                             "enum": ["celsius", "fahrenheit"]
#                         }
#                     },
#                     "required": [
#                         "location"
#                     ]
#                 }
#             )
#         ]
#     )

#     assert isinstance(response, Generator)
    
#     call: LLMResultChunk = None
#     chunks = []

#     for chunk in response:
#         chunks.append(chunk)
#         assert isinstance(chunk, LLMResultChunk)
#         assert isinstance(chunk.delta, LLMResultChunkDelta)
#         assert isinstance(chunk.delta.message, AssistantPromptMessage)
#         assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True

#         if chunk.delta.message.tool_calls and len(chunk.delta.message.tool_calls) > 0:
#             call = chunk
#             break

#     assert call is not None
#     assert call.delta.message.tool_calls[0].function.name == 'get_current_weather'

# def test_invoke_chat_model_with_functions():
#     model = XinferenceAILargeLanguageModel()

#     response = model.invoke(
#         model='ChatGLM3-6b',
#         credentials={
#             'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
#             'model_type': 'text-generation',
#             'model_name': 'ChatGLM3',
#             'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')
#         },
#         prompt_messages=[
#             UserPromptMessage(
#                 content='What is the weather like in San Francisco?'
#             )
#         ],
#         model_parameters={
#             'temperature': 0.7,
#             'top_p': 1.0,
#         },
#         stop=['you'],
#         user='abc-123',
#         stream=False,
#         tools=[
#             PromptMessageTool(
#                 name='get_current_weather',
#                 description='Get the current weather in a given location',
#                 parameters={
#                     "type": "object",
#                     "properties": {
#                         "location": {
#                         "type": "string",
#                             "description": "The city and state e.g. San Francisco, CA"
#                         },
#                         "unit": {
#                             "type": "string",
#                             "enum": [
#                                 "c",
#                                 "f"
#                             ]
#                         }
#                     },
#                     "required": [
#                         "location"
#                     ]
#                 }
#             )
#         ]
#     )

#     assert isinstance(response, LLMResult)
#     assert len(response.message.content) > 0
#     assert response.usage.total_tokens > 0
#     assert response.message.tool_calls[0].function.name == 'get_current_weather'

@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)
def test_validate_credentials_for_generation_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model='alapaca',
            credentials={
                'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
                'model_uid': 'www ' + os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
            }
        )

    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model='alapaca',
            credentials={
                'server_url': '',
                'model_uid': ''
            }
        )

    model.validate_credentials(
        model='alapaca',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
        }
    )

@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)
def test_invoke_generation_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    response = model.invoke(
        model='alapaca',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
        },
        prompt_messages=[
            UserPromptMessage(
                content='the United States is'
            )
        ],
        model_parameters={
            'temperature': 0.7,
            'top_p': 1.0,
        },
        stop=['you'],
        user="abc-123",
        stream=False
    )

    assert isinstance(response, LLMResult)
    assert len(response.message.content) > 0
    assert response.usage.total_tokens > 0

@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)
def test_invoke_stream_generation_model(setup_openai_mock, setup_xinference_mock):
    model = XinferenceAILargeLanguageModel()

    response = model.invoke(
        model='alapaca',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
        },
        prompt_messages=[
            UserPromptMessage(
                content='the United States is'
            )
        ],
        model_parameters={
            'temperature': 0.7,
            'top_p': 1.0,
        },
        stop=['you'],
        stream=True,
        user="abc-123"
    )

    assert isinstance(response, Generator)
    for chunk in response:
        assert isinstance(chunk, LLMResultChunk)
        assert isinstance(chunk.delta, LLMResultChunkDelta)
        assert isinstance(chunk.delta.message, AssistantPromptMessage)
        assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True

def test_get_num_tokens():
    model = XinferenceAILargeLanguageModel()

    num_tokens = model.get_num_tokens(
        model='ChatGLM3',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
        },
        prompt_messages=[
            SystemPromptMessage(
                content='You are a helpful AI assistant.',
            ),
            UserPromptMessage(
                content='Hello World!'
            )
        ],
        tools=[
            PromptMessageTool(
                name='get_current_weather',
                description='Get the current weather in a given location',
                parameters={
                    "type": "object",
                    "properties": {
                        "location": {
                        "type": "string",
                            "description": "The city and state e.g. San Francisco, CA"
                        },
                        "unit": {
                            "type": "string",
                            "enum": [
                                "c",
                                "f"
                            ]
                        }
                    },
                    "required": [
                        "location"
                    ]
                }
            )
        ]
    )

    assert isinstance(num_tokens, int)
    assert num_tokens == 77

    num_tokens = model.get_num_tokens(
        model='ChatGLM3',
        credentials={
            'server_url': os.environ.get('XINFERENCE_SERVER_URL'),
            'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')
        },
        prompt_messages=[
            SystemPromptMessage(
                content='You are a helpful AI assistant.',
            ),
            UserPromptMessage(
                content='Hello World!'
            )
        ],
    )

    assert isinstance(num_tokens, int)
    assert num_tokens == 21
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`import os`
			`from typing import Generator`

improve: introduce isort for linting Python imports (#1983) 2024-01-12 12:34:01 +08:00			`import pytest`
			`from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta`
			`from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessageTool,`
			`SystemPromptMessage, TextPromptMessageContent,`
			`UserPromptMessage)`
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`from core.model_runtime.entities.model_entities import AIModelEntity`
			`from core.model_runtime.errors.validate import CredentialsValidateFailedError`
			`from core.model_runtime.model_providers.xinference.llm.llm import XinferenceAILargeLanguageModel`

			`"""FOR MOCK FIXTURES, DO NOT REMOVE"""`
			`from tests.integration_tests.model_runtime.__mock.openai import setup_openai_mock`
			`from tests.integration_tests.model_runtime.__mock.xinference import setup_xinference_mock`

improve: introduce isort for linting Python imports (#1983) 2024-01-12 12:34:01 +08:00
Model Runtime (#1858) Co-authored-by: StyleZhang <jasonapring2015@outlook.com> Co-authored-by: Garfield Dai <dai.hai@foxmail.com> Co-authored-by: chenhe <guchenhe@gmail.com> Co-authored-by: jyong <jyong@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Yeuoly <admin@srmxy.cn> 2024-01-02 23:42:00 +08:00			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)`
			`def test_validate_credentials_for_chat_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`with pytest.raises(CredentialsValidateFailedError):`
			`model.validate_credentials(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': 'www ' + os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`}`
			`)`

			`with pytest.raises(CredentialsValidateFailedError):`
			`model.validate_credentials(`
			`model='aaaaa',`
			`credentials={`
			`'server_url': '',`
			`'model_uid': ''`
			`}`
			`)`

			`model.validate_credentials(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`}`
			`)`

			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)`
			`def test_invoke_chat_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`response = model.invoke(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`SystemPromptMessage(`
			`content='You are a helpful AI assistant.',`
			`),`
			`UserPromptMessage(`
			`content='Hello World!'`
			`)`
			`],`
			`model_parameters={`
			`'temperature': 0.7,`
			`'top_p': 1.0,`
			`},`
			`stop=['you'],`
			`user="abc-123",`
			`stream=False`
			`)`

			`assert isinstance(response, LLMResult)`
			`assert len(response.message.content) > 0`
			`assert response.usage.total_tokens > 0`

			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['chat', 'none']], indirect=True)`
			`def test_invoke_stream_chat_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`response = model.invoke(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`SystemPromptMessage(`
			`content='You are a helpful AI assistant.',`
			`),`
			`UserPromptMessage(`
			`content='Hello World!'`
			`)`
			`],`
			`model_parameters={`
			`'temperature': 0.7,`
			`'top_p': 1.0,`
			`},`
			`stop=['you'],`
			`stream=True,`
			`user="abc-123"`
			`)`

			`assert isinstance(response, Generator)`
			`for chunk in response:`
			`assert isinstance(chunk, LLMResultChunk)`
			`assert isinstance(chunk.delta, LLMResultChunkDelta)`
			`assert isinstance(chunk.delta.message, AssistantPromptMessage)`
			`assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True`
			`"""`
			`Funtion calling of xinference does not support stream mode currently`
			`"""`
			`# def test_invoke_stream_chat_model_with_functions():`
			`# model = XinferenceAILargeLanguageModel()`

			`# response = model.invoke(`
			`# model='ChatGLM3-6b',`
			`# credentials={`
			`# 'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`# 'model_type': 'text-generation',`
			`# 'model_name': 'ChatGLM3',`
			`# 'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`# },`
			`# prompt_messages=[`
			`# SystemPromptMessage(`
			`# content='你是一个天气机器人，可以通过调用函数来获取天气信息',`
			`# ),`
			`# UserPromptMessage(`
			`# content='波士顿天气如何？'`
			`# )`
			`# ],`
			`# model_parameters={`
			`# 'temperature': 0,`
			`# 'top_p': 1.0,`
			`# },`
			`# stop=['you'],`
			`# user='abc-123',`
			`# stream=True,`
			`# tools=[`
			`# PromptMessageTool(`
			`# name='get_current_weather',`
			`# description='Get the current weather in a given location',`
			`# parameters={`
			`# "type": "object",`
			`# "properties": {`
			`# "location": {`
			`# "type": "string",`
			`# "description": "The city and state e.g. San Francisco, CA"`
			`# },`
			`# "unit": {`
			`# "type": "string",`
			`# "enum": ["celsius", "fahrenheit"]`
			`# }`
			`# },`
			`# "required": [`
			`# "location"`
			`# ]`
			`# }`
			`# )`
			`# ]`
			`# )`

			`# assert isinstance(response, Generator)`

			`# call: LLMResultChunk = None`
			`# chunks = []`

			`# for chunk in response:`
			`# chunks.append(chunk)`
			`# assert isinstance(chunk, LLMResultChunk)`
			`# assert isinstance(chunk.delta, LLMResultChunkDelta)`
			`# assert isinstance(chunk.delta.message, AssistantPromptMessage)`
			`# assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True`

			`# if chunk.delta.message.tool_calls and len(chunk.delta.message.tool_calls) > 0:`
			`# call = chunk`
			`# break`

			`# assert call is not None`
			`# assert call.delta.message.tool_calls[0].function.name == 'get_current_weather'`

			`# def test_invoke_chat_model_with_functions():`
			`# model = XinferenceAILargeLanguageModel()`

			`# response = model.invoke(`
			`# model='ChatGLM3-6b',`
			`# credentials={`
			`# 'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`# 'model_type': 'text-generation',`
			`# 'model_name': 'ChatGLM3',`
			`# 'model_uid': os.environ.get('XINFERENCE_CHAT_MODEL_UID')`
			`# },`
			`# prompt_messages=[`
			`# UserPromptMessage(`
			`# content='What is the weather like in San Francisco?'`
			`# )`
			`# ],`
			`# model_parameters={`
			`# 'temperature': 0.7,`
			`# 'top_p': 1.0,`
			`# },`
			`# stop=['you'],`
			`# user='abc-123',`
			`# stream=False,`
			`# tools=[`
			`# PromptMessageTool(`
			`# name='get_current_weather',`
			`# description='Get the current weather in a given location',`
			`# parameters={`
			`# "type": "object",`
			`# "properties": {`
			`# "location": {`
			`# "type": "string",`
			`# "description": "The city and state e.g. San Francisco, CA"`
			`# },`
			`# "unit": {`
			`# "type": "string",`
			`# "enum": [`
			`# "c",`
			`# "f"`
			`# ]`
			`# }`
			`# },`
			`# "required": [`
			`# "location"`
			`# ]`
			`# }`
			`# )`
			`# ]`
			`# )`

			`# assert isinstance(response, LLMResult)`
			`# assert len(response.message.content) > 0`
			`# assert response.usage.total_tokens > 0`
			`# assert response.message.tool_calls[0].function.name == 'get_current_weather'`

			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)`
			`def test_validate_credentials_for_generation_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`with pytest.raises(CredentialsValidateFailedError):`
			`model.validate_credentials(`
			`model='alapaca',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': 'www ' + os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`}`
			`)`

			`with pytest.raises(CredentialsValidateFailedError):`
			`model.validate_credentials(`
			`model='alapaca',`
			`credentials={`
			`'server_url': '',`
			`'model_uid': ''`
			`}`
			`)`

			`model.validate_credentials(`
			`model='alapaca',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`}`
			`)`

			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)`
			`def test_invoke_generation_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`response = model.invoke(`
			`model='alapaca',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`UserPromptMessage(`
			`content='the United States is'`
			`)`
			`],`
			`model_parameters={`
			`'temperature': 0.7,`
			`'top_p': 1.0,`
			`},`
			`stop=['you'],`
			`user="abc-123",`
			`stream=False`
			`)`

			`assert isinstance(response, LLMResult)`
			`assert len(response.message.content) > 0`
			`assert response.usage.total_tokens > 0`

			`@pytest.mark.parametrize('setup_openai_mock, setup_xinference_mock', [['completion', 'none']], indirect=True)`
			`def test_invoke_stream_generation_model(setup_openai_mock, setup_xinference_mock):`
			`model = XinferenceAILargeLanguageModel()`

			`response = model.invoke(`
			`model='alapaca',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`UserPromptMessage(`
			`content='the United States is'`
			`)`
			`],`
			`model_parameters={`
			`'temperature': 0.7,`
			`'top_p': 1.0,`
			`},`
			`stop=['you'],`
			`stream=True,`
			`user="abc-123"`
			`)`

			`assert isinstance(response, Generator)`
			`for chunk in response:`
			`assert isinstance(chunk, LLMResultChunk)`
			`assert isinstance(chunk.delta, LLMResultChunkDelta)`
			`assert isinstance(chunk.delta.message, AssistantPromptMessage)`
			`assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True`

			`def test_get_num_tokens():`
			`model = XinferenceAILargeLanguageModel()`

			`num_tokens = model.get_num_tokens(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`SystemPromptMessage(`
			`content='You are a helpful AI assistant.',`
			`),`
			`UserPromptMessage(`
			`content='Hello World!'`
			`)`
			`],`
			`tools=[`
			`PromptMessageTool(`
			`name='get_current_weather',`
			`description='Get the current weather in a given location',`
			`parameters={`
			`"type": "object",`
			`"properties": {`
			`"location": {`
			`"type": "string",`
			`"description": "The city and state e.g. San Francisco, CA"`
			`},`
			`"unit": {`
			`"type": "string",`
			`"enum": [`
			`"c",`
			`"f"`
			`]`
			`}`
			`},`
			`"required": [`
			`"location"`
			`]`
			`}`
			`)`
			`]`
			`)`

			`assert isinstance(num_tokens, int)`
			`assert num_tokens == 77`

			`num_tokens = model.get_num_tokens(`
			`model='ChatGLM3',`
			`credentials={`
			`'server_url': os.environ.get('XINFERENCE_SERVER_URL'),`
			`'model_uid': os.environ.get('XINFERENCE_GENERATION_MODEL_UID')`
			`},`
			`prompt_messages=[`
			`SystemPromptMessage(`
			`content='You are a helpful AI assistant.',`
			`),`
			`UserPromptMessage(`
			`content='Hello World!'`
			`)`
			`],`
			`)`

			`assert isinstance(num_tokens, int)`
			`assert num_tokens == 21`