mirror of
https://github.com/langgenius/dify.git
synced 2024-11-16 11:42:29 +08:00
feat: couchbase integration (#6165)
Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: Elliot Scribner <elliot.scribner@couchbase.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Bowen Liang <bowenliang@apache.org>
This commit is contained in:
parent
fc37e654fc
commit
c8ef9223e5
3
.github/workflows/api-tests.yml
vendored
3
.github/workflows/api-tests.yml
vendored
|
@ -78,7 +78,7 @@ jobs:
|
|||
- name: Run Workflow
|
||||
run: poetry run -C api bash dev/pytest/pytest_workflow.sh
|
||||
|
||||
- name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch)
|
||||
- name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch, Couchbase)
|
||||
uses: hoverkraft-tech/compose-action@v2.0.0
|
||||
with:
|
||||
compose-file: |
|
||||
|
@ -86,6 +86,7 @@ jobs:
|
|||
services: |
|
||||
weaviate
|
||||
qdrant
|
||||
couchbase-server
|
||||
etcd
|
||||
minio
|
||||
milvus-standalone
|
||||
|
|
4
.github/workflows/expose_service_ports.sh
vendored
4
.github/workflows/expose_service_ports.sh
vendored
|
@ -7,5 +7,7 @@ yq eval '.services["milvus-standalone"].ports += ["19530:19530"]' -i docker/dock
|
|||
yq eval '.services.pgvector.ports += ["5433:5432"]' -i docker/docker-compose.yaml
|
||||
yq eval '.services["pgvecto-rs"].ports += ["5431:5432"]' -i docker/docker-compose.yaml
|
||||
yq eval '.services["elasticsearch"].ports += ["9200:9200"]' -i docker/docker-compose.yaml
|
||||
yq eval '.services.couchbase-server.ports += ["8091-8096:8091-8096"]' -i docker/docker-compose.yaml
|
||||
yq eval '.services.couchbase-server.ports += ["11210:11210"]' -i docker/docker-compose.yaml
|
||||
|
||||
echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch"
|
||||
echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch, couchbase"
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -173,6 +173,7 @@ docker/volumes/myscale/log/*
|
|||
docker/volumes/unstructured/*
|
||||
docker/volumes/pgvector/data/*
|
||||
docker/volumes/pgvecto_rs/data/*
|
||||
docker/volumes/couchbase/*
|
||||
|
||||
docker/nginx/conf.d/default.conf
|
||||
docker/nginx/ssl/*
|
||||
|
|
|
@ -120,7 +120,7 @@ SUPABASE_URL=your-server-url
|
|||
WEB_API_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*
|
||||
CONSOLE_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*
|
||||
|
||||
# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, vikingdb, upstash
|
||||
# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, couchbase, vikingdb, upstash
|
||||
VECTOR_STORE=weaviate
|
||||
|
||||
# Weaviate configuration
|
||||
|
@ -136,6 +136,13 @@ QDRANT_CLIENT_TIMEOUT=20
|
|||
QDRANT_GRPC_ENABLED=false
|
||||
QDRANT_GRPC_PORT=6334
|
||||
|
||||
#Couchbase configuration
|
||||
COUCHBASE_CONNECTION_STRING=127.0.0.1
|
||||
COUCHBASE_USER=Administrator
|
||||
COUCHBASE_PASSWORD=password
|
||||
COUCHBASE_BUCKET_NAME=Embeddings
|
||||
COUCHBASE_SCOPE_NAME=_default
|
||||
|
||||
# Milvus configuration
|
||||
MILVUS_URI=http://127.0.0.1:19530
|
||||
MILVUS_TOKEN=
|
||||
|
|
|
@ -278,6 +278,7 @@ def migrate_knowledge_vector_database():
|
|||
VectorType.BAIDU,
|
||||
VectorType.VIKINGDB,
|
||||
VectorType.UPSTASH,
|
||||
VectorType.COUCHBASE,
|
||||
}
|
||||
page = 1
|
||||
while True:
|
||||
|
|
|
@ -17,6 +17,7 @@ from configs.middleware.storage.tencent_cos_storage_config import TencentCloudCO
|
|||
from configs.middleware.storage.volcengine_tos_storage_config import VolcengineTOSStorageConfig
|
||||
from configs.middleware.vdb.analyticdb_config import AnalyticdbConfig
|
||||
from configs.middleware.vdb.chroma_config import ChromaConfig
|
||||
from configs.middleware.vdb.couchbase_config import CouchbaseConfig
|
||||
from configs.middleware.vdb.elasticsearch_config import ElasticsearchConfig
|
||||
from configs.middleware.vdb.milvus_config import MilvusConfig
|
||||
from configs.middleware.vdb.myscale_config import MyScaleConfig
|
||||
|
@ -251,6 +252,7 @@ class MiddlewareConfig(
|
|||
TiDBVectorConfig,
|
||||
WeaviateConfig,
|
||||
ElasticsearchConfig,
|
||||
CouchbaseConfig,
|
||||
InternalTestConfig,
|
||||
VikingDBConfig,
|
||||
UpstashConfig,
|
||||
|
|
34
api/configs/middleware/vdb/couchbase_config.py
Normal file
34
api/configs/middleware/vdb/couchbase_config.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class CouchbaseConfig(BaseModel):
|
||||
"""
|
||||
Couchbase configs
|
||||
"""
|
||||
|
||||
COUCHBASE_CONNECTION_STRING: Optional[str] = Field(
|
||||
description="COUCHBASE connection string",
|
||||
default=None,
|
||||
)
|
||||
|
||||
COUCHBASE_USER: Optional[str] = Field(
|
||||
description="COUCHBASE user",
|
||||
default=None,
|
||||
)
|
||||
|
||||
COUCHBASE_PASSWORD: Optional[str] = Field(
|
||||
description="COUCHBASE password",
|
||||
default=None,
|
||||
)
|
||||
|
||||
COUCHBASE_BUCKET_NAME: Optional[str] = Field(
|
||||
description="COUCHBASE bucket name",
|
||||
default=None,
|
||||
)
|
||||
|
||||
COUCHBASE_SCOPE_NAME: Optional[str] = Field(
|
||||
description="COUCHBASE scope name",
|
||||
default=None,
|
||||
)
|
|
@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource):
|
|||
| VectorType.ELASTICSEARCH
|
||||
| VectorType.PGVECTOR
|
||||
| VectorType.TIDB_ON_QDRANT
|
||||
| VectorType.COUCHBASE
|
||||
):
|
||||
return {
|
||||
"retrieval_method": [
|
||||
|
@ -678,6 +679,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||
| VectorType.MYSCALE
|
||||
| VectorType.ORACLE
|
||||
| VectorType.ELASTICSEARCH
|
||||
| VectorType.COUCHBASE
|
||||
| VectorType.PGVECTOR
|
||||
):
|
||||
return {
|
||||
|
|
0
api/core/rag/datasource/vdb/couchbase/__init__.py
Normal file
0
api/core/rag/datasource/vdb/couchbase/__init__.py
Normal file
378
api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
Normal file
378
api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
Normal file
|
@ -0,0 +1,378 @@
|
|||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
from couchbase import search
|
||||
from couchbase.auth import PasswordAuthenticator
|
||||
from couchbase.cluster import Cluster
|
||||
from couchbase.management.search import SearchIndex
|
||||
|
||||
# needed for options -- cluster, timeout, SQL++ (N1QL) query, etc.
|
||||
from couchbase.options import ClusterOptions, SearchOptions
|
||||
from couchbase.vector_search import VectorQuery, VectorSearch
|
||||
from flask import current_app
|
||||
from pydantic import BaseModel, model_validator
|
||||
|
||||
from core.rag.datasource.vdb.vector_base import BaseVector
|
||||
from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
|
||||
from core.rag.datasource.vdb.vector_type import VectorType
|
||||
from core.rag.embedding.embedding_base import Embeddings
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CouchbaseConfig(BaseModel):
|
||||
connection_string: str
|
||||
user: str
|
||||
password: str
|
||||
bucket_name: str
|
||||
scope_name: str
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def validate_config(cls, values: dict) -> dict:
|
||||
if not values.get("connection_string"):
|
||||
raise ValueError("config COUCHBASE_CONNECTION_STRING is required")
|
||||
if not values.get("user"):
|
||||
raise ValueError("config COUCHBASE_USER is required")
|
||||
if not values.get("password"):
|
||||
raise ValueError("config COUCHBASE_PASSWORD is required")
|
||||
if not values.get("bucket_name"):
|
||||
raise ValueError("config COUCHBASE_PASSWORD is required")
|
||||
if not values.get("scope_name"):
|
||||
raise ValueError("config COUCHBASE_SCOPE_NAME is required")
|
||||
return values
|
||||
|
||||
|
||||
class CouchbaseVector(BaseVector):
|
||||
def __init__(self, collection_name: str, config: CouchbaseConfig):
|
||||
super().__init__(collection_name)
|
||||
self._client_config = config
|
||||
|
||||
"""Connect to couchbase"""
|
||||
|
||||
auth = PasswordAuthenticator(config.user, config.password)
|
||||
options = ClusterOptions(auth)
|
||||
self._cluster = Cluster(config.connection_string, options)
|
||||
self._bucket = self._cluster.bucket(config.bucket_name)
|
||||
self._scope = self._bucket.scope(config.scope_name)
|
||||
self._bucket_name = config.bucket_name
|
||||
self._scope_name = config.scope_name
|
||||
|
||||
# Wait until the cluster is ready for use.
|
||||
self._cluster.wait_until_ready(timedelta(seconds=5))
|
||||
|
||||
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
index_id = str(uuid.uuid4()).replace("-", "")
|
||||
self._create_collection(uuid=index_id, vector_length=len(embeddings[0]))
|
||||
self.add_texts(texts, embeddings)
|
||||
|
||||
def _create_collection(self, vector_length: int, uuid: str):
|
||||
lock_name = "vector_indexing_lock_{}".format(self._collection_name)
|
||||
with redis_client.lock(lock_name, timeout=20):
|
||||
collection_exist_cache_key = "vector_indexing_{}".format(self._collection_name)
|
||||
if redis_client.get(collection_exist_cache_key):
|
||||
return
|
||||
if self._collection_exists(self._collection_name):
|
||||
return
|
||||
manager = self._bucket.collections()
|
||||
manager.create_collection(self._client_config.scope_name, self._collection_name)
|
||||
|
||||
index_manager = self._scope.search_indexes()
|
||||
|
||||
index_definition = json.loads("""
|
||||
{
|
||||
"type": "fulltext-index",
|
||||
"name": "Embeddings._default.Vector_Search",
|
||||
"uuid": "26d4db528e78b716",
|
||||
"sourceType": "gocbcore",
|
||||
"sourceName": "Embeddings",
|
||||
"sourceUUID": "2242e4a25b4decd6650c9c7b3afa1dbf",
|
||||
"planParams": {
|
||||
"maxPartitionsPerPIndex": 1024,
|
||||
"indexPartitions": 1
|
||||
},
|
||||
"params": {
|
||||
"doc_config": {
|
||||
"docid_prefix_delim": "",
|
||||
"docid_regexp": "",
|
||||
"mode": "scope.collection.type_field",
|
||||
"type_field": "type"
|
||||
},
|
||||
"mapping": {
|
||||
"analysis": { },
|
||||
"default_analyzer": "standard",
|
||||
"default_datetime_parser": "dateTimeOptional",
|
||||
"default_field": "_all",
|
||||
"default_mapping": {
|
||||
"dynamic": true,
|
||||
"enabled": true
|
||||
},
|
||||
"default_type": "_default",
|
||||
"docvalues_dynamic": false,
|
||||
"index_dynamic": true,
|
||||
"store_dynamic": true,
|
||||
"type_field": "_type",
|
||||
"types": {
|
||||
"collection_name": {
|
||||
"dynamic": true,
|
||||
"enabled": true,
|
||||
"properties": {
|
||||
"embedding": {
|
||||
"dynamic": false,
|
||||
"enabled": true,
|
||||
"fields": [
|
||||
{
|
||||
"dims": 1536,
|
||||
"index": true,
|
||||
"name": "embedding",
|
||||
"similarity": "dot_product",
|
||||
"type": "vector",
|
||||
"vector_index_optimized_for": "recall"
|
||||
}
|
||||
]
|
||||
},
|
||||
"metadata": {
|
||||
"dynamic": true,
|
||||
"enabled": true
|
||||
},
|
||||
"text": {
|
||||
"dynamic": false,
|
||||
"enabled": true,
|
||||
"fields": [
|
||||
{
|
||||
"index": true,
|
||||
"name": "text",
|
||||
"store": true,
|
||||
"type": "text"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"store": {
|
||||
"indexType": "scorch",
|
||||
"segmentVersion": 16
|
||||
}
|
||||
},
|
||||
"sourceParams": { }
|
||||
}
|
||||
""")
|
||||
index_definition["name"] = self._collection_name + "_search"
|
||||
index_definition["uuid"] = uuid
|
||||
index_definition["params"]["mapping"]["types"]["collection_name"]["properties"]["embedding"]["fields"][0][
|
||||
"dims"
|
||||
] = vector_length
|
||||
index_definition["params"]["mapping"]["types"][self._scope_name + "." + self._collection_name] = (
|
||||
index_definition["params"]["mapping"]["types"].pop("collection_name")
|
||||
)
|
||||
time.sleep(2)
|
||||
index_manager.upsert_index(
|
||||
SearchIndex(
|
||||
index_definition["name"],
|
||||
params=index_definition["params"],
|
||||
source_name=self._bucket_name,
|
||||
),
|
||||
)
|
||||
time.sleep(1)
|
||||
|
||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||
|
||||
def _collection_exists(self, name: str):
|
||||
scope_collection_map: dict[str, Any] = {}
|
||||
|
||||
# Get a list of all scopes in the bucket
|
||||
for scope in self._bucket.collections().get_all_scopes():
|
||||
scope_collection_map[scope.name] = []
|
||||
|
||||
# Get a list of all the collections in the scope
|
||||
for collection in scope.collections:
|
||||
scope_collection_map[scope.name].append(collection.name)
|
||||
|
||||
# Check if the collection exists in the scope
|
||||
return self._collection_name in scope_collection_map[self._scope_name]
|
||||
|
||||
def get_type(self) -> str:
|
||||
return VectorType.COUCHBASE
|
||||
|
||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||
uuids = self._get_uuids(documents)
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
doc_ids = []
|
||||
|
||||
documents_to_insert = [
|
||||
{"text": text, "embedding": vector, "metadata": metadata}
|
||||
for id, text, vector, metadata in zip(uuids, texts, embeddings, metadatas)
|
||||
]
|
||||
for doc, id in zip(documents_to_insert, uuids):
|
||||
result = self._scope.collection(self._collection_name).upsert(id, doc)
|
||||
|
||||
doc_ids.extend(uuids)
|
||||
|
||||
return doc_ids
|
||||
|
||||
def text_exists(self, id: str) -> bool:
|
||||
# Use a parameterized query for safety and correctness
|
||||
query = f"""
|
||||
SELECT COUNT(1) AS count FROM
|
||||
`{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
|
||||
WHERE META().id = $doc_id
|
||||
"""
|
||||
# Pass the id as a parameter to the query
|
||||
result = self._cluster.query(query, named_parameters={"doc_id": id}).execute()
|
||||
for row in result:
|
||||
return row["count"] > 0
|
||||
return False # Return False if no rows are returned
|
||||
|
||||
def delete_by_ids(self, ids: list[str]) -> None:
|
||||
query = f"""
|
||||
DELETE FROM `{self._bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
|
||||
WHERE META().id IN $doc_ids;
|
||||
"""
|
||||
try:
|
||||
self._cluster.query(query, named_parameters={"doc_ids": ids}).execute()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
def delete_by_document_id(self, document_id: str):
|
||||
query = f"""
|
||||
DELETE FROM
|
||||
`{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
|
||||
WHERE META().id = $doc_id;
|
||||
"""
|
||||
self._cluster.query(query, named_parameters={"doc_id": document_id}).execute()
|
||||
|
||||
# def get_ids_by_metadata_field(self, key: str, value: str):
|
||||
# query = f"""
|
||||
# SELECT id FROM
|
||||
# `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
|
||||
# WHERE `metadata.{key}` = $value;
|
||||
# """
|
||||
# result = self._cluster.query(query, named_parameters={'value':value})
|
||||
# return [row['id'] for row in result.rows()]
|
||||
|
||||
def delete_by_metadata_field(self, key: str, value: str) -> None:
|
||||
query = f"""
|
||||
DELETE FROM `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
|
||||
WHERE metadata.{key} = $value;
|
||||
"""
|
||||
self._cluster.query(query, named_parameters={"value": value}).execute()
|
||||
|
||||
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
||||
top_k = kwargs.get("top_k", 5)
|
||||
score_threshold = kwargs.get("score_threshold") or 0.0
|
||||
|
||||
search_req = search.SearchRequest.create(
|
||||
VectorSearch.from_vector_query(
|
||||
VectorQuery(
|
||||
"embedding",
|
||||
query_vector,
|
||||
top_k,
|
||||
)
|
||||
)
|
||||
)
|
||||
try:
|
||||
search_iter = self._scope.search(
|
||||
self._collection_name + "_search",
|
||||
search_req,
|
||||
SearchOptions(limit=top_k, collections=[self._collection_name], fields=["*"]),
|
||||
)
|
||||
|
||||
docs = []
|
||||
# Parse the results
|
||||
for row in search_iter.rows():
|
||||
text = row.fields.pop("text")
|
||||
metadata = self._format_metadata(row.fields)
|
||||
score = row.score
|
||||
metadata["score"] = score
|
||||
doc = Document(page_content=text, metadata=metadata)
|
||||
if score >= score_threshold:
|
||||
docs.append(doc)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Search failed with error: {e}")
|
||||
|
||||
return docs
|
||||
|
||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||
top_k = kwargs.get("top_k", 2)
|
||||
try:
|
||||
CBrequest = search.SearchRequest.create(search.QueryStringQuery("text:" + query))
|
||||
search_iter = self._scope.search(
|
||||
self._collection_name + "_search", CBrequest, SearchOptions(limit=top_k, fields=["*"])
|
||||
)
|
||||
|
||||
docs = []
|
||||
for row in search_iter.rows():
|
||||
text = row.fields.pop("text")
|
||||
metadata = self._format_metadata(row.fields)
|
||||
score = row.score
|
||||
metadata["score"] = score
|
||||
doc = Document(page_content=text, metadata=metadata)
|
||||
docs.append(doc)
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Search failed with error: {e}")
|
||||
|
||||
return docs
|
||||
|
||||
def delete(self):
|
||||
manager = self._bucket.collections()
|
||||
scopes = manager.get_all_scopes()
|
||||
|
||||
for scope in scopes:
|
||||
for collection in scope.collections:
|
||||
if collection.name == self._collection_name:
|
||||
manager.drop_collection("_default", self._collection_name)
|
||||
|
||||
def _format_metadata(self, row_fields: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Helper method to format the metadata from the Couchbase Search API.
|
||||
Args:
|
||||
row_fields (Dict[str, Any]): The fields to format.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: The formatted metadata.
|
||||
"""
|
||||
metadata = {}
|
||||
for key, value in row_fields.items():
|
||||
# Couchbase Search returns the metadata key with a prefix
|
||||
# `metadata.` We remove it to get the original metadata key
|
||||
if key.startswith("metadata"):
|
||||
new_key = key.split("metadata" + ".")[-1]
|
||||
metadata[new_key] = value
|
||||
else:
|
||||
metadata[key] = value
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
class CouchbaseVectorFactory(AbstractVectorFactory):
|
||||
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> CouchbaseVector:
|
||||
if dataset.index_struct_dict:
|
||||
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
|
||||
collection_name = class_prefix
|
||||
else:
|
||||
dataset_id = dataset.id
|
||||
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||
dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.COUCHBASE, collection_name))
|
||||
|
||||
config = current_app.config
|
||||
return CouchbaseVector(
|
||||
collection_name=collection_name,
|
||||
config=CouchbaseConfig(
|
||||
connection_string=config.get("COUCHBASE_CONNECTION_STRING"),
|
||||
user=config.get("COUCHBASE_USER"),
|
||||
password=config.get("COUCHBASE_PASSWORD"),
|
||||
bucket_name=config.get("COUCHBASE_BUCKET_NAME"),
|
||||
scope_name=config.get("COUCHBASE_SCOPE_NAME"),
|
||||
),
|
||||
)
|
|
@ -114,6 +114,10 @@ class Vector:
|
|||
from core.rag.datasource.vdb.analyticdb.analyticdb_vector import AnalyticdbVectorFactory
|
||||
|
||||
return AnalyticdbVectorFactory
|
||||
case VectorType.COUCHBASE:
|
||||
from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseVectorFactory
|
||||
|
||||
return CouchbaseVectorFactory
|
||||
case VectorType.BAIDU:
|
||||
from core.rag.datasource.vdb.baidu.baidu_vector import BaiduVectorFactory
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ class VectorType(str, Enum):
|
|||
TENCENT = "tencent"
|
||||
ORACLE = "oracle"
|
||||
ELASTICSEARCH = "elasticsearch"
|
||||
COUCHBASE = "couchbase"
|
||||
BAIDU = "baidu"
|
||||
VIKINGDB = "vikingdb"
|
||||
UPSTASH = "upstash"
|
||||
|
|
55
api/poetry.lock
generated
55
api/poetry.lock
generated
|
@ -1801,6 +1801,46 @@ requests = ">=2.8"
|
|||
six = "*"
|
||||
xmltodict = "*"
|
||||
|
||||
[[package]]
|
||||
name = "couchbase"
|
||||
version = "4.3.3"
|
||||
description = "Python Client for Couchbase"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "couchbase-4.3.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:d8069e4f01332859d56cca597874645c914699162b3979d1b432f0dfc186b124"},
|
||||
{file = "couchbase-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1caa6cfef49c785b35b1702102f718227f351df87bba2694b9334520c41e9eb5"},
|
||||
{file = "couchbase-4.3.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f4a9a65c44935249fa078fb90a3c28ea71da9d2d5889fcd514b12d0538010ae0"},
|
||||
{file = "couchbase-4.3.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f144b8c482c18283d8e419b844630d41f3249b07d43d40b5e3535444e57d0fb"},
|
||||
{file = "couchbase-4.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c534fba6fdc7cf47eed9dee8a57d1e9eb867bf008574e321fa380a77cebf32f"},
|
||||
{file = "couchbase-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b841be06e0e4370b69ebef6bca3409c378186f7d6e964cd645ba18e97216c022"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:eee7a73b3acbdc78ae314fddf7f975b3c9e05df07df255f4dcc878939a2abae0"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:53417cafcf90ff4e2fd81ebba2a08b7ad56f17160d1c5019ad3b09c758aeb363"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0cefd13bea8b0f150f1b9d27fd7614f971f77419b31817781d26ba315ed658bb"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:78fa1054d7740e2fe38fce0a2aab4e9a2d30263d894e0615ee5df297f02f59a3"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb093899cfad5a7472258a9b6a57775dbf23a6e0180241507ba89ce3ab241e41"},
|
||||
{file = "couchbase-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f7cfbdc699af5715f49365ffbb05a6a7366a534c0d7161edf270ad3e735a6c5d"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:58352cae9b8affdaa2ac012e0a03c8c2632ee6297a878232888b4e0360d0d5df"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:728e7e3b5e1682706cb9d63993d289226d02a25089527b8ecb4e3889dabc38cf"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:73014bf098cf14187a39cc13453e0d859c1d54568df28f69cc308a9a5f24feb2"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a743375804068ae01b73c916bfca738764c8c12f381bb399ef04e784935856a1"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:394c122cfe02a76a99e7d5178e64129f6da49843225e78d8629abcab556c24af"},
|
||||
{file = "couchbase-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:bf85d7a5cda548d9801614651206068b4445fa37972e62b14d7521a958198693"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:92d23c9cedd571631070791f2afee0e3d7d8c9ce1bf2ea6e9a4f2fdbc37a0f1e"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:38c42eb29a73cce2998ae5df45bd61b16dce9765d3bff968ec5cf6a622faa291"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:afed137bf0edc642d7b201b6ab7b1e7117bb4c8eac6b2f253cc6e106f334a2a1"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:954d991377d47883aaf903934c5d0f19577680a2abf80d3ce5bb9b3c80991fc7"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5552b9fa684630698dc98d6f3b1082540634c1b7ad5bf53b843b5da57b0169c"},
|
||||
{file = "couchbase-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:f88f2b7e0c894f7237d9f3fb5c46abc44b8151a97b3ca8e75f57d23ebf59f9da"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:769e1e2367ea1d4de181fcd4b4e353e9abef97d15b581a6c5aea49ece3dc7d59"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:47f59a0b35ffce060583fd11f98f049f3b70701cf14aab9ac092594aca486aeb"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:440bb93d611827ba0ea2403c6f204fe931467a6cb5811f0e03bf1779204ef843"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cdb4dde62e1d41c0b8707121ab68fa78b7a1508541bd48fc850be396f91bc8d9"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7f8cf45f317b39cc19db5c67b565662f08d6c90305b3aa14e04bc22707258213"},
|
||||
{file = "couchbase-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:c97d48ad486c8f201b4482d5594258f949369cb44792ed148d5159a3d12ae21b"},
|
||||
{file = "couchbase-4.3.3.tar.gz", hash = "sha256:27808500551564b39b46943cf3daab572694889c1eb638425d363edb48b20da7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "coverage"
|
||||
version = "7.2.7"
|
||||
|
@ -6850,6 +6890,19 @@ files = [
|
|||
{file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
|
||||
{file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
|
||||
{file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
|
||||
{file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -10866,4 +10919,4 @@ cffi = ["cffi (>=1.11)"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "1b268122d3d4771ba219f0e983322e0454b7b8644dba35da38d7d950d489e1ba"
|
||||
content-hash = "52552faf5f4823056eb48afe05349ab2f0e9a5bc42105211ccbbb54b59e27b59"
|
||||
|
|
|
@ -239,6 +239,7 @@ alibabacloud_gpdb20160503 = "~3.8.0"
|
|||
alibabacloud_tea_openapi = "~0.3.9"
|
||||
chromadb = "0.5.1"
|
||||
clickhouse-connect = "~0.7.16"
|
||||
couchbase = "~4.3.0"
|
||||
elasticsearch = "8.14.0"
|
||||
opensearch-py = "2.4.0"
|
||||
oracledb = "~2.2.1"
|
||||
|
|
50
api/tests/integration_tests/vdb/couchbase/test_couchbase.py
Normal file
50
api/tests/integration_tests/vdb/couchbase/test_couchbase.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
import subprocess
|
||||
import time
|
||||
|
||||
from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseConfig, CouchbaseVector
|
||||
from tests.integration_tests.vdb.test_vector_store import (
|
||||
AbstractVectorTest,
|
||||
get_example_text,
|
||||
setup_mock_redis,
|
||||
)
|
||||
|
||||
|
||||
def wait_for_healthy_container(service_name="couchbase-server", timeout=300):
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
result = subprocess.run(
|
||||
["docker", "inspect", "--format", "{{.State.Health.Status}}", service_name], capture_output=True, text=True
|
||||
)
|
||||
if result.stdout.strip() == "healthy":
|
||||
print(f"{service_name} is healthy!")
|
||||
return True
|
||||
else:
|
||||
print(f"Waiting for {service_name} to be healthy...")
|
||||
time.sleep(10)
|
||||
raise TimeoutError(f"{service_name} did not become healthy in time")
|
||||
|
||||
|
||||
class CouchbaseTest(AbstractVectorTest):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.vector = CouchbaseVector(
|
||||
collection_name=self.collection_name,
|
||||
config=CouchbaseConfig(
|
||||
connection_string="couchbase://127.0.0.1",
|
||||
user="Administrator",
|
||||
password="password",
|
||||
bucket_name="Embeddings",
|
||||
scope_name="_default",
|
||||
),
|
||||
)
|
||||
|
||||
def search_by_vector(self):
|
||||
# brief sleep to ensure document is indexed
|
||||
time.sleep(5)
|
||||
hits_by_vector = self.vector.search_by_vector(query_vector=self.example_embedding)
|
||||
assert len(hits_by_vector) == 1
|
||||
|
||||
|
||||
def test_couchbase(setup_mock_redis):
|
||||
wait_for_healthy_container("couchbase-server", timeout=60)
|
||||
CouchbaseTest().run_all_tests()
|
|
@ -11,4 +11,5 @@ pytest api/tests/integration_tests/vdb/chroma \
|
|||
api/tests/integration_tests/vdb/vikingdb \
|
||||
api/tests/integration_tests/vdb/baidu \
|
||||
api/tests/integration_tests/vdb/tcvectordb \
|
||||
api/tests/integration_tests/vdb/upstash
|
||||
api/tests/integration_tests/vdb/upstash \
|
||||
api/tests/integration_tests/vdb/couchbase \
|
||||
|
|
|
@ -375,7 +375,7 @@ SUPABASE_URL=your-server-url
|
|||
# ------------------------------
|
||||
|
||||
# The type of vector store to use.
|
||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `vikingdb`.
|
||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`.
|
||||
VECTOR_STORE=weaviate
|
||||
|
||||
# The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
|
||||
|
@ -414,6 +414,14 @@ MYSCALE_PASSWORD=
|
|||
MYSCALE_DATABASE=dify
|
||||
MYSCALE_FTS_PARAMS=
|
||||
|
||||
# Couchbase configurations, only available when VECTOR_STORE is `couchbase`
|
||||
# The connection string must include hostname defined in the docker-compose file (couchbase-server in this case)
|
||||
COUCHBASE_CONNECTION_STRING=couchbase://couchbase-server
|
||||
COUCHBASE_USER=Administrator
|
||||
COUCHBASE_PASSWORD=password
|
||||
COUCHBASE_BUCKET_NAME=Embeddings
|
||||
COUCHBASE_SCOPE_NAME=_default
|
||||
|
||||
# pgvector configurations, only available when VECTOR_STORE is `pgvector`
|
||||
PGVECTOR_HOST=pgvector
|
||||
PGVECTOR_PORT=5432
|
||||
|
|
4
docker/couchbase-server/Dockerfile
Normal file
4
docker/couchbase-server/Dockerfile
Normal file
|
@ -0,0 +1,4 @@
|
|||
FROM couchbase/server:latest AS stage_base
|
||||
# FROM couchbase:latest AS stage_base
|
||||
COPY init-cbserver.sh /opt/couchbase/init/
|
||||
RUN chmod +x /opt/couchbase/init/init-cbserver.sh
|
44
docker/couchbase-server/init-cbserver.sh
Executable file
44
docker/couchbase-server/init-cbserver.sh
Executable file
|
@ -0,0 +1,44 @@
|
|||
#!/bin/bash
|
||||
# used to start couchbase server - can't get around this as docker compose only allows you to start one command - so we have to start couchbase like the standard couchbase Dockerfile would
|
||||
# https://github.com/couchbase/docker/blob/master/enterprise/couchbase-server/7.2.0/Dockerfile#L88
|
||||
|
||||
/entrypoint.sh couchbase-server &
|
||||
|
||||
# track if setup is complete so we don't try to setup again
|
||||
FILE=/opt/couchbase/init/setupComplete.txt
|
||||
|
||||
if ! [ -f "$FILE" ]; then
|
||||
# used to automatically create the cluster based on environment variables
|
||||
# https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-cluster-init.html
|
||||
|
||||
echo $COUCHBASE_ADMINISTRATOR_USERNAME ":" $COUCHBASE_ADMINISTRATOR_PASSWORD
|
||||
|
||||
sleep 20s
|
||||
/opt/couchbase/bin/couchbase-cli cluster-init -c 127.0.0.1 \
|
||||
--cluster-username $COUCHBASE_ADMINISTRATOR_USERNAME \
|
||||
--cluster-password $COUCHBASE_ADMINISTRATOR_PASSWORD \
|
||||
--services data,index,query,fts \
|
||||
--cluster-ramsize $COUCHBASE_RAM_SIZE \
|
||||
--cluster-index-ramsize $COUCHBASE_INDEX_RAM_SIZE \
|
||||
--cluster-eventing-ramsize $COUCHBASE_EVENTING_RAM_SIZE \
|
||||
--cluster-fts-ramsize $COUCHBASE_FTS_RAM_SIZE \
|
||||
--index-storage-setting default
|
||||
|
||||
sleep 2s
|
||||
|
||||
# used to auto create the bucket based on environment variables
|
||||
# https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-bucket-create.html
|
||||
|
||||
/opt/couchbase/bin/couchbase-cli bucket-create -c localhost:8091 \
|
||||
--username $COUCHBASE_ADMINISTRATOR_USERNAME \
|
||||
--password $COUCHBASE_ADMINISTRATOR_PASSWORD \
|
||||
--bucket $COUCHBASE_BUCKET \
|
||||
--bucket-ramsize $COUCHBASE_BUCKET_RAMSIZE \
|
||||
--bucket-type couchbase
|
||||
|
||||
# create file so we know that the cluster is setup and don't run the setup again
|
||||
touch $FILE
|
||||
fi
|
||||
# docker compose will stop the container from running unless we do this
|
||||
# known issue and workaround
|
||||
tail -f /dev/null
|
|
@ -110,6 +110,11 @@ x-shared-env: &shared-api-worker-env
|
|||
QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
|
||||
QDRANT_GRPC_ENABLED: ${QDRANT_GRPC_ENABLED:-false}
|
||||
QDRANT_GRPC_PORT: ${QDRANT_GRPC_PORT:-6334}
|
||||
COUCHBASE_CONNECTION_STRING: ${COUCHBASE_CONNECTION_STRING:-'couchbase-server'}
|
||||
COUCHBASE_USER: ${COUCHBASE_USER:-Administrator}
|
||||
COUCHBASE_PASSWORD: ${COUCHBASE_PASSWORD:-password}
|
||||
COUCHBASE_BUCKET_NAME: ${COUCHBASE_BUCKET_NAME:-Embeddings}
|
||||
COUCHBASE_SCOPE_NAME: ${COUCHBASE_SCOPE_NAME:-_default}
|
||||
MILVUS_URI: ${MILVUS_URI:-http://127.0.0.1:19530}
|
||||
MILVUS_TOKEN: ${MILVUS_TOKEN:-}
|
||||
MILVUS_USER: ${MILVUS_USER:-root}
|
||||
|
@ -475,6 +480,39 @@ services:
|
|||
environment:
|
||||
QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
|
||||
|
||||
# The Couchbase vector store.
|
||||
couchbase-server:
|
||||
build: ./couchbase-server
|
||||
profiles:
|
||||
- couchbase
|
||||
restart: always
|
||||
environment:
|
||||
- CLUSTER_NAME=dify_search
|
||||
- COUCHBASE_ADMINISTRATOR_USERNAME=${COUCHBASE_USER:-Administrator}
|
||||
- COUCHBASE_ADMINISTRATOR_PASSWORD=${COUCHBASE_PASSWORD:-password}
|
||||
- COUCHBASE_BUCKET=${COUCHBASE_BUCKET_NAME:-Embeddings}
|
||||
- COUCHBASE_BUCKET_RAMSIZE=512
|
||||
- COUCHBASE_RAM_SIZE=2048
|
||||
- COUCHBASE_EVENTING_RAM_SIZE=512
|
||||
- COUCHBASE_INDEX_RAM_SIZE=512
|
||||
- COUCHBASE_FTS_RAM_SIZE=1024
|
||||
hostname: couchbase-server
|
||||
container_name: couchbase-server
|
||||
working_dir: /opt/couchbase
|
||||
stdin_open: true
|
||||
tty: true
|
||||
entrypoint: [""]
|
||||
command: sh -c "/opt/couchbase/init/init-cbserver.sh"
|
||||
volumes:
|
||||
- ./volumes/couchbase/data:/opt/couchbase/var/lib/couchbase/data
|
||||
healthcheck:
|
||||
# ensure bucket was created before proceeding
|
||||
test: [ "CMD-SHELL", "curl -s -f -u Administrator:password http://localhost:8091/pools/default/buckets | grep -q '\\[{' || exit 1" ]
|
||||
interval: 10s
|
||||
retries: 10
|
||||
start_period: 30s
|
||||
timeout: 10s
|
||||
|
||||
# The pgvector vector database.
|
||||
pgvector:
|
||||
image: pgvector/pgvector:pg16
|
||||
|
|
Loading…
Reference in New Issue
Block a user