From 7ae728a9a3bd44baa365c89a0d86f5dee743a286 Mon Sep 17 00:00:00 2001
From: Jyong <76649700+JohnJyong@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:14:05 +0800
Subject: [PATCH] fix nltk averaged_perceptron_tagger download and fix score
 limit is none (#7582)

Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
---
 api/.idea/vcs.xml                           | 1 +
 api/Dockerfile                              | 2 +-
 api/core/rag/retrieval/dataset_retrieval.py | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/api/.idea/vcs.xml b/api/.idea/vcs.xml
index eaa7c25c60..b7af618884 100644
--- a/api/.idea/vcs.xml
+++ b/api/.idea/vcs.xml
@@ -12,5 +12,6 @@
   </component>
   <component name="VcsDirectoryMappings">
     <mapping directory="" vcs="Git" />
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
   </component>
 </project>
diff --git a/api/Dockerfile b/api/Dockerfile
index 10a3dc2eed..cca6488679 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -65,7 +65,7 @@ COPY --from=packages ${VIRTUAL_ENV} ${VIRTUAL_ENV}
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
 
 # Download nltk data
-RUN python -c "import nltk; nltk.download('punkt')"
+RUN python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger')"
 
 # Copy source code
 COPY . /app/api/
diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py
index fc6d231f8e..c970e3dafa 100644
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@@ -616,6 +616,7 @@ class DatasetRetrieval:
         for document in all_documents:
             if score_threshold is None or document.metadata['score'] >= score_threshold:
                 filter_documents.append(document)
+
         if not filter_documents:
             return []
         filter_documents = sorted(filter_documents, key=lambda x: x.metadata['score'], reverse=True)