diff --git a/backend/app/api/docs/collections/info.md b/backend/app/api/docs/collections/info.md index 65c48c7a1..a751954e8 100644 --- a/backend/app/api/docs/collections/info.md +++ b/backend/app/api/docs/collections/info.md @@ -1,5 +1,13 @@ -Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization, -timestamps, and associated LLM service details (`llm_service_id` and `llm_service_name`). +Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization, timestamps, and service-specific details. + +**Response Fields:** + +**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, the actual response will only include the fields relevant to what was created: + +- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name` +- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider` + +**Including Documents:** If the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider. diff --git a/backend/app/api/docs/collections/job_info.md b/backend/app/api/docs/collections/job_info.md index 8ddbf0694..8ca288b7e 100644 --- a/backend/app/api/docs/collections/job_info.md +++ b/backend/app/api/docs/collections/job_info.md @@ -2,8 +2,13 @@ Retrieve information about a collection job by the collection job ID. This endpo * Fetching the collection job object, including the collection job ID, the current status, and the associated collection details. -* If the job has finished, has been successful and it was a job of creation of collection then this endpoint will fetch the associated collection details from the collection table, including: - - `llm_service_id` and `llm_service_name`. - - Collection metadata such as ID, project, organization, and timestamps. +* If the job has finished, has been successful and it was a job of creation of collection then this endpoint will fetch the associated collection details. -* If the delete-collection job succeeds, the status is set to “successful” and the `collection` key contains the ID of the collection that has been deleted. +* If the delete-collection job succeeds, the status is set to "successful" and the `collection` key contains the ID of the collection that has been deleted. + +**Response Fields for Successful Creation Jobs:** + +**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, the actual collection object in the response will only include the fields relevant to what was created: + +- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name` +- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider` diff --git a/backend/app/api/docs/collections/list.md b/backend/app/api/docs/collections/list.md index bb28e0b6a..ae5ad46e3 100644 --- a/backend/app/api/docs/collections/list.md +++ b/backend/app/api/docs/collections/list.md @@ -1,5 +1,8 @@ -List all _active_ collections that have been created and are not deleted +List all _active_ collections that have been created and are not deleted. -If a vector store was created - `llm_service_name` and `llm_service_id` in the response denotes the name of the vector store (eg. 'openai vector store') and its id respectively. +**Response Fields:** -[Deprecated] If an assistant was created, `llm_service_name` and `llm_service_id` in the response denotes the name of the model used in the assistant (eg. 'gpt-4o') and assistant id. +**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, each collection in the response will only include the fields relevant to what was created: + +- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name` (e.g., `llm_service_name: "gpt-4o"` and the assistant ID) +- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider` (e.g., `knowledge_base_provider: "openai vector store"` and the vector store ID) diff --git a/backend/app/api/routes/collection_job.py b/backend/app/api/routes/collection_job.py index 31686c83e..c586a3cc9 100644 --- a/backend/app/api/routes/collection_job.py +++ b/backend/app/api/routes/collection_job.py @@ -19,7 +19,7 @@ ) from app.models.collection import CollectionPublic from app.utils import APIResponse, load_description -from app.services.collections.helpers import extract_error_message +from app.services.collections.helpers import extract_error_message, to_collection_public logger = logging.getLogger(__name__) @@ -49,7 +49,7 @@ def collection_job_info( ): collection_crud = CollectionCrud(session, current_user.project_.id) collection = collection_crud.read_one(collection_job.collection_id) - job_out.collection = CollectionPublic.model_validate(collection) + job_out.collection = to_collection_public(collection) elif collection_job.action_type == CollectionActionType.DELETE: job_out.collection = CollectionIDPublic(id=collection_job.collection_id) diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py index bf063a084..558e4d867 100644 --- a/backend/app/api/routes/collections.py +++ b/backend/app/api/routes/collections.py @@ -28,7 +28,7 @@ CollectionPublic, ) from app.utils import APIResponse, load_description, validate_callback_url -from app.services.collections.helpers import ensure_unique_name +from app.services.collections.helpers import ensure_unique_name, to_collection_public from app.services.collections import ( create_collection as create_service, delete_collection as delete_service, @@ -71,7 +71,10 @@ def list_collections( collection_crud = CollectionCrud(session, current_user.project_.id) rows = collection_crud.read_all() - return APIResponse.success_response(rows) + # Convert each collection to CollectionPublic with correct field mapping + public_collections = [to_collection_public(collection) for collection in rows] + + return APIResponse.success_response(public_collections) @router.post( @@ -190,7 +193,7 @@ def collection_info( description="If true, include documents linked to this collection", ), include_url: bool = Query( - True, description="Include a signed URL to access the document" + False, description="Include a signed URL to access the document" ), limit: int | None = Query( @@ -203,7 +206,9 @@ def collection_info( collection_crud = CollectionCrud(session, current_user.project_.id) collection = collection_crud.read_one(collection_id) - collection_with_docs = CollectionWithDocsPublic.model_validate(collection) + # Convert to CollectionPublic with correct field mapping, then to WithDocs + collection_public = to_collection_public(collection) + collection_with_docs = CollectionWithDocsPublic.model_validate(collection_public) if include_docs: document_collection_crud = DocumentCollectionCrud(session) diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py index 58beb31b8..69c5b4895 100644 --- a/backend/app/api/routes/documents.py +++ b/backend/app/api/routes/documents.py @@ -111,7 +111,7 @@ async def upload_doc( target_format: str | None = Form( None, - description="Desired output format for the uploaded document (e.g., pdf, docx, txt).", + description="Desired output format for the uploaded document", ), transformer: str | None = Form( diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py index 74c3f9ab7..47ea3041e 100644 --- a/backend/app/models/collection.py +++ b/backend/app/models/collection.py @@ -3,7 +3,7 @@ from typing import Any, Literal from uuid import UUID, uuid4 -from pydantic import HttpUrl, model_validator +from pydantic import HttpUrl, model_validator, model_serializer from sqlalchemy import UniqueConstraint, Index, text from sqlmodel import Field, Relationship, SQLModel @@ -211,14 +211,78 @@ class CollectionIDPublic(SQLModel): class CollectionPublic(SQLModel): id: UUID - llm_service_id: str - llm_service_name: str + llm_service_id: str | None = Field( + default=None, + description="LLM service ID (e.g., Assistant ID) when model and instructions were provided", + ) + llm_service_name: str | None = Field( + default=None, + description="LLM service name (e.g., model name) when model and instructions were provided", + ) + knowledge_base_id: str | None = Field( + default=None, + description="Knowledge base ID (e.g., Vector Store ID) when only vector store was created", + ) + knowledge_base_provider: str | None = Field( + default=None, + description="Knowledge base provider name when only vector store was created", + ) project_id: int inserted_at: datetime updated_at: datetime deleted_at: datetime | None = None + @model_validator(mode="after") + def validate_service_fields(self) -> "CollectionPublic": + """Ensure either LLM service fields or knowledge base fields are set, not both.""" + has_llm = self.llm_service_id is not None or self.llm_service_name is not None + has_kb = ( + self.knowledge_base_id is not None + or self.knowledge_base_provider is not None + ) + + if has_llm and has_kb: + raise ValueError( + "Cannot have both LLM service fields and knowledge base fields set" + ) + + if not has_llm and not has_kb: + raise ValueError( + "Either LLM service fields or knowledge base fields must be set" + ) + + # Ensure both fields in the pair are set or both are None + if has_llm and ( + (self.llm_service_id is None) != (self.llm_service_name is None) + ): + raise ValueError("Both llm_service_id and llm_service_name must be set") + + if has_kb and ( + (self.knowledge_base_id is None) != (self.knowledge_base_provider is None) + ): + raise ValueError( + "Both knowledge_base_id and knowledge_base_provider must be set" + ) + + return self + + @model_serializer(mode="wrap", when_used="json") + def _serialize_model(self, serializer: Any, info: Any) -> dict[str, Any]: + """Exclude unused service fields from JSON serialization.""" + data = serializer(self) + + # If this is a knowledge base, remove llm_service fields + if data.get("knowledge_base_id") is not None: + data.pop("llm_service_id", None) + data.pop("llm_service_name", None) + # If this is an assistant, remove knowledge_base fields + elif data.get("llm_service_id") is not None: + data.pop("knowledge_base_id", None) + data.pop("knowledge_base_provider", None) + + return data + class CollectionWithDocsPublic(CollectionPublic): documents: list[DocumentPublic] | None = None diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 5e7389db1..dd4016616 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -22,7 +22,10 @@ CollectionJobPublic, CreationRequest, ) -from app.services.collections.helpers import extract_error_message +from app.services.collections.helpers import ( + extract_error_message, + to_collection_public, +) from app.services.collections.providers.registry import get_llm_provider from app.celery.utils import start_low_priority_job from app.utils import send_callback, APIResponse @@ -75,10 +78,12 @@ def build_success_payload( "metadata": null } """ - collection_public = CollectionPublic.model_validate(collection) + collection_public = to_collection_public(collection) + collection_dict = collection_public.model_dump(mode="json", exclude_none=True) + job_public = CollectionJobPublic.model_validate( collection_job, - update={"collection": collection_public}, + update={"collection": collection_dict}, ) return APIResponse.success_response(job_public).model_dump( mode="json", exclude={"data": {"error_message"}} diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 7965e2e27..6275ee40d 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -7,11 +7,10 @@ from fastapi import HTTPException from sqlmodel import select -from openai import OpenAIError from app.crud import DocumentCrud, CollectionCrud from app.api.deps import SessionDep -from app.models import DocumentCollection, Collection +from app.models import DocumentCollection, Collection, CollectionPublic logger = logging.getLogger(__name__) @@ -119,3 +118,38 @@ def ensure_unique_name( ) return requested_name + + +def to_collection_public(collection: Collection) -> CollectionPublic: + """ + Convert a Collection DB model to CollectionPublic response model. + + Maps fields based on service type: + - If llm_service_name is a vector store (matches get_service_name pattern), + use knowledge_base_id/knowledge_base_provider + - Otherwise (assistant), use llm_service_id/llm_service_name + """ + is_vector_store = collection.llm_service_name == get_service_name( + collection.provider + ) + + if is_vector_store: + return CollectionPublic( + id=collection.id, + knowledge_base_id=collection.llm_service_id, + knowledge_base_provider=collection.llm_service_name, + project_id=collection.project_id, + inserted_at=collection.inserted_at, + updated_at=collection.updated_at, + deleted_at=collection.deleted_at, + ) + else: + return CollectionPublic( + id=collection.id, + llm_service_id=collection.llm_service_id, + llm_service_name=collection.llm_service_name, + project_id=collection.project_id, + inserted_at=collection.inserted_at, + updated_at=collection.updated_at, + deleted_at=collection.deleted_at, + ) diff --git a/backend/app/tests/api/routes/collections/test_collection_info.py b/backend/app/tests/api/routes/collections/test_collection_info.py index 88cc7ed32..f41623a54 100644 --- a/backend/app/tests/api/routes/collections/test_collection_info.py +++ b/backend/app/tests/api/routes/collections/test_collection_info.py @@ -167,8 +167,12 @@ def test_collection_info_vector_store_collection( payload = data["data"] assert payload["id"] == str(collection.id) - assert payload["llm_service_name"] == get_service_name("openai") - assert payload["llm_service_id"] == collection.llm_service_id + # Vector store collection should have knowledge_base fields, not llm_service fields + assert payload["knowledge_base_provider"] == get_service_name("openai") + assert payload["knowledge_base_id"] == collection.llm_service_id + # LLM service fields should not be present in the response + assert "llm_service_name" not in payload + assert "llm_service_id" not in payload docs = payload.get("documents", []) assert len(docs) >= 1 diff --git a/backend/app/tests/api/routes/collections/test_collection_list.py b/backend/app/tests/api/routes/collections/test_collection_list.py index e9b5626d3..368603253 100644 --- a/backend/app/tests/api/routes/collections/test_collection_list.py +++ b/backend/app/tests/api/routes/collections/test_collection_list.py @@ -102,8 +102,12 @@ def test_list_collections_includes_vector_store_collection_with_fields( row = matching[0] assert row["project_id"] == project.id - assert row["llm_service_name"] == get_service_name("openai") - assert row["llm_service_id"] == collection.llm_service_id + # Vector store collection should have knowledge_base fields, not llm_service fields + assert row["knowledge_base_provider"] == get_service_name("openai") + assert row["knowledge_base_id"] == collection.llm_service_id + # LLM service fields should not be present in the response + assert "llm_service_name" not in row + assert "llm_service_id" not in row def test_list_collections_does_not_error_with_no_collections(