Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions backend/app/api/docs/collections/info.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization,
timestamps, and associated LLM service details (`llm_service_id` and `llm_service_name`).
Retrieve detailed information about a specific collection by its collection id. This endpoint returns the collection object including its project, organization, timestamps, and service-specific details.

**Response Fields:**

**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, the actual response will only include the fields relevant to what was created:

- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name`
- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider`

**Including Documents:**

If the `include_docs` flag in the request body is true then you will get a list of document IDs associated with a given collection as well. Note that, documents returned are not only stored by Kaapi, but also by Vector store provider.

Expand Down
13 changes: 9 additions & 4 deletions backend/app/api/docs/collections/job_info.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@ Retrieve information about a collection job by the collection job ID. This endpo

* Fetching the collection job object, including the collection job ID, the current status, and the associated collection details.

* If the job has finished, has been successful and it was a job of creation of collection then this endpoint will fetch the associated collection details from the collection table, including:
- `llm_service_id` and `llm_service_name`.
- Collection metadata such as ID, project, organization, and timestamps.
* If the job has finished, has been successful and it was a job of creation of collection then this endpoint will fetch the associated collection details.

* If the delete-collection job succeeds, the status is set to “successful” and the `collection` key contains the ID of the collection that has been deleted.
* If the delete-collection job succeeds, the status is set to "successful" and the `collection` key contains the ID of the collection that has been deleted.

**Response Fields for Successful Creation Jobs:**

**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, the actual collection object in the response will only include the fields relevant to what was created:

- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name`
- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider`
9 changes: 6 additions & 3 deletions backend/app/api/docs/collections/list.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
List all _active_ collections that have been created and are not deleted
List all _active_ collections that have been created and are not deleted.

If a vector store was created - `llm_service_name` and `llm_service_id` in the response denotes the name of the vector store (eg. 'openai vector store') and its id respectively.
**Response Fields:**

[Deprecated] If an assistant was created, `llm_service_name` and `llm_service_id` in the response denotes the name of the model used in the assistant (eg. 'gpt-4o') and assistant id.
**Note:** While the API schema shows both `llm_service_id`/`llm_service_name` AND `knowledge_base_id`/`knowledge_base_provider`, each collection in the response will only include the fields relevant to what was created:

- **If an Assistant was created** (with model + instructions): The response will only include `llm_service_id` and `llm_service_name` (e.g., `llm_service_name: "gpt-4o"` and the assistant ID)
- **If only a Vector Store was created** (without model/instructions): The response will only include `knowledge_base_id` and `knowledge_base_provider` (e.g., `knowledge_base_provider: "openai vector store"` and the vector store ID)
4 changes: 2 additions & 2 deletions backend/app/api/routes/collection_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
)
from app.models.collection import CollectionPublic
from app.utils import APIResponse, load_description
from app.services.collections.helpers import extract_error_message
from app.services.collections.helpers import extract_error_message, to_collection_public


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -49,7 +49,7 @@ def collection_job_info(
):
collection_crud = CollectionCrud(session, current_user.project_.id)
collection = collection_crud.read_one(collection_job.collection_id)
job_out.collection = CollectionPublic.model_validate(collection)
job_out.collection = to_collection_public(collection)

elif collection_job.action_type == CollectionActionType.DELETE:
job_out.collection = CollectionIDPublic(id=collection_job.collection_id)
Expand Down
13 changes: 9 additions & 4 deletions backend/app/api/routes/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
CollectionPublic,
)
from app.utils import APIResponse, load_description, validate_callback_url
from app.services.collections.helpers import ensure_unique_name
from app.services.collections.helpers import ensure_unique_name, to_collection_public
from app.services.collections import (
create_collection as create_service,
delete_collection as delete_service,
Expand Down Expand Up @@ -71,7 +71,10 @@ def list_collections(
collection_crud = CollectionCrud(session, current_user.project_.id)
rows = collection_crud.read_all()

return APIResponse.success_response(rows)
# Convert each collection to CollectionPublic with correct field mapping
public_collections = [to_collection_public(collection) for collection in rows]

return APIResponse.success_response(public_collections)


@router.post(
Expand Down Expand Up @@ -190,7 +193,7 @@ def collection_info(
description="If true, include documents linked to this collection",
),
include_url: bool = Query(
True, description="Include a signed URL to access the document"
False, description="Include a signed URL to access the document"
),
limit: int
| None = Query(
Expand All @@ -203,7 +206,9 @@ def collection_info(
collection_crud = CollectionCrud(session, current_user.project_.id)
collection = collection_crud.read_one(collection_id)

collection_with_docs = CollectionWithDocsPublic.model_validate(collection)
# Convert to CollectionPublic with correct field mapping, then to WithDocs
collection_public = to_collection_public(collection)
collection_with_docs = CollectionWithDocsPublic.model_validate(collection_public)

if include_docs:
document_collection_crud = DocumentCollectionCrud(session)
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ async def upload_doc(
target_format: str
| None = Form(
None,
description="Desired output format for the uploaded document (e.g., pdf, docx, txt).",
description="Desired output format for the uploaded document",
),
transformer: str
| None = Form(
Expand Down
70 changes: 67 additions & 3 deletions backend/app/models/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any, Literal
from uuid import UUID, uuid4

from pydantic import HttpUrl, model_validator
from pydantic import HttpUrl, model_validator, model_serializer
from sqlalchemy import UniqueConstraint, Index, text
from sqlmodel import Field, Relationship, SQLModel

Expand Down Expand Up @@ -211,14 +211,78 @@ class CollectionIDPublic(SQLModel):

class CollectionPublic(SQLModel):
id: UUID
llm_service_id: str
llm_service_name: str
llm_service_id: str | None = Field(
default=None,
description="LLM service ID (e.g., Assistant ID) when model and instructions were provided",
)
llm_service_name: str | None = Field(
default=None,
description="LLM service name (e.g., model name) when model and instructions were provided",
)
knowledge_base_id: str | None = Field(
default=None,
description="Knowledge base ID (e.g., Vector Store ID) when only vector store was created",
)
knowledge_base_provider: str | None = Field(
default=None,
description="Knowledge base provider name when only vector store was created",
)
project_id: int

inserted_at: datetime
updated_at: datetime
deleted_at: datetime | None = None

@model_validator(mode="after")
def validate_service_fields(self) -> "CollectionPublic":
"""Ensure either LLM service fields or knowledge base fields are set, not both."""
has_llm = self.llm_service_id is not None or self.llm_service_name is not None
has_kb = (
self.knowledge_base_id is not None
or self.knowledge_base_provider is not None
)

if has_llm and has_kb:
raise ValueError(
"Cannot have both LLM service fields and knowledge base fields set"
)

if not has_llm and not has_kb:
raise ValueError(
"Either LLM service fields or knowledge base fields must be set"
)

# Ensure both fields in the pair are set or both are None
if has_llm and (
(self.llm_service_id is None) != (self.llm_service_name is None)
):
raise ValueError("Both llm_service_id and llm_service_name must be set")

if has_kb and (
(self.knowledge_base_id is None) != (self.knowledge_base_provider is None)
):
raise ValueError(
"Both knowledge_base_id and knowledge_base_provider must be set"
)

return self

@model_serializer(mode="wrap", when_used="json")
def _serialize_model(self, serializer: Any, info: Any) -> dict[str, Any]:
"""Exclude unused service fields from JSON serialization."""
data = serializer(self)

# If this is a knowledge base, remove llm_service fields
if data.get("knowledge_base_id") is not None:
data.pop("llm_service_id", None)
data.pop("llm_service_name", None)
# If this is an assistant, remove knowledge_base fields
elif data.get("llm_service_id") is not None:
data.pop("knowledge_base_id", None)
data.pop("knowledge_base_provider", None)

return data


class CollectionWithDocsPublic(CollectionPublic):
documents: list[DocumentPublic] | None = None
11 changes: 8 additions & 3 deletions backend/app/services/collections/create_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
CollectionJobPublic,
CreationRequest,
)
from app.services.collections.helpers import extract_error_message
from app.services.collections.helpers import (
extract_error_message,
to_collection_public,
)
from app.services.collections.providers.registry import get_llm_provider
from app.celery.utils import start_low_priority_job
from app.utils import send_callback, APIResponse
Expand Down Expand Up @@ -75,10 +78,12 @@ def build_success_payload(
"metadata": null
}
"""
collection_public = CollectionPublic.model_validate(collection)
collection_public = to_collection_public(collection)
collection_dict = collection_public.model_dump(mode="json", exclude_none=True)

job_public = CollectionJobPublic.model_validate(
collection_job,
update={"collection": collection_public},
update={"collection": collection_dict},
)
return APIResponse.success_response(job_public).model_dump(
mode="json", exclude={"data": {"error_message"}}
Expand Down
38 changes: 36 additions & 2 deletions backend/app/services/collections/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@

from fastapi import HTTPException
from sqlmodel import select
from openai import OpenAIError

from app.crud import DocumentCrud, CollectionCrud
from app.api.deps import SessionDep
from app.models import DocumentCollection, Collection
from app.models import DocumentCollection, Collection, CollectionPublic


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -119,3 +118,38 @@ def ensure_unique_name(
)

return requested_name


def to_collection_public(collection: Collection) -> CollectionPublic:
"""
Convert a Collection DB model to CollectionPublic response model.

Maps fields based on service type:
- If llm_service_name is a vector store (matches get_service_name pattern),
use knowledge_base_id/knowledge_base_provider
- Otherwise (assistant), use llm_service_id/llm_service_name
"""
is_vector_store = collection.llm_service_name == get_service_name(
collection.provider
)

if is_vector_store:
return CollectionPublic(
id=collection.id,
knowledge_base_id=collection.llm_service_id,
knowledge_base_provider=collection.llm_service_name,
project_id=collection.project_id,
inserted_at=collection.inserted_at,
updated_at=collection.updated_at,
deleted_at=collection.deleted_at,
)
else:
return CollectionPublic(
id=collection.id,
llm_service_id=collection.llm_service_id,
llm_service_name=collection.llm_service_name,
project_id=collection.project_id,
inserted_at=collection.inserted_at,
updated_at=collection.updated_at,
deleted_at=collection.deleted_at,
)
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,12 @@ def test_collection_info_vector_store_collection(
payload = data["data"]

assert payload["id"] == str(collection.id)
assert payload["llm_service_name"] == get_service_name("openai")
assert payload["llm_service_id"] == collection.llm_service_id
# Vector store collection should have knowledge_base fields, not llm_service fields
assert payload["knowledge_base_provider"] == get_service_name("openai")
assert payload["knowledge_base_id"] == collection.llm_service_id
# LLM service fields should not be present in the response
assert "llm_service_name" not in payload
assert "llm_service_id" not in payload

docs = payload.get("documents", [])
assert len(docs) >= 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,12 @@ def test_list_collections_includes_vector_store_collection_with_fields(

row = matching[0]
assert row["project_id"] == project.id
assert row["llm_service_name"] == get_service_name("openai")
assert row["llm_service_id"] == collection.llm_service_id
# Vector store collection should have knowledge_base fields, not llm_service fields
assert row["knowledge_base_provider"] == get_service_name("openai")
assert row["knowledge_base_id"] == collection.llm_service_id
# LLM service fields should not be present in the response
assert "llm_service_name" not in row
assert "llm_service_id" not in row


def test_list_collections_does_not_error_with_no_collections(
Expand Down