Database Connectors
Connect Vecta to your vector database with flexible schema configuration
Database Connectors
Vecta connects to your vector database using configurable connectors and schemas. This flexible system adapts to different data structures while maintaining consistent evaluation capabilities.
How Connectors Work
Every connector implements three core operations:
get_all_chunks()
- Retrieve all data for benchmark generationsemantic_search()
- Find similar chunks during evaluationget_chunk_by_id()
- Fetch specific chunks for validation
Schema Configuration
Schemas define how to extract standardized data from your database's specific format:
from vecta.core.schemas import VectorDBSchema
schema = VectorDBSchema(
id_accessor="id", # → ChunkData.id
content_accessor="document", # → ChunkData.content
source_path_accessor="metadata.source_path", # → ChunkData.source_path
page_nums_accessor="metadata.page_nums", # → ChunkData.page_nums
source_path_default="unknown" # fallback value
)
Accessor Syntax
Pattern | Description | Example Use |
---|---|---|
"field" | Direct field access | "content" for top-level field |
".property" | Object property | ".id" for object attributes |
"field.nested" | Nested navigation | "metadata.source_path" |
"[0]" | Array indexing | "[0]" for first element |
"json(field).sub" | Parse JSON field | "json(metadata.provenance).filename" |
"json(json(a).b).c" | Nested JSON parsing | Complex nested structures |
Example: Complex Schema
# For data with nested JSON metadata
complex_schema = VectorDBSchema(
id_accessor="chunk_id",
content_accessor="text_content",
# Parse JSON field, then access nested property
source_path_accessor="json(metadata.document_info).filename",
# Parse nested JSON structure
page_nums_accessor="json(json(metadata.provenance).pages).numbers",
source_path_default="untitled_document"
)
Pre-built Connectors
ChromaDB
from vecta.connectors.chroma_local_connector import ChromaLocalConnector
from vecta.core.schema_helpers import SchemaTemplates
# Local ChromaDB
connector = ChromaLocalConnector(
client=chroma_client,
collection_name="documents",
schema=SchemaTemplates.chroma_default()
)
# ChromaDB Cloud
from vecta.connectors.chroma_cloud_connector import ChromaCloudConnector
connector = ChromaCloudConnector(
tenant="your-tenant",
database="your-db",
api_key="your-key",
collection_name="documents",
schema=SchemaTemplates.chroma_default()
)
Pinecone
from vecta.connectors.pinecone_connector import PineconeConnector
from vecta.core.schema_helpers import SchemaTemplates
connector = PineconeConnector(
api_key="your-key",
index_name="your-index",
namespace="optional-namespace",
schema=SchemaTemplates.pinecone_default()
)
PostgreSQL + pgvector
from vecta.connectors.pgvector_connector import PgVectorConnector
from vecta.core.schema_helpers import SchemaTemplates
# Standard schema (metadata as JSON column)
connector = PgVectorConnector(
dsn="postgresql://user:pass@host:5432/db",
table="chunks",
schema=SchemaTemplates.pgvector_standard()
)
# Flat schema (separate columns)
flat_schema = SchemaTemplates.pgvector_flat(
id_col="id",
content_col="content",
source_path_col="document_name",
page_nums_col="pages"
)
LangChain Integration
from vecta.connectors.langchain_connector import LangChainVectorStoreConnector
from vecta.core.schema_helpers import SchemaTemplates
# Works with any LangChain VectorStore
connector = LangChainVectorStoreConnector(
vectorstore=your_langchain_store,
schema=SchemaTemplates.chroma_default() # Adjust based on underlying store
)
Schema Templates
Use pre-built schemas for common configurations:
from vecta.core.schema_helpers import SchemaTemplates
# Pre-built templates
chroma_schema = SchemaTemplates.chroma_default()
pinecone_schema = SchemaTemplates.pinecone_default()
pgvector_schema = SchemaTemplates.pgvector_standard()
databricks_schema = SchemaTemplates.databricks_indexed()
weaviate_schema = SchemaTemplates.weaviate_default()
# Flat pgvector with custom column names
custom_pgvector = SchemaTemplates.pgvector_flat(
id_col="chunk_uuid",
content_col="text_data",
source_path_col="file_name",
page_nums_col="page_numbers"
)
Creating Custom Connectors
Extend BaseVectorDBConnector
for unsupported databases:
from vecta.connectors.base import BaseVectorDBConnector
from vecta.core.schemas import ChunkData, VectorDBSchema
class MyCustomConnector(BaseVectorDBConnector):
def __init__(self, client, schema: VectorDBSchema):
super().__init__(schema)
self.client = client
def get_all_chunks(self) -> List[ChunkData]:
# Fetch all records from your database
raw_results = self.client.query("SELECT * FROM vectors")
return [self._create_chunk_data_from_raw(row) for row in raw_results]
def semantic_search(self, query_str: str, k: int = 10) -> List[ChunkData]:
# Perform similarity search
raw_results = self.client.vector_search(query_str, limit=k)
return [self._create_chunk_data_from_raw(row) for row in raw_results]
def get_chunk_by_id(self, chunk_id: str) -> ChunkData:
# Fetch specific chunk
raw_result = self.client.query("SELECT * FROM vectors WHERE id = ?", [chunk_id])
return self._create_chunk_data_from_raw(raw_result[0])
The _create_chunk_data_from_raw()
method automatically applies your schema to extract the required fields.
Troubleshooting
Schema Debugging
Test your schema configuration:
# Test with sample data
sample_result = {"id": "test", "content": "Hello", "metadata": {"source_path": "doc.pdf"}}
extracted = schema.extract_chunk_data(sample_result)
print(extracted) # Verify extracted fields
Common Issues
Missing metadata: Ensure your vector database includes source_path
and page_nums
in chunk metadata.
Schema mismatch: Use schema templates as starting points and adjust accessors to match your data structure.
JSON parsing: For complex nested metadata, use json()
syntax to parse string fields before accessing subfields.
Next Steps
Need help? Contact our support team or book a demo.