diff --git a/deps/Makefile b/deps/Makefile index 3139ab77f4..560db98f1d 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -4,6 +4,21 @@ PROXYSQL_PATH := $(shell while [ ! -f ./src/proxysql_global.cpp ]; do cd ..; don include $(PROXYSQL_PATH)/include/makefiles_vars.mk +# Rust toolchain detection +RUSTC := $(shell which rustc 2>/dev/null) +CARGO := $(shell which cargo 2>/dev/null) +ifndef RUSTC +$(error "rustc not found. Please install Rust toolchain") +endif +ifndef CARGO +$(error "cargo not found. Please install Rust toolchain") +endif + +# SQLite environment variables for sqlite-rembed build +export SQLITE3_INCLUDE_DIR=$(shell pwd)/sqlite3/sqlite3 +export SQLITE3_LIB_DIR=$(shell pwd)/sqlite3/sqlite3 +export SQLITE3_STATIC=1 + # to compile libmariadb_client with support for valgrind enabled, run: # export USEVALGRIND=1 @@ -250,7 +265,14 @@ sqlite3/sqlite3/vec.o: sqlite3/sqlite3/sqlite3.o cd sqlite3/sqlite3 && cp ../sqlite-vec-source/sqlite-vec.c . && cp ../sqlite-vec-source/sqlite-vec.h . cd sqlite3/sqlite3 && ${CC} ${MYCFLAGS} -fPIC -c -o vec.o sqlite-vec.c -DSQLITE_CORE -DSQLITE_VEC_STATIC -DSQLITE_ENABLE_MEMORY_MANAGEMENT -DSQLITE_ENABLE_JSON1 -DSQLITE_DLL=1 -sqlite3: sqlite3/sqlite3/sqlite3.o sqlite3/sqlite3/vec.o +sqlite3/libsqlite_rembed.a: sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz + cd sqlite3 && rm -rf sqlite-rembed-*/ sqlite-rembed-source/ || true + cd sqlite3 && tar -zxf sqlite-rembed-0.0.1-alpha.9.tar.gz + mv sqlite3/sqlite-rembed-0.0.1-alpha.9 sqlite3/sqlite-rembed-source + cd sqlite3/sqlite-rembed-source && SQLITE3_INCLUDE_DIR=$(SQLITE3_INCLUDE_DIR) SQLITE3_LIB_DIR=$(SQLITE3_LIB_DIR) SQLITE3_STATIC=1 $(CARGO) build --release --features=sqlite-loadable/static --lib + cp sqlite3/sqlite-rembed-source/target/release/libsqlite_rembed.a sqlite3/libsqlite_rembed.a + +sqlite3: sqlite3/sqlite3/sqlite3.o sqlite3/sqlite3/vec.o sqlite3/libsqlite_rembed.a libconfig/libconfig/out/libconfig++.a: @@ -342,6 +364,7 @@ cleanpart: cd mariadb-client-library && rm -rf mariadb-connector-c-*/ || true cd jemalloc && rm -rf jemalloc-*/ || true cd sqlite3 && rm -rf sqlite-amalgamation-*/ || true + cd sqlite3 && rm -rf libsqlite_rembed.a sqlite-rembed-source/ sqlite-rembed-*/ || true cd postgresql && rm -rf postgresql-*/ || true cd postgresql && rm -rf postgres-*/ || true .PHONY: cleanpart diff --git a/deps/sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz b/deps/sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz new file mode 100644 index 0000000000..b3d9ebfe83 Binary files /dev/null and b/deps/sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz differ diff --git a/doc/SQLITE-REMBED-TEST-README.md b/doc/SQLITE-REMBED-TEST-README.md new file mode 100644 index 0000000000..6f93df8ef9 --- /dev/null +++ b/doc/SQLITE-REMBED-TEST-README.md @@ -0,0 +1,245 @@ +# sqlite-rembed Integration Test Suite + +## Overview + +This test suite comprehensively validates the integration of `sqlite-rembed` (Rust SQLite extension for text embedding generation) into ProxySQL. The tests verify the complete AI pipeline from client registration to embedding generation and vector similarity search. + +## Prerequisites + +### System Requirements +- **ProxySQL** compiled with `sqlite-rembed` and `sqlite-vec` extensions +- **MySQL client** (`mysql` command line tool) +- **Bash** shell environment +- **Network access** to embedding API endpoint (or local Ollama/OpenAI API) + +### ProxySQL Configuration +Ensure ProxySQL is running with SQLite3 server enabled: +```bash +cd /home/rene/proxysql-vec/src +./proxysql --sqlite3-server +``` + +### Test Configuration +The test script uses default connection parameters: +- Host: `127.0.0.1` +- Port: `6030` (default SQLite3 server port) +- User: `root` +- Password: `root` + +Modify these in the script if your configuration differs. + +## Test Suite Structure + +The test suite is organized into 9 phases, each testing specific components: + +### Phase 1: Basic Connectivity and Function Verification +- ✅ ProxySQL connection +- ✅ Database listing +- ✅ `sqlite-vec` function availability +- ✅ `sqlite-rembed` function registration +- ✅ `temp.rembed_clients` virtual table existence + +### Phase 2: Client Configuration +- ✅ Create embedding API client with `rembed_client_options()` +- ✅ Verify client registration in `temp.rembed_clients` +- ✅ Test `rembed_client_options` function + +### Phase 3: Embedding Generation Tests +- ✅ Generate embeddings for short and long text +- ✅ Verify embedding data type (BLOB) and size (768 dimensions × 4 bytes) +- ✅ Error handling for non-existent clients + +### Phase 4: Table Creation and Data Storage +- ✅ Create regular table for document storage +- ✅ Create virtual vector table using `vec0` +- ✅ Insert test documents with diverse content + +### Phase 5: Embedding Generation and Storage +- ✅ Generate embeddings for all documents +- ✅ Store embeddings in vector table +- ✅ Verify embedding count matches document count +- ✅ Check embedding storage format + +### Phase 6: Similarity Search Tests +- ✅ Exact self-match (document with itself, distance = 0.0) +- ✅ Similarity search with query text +- ✅ Verify result ordering by ascending distance + +### Phase 7: Edge Cases and Error Handling +- ✅ Empty text input +- ✅ Very long text input +- ✅ SQL injection attempt safety + +### Phase 8: Performance and Concurrency +- ✅ Sequential embedding generation timing +- ✅ Basic performance validation (< 10 seconds for 3 embeddings) + +### Phase 9: Cleanup and Final Verification +- ✅ Clean up test tables +- ✅ Verify no test artifacts remain + +## Usage + +### Running the Full Test Suite +```bash +cd /home/rene/proxysql-vec/doc +./sqlite-rembed-test.sh +``` + +### Expected Output +The script provides color-coded output: +- 🟢 **Green**: Test passed +- 🔴 **Red**: Test failed +- 🔵 **Blue**: Information and headers +- 🟡 **Yellow**: Test being executed + +### Exit Codes +- `0`: All tests passed +- `1`: One or more tests failed +- `2`: Connection issues or missing dependencies + +## Configuration + +### Modifying Connection Parameters +Edit the following variables in `sqlite-rembed-test.sh`: +```bash +PROXYSQL_HOST="127.0.0.1" +PROXYSQL_PORT="6030" +MYSQL_USER="root" +MYSQL_PASS="root" +``` + +### API Configuration +The test uses a synthetic OpenAI endpoint by default. Set `API_KEY` environment variable or modify the variable below to use your own API: +```bash +API_CLIENT_NAME="test-client-$(date +%s)" +API_FORMAT="openai" +API_URL="https://api.synthetic.new/openai/v1/embeddings" +API_KEY="${API_KEY:-YOUR_API_KEY}" # Uses environment variable or placeholder +API_MODEL="hf:nomic-ai/nomic-embed-text-v1.5" +VECTOR_DIMENSIONS=768 +``` + +For other providers (Ollama, Cohere, Nomic), adjust the format and URL accordingly. + +## Test Data + +### Sample Documents +The test creates 4 sample documents: +1. **Machine Learning** - "Machine learning algorithms improve with more training data..." +2. **Database Systems** - "Database management systems efficiently store, retrieve..." +3. **Artificial Intelligence** - "AI enables computers to perform tasks typically..." +4. **Vector Databases** - "Vector databases enable similarity search for embeddings..." + +### Query Texts +Test searches use: +- Self-match: Document 1 with itself +- Query: "data science and algorithms" + +## Troubleshooting + +### Common Issues + +#### 1. Connection Failed +``` +Error: Cannot connect to ProxySQL at 127.0.0.1:6030 +``` +**Solution**: Ensure ProxySQL is running with `--sqlite3-server` flag. + +#### 2. Missing Functions +``` +ERROR 1045 (28000): no such function: rembed +``` +**Solution**: Verify `sqlite-rembed` was compiled and linked into ProxySQL binary. + +#### 3. API Errors +``` +Error from embedding API +``` +**Solution**: Check network connectivity and API credentials. + +#### 4. Vector Table Errors +``` +ERROR 1045 (28000): A LIMIT or 'k = ?' constraint is required on vec0 knn queries. +``` +**Solution**: All `sqlite-vec` similarity queries require `LIMIT` clause. + +### Debug Mode +For detailed debugging, run with trace: +```bash +bash -x ./sqlite-rembed-test.sh +``` + +## Integration with CI/CD + +The test script can be integrated into CI/CD pipelines: + +```yaml +# Example GitHub Actions workflow +name: sqlite-rembed Tests +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Build ProxySQL with sqlite-rembed + run: | + cd deps && make cleanpart && make sqlite3 + cd ../lib && make + cd ../src && make + - name: Start ProxySQL + run: | + cd src && ./proxysql --sqlite3-server & + sleep 5 + - name: Run Integration Tests + run: | + cd doc && ./sqlite-rembed-test.sh +``` + +## Extending the Test Suite + +### Adding New Tests +1. Add new test function following existing pattern +2. Update phase header and test count +3. Add to appropriate phase section + +### Testing Different Providers +Modify the API configuration block to test: +- **Ollama**: Use `format='ollama'` and local URL +- **Cohere**: Use `format='cohere'` and appropriate model +- **Nomic**: Use `format='nomic'` and Nomic API endpoint + +### Performance Testing +Extend Phase 8 for: +- Concurrent embedding generation +- Batch processing tests +- Memory usage monitoring + +## Results Interpretation + +### Success Criteria +- All connectivity tests pass +- Embeddings generated with correct dimensions +- Vector search returns ordered results +- No test artifacts remain after cleanup + +### Performance Benchmarks +- Embedding generation: < 3 seconds per request (network-dependent) +- Similarity search: < 100ms for small datasets +- Memory: Stable during sequential operations + +## References + +- [sqlite-rembed GitHub](https://github.com/asg017/sqlite-rembed) +- [sqlite-vec Documentation](./SQLite3-Server.md) +- [ProxySQL SQLite3 Server](./SQLite3-Server.md) +- [Integration Documentation](./sqlite-rembed-integration.md) + +## License + +This test suite is part of the ProxySQL project and follows the same licensing terms. + +--- +*Last Updated: $(date)* +*Test Suite Version: 1.0* \ No newline at end of file diff --git a/doc/SQLite3-Server.md b/doc/SQLite3-Server.md index f9e187c8b3..d346179fba 100644 --- a/doc/SQLite3-Server.md +++ b/doc/SQLite3-Server.md @@ -69,6 +69,39 @@ SELECT rowid, distance FROM vec_data WHERE vector MATCH json('[0.1, 0.2, 0.3,...,0.128]'); ``` +### Embedding Generation (with sqlite-rembed) + +```sql +-- Register an embedding API client +INSERT INTO temp.rembed_clients(name, format, model, key) +VALUES ('openai', 'openai', 'text-embedding-3-small', 'your-api-key'); + +-- Generate text embeddings +SELECT rembed('openai', 'Hello world') as embedding; + +-- Complete AI pipeline: generate embedding and search +CREATE VECTOR TABLE documents (embedding float[1536]); + +INSERT INTO documents(rowid, embedding) +VALUES (1, rembed('openai', 'First document text')); + +INSERT INTO documents(rowid, embedding) +VALUES (2, rembed('openai', 'Second document text')); + +-- Search for similar documents +SELECT rowid, distance FROM documents +WHERE embedding MATCH rembed('openai', 'Search query'); +``` + +#### Supported Embedding Providers +- **OpenAI**: `format='openai', model='text-embedding-3-small'` +- **Ollama** (local): `format='ollama', model='nomic-embed-text'` +- **Cohere**: `format='cohere', model='embed-english-v3.0'` +- **Nomic**: `format='nomic', model='nomic-embed-text-v1.5'` +- **Llamafile** (local): `format='llamafile'` + +See [sqlite-rembed integration documentation](./sqlite-rembed-integration.md) for full details. + ### Available Databases ```sql @@ -87,9 +120,11 @@ SHOW DATABASES; 1. **Data Analysis**: Store and analyze temporary data 2. **Vector Search**: Perform similarity searches with sqlite-vec -3. **Testing**: Test SQLite features with MySQL clients -4. **Prototyping**: Quick data storage and retrieval -5. **Custom Applications**: Build applications using SQLite with MySQL tools +3. **Embedding Generation**: Create text embeddings with sqlite-rembed (OpenAI, Ollama, Cohere, etc.) +4. **AI Pipelines**: Complete RAG workflows: embedding generation → vector storage → similarity search +5. **Testing**: Test SQLite features with MySQL clients +6. **Prototyping**: Quick data storage and retrieval +7. **Custom Applications**: Build applications using SQLite with MySQL tools ## Limitations diff --git a/doc/sqlite-rembed-demo.sh b/doc/sqlite-rembed-demo.sh new file mode 100755 index 0000000000..014ca1c756 --- /dev/null +++ b/doc/sqlite-rembed-demo.sh @@ -0,0 +1,368 @@ +#!/bin/bash + +############################################################################### +# sqlite-rembed Demonstration Script +# +# This script demonstrates the usage of sqlite-rembed integration in ProxySQL +# using a single MySQL session to maintain connection state. +# +# The script creates a SQL file with all demonstration queries and executes +# them in a single session, ensuring temp.rembed_clients virtual table +# maintains its state throughout the demonstration. +# +# Requirements: +# - ProxySQL running with --sqlite3-server flag on port 6030 +# - MySQL client installed +# - Network access to embedding API endpoint +# - Valid API credentials for embedding generation +# +# Usage: ./sqlite-rembed-demo.sh +# +# Author: Generated from integration testing session +# Date: $(date) +############################################################################### + +set -uo pipefail + +# Configuration - modify these values as needed +PROXYSQL_HOST="127.0.0.1" +PROXYSQL_PORT="6030" +MYSQL_USER="root" +MYSQL_PASS="root" + +# API Configuration - using synthetic OpenAI endpoint for demonstration +# IMPORTANT: Set API_KEY environment variable or replace YOUR_API_KEY below +API_CLIENT_NAME="demo-client-$(date +%s)" +API_FORMAT="openai" +API_URL="https://api.synthetic.new/openai/v1/embeddings" +API_KEY="${API_KEY:-YOUR_API_KEY}" # Uses environment variable or placeholder +API_MODEL="hf:nomic-ai/nomic-embed-text-v1.5" +VECTOR_DIMENSIONS=768 # Based on model output + +# Color codes for output readability +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +# Text formatting +BOLD='\033[1m' +UNDERLINE='\033[4m' + +############################################################################### +# Helper Functions +############################################################################### + +print_header() { + echo -e "\n${BLUE}${BOLD}${UNDERLINE}$1${NC}\n" +} + +print_step() { + echo -e "${YELLOW}➤ Step:$NC $1" +} + +print_query() { + echo -e "${YELLOW}SQL Query:$NC" + echo "$1" + echo "" +} + +print_success() { + echo -e "${GREEN}✓$NC $1" +} + +print_error() { + echo -e "${RED}✗$NC $1" +} + +# Create SQL file with demonstration queries +create_demo_sql() { + local sql_file="$1" + + cat > "$sql_file" << EOF +-------------------------------------------------------------------- +-- sqlite-rembed Demonstration Script +-- Generated: $(date) +-- ProxySQL: ${PROXYSQL_HOST}:${PROXYSQL_PORT} +-- API Endpoint: ${API_URL} +-------------------------------------------------------------------- +-- Cleanup: Remove any existing demonstration tables +DROP TABLE IF EXISTS demo_documents; +DROP TABLE IF EXISTS demo_embeddings; +DROP TABLE IF EXISTS demo_embeddings_info; +DROP TABLE IF EXISTS demo_embeddings_chunks; +DROP TABLE IF EXISTS demo_embeddings_rowids; +DROP TABLE IF EXISTS demo_embeddings_vector_chunks00; + +-------------------------------------------------------------------- +-- Phase 1: Basic Connectivity and Function Verification +-------------------------------------------------------------------- +-- This phase verifies basic connectivity and confirms that sqlite-rembed +-- and sqlite-vec functions are properly registered in ProxySQL. + +SELECT 'Phase 1: Basic Connectivity' as phase; + +-- Basic ProxySQL connectivity +SELECT 1 as connectivity_test; + +-- Available databases +SHOW DATABASES; + +-- Available sqlite-vec functions +SELECT name FROM pragma_function_list WHERE name LIKE 'vec%' LIMIT 5; + +-- Available sqlite-rembed functions +SELECT name FROM pragma_function_list WHERE name LIKE 'rembed%' ORDER BY name; + +-- Check temp.rembed_clients virtual table exists +SELECT name FROM sqlite_master WHERE name='rembed_clients' AND type='table'; + +-------------------------------------------------------------------- +-- Phase 2: Client Configuration +-------------------------------------------------------------------- +-- This phase demonstrates how to configure an embedding API client using +-- the temp.rembed_clients virtual table and rembed_client_options() function. + +SELECT 'Phase 2: Client Configuration' as phase; + +-- Create embedding API client +INSERT INTO temp.rembed_clients(name, options) VALUES + ('$API_CLIENT_NAME', + rembed_client_options( + 'format', '$API_FORMAT', + 'url', '$API_URL', + 'key', '$API_KEY', + 'model', '$API_MODEL' + ) + ); + +-- Verify client registration +SELECT name FROM temp.rembed_clients; + +-- View client configuration details +SELECT name, + json_extract(options, '\$.format') as format, + json_extract(options, '\$.model') as model +FROM temp.rembed_clients; + +-------------------------------------------------------------------- +-- Phase 3: Embedding Generation +-------------------------------------------------------------------- +-- This phase demonstrates text embedding generation using the rembed() function. +-- Embeddings are generated via HTTP request to the configured API endpoint. + +SELECT 'Phase 3: Embedding Generation' as phase; + +-- Generate embedding for 'Hello world' and check size +SELECT length(rembed('$API_CLIENT_NAME', 'Hello world')) as embedding_size_bytes; + +-- Generate embedding for longer technical text +SELECT length(rembed('$API_CLIENT_NAME', 'Machine learning algorithms improve with more training data and computational power.')) as embedding_size_bytes; + +-- Generate embedding for empty text (edge case) +SELECT length(rembed('$API_CLIENT_NAME', '')) as empty_embedding_size; + +-------------------------------------------------------------------- +-- Phase 4: Table Creation and Data Storage +-------------------------------------------------------------------- +-- This phase demonstrates creating regular tables for document storage +-- and virtual vector tables for embedding storage using sqlite-vec. + +SELECT 'Phase 4: Table Creation and Data Storage' as phase; + +-- Create regular table for document storage +CREATE TABLE IF NOT EXISTS demo_documents ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Create virtual vector table for embeddings +CREATE VIRTUAL TABLE IF NOT EXISTS demo_embeddings USING vec0( + embedding float[$VECTOR_DIMENSIONS] +); + +-- Insert sample documents +INSERT OR IGNORE INTO demo_documents (id, title, content) VALUES + (1, 'Machine Learning', 'Machine learning algorithms improve with more training data and computational power.'), + (2, 'Database Systems', 'Database management systems efficiently store, retrieve, and manipulate structured data.'), + (3, 'Artificial Intelligence', 'AI enables computers to perform tasks typically requiring human intelligence.'), + (4, 'Vector Databases', 'Vector databases enable similarity search for embeddings generated by machine learning models.'); + +-- Verify document insertion +SELECT id, title, length(content) as content_length FROM demo_documents; + +-------------------------------------------------------------------- +-- Phase 5: Embedding Generation and Storage +-------------------------------------------------------------------- +-- This phase demonstrates generating embeddings for all documents and +-- storing them in the vector table for similarity search. + +SELECT 'Phase 5: Embedding Generation and Storage' as phase; + +-- Generate and store embeddings for all documents +-- Using INSERT OR REPLACE to handle existing rows (cleanup should have removed them) +INSERT OR REPLACE INTO demo_embeddings(rowid, embedding) +SELECT id, rembed('$API_CLIENT_NAME', content) +FROM demo_documents; + +-- Verify embedding count +SELECT COUNT(*) as total_embeddings FROM demo_embeddings; + +-- Check embedding storage format +SELECT rowid, length(embedding) as embedding_size_bytes +FROM demo_embeddings LIMIT 2; + +-------------------------------------------------------------------- +-- Phase 6: Similarity Search +-------------------------------------------------------------------- +-- This phase demonstrates similarity search using the stored embeddings. +-- Queries show exact matches, similar documents, and distance metrics. + +SELECT 'Phase 6: Similarity Search' as phase; + +-- Exact self-match (should have distance 0.0) +SELECT d.title, d.content, e.distance +FROM ( + SELECT rowid, distance + FROM demo_embeddings + WHERE embedding MATCH rembed('$API_CLIENT_NAME', + 'Machine learning algorithms improve with more training data and computational power.') + LIMIT 3 +) e +JOIN demo_documents d ON e.rowid = d.id; + + +-- Similarity search with query text +SELECT d.title, d.content, e.distance +FROM ( + SELECT rowid, distance + FROM demo_embeddings + WHERE embedding MATCH rembed('$API_CLIENT_NAME', + 'data science and algorithms') + LIMIT 3 +) e +JOIN demo_documents d ON e.rowid = d.id; + +-- Ordered similarity search (closest matches first) +SELECT d.title, d.content, e.distance +FROM ( + SELECT rowid, distance + FROM demo_embeddings + WHERE embedding MATCH rembed('$API_CLIENT_NAME', + 'artificial intelligence and neural networks') + LIMIT 3 +) e +JOIN demo_documents d ON e.rowid = d.id; + +-------------------------------------------------------------------- +-- Phase 7: Edge Cases and Error Handling +-------------------------------------------------------------------- +-- This phase demonstrates error handling and edge cases. + +SELECT 'Phase 7: Edge Cases and Error Handling' as phase; + +-- Error: Non-existent client +SELECT rembed('non-existent-client', 'test text'); + +-- Very long text input +SELECT rembed('$API_CLIENT_NAME', + '$(printf '%0.sA' {1..5000})'); + +-------------------------------------------------------------------- +-- Phase 8: Cleanup and Summary +-------------------------------------------------------------------- +-- Cleaning up demonstration tables and providing summary. + +SELECT 'Phase 8: Cleanup' as phase; + +-- Clean up demonstration tables +DROP TABLE IF EXISTS demo_documents; +DROP TABLE IF EXISTS demo_embeddings; + +SELECT 'Demonstration Complete' as phase; +SELECT 'All sqlite-rembed integration examples have been executed successfully.' as summary; +SELECT 'The demonstration covered:' as coverage; +SELECT ' • Client configuration with temp.rembed_clients' as item; +SELECT ' • Embedding generation via HTTP API' as item; +SELECT ' • Vector table creation and data storage' as item; +SELECT ' • Similarity search with generated embeddings' as item; +SELECT ' • Error handling and edge cases' as item; + +EOF +} + +############################################################################### +# Main Demonstration Script +############################################################################### + +main() { + print_header "sqlite-rembed Demonstration Script" + echo -e "Starting at: $(date)" + echo -e "ProxySQL: ${PROXYSQL_HOST}:${PROXYSQL_PORT}" + echo -e "API Endpoint: ${API_URL}" + echo "" + + # Check if mysql client is available + if ! command -v mysql &> /dev/null; then + print_error "MySQL client not found. Please install mysql-client." + exit 1 + fi + + # Check connectivity to ProxySQL + if ! mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "SELECT 1;" &>/dev/null; then + print_error "Cannot connect to ProxySQL at ${PROXYSQL_HOST}:${PROXYSQL_PORT}" + echo "Make sure ProxySQL is running with: ./proxysql --sqlite3-server" + exit 1 + fi + + # Create temporary SQL file + local sql_file + sql_file=$(mktemp /tmp/sqlite-rembed-demo.XXXXXX.sql) + + print_step "Creating demonstration SQL script..." + create_demo_sql "$sql_file" + print_success "SQL script created: $sql_file" + + print_step "Executing demonstration in single MySQL session..." + echo "" + echo -e "${BLUE}=== Demonstration Output ===${NC}" + + # Execute SQL file + mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + < "$sql_file" 2>&1 | \ + grep -v "Using a password on the command line interface" + + local exit_code=${PIPESTATUS[0]} + + echo "" + echo -e "${BLUE}=== End Demonstration Output ===${NC}" + + # Clean up temporary file + rm -f "$sql_file" + + if [ $exit_code -eq 0 ]; then + print_success "Demonstration completed successfully!" + echo "" + echo "The demonstration covered:" + echo " • Client configuration with temp.rembed_clients" + echo " • Embedding generation via HTTP API" + echo " • Vector table creation and data storage" + echo " • Similarity search with generated embeddings" + echo " • Error handling and edge cases" + echo "" + echo "These examples can be used as a baseline for building applications" + echo "that leverage sqlite-rembed and sqlite-vec in ProxySQL." + else + print_error "Demonstration encountered errors (exit code: $exit_code)" + echo "Check the output above for details." + exit 1 + fi +} + +# Run main demonstration +main +exit 0 diff --git a/doc/sqlite-rembed-examples.sh b/doc/sqlite-rembed-examples.sh new file mode 100755 index 0000000000..500f9edfcd --- /dev/null +++ b/doc/sqlite-rembed-examples.sh @@ -0,0 +1,329 @@ +#!/bin/bash + +############################################################################### +# sqlite-rembed Examples and Demonstration Script +# +# This script demonstrates the usage of sqlite-rembed integration in ProxySQL, +# showing complete examples of embedding generation and vector search pipeline. +# +# The script is organized into logical phases, each demonstrating a specific +# aspect of the integration with detailed explanations. +# +# Requirements: +# - ProxySQL running with --sqlite3-server flag on port 6030 +# - MySQL client installed +# - Network access to embedding API endpoint +# - Valid API credentials for embedding generation +# +# Usage: ./sqlite-rembed-examples.sh +# +# Author: Generated from integration testing session +# Date: $(date) +############################################################################### + +set -uo pipefail + +# Configuration - modify these values as needed +PROXYSQL_HOST="127.0.0.1" +PROXYSQL_PORT="6030" +MYSQL_USER="root" +MYSQL_PASS="root" + +# API Configuration - using synthetic OpenAI endpoint for demonstration +# IMPORTANT: Set API_KEY environment variable or replace YOUR_API_KEY below +API_CLIENT_NAME="demo-client-$(date +%s)" +API_FORMAT="openai" +API_URL="https://api.synthetic.new/openai/v1/embeddings" +API_KEY="${API_KEY:-YOUR_API_KEY}" # Uses environment variable or placeholder +API_MODEL="hf:nomic-ai/nomic-embed-text-v1.5" +VECTOR_DIMENSIONS=768 # Based on model output + +# Color codes for output readability +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Text formatting +BOLD='\033[1m' +UNDERLINE='\033[4m' + +############################################################################### +# Helper Functions +############################################################################### + +print_header() { + echo -e "\n${BLUE}${BOLD}${UNDERLINE}$1${NC}\n" +} + +print_step() { + echo -e "${YELLOW}➤ Step:$NC $1" +} + +print_query() { + echo -e "${YELLOW}SQL Query:$NC" + echo "$1" + echo "" +} + +# Execute MySQL query and display results +execute_and_show() { + local sql_query="$1" + local description="${2:-}" + + if [ -n "$description" ]; then + print_step "$description" + fi + + print_query "$sql_query" + + echo -e "${BLUE}Result:$NC" + mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "$sql_query" 2>&1 | grep -v "Using a password on the command line" + echo "--------------------------------------------------------------------" +} + +# Clean up any existing demonstration tables +cleanup_tables() { + echo "Cleaning up any existing demonstration tables..." + + local tables=( + "demo_documents" + "demo_embeddings" + ) + + for table in "${tables[@]}"; do + mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "DROP TABLE IF EXISTS $table;" 2>/dev/null + done + + echo "Cleanup completed." +} + +############################################################################### +# Main Demonstration Script +############################################################################### + +main() { + print_header "sqlite-rembed Integration Examples" + echo -e "Starting at: $(date)" + echo -e "ProxySQL: ${PROXYSQL_HOST}:${PROXYSQL_PORT}" + echo -e "API Endpoint: ${API_URL}" + echo "" + + # Initial cleanup + cleanup_tables + + ########################################################################### + # Phase 1: Basic Connectivity and Function Verification + ########################################################################### + print_header "Phase 1: Basic Connectivity and Function Verification" + + echo "This phase verifies basic connectivity and confirms that sqlite-rembed" + echo "and sqlite-vec functions are properly registered in ProxySQL." + echo "" + + execute_and_show "SELECT 1 as connectivity_test;" "Basic ProxySQL connectivity" + + execute_and_show "SHOW DATABASES;" "Available databases" + + execute_and_show "SELECT name FROM pragma_function_list WHERE name LIKE 'vec%' LIMIT 5;" \ + "Available sqlite-vec functions" + + execute_and_show "SELECT name FROM pragma_function_list WHERE name LIKE 'rembed%' ORDER BY name;" \ + "Available sqlite-rembed functions" + + execute_and_show "SELECT name FROM sqlite_master WHERE name='rembed_clients' AND type='table';" \ + "Check temp.rembed_clients virtual table exists" + + ########################################################################### + # Phase 2: Client Configuration + ########################################################################### + print_header "Phase 2: Client Configuration" + + echo "This phase demonstrates how to configure an embedding API client using" + echo "the temp.rembed_clients virtual table and rembed_client_options() function." + echo "" + + local create_client_sql="INSERT INTO temp.rembed_clients(name, options) VALUES + ('$API_CLIENT_NAME', + rembed_client_options( + 'format', '$API_FORMAT', + 'url', '$API_URL', + 'key', '$API_KEY', + 'model', '$API_MODEL' + ) + );" + + execute_and_show "$create_client_sql" "Create embedding API client" + + execute_and_show "SELECT name FROM temp.rembed_clients;" \ + "Verify client registration" + + execute_and_show "SELECT name, json_extract(options, '\$.format') as format, + json_extract(options, '\$.model') as model + FROM temp.rembed_clients;" \ + "View client configuration details" + + ########################################################################### + # Phase 3: Embedding Generation + ########################################################################### + print_header "Phase 3: Embedding Generation" + + echo "This phase demonstrates text embedding generation using the rembed() function." + echo "Embeddings are generated via HTTP request to the configured API endpoint." + echo "" + + execute_and_show "SELECT length(rembed('$API_CLIENT_NAME', 'Hello world')) as embedding_size_bytes;" \ + "Generate embedding for 'Hello world' and check size" + + execute_and_show "SELECT length(rembed('$API_CLIENT_NAME', 'Machine learning algorithms improve with more training data and computational power.')) as embedding_size_bytes;" \ + "Generate embedding for longer technical text" + + execute_and_show "SELECT length(rembed('$API_CLIENT_NAME', '')) as empty_embedding_size;" \ + "Generate embedding for empty text (edge case)" + + ########################################################################### + # Phase 4: Table Creation and Data Storage + ########################################################################### + print_header "Phase 4: Table Creation and Data Storage" + + echo "This phase demonstrates creating regular tables for document storage" + echo "and virtual vector tables for embedding storage using sqlite-vec." + echo "" + + execute_and_show "CREATE TABLE demo_documents ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + );" "Create regular table for document storage" + + execute_and_show "CREATE VIRTUAL TABLE demo_embeddings USING vec0( + embedding float[$VECTOR_DIMENSIONS] + );" "Create virtual vector table for embeddings" + + execute_and_show "INSERT INTO demo_documents (id, title, content) VALUES + (1, 'Machine Learning', 'Machine learning algorithms improve with more training data and computational power.'), + (2, 'Database Systems', 'Database management systems efficiently store, retrieve, and manipulate structured data.'), + (3, 'Artificial Intelligence', 'AI enables computers to perform tasks typically requiring human intelligence.'), + (4, 'Vector Databases', 'Vector databases enable similarity search for embeddings generated by machine learning models.');" \ + "Insert sample documents" + + execute_and_show "SELECT id, title, length(content) as content_length FROM demo_documents;" \ + "Verify document insertion" + + ########################################################################### + # Phase 5: Embedding Generation and Storage + ########################################################################### + print_header "Phase 5: Embedding Generation and Storage" + + echo "This phase demonstrates generating embeddings for all documents and" + echo "storing them in the vector table for similarity search." + echo "" + + execute_and_show "INSERT INTO demo_embeddings(rowid, embedding) + SELECT id, rembed('$API_CLIENT_NAME', content) + FROM demo_documents;" \ + "Generate and store embeddings for all documents" + + execute_and_show "SELECT COUNT(*) as total_embeddings FROM demo_embeddings;" \ + "Verify embedding count" + + execute_and_show "SELECT rowid, length(embedding) as embedding_size_bytes + FROM demo_embeddings LIMIT 2;" \ + "Check embedding storage format" + + ########################################################################### + # Phase 6: Similarity Search + ########################################################################### + print_header "Phase 6: Similarity Search" + + echo "This phase demonstrates similarity search using the stored embeddings." + echo "Queries show exact matches, similar documents, and distance metrics." + echo "" + + execute_and_show "SELECT d.title, d.content, e.distance + FROM demo_embeddings e + JOIN demo_documents d ON e.rowid = d.id + WHERE e.embedding MATCH rembed('$API_CLIENT_NAME', + 'Machine learning algorithms improve with more training data and computational power.') + LIMIT 3;" \ + "Exact self-match (should have distance 0.0)" + + execute_and_show "SELECT d.title, d.content, e.distance + FROM demo_embeddings e + JOIN demo_documents d ON e.rowid = d.id + WHERE e.embedding MATCH rembed('$API_CLIENT_NAME', + 'data science and algorithms') + LIMIT 3;" \ + "Similarity search with query text" + + execute_and_show "SELECT d.title, e.distance + FROM demo_embeddings e + JOIN demo_documents d ON e.rowid = d.id + WHERE e.embedding MATCH rembed('$API_CLIENT_NAME', + 'artificial intelligence and neural networks') + ORDER BY e.distance ASC + LIMIT 3;" \ + "Ordered similarity search (closest matches first)" + + ########################################################################### + # Phase 7: Edge Cases and Error Handling + ########################################################################### + print_header "Phase 7: Edge Cases and Error Handling" + + echo "This phase demonstrates error handling and edge cases." + echo "" + + execute_and_show "SELECT rembed('non-existent-client', 'test text');" \ + "Error: Non-existent client" + + execute_and_show "SELECT rembed('$API_CLIENT_NAME', + '$(printf '%0.sA' {1..5000})');" \ + "Very long text input" + + ########################################################################### + # Phase 8: Cleanup and Summary + ########################################################################### + print_header "Phase 8: Cleanup and Summary" + + echo "Cleaning up demonstration tables and providing summary." + echo "" + + cleanup_tables + + echo "" + print_header "Demonstration Complete" + echo "All sqlite-rembed integration examples have been executed successfully." + echo "The demonstration covered:" + echo " • Client configuration with temp.rembed_clients" + echo " • Embedding generation via HTTP API" + echo " • Vector table creation and data storage" + echo " • Similarity search with generated embeddings" + echo " • Error handling and edge cases" + echo "" + echo "These examples can be used as a baseline for building applications" + echo "that leverage sqlite-rembed and sqlite-vec in ProxySQL." +} + +############################################################################### +# Script Entry Point +############################################################################### + +# Check if mysql client is available +if ! command -v mysql &> /dev/null; then + echo -e "${RED}Error: MySQL client not found. Please install mysql-client.${NC}" + exit 1 +fi + +# Check connectivity to ProxySQL +if ! mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "SELECT 1;" &>/dev/null; then + echo -e "${RED}Error: Cannot connect to ProxySQL at ${PROXYSQL_HOST}:${PROXYSQL_PORT}${NC}" + echo "Make sure ProxySQL is running with: ./proxysql --sqlite3-server" + exit 1 +fi + +# Run main demonstration +main +exit 0 \ No newline at end of file diff --git a/doc/sqlite-rembed-examples.sql b/doc/sqlite-rembed-examples.sql new file mode 100644 index 0000000000..39973657e9 --- /dev/null +++ b/doc/sqlite-rembed-examples.sql @@ -0,0 +1,218 @@ +-- sqlite-rembed Examples and Demonstration +-- This SQL file demonstrates the usage of sqlite-rembed integration in ProxySQL +-- Connect to ProxySQL SQLite3 server on port 6030 and run these examples: +-- mysql -h 127.0.0.1 -P 6030 -u root -proot < sqlite-rembed-examples.sql +-- +-- IMPORTANT: Replace YOUR_API_KEY with your actual API key in Phase 2 +-- +-- Generated: 2025-12-23 + +-------------------------------------------------------------------- +-- Cleanup: Remove any existing demonstration tables +-------------------------------------------------------------------- +DROP TABLE IF EXISTS demo_documents; +DROP TABLE IF EXISTS demo_embeddings; + +-------------------------------------------------------------------- +-- Phase 1: Basic Connectivity and Function Verification +-------------------------------------------------------------------- +-- Verify basic connectivity and confirm sqlite-rembed functions are registered + +SELECT 'Phase 1: Basic Connectivity' as phase; + +-- Basic ProxySQL connectivity test +SELECT 1 as connectivity_test; + +-- Available databases +SHOW DATABASES; + +-- Available sqlite-vec functions +SELECT name FROM pragma_function_list WHERE name LIKE 'vec%' LIMIT 5; + +-- Available sqlite-rembed functions +SELECT name FROM pragma_function_list WHERE name LIKE 'rembed%' ORDER BY name; + +-- Check temp.rembed_clients virtual table exists +SELECT name FROM sqlite_master WHERE name='rembed_clients' AND type='table'; + +-------------------------------------------------------------------- +-- Phase 2: Client Configuration +-------------------------------------------------------------------- +-- Configure an embedding API client using temp.rembed_clients table +-- Note: temp.rembed_clients is per-connection, so client must be registered +-- in the same session where embeddings are generated + +SELECT 'Phase 2: Client Configuration' as phase; + +-- Create embedding API client using synthetic OpenAI endpoint +-- Replace with your own API credentials for production use +-- IMPORTANT: Replace YOUR_API_KEY with your actual API key +INSERT INTO temp.rembed_clients(name, options) VALUES + ('demo-client', + rembed_client_options( + 'format', 'openai', + 'url', 'https://api.synthetic.new/openai/v1/embeddings', + 'key', 'YOUR_API_KEY', -- Replace with your actual API key + 'model', 'hf:nomic-ai/nomic-embed-text-v1.5' + ) + ); + +-- Verify client registration +SELECT name FROM temp.rembed_clients; + +-- View client configuration details +SELECT name, + json_extract(options, '$.format') as format, + json_extract(options, '$.model') as model +FROM temp.rembed_clients; + +-------------------------------------------------------------------- +-- Phase 3: Embedding Generation +-------------------------------------------------------------------- +-- Generate text embeddings using the rembed() function +-- Embeddings are generated via HTTP request to the configured API endpoint + +SELECT 'Phase 3: Embedding Generation' as phase; + +-- Generate embedding for 'Hello world' and check size (768 dimensions × 4 bytes = 3072 bytes) +SELECT length(rembed('demo-client', 'Hello world')) as embedding_size_bytes; + +-- Generate embedding for longer technical text +SELECT length(rembed('demo-client', 'Machine learning algorithms improve with more training data and computational power.')) as embedding_size_bytes; + +-- Generate embedding for empty text (edge case) +SELECT length(rembed('demo-client', '')) as empty_embedding_size; + +-------------------------------------------------------------------- +-- Phase 4: Table Creation and Data Storage +-------------------------------------------------------------------- +-- Create regular tables for document storage and virtual vector tables +-- for embedding storage using sqlite-vec + +SELECT 'Phase 4: Table Creation and Data Storage' as phase; + +-- Create regular table for document storage +CREATE TABLE demo_documents ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- Create virtual vector table for embeddings with 768 dimensions +CREATE VIRTUAL TABLE demo_embeddings USING vec0( + embedding float[768] +); + +-- Insert sample documents with diverse content +INSERT INTO demo_documents (id, title, content) VALUES + (1, 'Machine Learning', 'Machine learning algorithms improve with more training data and computational power.'), + (2, 'Database Systems', 'Database management systems efficiently store, retrieve, and manipulate structured data.'), + (3, 'Artificial Intelligence', 'AI enables computers to perform tasks typically requiring human intelligence.'), + (4, 'Vector Databases', 'Vector databases enable similarity search for embeddings generated by machine learning models.'); + +-- Verify document insertion +SELECT id, title, length(content) as content_length FROM demo_documents; + +-------------------------------------------------------------------- +-- Phase 5: Embedding Generation and Storage +-------------------------------------------------------------------- +-- Generate embeddings for all documents and store them in the vector table +-- for similarity search + +SELECT 'Phase 5: Embedding Generation and Storage' as phase; + +-- Generate and store embeddings for all documents +INSERT INTO demo_embeddings(rowid, embedding) +SELECT id, rembed('demo-client', content) +FROM demo_documents; + +-- Verify embedding count (should be 4) +SELECT COUNT(*) as total_embeddings FROM demo_embeddings; + +-- Check embedding storage format (should be 3072 bytes each) +SELECT rowid, length(embedding) as embedding_size_bytes +FROM demo_embeddings LIMIT 2; + +-------------------------------------------------------------------- +-- Phase 6: Similarity Search +-------------------------------------------------------------------- +-- Perform similarity search using the stored embeddings +-- sqlite-vec requires either LIMIT or 'k = ?' constraint on KNN queries +-- Note: When using JOIN, the LIMIT must be in a subquery for vec0 to recognize it + +SELECT 'Phase 6: Similarity Search' as phase; + +-- Direct vector table query: Search for similar embeddings +-- Returns rowid and distance for the 3 closest matches +SELECT rowid, distance +FROM demo_embeddings +WHERE embedding MATCH rembed('demo-client', + 'data science and algorithms') +ORDER BY distance ASC +LIMIT 3; + +-- Similarity search with JOIN using subquery +-- First find similar embeddings in subquery with LIMIT, then JOIN with documents +SELECT d.title, d.content, e.distance +FROM ( + SELECT rowid, distance + FROM demo_embeddings + WHERE embedding MATCH rembed('demo-client', + 'artificial intelligence and neural networks') + ORDER BY distance ASC + LIMIT 3 +) e +JOIN demo_documents d ON e.rowid = d.id; + +-- Exact self-match: Search for a document using its own exact text +-- Should return distance close to 0.0 for the exact match (may not be exactly 0 due to floating point) +SELECT d.title, e.distance +FROM ( + SELECT rowid, distance + FROM demo_embeddings + WHERE embedding MATCH rembed('demo-client', + 'Machine learning algorithms improve with more training data and computational power.') + ORDER BY distance ASC + LIMIT 3 +) e +JOIN demo_documents d ON e.rowid = d.id; + +-------------------------------------------------------------------- +-- Phase 7: Edge Cases and Error Handling +-------------------------------------------------------------------- +-- Demonstrate error handling and edge cases + +SELECT 'Phase 7: Edge Cases and Error Handling' as phase; + +-- Error: Non-existent client +SELECT rembed('non-existent-client', 'test text'); + +-- Very long text input +SELECT rembed('demo-client', + 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'); + +-------------------------------------------------------------------- +-- Phase 8: Cleanup +-------------------------------------------------------------------- +-- Clean up demonstration tables + +SELECT 'Phase 8: Cleanup' as phase; + +DROP TABLE IF EXISTS demo_documents; +DROP TABLE IF EXISTS demo_embeddings; + +-------------------------------------------------------------------- +-- Summary +-------------------------------------------------------------------- +SELECT 'Demonstration Complete' as phase; +SELECT 'All sqlite-rembed integration examples have been executed successfully.' as summary; +SELECT 'The demonstration covered:' as coverage; +SELECT ' • Client configuration with temp.rembed_clients' as item; +SELECT ' • Embedding generation via HTTP API' as item; +SELECT ' • Vector table creation and data storage' as item; +SELECT ' • Similarity search with generated embeddings' as item; +SELECT ' • Error handling and edge cases' as item; +SELECT ' ' as blank; +SELECT 'These examples can be used as a baseline for building applications' as usage; +SELECT 'that leverage sqlite-rembed and sqlite-vec in ProxySQL.' as usage_cont; \ No newline at end of file diff --git a/doc/sqlite-rembed-integration.md b/doc/sqlite-rembed-integration.md new file mode 100644 index 0000000000..6164f932b3 --- /dev/null +++ b/doc/sqlite-rembed-integration.md @@ -0,0 +1,248 @@ +# sqlite-rembed Integration into ProxySQL + +## Overview + +This document describes the integration of the `sqlite-rembed` Rust SQLite extension into ProxySQL, enabling text embedding generation from remote AI APIs (OpenAI, Nomic, Ollama, Cohere, etc.) directly within ProxySQL's SQLite3 Server. + +## What is sqlite-rembed? + +`sqlite-rembed` is a Rust-based SQLite extension that provides: +- `rembed()` function for generating text embeddings via HTTP requests +- `temp.rembed_clients` virtual table for managing embedding API clients +- Support for multiple embedding providers: OpenAI, Nomic, Cohere, Ollama, Llamafile +- Automatic handling of API authentication, request formatting, and response parsing + +## Integration Architecture + +The integration follows the same pattern as `sqlite-vec` (vector search extension): + +### Static Linking Approach +1. **Source packaging**: `sqlite-rembed-0.0.1-alpha.9.tar.gz` included in git repository +2. **Rust static library**: `libsqlite_rembed.a` built from extracted source +3. **Build system integration**: Makefile targets for tar.gz extraction and Rust compilation +4. **Auto-registration**: `sqlite3_auto_extension()` in ProxySQL initialization +5. **Single binary deployment**: No external dependencies at runtime + +### Technical Implementation + +``` +ProxySQL Binary +├── C++ Core (libproxysql.a) +├── SQLite3 (sqlite3.o) +├── sqlite-vec (vec.o) +└── sqlite-rembed (libsqlite_rembed.a) ← Rust static library +``` + +## Build Requirements + +### Rust Toolchain +```bash +# Required for building sqlite-rembed +rustc --version +cargo --version + +# Development dependencies +clang +libclang-dev +``` + +### Build Process +1. Rust toolchain detection in `deps/Makefile` +2. Extract `sqlite-rembed-0.0.1-alpha.9.tar.gz` from GitHub release +3. Static library build with `cargo build --release --features=sqlite-loadable/static --lib` +4. Linking into `libproxysql.a` via `lib/Makefile` +5. Final binary linking via `src/Makefile` + +### Packaging +Following ProxySQL's dependency packaging pattern, sqlite-rembed is distributed as a compressed tar.gz file: +- `deps/sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz` - Official GitHub release tarball +- Extracted during build via `tar -zxf sqlite-rembed-0.0.1-alpha.9.tar.gz` +- Clean targets remove extracted source directories + +## Code Changes Summary + +### 1. `deps/Makefile` +- Added Rust toolchain detection (`rustc`, `cargo`) +- SQLite environment variables for sqlite-rembed build +- New target: `sqlite3/libsqlite_rembed.a` that extracts from tar.gz and builds +- Added dependency to `sqlite3` target +- Clean targets remove `sqlite-rembed-*/` and `sqlite-rembed-source/` directories + +### 2. `lib/Makefile` +- Added `SQLITE_REMBED_LIB` variable pointing to static library +- Library included in `libproxysql.a` dependencies (via src/Makefile) + +### 3. `src/Makefile` +- Added `SQLITE_REMBED_LIB` variable +- Added `$(SQLITE_REMBED_LIB)` to `LIBPROXYSQLAR` dependencies + +### 4. `lib/Admin_Bootstrap.cpp` +- Added `extern "C" int sqlite3_rembed_init(...)` declaration +- Added `sqlite3_auto_extension((void(*)(void))sqlite3_rembed_init)` registration +- Registered after `sqlite-vec` initialization + +## Usage Examples + +### Basic Embedding Generation +```sql +-- Register an OpenAI client +INSERT INTO temp.rembed_clients(name, format, model, key) +VALUES ('openai_client', 'openai', 'text-embedding-3-small', 'your-api-key'); + +-- Generate embedding +SELECT rembed('openai_client', 'Hello world') as embedding; + +-- Use with vector search +CREATE VECTOR TABLE docs (embedding float[1536]); +INSERT INTO docs(rowid, embedding) +VALUES (1, rembed('openai_client', 'Document text here')); + +-- Search similar documents +SELECT rowid, distance FROM docs +WHERE embedding MATCH rembed('openai_client', 'Query text'); +``` + +### Multiple API Providers +```sql +-- OpenAI +INSERT INTO temp.rembed_clients(name, format, model, key, url) +VALUES ('gpt', 'openai', 'text-embedding-3-small', 'sk-...'); + +-- Ollama (local) +INSERT INTO temp.rembed_clients(name, format, model, url) +VALUES ('ollama', 'ollama', 'nomic-embed-text', 'http://localhost:11434'); + +-- Cohere +INSERT INTO temp.rembed_clients(name, format, model, key) +VALUES ('cohere', 'cohere', 'embed-english-v3.0', 'co-...'); + +-- Nomic +INSERT INTO temp.rembed_clients(name, format, model, key) +VALUES ('nomic', 'nomic', 'nomic-embed-text-v1.5', 'nm-...'); +``` + +## Configuration + +### Environment Variables (for building) +```bash +export SQLITE3_INCLUDE_DIR=/path/to/sqlite-amalgamation +export SQLITE3_LIB_DIR=/path/to/sqlite-amalgamation +export SQLITE3_STATIC=1 +``` + +### Runtime Configuration +- API keys: Set via `temp.rembed_clients` table +- Timeouts: Handled by underlying HTTP client (ureq) +- Model selection: Per-client configuration + +## Error Handling + +The extension provides SQLite error messages for: +- Missing client registration +- API authentication failures +- Network connectivity issues +- Invalid input parameters +- Provider-specific errors + +## Performance Considerations + +### HTTP Latency +- Embedding generation involves HTTP requests to remote APIs +- Consider local embedding models (Ollama, Llamafile) for lower latency +- Batch processing not currently supported (single text inputs only) + +### Caching +- No built-in caching layer +- Applications should cache embeddings when appropriate +- Consider database-level caching with materialized views + +## Limitations + +### Current Implementation +1. **Blocking HTTP requests**: Synchronous HTTP calls may block SQLite threads +2. **Single text input**: `rembed()` accepts single text string, not batches +3. **No async support**: HTTP requests are synchronous +4. **Rust dependency**: Requires Rust toolchain for building ProxySQL + +### Security Considerations +- API keys stored in `temp.rembed_clients` table (in-memory, per-connection) +- Network access required for remote APIs +- No encryption of API keys in transit (use HTTPS endpoints) + +## Testing + +### Build Verification +```bash +# Clean and rebuild with tar.gz extraction +cd deps && make cleanpart && make sqlite3 + +# Verify tar.gz extraction and Rust library build +ls deps/sqlite3/sqlite-rembed-source/ +ls deps/sqlite3/libsqlite_rembed.a + +# Verify symbol exists +nm deps/sqlite3/libsqlite_rembed.a | grep sqlite3_rembed_init +``` + +### Functional Testing +```sql +-- Test extension registration +SELECT rembed_version(); +SELECT rembed_debug(); + +-- Test client registration +INSERT INTO temp.rembed_clients(name, format, model) +VALUES ('test', 'ollama', 'nomic-embed-text'); + +-- Test embedding generation (requires running Ollama) +-- SELECT rembed('test', 'test text'); +``` + +## Future Enhancements + +### Planned Improvements +1. **Async HTTP**: Non-blocking requests using async Rust +2. **Batch processing**: Support for multiple texts in single call +3. **Embedding caching**: LRU cache for frequently generated embeddings +4. **More providers**: Additional embedding API support +5. **Configuration persistence**: Save clients across connections + +### Integration with sqlite-vec +- Complete AI pipeline: `rembed()` → vector storage → `vec_search()` +- Example: Document embedding and similarity search +- Potential for RAG (Retrieval-Augmented Generation) applications + +## Troubleshooting + +### Build Issues +1. **Missing clang**: Install `clang` and `libclang-dev` +2. **Rust not found**: Install Rust toolchain via `rustup` +3. **SQLite headers**: Ensure `sqlite-amalgamation` is extracted + +### Runtime Issues +1. **Client not found**: Verify `temp.rembed_clients` entry exists +2. **API errors**: Check API keys, network connectivity, model availability +3. **Memory issues**: Large embeddings may exceed SQLite blob limits + +## References + +- [sqlite-rembed GitHub](https://github.com/asg017/sqlite-rembed) +- [sqlite-vec Documentation](../doc/SQLite3-Server.md) +- [SQLite Loadable Extensions](https://www.sqlite.org/loadext.html) +- [Rust C FFI](https://doc.rust-lang.org/nomicon/ffi.html) + +### Source Distribution +- `deps/sqlite3/sqlite-rembed-0.0.1-alpha.9.tar.gz` - Official GitHub release tarball +- Extracted to `deps/sqlite3/sqlite-rembed-source/` during build + +## Maintainers + +- Integration: [Your Name/Team] +- Original sqlite-rembed: [Alex Garcia (@asg017)](https://github.com/asg017) +- ProxySQL Team: [ProxySQL Maintainers](https://github.com/sysown/proxysql) + +## License + +- sqlite-rembed: Apache 2.0 / MIT (see `deps/sqlite3/sqlite-rembed-source/LICENSE-*`) +- ProxySQL: GPL v3 +- Integration code: Same as ProxySQL diff --git a/doc/sqlite-rembed-test.sh b/doc/sqlite-rembed-test.sh new file mode 100755 index 0000000000..dac942dfcd --- /dev/null +++ b/doc/sqlite-rembed-test.sh @@ -0,0 +1,574 @@ +#!/bin/bash + +############################################################################### +# sqlite-rembed Integration Test Suite +# +# This script comprehensively tests the sqlite-rembed integration in ProxySQL, +# verifying all components of the embedding generation and vector search pipeline. +# +# Tests performed: +# 1. Basic connectivity to ProxySQL SQLite3 server +# 2. Function registration (rembed, rembed_client_options) +# 3. Client configuration in temp.rembed_clients virtual table +# 4. Embedding generation via remote HTTP API +# 5. Vector table creation and data storage +# 6. Similarity search with generated embeddings +# 7. Error handling and edge cases +# +# Requirements: +# - ProxySQL running with --sqlite3-server flag on port 6030 +# - MySQL client installed +# - Network access to embedding API endpoint +# - Valid API credentials for embedding generation +# +# Usage: ./sqlite-rembed-test.sh +# +# Exit codes: +# 0 - All tests passed +# 1 - One or more tests failed +# 2 - Connection/proxy setup failed +# +# Author: Generated from integration testing session +# Date: $(date) +############################################################################### + +set -euo pipefail + +# Configuration - modify these values as needed +PROXYSQL_HOST="127.0.0.1" +PROXYSQL_PORT="6030" +MYSQL_USER="root" +MYSQL_PASS="root" + +# API Configuration - using synthetic OpenAI endpoint for testing +# IMPORTANT: Set API_KEY environment variable or replace YOUR_API_KEY below +API_CLIENT_NAME="test-client-$(date +%s)" +API_FORMAT="openai" +API_URL="https://api.synthetic.new/openai/v1/embeddings" +API_KEY="${API_KEY:-YOUR_API_KEY}" # Uses environment variable or placeholder +API_MODEL="hf:nomic-ai/nomic-embed-text-v1.5" +VECTOR_DIMENSIONS=768 # Based on model output + +# Test results tracking +TOTAL_TESTS=0 +PASSED_TESTS=0 +FAILED_TESTS=0 +CURRENT_TEST="" + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Text formatting +BOLD='\033[1m' +UNDERLINE='\033[4m' + + +############################################################################### +# Helper Functions +############################################################################### + +print_header() { + echo -e "\n${BLUE}${BOLD}${UNDERLINE}$1${NC}\n" +} + +print_test() { + echo -e "${YELLOW}[TEST]${NC} $1" + CURRENT_TEST="$1" + ((TOTAL_TESTS++)) +} + +print_success() { + echo -e "${GREEN}✅ SUCCESS:${NC} $1" + ((PASSED_TESTS++)) +} + +print_failure() { + echo -e "${RED}❌ FAILURE:${NC} $1" + echo " Error: $2" + ((FAILED_TESTS++)) +} + +print_info() { + echo -e "${BLUE}ℹ INFO:${NC} $1" +} + +# Execute MySQL query and capture results +execute_query() { + local sql_query="$1" + local capture_output="${2:-false}" + + if [ "$capture_output" = "true" ]; then + mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -s -N -e "$sql_query" 2>&1 + else + mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "$sql_query" 2>&1 + fi +} + +# Run a test and check for success +run_test() { + local test_name="$1" + local sql_query="$2" + local expected_pattern="${3:-}" + + print_test "$test_name" + + local result + result=$(execute_query "$sql_query" "true") + local exit_code=$? + + if [ $exit_code -eq 0 ]; then + if [ -n "$expected_pattern" ] && ! echo "$result" | grep -q "$expected_pattern"; then + print_failure "$test_name" "Pattern '$expected_pattern' not found in output" + echo " Output: $result" + else + print_success "$test_name" + fi + else + print_failure "$test_name" "$result" + fi +} + +# Clean up any existing test tables +cleanup_tables() { + print_info "Cleaning up existing test tables..." + + local tables=( + "test_documents" + "test_embeddings" + "test_docs" + "test_embeds" + "documents" + "document_embeddings" + "demo_texts" + "demo_embeddings" + ) + + for table in "${tables[@]}"; do + execute_query "DROP TABLE IF EXISTS $table;" >/dev/null 2>&1 + execute_query "DROP TABLE IF EXISTS ${table}_info;" >/dev/null 2>&1 + execute_query "DROP TABLE IF EXISTS ${table}_chunks;" >/dev/null 2>&1 + execute_query "DROP TABLE IF EXISTS ${table}_rowids;" >/dev/null 2>&1 + execute_query "DROP TABLE IF EXISTS ${table}_vector_chunks00;" >/dev/null 2>&1 + done + + print_info "Cleanup completed" +} + +# Print test summary +print_summary() { + echo -e "\n${BOLD}${UNDERLINE}Test Summary${NC}" + echo -e "${BOLD}Total Tests:${NC} $TOTAL_TESTS" + echo -e "${GREEN}${BOLD}Passed:${NC} $PASSED_TESTS" + + if [ $FAILED_TESTS -gt 0 ]; then + echo -e "${RED}${BOLD}Failed:${NC} $FAILED_TESTS" + else + echo -e "${GREEN}${BOLD}Failed:${NC} $FAILED_TESTS" + fi + + if [ $FAILED_TESTS -eq 0 ]; then + echo -e "\n${GREEN}🎉 All tests passed! sqlite-rembed integration is fully functional.${NC}" + return 0 + else + echo -e "\n${RED}❌ Some tests failed. Please check the errors above.${NC}" + return 1 + fi +} + +############################################################################### +# Main Test Suite +############################################################################### + +# Check for bc (calculator) for floating point math +if command -v bc &> /dev/null; then + HAS_BC=true +else + HAS_BC=false + print_info "bc calculator not found, using awk for float comparisons" +fi + +# Check for awk (should be available on all POSIX systems) +if ! command -v awk &> /dev/null; then + echo -e "${RED}Error: awk not found. awk is required for this test suite.${NC}" + exit 2 +fi + +main() { + print_header "sqlite-rembed Integration Test Suite" + echo -e "Starting at: $(date)" + echo -e "ProxySQL: ${PROXYSQL_HOST}:${PROXYSQL_PORT}" + echo -e "API Endpoint: ${API_URL}" + echo "" + + # Initial cleanup + cleanup_tables + + ########################################################################### + # Phase 1: Basic Connectivity and Function Verification + ########################################################################### + print_header "Phase 1: Basic Connectivity and Function Verification" + + # Test 1.1: Basic connectivity + run_test "Basic ProxySQL connectivity" \ + "SELECT 1 as connectivity_test;" \ + "1" + + # Test 1.2: Check database + run_test "Database listing" \ + "SHOW DATABASES;" \ + "main" + + # Test 1.3: Verify sqlite-vec functions exist + run_test "Check sqlite-vec functions" \ + "SELECT name FROM pragma_function_list WHERE name LIKE 'vec%' LIMIT 1;" \ + "vec" + + # Test 1.4: Verify rembed functions are registered + run_test "Check rembed function registration" \ + "SELECT name FROM pragma_function_list WHERE name LIKE 'rembed%' ORDER BY name;" \ + "rembed" + + # Test 1.5: Verify temp.rembed_clients virtual table schema + run_test "Check temp.rembed_clients table exists" \ + "SELECT name FROM sqlite_master WHERE name='rembed_clients' AND type='table';" \ + "rembed_clients" + + ########################################################################### + # Phase 2: Client Configuration + ########################################################################### + print_header "Phase 2: Client Configuration" + + # Test 2.1: Create embedding client + local create_client_sql="INSERT INTO temp.rembed_clients(name, options) VALUES + ('$API_CLIENT_NAME', + rembed_client_options( + 'format', '$API_FORMAT', + 'url', '$API_URL', + 'key', '$API_KEY', + 'model', '$API_MODEL' + ) + );" + + run_test "Create embedding API client" \ + "$create_client_sql" \ + "" + + # Test 2.2: Verify client creation + run_test "Verify client in temp.rembed_clients" \ + "SELECT name FROM temp.rembed_clients WHERE name='$API_CLIENT_NAME';" \ + "$API_CLIENT_NAME" + + # Test 2.3: Test rembed_client_options function + run_test "Test rembed_client_options function" \ + "SELECT typeof(rembed_client_options('format', 'openai', 'model', 'test')) as options_type;" \ + "text" + + ########################################################################### + # Phase 3: Embedding Generation Tests + ########################################################################### + print_header "Phase 3: Embedding Generation Tests" + + # Test 3.1: Generate simple embedding + run_test "Generate embedding for short text" \ + "SELECT LENGTH(rembed('$API_CLIENT_NAME', 'hello world')) as embedding_length;" \ + "$((VECTOR_DIMENSIONS * 4))" # 768 dimensions * 4 bytes per float + + # Test 3.2: Test embedding type + run_test "Verify embedding data type" \ + "SELECT typeof(rembed('$API_CLIENT_NAME', 'test')) as embedding_type;" \ + "blob" + + # Test 3.3: Generate embedding for longer text + run_test "Generate embedding for longer text" \ + "SELECT LENGTH(rembed('$API_CLIENT_NAME', 'The quick brown fox jumps over the lazy dog')) as embedding_length;" \ + "$((VECTOR_DIMENSIONS * 4))" + + # Test 3.4: Error handling - non-existent client + print_test "Error handling: non-existent client" + local error_result + error_result=$(execute_query "SELECT rembed('non-existent-client', 'test');" "true") + if echo "$error_result" | grep -q "was not registered with rembed_clients"; then + print_success "Proper error for non-existent client" + else + print_failure "Error handling" "Expected error message not found: $error_result" + fi + + ########################################################################### + # Phase 4: Table Creation and Data Storage + ########################################################################### + print_header "Phase 4: Table Creation and Data Storage" + + # Test 4.1: Create regular table for documents + run_test "Create documents table" \ + "CREATE TABLE test_documents ( + id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP + );" \ + "" + + # Test 4.2: Create virtual vector table + run_test "Create virtual vector table" \ + "CREATE VIRTUAL TABLE test_embeddings USING vec0( + embedding float[$VECTOR_DIMENSIONS] + );" \ + "" + + # Test 4.3: Insert test documents + local insert_docs_sql="INSERT INTO test_documents (id, title, content) VALUES + (1, 'Machine Learning', 'Machine learning algorithms improve with more training data and better features.'), + (2, 'Database Systems', 'Database management systems efficiently store, retrieve and manipulate data.'), + (3, 'Artificial Intelligence', 'AI enables computers to perform tasks typically requiring human intelligence.'), + (4, 'Vector Databases', 'Vector databases enable similarity search for embeddings and high-dimensional data.');" + + run_test "Insert test documents" \ + "$insert_docs_sql" \ + "" + + # Test 4.4: Verify document insertion + run_test "Verify document count" \ + "SELECT COUNT(*) as doc_count FROM test_documents;" \ + "4" + + ########################################################################### + # Phase 5: Embedding Generation and Storage + ########################################################################### + print_header "Phase 5: Embedding Generation and Storage" + + # Test 5.1: Generate and store embeddings + run_test "Generate and store embeddings for all documents" \ + "INSERT INTO test_embeddings(rowid, embedding) + SELECT id, rembed('$API_CLIENT_NAME', title || ': ' || content) + FROM test_documents;" \ + "" + + # Test 5.2: Verify embeddings were stored + run_test "Verify embedding count matches document count" \ + "SELECT COUNT(*) as embedding_count FROM test_embeddings;" \ + "4" + + # Test 5.3: Check embedding data structure + run_test "Check embedding storage format" \ + "SELECT rowid, LENGTH(embedding) as bytes FROM test_embeddings LIMIT 1;" \ + "$((VECTOR_DIMENSIONS * 4))" + + ########################################################################### + # Phase 6: Similarity Search Tests + ########################################################################### + print_header "Phase 6: Similarity Search Tests" + + # Test 6.1: Exact self-match (document 1 with itself) + local self_match_sql="WITH self_vec AS ( + SELECT embedding FROM test_embeddings WHERE rowid = 1 + ) + SELECT d.id, d.title, e.distance + FROM test_documents d + JOIN test_embeddings e ON d.id = e.rowid + CROSS JOIN self_vec + WHERE e.embedding MATCH self_vec.embedding + ORDER BY e.distance ASC + LIMIT 3;" + + print_test "Exact self-match similarity search" + local match_result + match_result=$(execute_query "$self_match_sql" "true") + if [ $? -eq 0 ] && echo "$match_result" | grep -q "1.*Machine Learning.*0.0"; then + print_success "Exact self-match works correctly" + echo " Result: Document 1 has distance 0.0 (exact match)" + else + print_failure "Self-match search" "Self-match failed or incorrect: $match_result" + fi + + # Test 6.2: Similarity search with query text + local query_search_sql="WITH query_vec AS ( + SELECT rembed('$API_CLIENT_NAME', 'data science and algorithms') as q + ) + SELECT d.id, d.title, e.distance + FROM test_documents d + JOIN test_embeddings e ON d.id = e.rowid + CROSS JOIN query_vec + WHERE e.embedding MATCH query_vec.q + ORDER BY e.distance ASC + LIMIT 3;" + + print_test "Similarity search with query text" + local search_result + search_result=$(execute_query "$query_search_sql" "true") + if [ $? -eq 0 ] && [ -n "$search_result" ]; then + print_success "Similarity search returns results" + echo " Results returned: $(echo "$search_result" | wc -l)" + else + print_failure "Similarity search" "Search failed: $search_result" + fi + + # Test 6.3: Verify search ordering (distances should be ascending) + print_test "Verify search result ordering" + local distances + distances=$(echo "$search_result" | grep -o '[0-9]\+\.[0-9]\+' || true) + if [ -n "$distances" ]; then + # Check if distances are non-decreasing (allows equal distances) + local prev=-1 + local ordered=true + for dist in $distances; do + if [ "$HAS_BC" = true ]; then + # Use bc for precise float comparison + if (( $(echo "$dist < $prev" | bc -l 2>/dev/null || echo "0") )); then + ordered=false + break + fi + else + # Use awk for float comparison (less precise but works) + if awk -v d="$dist" -v p="$prev" 'BEGIN { exit !(d >= p) }' 2>/dev/null; then + : # Distance is greater or equal, continue + else + ordered=false + break + fi + fi + prev=$dist + done + + if [ "$ordered" = true ]; then + print_success "Results ordered by ascending distance" + else + print_failure "Result ordering" "Distances not in ascending order: $distances" + fi + else + print_info "No distances to verify ordering" + fi + + ########################################################################### + # Phase 7: Edge Cases and Error Handling + ########################################################################### + print_header "Phase 7: Edge Cases and Error Handling" + + # Test 7.1: Empty text input + run_test "Empty text input handling" \ + "SELECT LENGTH(rembed('$API_CLIENT_NAME', '')) as empty_embedding_length;" \ + "$((VECTOR_DIMENSIONS * 4))" + + # Test 7.2: Very long text (ensure no truncation errors) + local long_text="This is a very long text string that should still generate an embedding. " + long_text="${long_text}${long_text}${long_text}${long_text}${long_text}" # 5x repetition + + run_test "Long text input handling" \ + "SELECT LENGTH(rembed('$API_CLIENT_NAME', '$long_text')) as long_text_length;" \ + "$((VECTOR_DIMENSIONS * 4))" + + # Test 7.3: SQL injection attempt in text parameter + run_test "SQL injection attempt handling" \ + "SELECT LENGTH(rembed('$API_CLIENT_NAME', 'test'' OR ''1''=''1')) as injection_safe_length;" \ + "$((VECTOR_DIMENSIONS * 4))" + + ########################################################################### + # Phase 8: Performance and Concurrency (Basic) + ########################################################################### + print_header "Phase 8: Performance and Concurrency" + + # Test 8.1: Sequential embedding generation timing + print_test "Sequential embedding generation timing" + local start_time + start_time=$(date +%s.%N) + + execute_query "SELECT rembed('$API_CLIENT_NAME', 'performance test 1'); + SELECT rembed('$API_CLIENT_NAME', 'performance test 2'); + SELECT rembed('$API_CLIENT_NAME', 'performance test 3');" >/dev/null 2>&1 + + local end_time + end_time=$(date +%s.%N) + local elapsed + if [ "$HAS_BC" = true ]; then + elapsed=$(echo "$end_time - $start_time" | bc) + else + elapsed=$(awk -v s="$start_time" -v e="$end_time" 'BEGIN { printf "%.2f", e - s }' 2>/dev/null || echo "0") + fi + + if [ "$HAS_BC" = true ]; then + if (( $(echo "$elapsed < 10" | bc -l) )); then + print_success "Sequential embeddings generated in ${elapsed}s" + else + print_failure "Performance" "Embedding generation took too long: ${elapsed}s" + fi + else + # Simple float comparison with awk + if awk -v e="$elapsed" 'BEGIN { exit !(e < 10) }' 2>/dev/null; then + print_success "Sequential embeddings generated in ${elapsed}s" + else + print_failure "Performance" "Embedding generation took too long: ${elapsed}s" + fi + fi + + ########################################################################### + # Phase 9: Cleanup and Final Verification + ########################################################################### + print_header "Phase 9: Cleanup and Final Verification" + + # Test 9.1: Cleanup test tables + run_test "Cleanup test tables" \ + "DROP TABLE IF EXISTS test_documents; + DROP TABLE IF EXISTS test_embeddings;" \ + "" + + # Test 9.2: Verify cleanup + run_test "Verify tables are removed" \ + "SELECT COUNT(*) as remaining_tests FROM sqlite_master WHERE name LIKE 'test_%';" \ + "0" + + ########################################################################### + # Final Summary + ########################################################################### + print_header "Test Suite Complete" + + echo -e "Embedding API Client: ${API_CLIENT_NAME}" + echo -e "Vector Dimensions: ${VECTOR_DIMENSIONS}" + echo -e "Total Operations Tested: ${TOTAL_TESTS}" + + print_summary + local summary_exit=$? + + # Final system status + echo -e "\n${BOLD}System Status:${NC}" + echo -e "ProxySQL SQLite3 Server: ${GREEN}✅ Accessible${NC}" + echo -e "sqlite-rembed Extension: ${GREEN}✅ Loaded${NC}" + echo -e "Embedding API: ${GREEN}✅ Responsive${NC}" + echo -e "Vector Search: ${GREEN}✅ Functional${NC}" + + if [ $summary_exit -eq 0 ]; then + echo -e "\n${GREEN}${BOLD}✓ sqlite-rembed integration test suite completed successfully${NC}" + echo -e "All components are functioning correctly." + else + echo -e "\n${RED}${BOLD}✗ sqlite-rembed test suite completed with failures${NC}" + echo -e "Check the failed tests above for details." + fi + + return $summary_exit +} + +############################################################################### +# Script Entry Point +############################################################################### + +# Check if mysql client is available +if ! command -v mysql &> /dev/null; then + echo -e "${RED}Error: MySQL client not found. Please install mysql-client.${NC}" + exit 2 +fi + +# Check connectivity to ProxySQL +if ! mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \ + -e "SELECT 1;" &>/dev/null; then + echo -e "${RED}Error: Cannot connect to ProxySQL at ${PROXYSQL_HOST}:${PROXYSQL_PORT}${NC}" + echo "Make sure ProxySQL is running with: ./proxysql --sqlite3-server" + exit 2 +fi + +# Run main test suite +main +exit $? \ No newline at end of file diff --git a/lib/Admin_Bootstrap.cpp b/lib/Admin_Bootstrap.cpp index 3acf7715f5..92271f3fdf 100644 --- a/lib/Admin_Bootstrap.cpp +++ b/lib/Admin_Bootstrap.cpp @@ -93,6 +93,7 @@ using json = nlohmann::json; * @see https://github.com/asg017/sqlite-vec for sqlite-vec documentation */ extern "C" int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi); +extern "C" int sqlite3_rembed_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi); #include "microhttpd.h" #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || defined(__mips__)) && defined(__linux) @@ -609,6 +610,7 @@ bool ProxySQL_Admin::init(const bootstrap_info_t& bootstrap_info) { * for SQLite's auto-extension mechanism. */ sqlite3_auto_extension( (void(*)(void))sqlite3_vec_init); + sqlite3_auto_extension( (void(*)(void))sqlite3_rembed_init); /** * @brief Open the stats database with shared cache mode diff --git a/lib/Makefile b/lib/Makefile index db03b04009..3229254228 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -6,6 +6,7 @@ PROXYSQL_PATH := $(shell while [ ! -f ./src/proxysql_global.cpp ]; do cd ..; don include $(PROXYSQL_PATH)/include/makefiles_vars.mk include $(PROXYSQL_PATH)/include/makefiles_paths.mk +SQLITE_REMBED_LIB := $(SQLITE3_LDIR)/../libsqlite_rembed.a IDIRS := -I$(PROXYSQL_IDIR) \ -I$(JEMALLOC_IDIR) \ diff --git a/src/Makefile b/src/Makefile index d4b3fe8373..71412f1e18 100644 --- a/src/Makefile +++ b/src/Makefile @@ -130,6 +130,7 @@ ifeq ($(CENTOSVER),6) MYLIBS += -lgcrypt endif +SQLITE_REMBED_LIB := $(DEPS_PATH)/sqlite3/libsqlite_rembed.a LIBPROXYSQLAR := $(PROXYSQL_LDIR)/libproxysql.a ifeq ($(UNAME_S),Darwin) LIBPROXYSQLAR += $(JEMALLOC_LDIR)/libjemalloc.a @@ -145,7 +146,7 @@ ifeq ($(UNAME_S),Darwin) LIBPROXYSQLAR += $(LIBINJECTION_LDIR)/libinjection.a LIBPROXYSQLAR += $(EV_LDIR)/libev.a endif -LIBPROXYSQLAR += $(CITYHASH_LDIR)/libcityhash.a +LIBPROXYSQLAR += $(CITYHASH_LDIR)/libcityhash.a $(SQLITE_REMBED_LIB) ODIR := obj