diff --git a/README.md b/README.md index a4d2ee3..48c6c9a 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ [![Release](https://img.shields.io/github/v/release/vmvarela/sql-pipe)](https://github.com/vmvarela/sql-pipe/releases/latest) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) -`sql-pipe` reads CSV, JSON, or NDJSON from stdin, loads it into an in-memory SQLite database, runs a SQL query, and prints the results. No server, no schema files, no setup. +`sql-pipe` reads CSV, JSON, or NDJSON from stdin or file arguments, loads it into an in-memory SQLite database, runs a SQL query, and prints the results. No server, no schema files, no setup. It exists because `awk` is cryptic, spinning up a Python interpreter for a one-liner feels wrong, and `sqlite3 :memory:` takes four commands before you can query anything. If you know SQL and work with CSV in the terminal, this is the tool you've been reaching for. ```sh $ curl -s https://example.com/data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region ORDER BY 2 DESC' + +# Or pass files directly — each file becomes a table named after its basename +$ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' ``` ## Quick Start @@ -143,7 +146,11 @@ Binary lands at `./zig-out/bin/sql-pipe`. SQLite is compiled from the official a ## Usage -The input comes from stdin. For CSV and TSV, the first row must be a header — those column names become the schema for a table called `t`. Results go to stdout as comma-separated values by default. +Input comes from stdin or file arguments. For CSV and TSV, the first row must be a header — those column names become the schema. Results go to stdout as comma-separated values by default. + +### Stdin input (table `t`) + +When reading from stdin, data is loaded into a table called `t`: ```sh $ printf 'name,age\nAlice,30\nBob,25\nCarol,35' | sql-pipe 'SELECT * FROM t' @@ -220,6 +227,36 @@ $ cat events.xml | sql-pipe -I xml --xml-root events --xml-row event \ 'SELECT name, date FROM t WHERE type = "conference"' ``` +### File arguments + +Pass files as positional arguments instead of piping through stdin. Each file becomes a table named after its basename (without extension). The input format is auto-detected from the file extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`): + +```sh +# Single file — no more cat +$ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' + +# Multi-file join — the #1 reason people reach for DuckDB +$ sql-pipe orders.csv customers.csv \ + 'SELECT c.name, SUM(o.amount) FROM orders o + JOIN customers c ON o.cust_id = c.id GROUP BY c.name' +``` + +Stdin still works and is always available as table `t`. Mix stdin with file arguments: + +```sh +# Stdin as t, file as named table +$ cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id' + +# Stdin still works the old way +$ cat data.csv | sql-pipe 'SELECT * FROM t' +``` + +Use `--` to separate files from the query when needed (e.g. if a filename starts with `-`): + +```sh +$ sql-pipe -- data.csv 'SELECT * FROM data' +``` + Chain queries by piping back in — useful for two-pass aggregations. Pass `-H` to the first call so the second one sees column names: ```sh @@ -250,6 +287,7 @@ $ cat events.csv \ | `-s`, `--silent` | Suppress `Loaded rows in s` and the progress counter from stderr unconditionally. Cannot be combined with `-v`/`--verbose` | | `-h`, `--help` | Show usage help and exit | | `-V`, `--version` | Print version and exit | +| `--` | End of options — treat all remaining arguments as files or query | After loading, `sql-pipe` prints `Loaded rows in s` to stderr whenever stderr is a TTY (interactive terminal). The message is suppressed in scripts and pipes to keep them noise-free. Use `-v` / `--verbose` to force it regardless of TTY, or `-s` / `--silent` to suppress it unconditionally (e.g. when stderr is a TTY but you want clean output): @@ -297,6 +335,15 @@ error: no such column: amout ## Recipes +**Multi-file join:** + +```sh +$ sql-pipe orders.csv customers.csv \ + 'SELECT c.name, SUM(o.amount) as total + FROM orders o JOIN customers c ON o.cust_id = c.id + GROUP BY c.name ORDER BY total DESC' +``` + **Top N rows by a column:** ```sh @@ -499,7 +546,7 @@ The database never touches disk and vanishes when the process exits. No state, n ## Limitations -- **Single table per invocation.** For joins, use chained `sql-pipe` calls or a `WITH` CTE. +- **File format auto-detection** is based on file extension. Files without a recognized extension (`.csv`, `.tsv`, `.json`, `.ndjson`, `.xml`) fall back to the `-I` flag value (default: CSV). ## Related diff --git a/build.zig b/build.zig index 571469e..953f349 100644 --- a/build.zig +++ b/build.zig @@ -395,11 +395,11 @@ pub fn build(b: *std.Build) void { test_columns_tsv.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_columns_tsv.step); - // Integration test 37: --columns combined with a query argument exits 1 with error + // Integration test 37: --columns with a non-file positional arg exits 2 with error const test_columns_with_query = b.addSystemCommand(&.{ "bash", "-c", \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe --columns 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") - \\echo "$msg" | grep -q 'error: --columns cannot be combined with a query argument' && echo "$msg" | grep -q 'EXIT:1' + \\echo "$msg" | grep -q 'error: cannot open file' && echo "$msg" | grep -q 'EXIT:2' }); test_columns_with_query.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_columns_with_query.step); @@ -581,11 +581,11 @@ pub fn build(b: *std.Build) void { test_json_bool_value.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_json_bool_value.step); - // Integration test 56: empty JSON array → error exit 2 + // Integration test 56: empty JSON array → no such table error (table not created) const test_json_empty_array = b.addSystemCommand(&.{ "bash", "-c", \\msg=$(printf '[]' | ./zig-out/bin/sql-pipe -I json 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") - \\echo "$msg" | grep -q 'empty JSON array' && echo "$msg" | grep -q 'EXIT:2' + \\echo "$msg" | grep -q 'no such table' && echo "$msg" | grep -q 'EXIT:3' }); test_json_empty_array.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_json_empty_array.step); @@ -809,11 +809,11 @@ pub fn build(b: *std.Build) void { test_validate_delimiter.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_validate_delimiter.step); - // Integration test 78: --validate with query argument exits 1 + // Integration test 78: --validate with a non-file positional arg exits 2 const test_validate_with_query = b.addSystemCommand(&.{ "bash", "-c", \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe --validate 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") - \\echo "$msg" | grep -q 'error: --validate cannot be combined with a query argument' && echo "$msg" | grep -q 'EXIT:1' + \\echo "$msg" | grep -q 'error: cannot open file' && echo "$msg" | grep -q 'EXIT:2' }); test_validate_with_query.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_validate_with_query.step); @@ -930,11 +930,11 @@ pub fn build(b: *std.Build) void { test_sample_zero.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_sample_zero.step); - // Integration test 90: --sample combined with a query argument exits 1 with error + // Integration test 90: --sample with a non-file positional arg exits 2 const test_sample_with_query = b.addSystemCommand(&.{ "bash", "-c", \\msg=$(printf 'a,b\n1,2\n' | ./zig-out/bin/sql-pipe --sample 5 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") - \\echo "$msg" | grep -q 'error: --sample cannot be combined with a query argument' && echo "$msg" | grep -q 'EXIT:1' + \\echo "$msg" | grep -q 'error: cannot open file' && echo "$msg" | grep -q 'EXIT:2' }); test_sample_with_query.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_sample_with_query.step); @@ -1097,11 +1097,11 @@ pub fn build(b: *std.Build) void { test_xml_null_output.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_xml_null_output.step); - // Integration test 107: Documento XML vacío → error con "empty input" + // Integration test 107: Empty XML document → no such table error (table not created) const test_xml_empty_input = b.addSystemCommand(&.{ "bash", "-c", - \\msg=$(printf '' | ./zig-out/bin/sql-pipe -I xml 'SELECT 1' 2>&1; echo "EXIT:$?") - \\echo "$msg" | grep -q 'empty input' && echo "$msg" | grep -qv 'EXIT:0' + \\msg=$(printf '' | ./zig-out/bin/sql-pipe -I xml 'SELECT 1 FROM t' 2>&1; echo "EXIT:$?") + \\echo "$msg" | grep -q 'no such table' && echo "$msg" | grep -q 'EXIT:3' }); test_xml_empty_input.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_xml_empty_input.step); @@ -1593,6 +1593,350 @@ pub fn build(b: *std.Build) void { }); test_date_no_type_inference.step.dependOn(b.getInstallStep()); test_step.dependOn(&test_date_no_type_inference.step); + // ─── File argument integration tests (issue #155) ─────────────────────── + + // Integration test 155a: Single file argument — query by table name + const test_file_single = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'name,amount\nAlice,150\nBob,80\nCarol,200\n' > "$dir/orders.csv" + \\result=$(./zig-out/bin/sql-pipe "$dir/orders.csv" 'SELECT name FROM orders WHERE amount > 100') + \\rm -rf "$dir" + \\[ "$result" = "$(printf 'Alice\nCarol')" ] + }); + test_file_single.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_single.step); + + // Integration test 155b: Multi-file join + const test_file_multi_join = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'id,name\n1,Alice\n2,Bob\n3,Carol\n' > "$dir/users.csv" + \\printf 'user_id,amount\n1,150\n2,80\n3,200\n1,50\n' > "$dir/orders.csv" + \\result=$(./zig-out/bin/sql-pipe "$dir/users.csv" "$dir/orders.csv" \ + \\ 'SELECT u.name, SUM(o.amount) FROM users u JOIN orders o ON u.id = o.user_id GROUP BY u.name ORDER BY u.name') + \\rm -rf "$dir" + \\expected=$(printf 'Alice,200\nBob,80\nCarol,200') + \\[ "$result" = "$expected" ] + }); + test_file_multi_join.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_multi_join.step); + + // Integration test 155c: Stdin + file mix + const test_file_stdin_mix = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'uid,name\n1,Alice\n2,Bob\n' > "$dir/users.csv" + \\result=$(printf 'user_id,amount\n1,150\n2,80\n' \ + \\ | ./zig-out/bin/sql-pipe "$dir/users.csv" 'SELECT t.amount, u.name FROM t JOIN users u ON t.user_id = u.uid ORDER BY u.name') + \\rm -rf "$dir" + \\[ "$result" = "$(printf '150,Alice\n80,Bob')" ] + }); + test_file_stdin_mix.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_stdin_mix.step); + + // Integration test 155d: File with .tsv extension auto-detected as TSV + const test_file_tsv_ext = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'name\tage\nAlice\t30\nBob\t25\n' > "$dir/data.tsv" + \\result=$(./zig-out/bin/sql-pipe "$dir/data.tsv" 'SELECT name FROM data WHERE age > 27') + \\rm -rf "$dir" + \\[ "$result" = "Alice" ] + }); + test_file_tsv_ext.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_tsv_ext.step); + + // Integration test 155e: -- separator works + const test_file_dashdash = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'x,y\n1,2\n' > "$dir/data.csv" + \\result=$(./zig-out/bin/sql-pipe -- "$dir/data.csv" 'SELECT y FROM data') + \\rm -rf "$dir" + \\[ "$result" = "2" ] + }); + test_file_dashdash.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_dashdash.step); + + // Integration test 155f: Non-existent file argument exits 2 + const test_file_not_found = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(./zig-out/bin/sql-pipe /tmp/nonexistent_file_xyz.csv 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'error: cannot open file' && echo "$msg" | grep -q 'EXIT:2' + }); + test_file_not_found.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_not_found.step); + + // Integration test 155g: File with no extension uses default CSV format + const test_file_no_ext = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'name,age\nAlice,30\n' > "$dir/mydata" + \\result=$(./zig-out/bin/sql-pipe "$dir/mydata" 'SELECT name FROM mydata') + \\rm -rf "$dir" + \\[ "$result" = "Alice" ] + }); + test_file_no_ext.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_no_ext.step); + + // Integration test 155h: Stdin still works as `t` (existing behavior preserved) + const test_file_stdin_only = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(printf 'name,age\nAlice,30\nBob,25\n' \ + \\ | ./zig-out/bin/sql-pipe 'SELECT name FROM t WHERE age > 27') + \\[ "$result" = "Alice" ] + }); + test_file_stdin_only.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_stdin_only.step); + + // Integration test 155i: No file args + no stdin + no query → error + const test_file_no_input = b.addSystemCommand(&.{ + "bash", "-c", + \\msg=$(./zig-out/bin/sql-pipe 2>&1 >/dev/null; echo "EXIT:$?") + \\echo "$msg" | grep -q 'Usage:' && echo "$msg" | grep -q 'EXIT:1' + }); + test_file_no_input.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_no_input.step); + + // Integration test 155j: File named t.csv conflicts with stdin table t + const test_file_t_conflict = b.addSystemCommand(&.{ + "bash", "-c", + \\dir=$(mktemp -d) + \\printf 'a,b\n1,2\n' > "$dir/t.csv" + \\msg=$(printf 'x,y\n3,4\n' | ./zig-out/bin/sql-pipe "$dir/t.csv" 'SELECT * FROM t' 2>&1 >/dev/null; echo "EXIT:$?") + \\rm -rf "$dir" + \\echo "$msg" | grep -q 'duplicate table name' && echo "$msg" | grep -q 'EXIT:1' + }); + test_file_t_conflict.step.dependOn(b.getInstallStep()); + test_step.dependOn(&test_file_t_conflict.step); + + // ─── Fixture-based integration tests ───────────────────────────────────── + // These tests use sample files committed in tests/fixtures/ to exercise + // the binary end-to-end with realistic data across all supported formats. + + const fixture_test_step = b.step("fixture-test", "Run fixture-based integration tests"); + fixture_test_step.dependOn(b.getInstallStep()); + test_step.dependOn(fixture_test_step); + + // Fixture test 1: CSV file argument — basic query + const fixture_csv_basic = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv 'SELECT product, SUM(amount) FROM orders GROUP BY product ORDER BY product') + \\expected=$(printf 'Doohickey,200.0\nGadget,125.5\nThingamajig,300.0\nWidget,345.25') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_csv_basic.step); + + // Fixture test 2: CSV file argument — type inference (amount is REAL, date is DATE) + const fixture_csv_types = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv 'SELECT COUNT(*), SUM(amount) FROM orders WHERE amount > 100') + \\[ "$result" = "4,770.0" ] + }); + fixture_test_step.dependOn(&fixture_csv_types.step); + + // Fixture test 3: CSV file argument — date column works with SQLite date functions + const fixture_csv_dates = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv 'SELECT strftime("%Y", date) AS year, SUM(amount) FROM orders GROUP BY year ORDER BY year') + \\expected=$(printf '2024,970.75') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_csv_dates.step); + + // Fixture test 4: Multi-file CSV join (orders + customers) + const fixture_csv_join = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv tests/fixtures/customers.csv \ + \\ 'SELECT c.name, c.region, SUM(o.amount) as total FROM orders o JOIN customers c ON o.customer_id = c.id GROUP BY c.name ORDER BY total DESC') + \\expected=$(printf 'Alice,East,395.0\nBob,West,380.5\nCarol,East,195.25') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_csv_join.step); + + // Fixture test 5: CSV file via stdin (piped) + const fixture_csv_stdin = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/orders.csv | ./zig-out/bin/sql-pipe 'SELECT DISTINCT product FROM t ORDER BY product') + \\expected=$(printf 'Doohickey\nGadget\nThingamajig\nWidget') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_csv_stdin.step); + + // Fixture test 6: JSON file argument — auto-detected from .json extension + const fixture_json_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT name, price FROM products ORDER BY CAST(price AS REAL) DESC') + \\expected=$(printf 'Thingamajig,60.0\nDoohickey,50.0\nGadget,40.25\nWidget,25.0') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_json_file.step); + + // Fixture test 7: JSON file — filter and aggregate + const fixture_json_filter = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/products.json 'SELECT category, COUNT(*) FROM products GROUP BY category ORDER BY category') + \\expected=$(printf 'electronics,2\nhardware,2') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_json_filter.step); + + // Fixture test 8: JSON file via stdin + const fixture_json_stdin = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/products.json | ./zig-out/bin/sql-pipe -I json 'SELECT name FROM t WHERE CAST(stock AS INTEGER) > 0 ORDER BY name') + \\expected=$(printf 'Doohickey\nGadget\nWidget') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_json_stdin.step); + + // Fixture test 9: NDJSON file argument — auto-detected from .ndjson extension + const fixture_ndjson_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT event, COUNT(*) FROM events GROUP BY event ORDER BY event') + \\expected=$(printf 'login,2\nlogout,1\npurchase,2') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_ndjson_file.step); + + // Fixture test 10: NDJSON file — user activity summary + const fixture_ndjson_user = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/events.ndjson 'SELECT user, COUNT(*) as n FROM events GROUP BY user ORDER BY n DESC, user') + \\expected=$(printf 'alice,3\nbob,1\ncarol,1') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_ndjson_user.step); + + // Fixture test 11: XML file argument — auto-detected from .xml extension, with --xml-root/--xml-row + const fixture_xml_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml -I xml --xml-root channel --xml-row item 'SELECT author, title FROM feed ORDER BY author') + \\expected=$(printf 'Alice,First Post\nBob,Second Post\nCarol,Third Post') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_xml_file.step); + + // Fixture test 12: XML file — aggregate views + const fixture_xml_aggregate = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/feed.xml -I xml --xml-root channel --xml-row item 'SELECT SUM(CAST(views AS INTEGER)) FROM feed') + \\[ "$result" = "430" ] + }); + fixture_test_step.dependOn(&fixture_xml_aggregate.step); + + // Fixture test 13: XML file via stdin + const fixture_xml_stdin = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/feed.xml | ./zig-out/bin/sql-pipe -I xml --xml-root channel --xml-row item 'SELECT title FROM t WHERE CAST(views AS INTEGER) > 100 ORDER BY title') + \\expected=$(printf 'First Post\nThird Post') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_xml_stdin.step); + + // Fixture test 13: Mixed format — CSV file + JSON file join (via shared column) + // orders.csv has "product" column, products.json has "name" column + const fixture_mixed_join = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv tests/fixtures/products.json \ + \\ 'SELECT o.product, p.category, SUM(o.amount) FROM orders o JOIN products p ON o.product = p.name GROUP BY o.product ORDER BY o.product' 2>/dev/null) + \\expected=$(printf 'Doohickey,hardware,200.0\nGadget,electronics,125.5\nThingamajig,electronics,300.0\nWidget,hardware,345.25') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_mixed_join.step); + + // Fixture test 14: CSV file + NDJSON stdin mix + const fixture_csv_ndjson_mix = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/events.ndjson | ./zig-out/bin/sql-pipe -I ndjson tests/fixtures/customers.csv \ + \\ 'SELECT c.name, e.event FROM t e JOIN customers c ON LOWER(e.user) = LOWER(c.name) ORDER BY c.name, e.event') + \\expected=$(printf 'Alice,login\nAlice,logout\nAlice,purchase\nBob,purchase\nCarol,login') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_csv_ndjson_mix.step); + + // Fixture test 15: --columns with fixture file (via stdin) + const fixture_columns = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/orders.csv | ./zig-out/bin/sql-pipe --columns) + \\expected=$(printf 'id\ncustomer_id\nproduct\namount\ndate') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_columns.step); + + // Fixture test 16: --validate with fixture file (via stdin) + const fixture_validate = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/orders.csv | ./zig-out/bin/sql-pipe --validate) + \\echo "$result" | grep -q 'OK: 7 rows, 5 columns' + }); + fixture_test_step.dependOn(&fixture_validate.step); + + // Fixture test 17: --sample with fixture file (via stdin) + const fixture_sample = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(cat tests/fixtures/orders.csv | ./zig-out/bin/sql-pipe --sample 2 2>/dev/null) + \\expected=$(printf 'id,customer_id,product,amount,date\n1,1,Widget,150.00,2024-01-15\n2,2,Gadget,80.50,2024-02-20') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_sample.step); + + // Fixture test 18: --output with fixture file + const fixture_output = b.addSystemCommand(&.{ + "bash", "-c", + \\tmp=$(mktemp) + \\./zig-out/bin/sql-pipe tests/fixtures/orders.csv --output "$tmp" 'SELECT COUNT(*) FROM orders' + \\result=$(cat "$tmp") + \\rm -f "$tmp" + \\[ "$result" = "7" ] + }); + fixture_test_step.dependOn(&fixture_output.step); + + // Fixture test 19: JSON output from CSV fixture + const fixture_json_output = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/customers.csv --json 'SELECT name, region FROM customers ORDER BY name') + \\expected='[{"name":"Alice","region":"East"},{"name":"Bob","region":"West"},{"name":"Carol","region":"East"}]' + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_json_output.step); + + // Fixture test 20: --header with fixture file + const fixture_header = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/customers.csv --header 'SELECT name FROM customers ORDER BY name') + \\expected=$(printf 'name\nAlice\nBob\nCarol') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_header.step); + + // Fixture test 21: --columns with file argument + const fixture_columns_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv --columns) + \\expected=$(printf 'id\ncustomer_id\nproduct\namount\ndate') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_columns_file.step); + + // Fixture test 22: --validate with file argument + const fixture_validate_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv --validate) + \\echo "$result" | grep -q 'OK: 7 rows, 5 columns' + }); + fixture_test_step.dependOn(&fixture_validate_file.step); + + // Fixture test 23: --sample with file argument + const fixture_sample_file = b.addSystemCommand(&.{ + "bash", "-c", + \\result=$(./zig-out/bin/sql-pipe tests/fixtures/orders.csv --sample 2 2>/dev/null) + \\expected=$(printf 'id,customer_id,product,amount,date\n1,1,Widget,150.00,2024-01-15\n2,2,Gadget,80.50,2024-02-20') + \\[ "$result" = "$expected" ] + }); + fixture_test_step.dependOn(&fixture_sample_file.step); + const unit_tests = b.addTest(.{ .root_module = b.createModule(.{ .root_source_file = b.path("src/csv.zig"), diff --git a/docs/sql-pipe.1.scd b/docs/sql-pipe.1.scd index 075671d..82458e3 100644 --- a/docs/sql-pipe.1.scd +++ b/docs/sql-pipe.1.scd @@ -1,27 +1,39 @@ sql-pipe(1) NAME - sql-pipe - Execute SQL queries on CSV data from stdin + sql-pipe - Execute SQL queries on CSV/JSON/XML data from stdin or files SYNOPSIS + *sql-pipe* [OPTIONS] [...] *sql-pipe* [OPTIONS] - *sql-pipe* --columns [OPTIONS] - *sql-pipe* --sample [] [OPTIONS] + *sql-pipe* --columns [OPTIONS] [] + *sql-pipe* --sample [] [OPTIONS] [] DESCRIPTION - sql-pipe reads CSV data from standard input, loads it into an in-memory SQLite - database table named *t*, executes the provided SQL query, and prints the results - as CSV (or JSON when *--json* is used) to standard output. - - This tool is useful for quick data transformations, filtering, grouping, and - aggregations on CSV files without manual SQL database setup. - - All input columns are automatically loaded into the table with names derived from - the CSV header row. By default, column types (TEXT, INTEGER, REAL, DATE, DATETIME) - are inferred from the first 100 rows of data. DATE and DATETIME values are - normalized to ISO 8601 on insert, enabling SQLite date functions (*date()*, - *strftime()*, *julianday()*) to work directly on those columns. Use - *--no-type-inference* to disable this behavior and treat all columns as TEXT. + sql-pipe reads CSV, JSON, NDJSON, or XML data from standard input and/or + file arguments, loads it into an in-memory SQLite database, executes the + provided SQL query, and prints the results as CSV (or JSON/XML when + requested) to standard output. + + When reading from stdin, data is loaded into a table named *t*. When files + are passed as positional arguments, each file becomes a table named after + its basename without extension (e.g., *orders.csv* becomes table *orders*). + Stdin and file arguments can be combined — stdin is always table *t*. + + Input format for files is auto-detected from the file extension (*.csv*, + *.tsv*, *.json*, *.ndjson*, *.xml*). Files without a recognized extension + fall back to the *-I* flag value (default: CSV). + + This tool is useful for quick data transformations, filtering, grouping, + aggregations, and multi-file joins without manual SQL database setup. + + All input columns are automatically loaded into the table with names derived + from the CSV header row (or JSON keys). By default, column types (TEXT, + INTEGER, REAL, DATE, DATETIME) are inferred from the first 100 rows of data. + DATE and DATETIME values are normalized to ISO 8601 on insert, enabling + SQLite date functions (*date()*, *strftime()*, *julianday()*) to work + directly on those columns. Use *--no-type-inference* to disable this + behavior and treat all columns as TEXT. By default, input fields are parsed as comma-separated values. Use *--delimiter* (or *-d*) to parse other delimiters (1–8 characters), or *--tsv* @@ -126,8 +138,13 @@ OPTIONS *-V, --version* Print the version number and exit with code 0. + *--* + End of options. All remaining arguments are treated as positional + arguments (files or query), even if they start with a dash. Use this + when a filename begins with *-*. + EXAMPLES - Select all rows: + Select all rows from stdin: $ cat people.csv | sql-pipe 'SELECT name, age FROM t' @@ -135,6 +152,21 @@ EXAMPLES Alice,30++ Bob,25 + Query a file directly (table named after basename): + + $ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' + + Join two files: + + $ sql-pipe orders.csv customers.csv ++ + 'SELECT c.name, SUM(o.amount) FROM orders o ++ + JOIN customers c ON o.cust_id = c.id GROUP BY c.name' + + Mix stdin (as table t) with a file argument: + + $ cat events.csv | sql-pipe users.csv ++ + 'SELECT * FROM t JOIN users ON t.uid = users.id' + Group data by region and sum revenue: $ cat sales.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region' diff --git a/src/args.zig b/src/args.zig index a81fec1..86b92d9 100644 --- a/src/args.zig +++ b/src/args.zig @@ -18,6 +18,12 @@ pub const ExitCode = enum(u8) { sql_error = 3, }; +pub const FileInput = struct { + path: []const u8, + table_name: []const u8, + format: InputFormat, +}; + pub const SqlPipeError = error{ MissingQuery, InvalidDelimiter, @@ -52,11 +58,16 @@ pub const SqlPipeError = error{ SampleWithValidate, SampleWithOutput, InvalidSampleCount, + DuplicateTableName, }; pub const ParsedArgs = struct { - /// SQL query to execute against table `t`. + /// SQL query to execute. query: []const u8, + /// Input files as positional arguments; empty when reading from stdin only. + files: []const FileInput = &.{}, + /// True when stdin has piped data (not a TTY). + has_stdin: bool = false, /// Infer column types from the first 100 buffered rows when true. type_inference: bool, /// CSV field delimiter — 1 to 8 bytes (default: ","). @@ -92,6 +103,8 @@ pub const ParsedArgs = struct { }; pub const ColumnsArgs = struct { + /// Input files as positional arguments; empty when reading from stdin only. + files: []const FileInput = &.{}, /// CSV field delimiter — 1 to 8 bytes (default: ","). delimiter: []const u8, /// Show inferred type alongside name when true. @@ -107,6 +120,8 @@ pub const ColumnsArgs = struct { }; pub const ValidateArgs = struct { + /// Input files as positional arguments; empty when reading from stdin only. + files: []const FileInput = &.{}, /// CSV field delimiter — 1 to 8 bytes (default: ","). delimiter: []const u8, /// Infer column types from the first 100 buffered rows when true. @@ -122,6 +137,8 @@ pub const ValidateArgs = struct { }; pub const SampleArgs = struct { + /// Input files as positional arguments; empty when reading from stdin only. + files: []const FileInput = &.{}, /// CSV field delimiter — 1 to 8 bytes (default: ","). delimiter: []const u8, /// Input format (default: csv). @@ -150,9 +167,14 @@ pub const ArgsResult = union(enum) { pub fn printUsage(writer: *std.Io.Writer) !void { try writer.writeAll( \\Usage: sql-pipe [OPTIONS] + \\ sql-pipe [OPTIONS] ... \\ - \\Reads input from stdin, loads it into an in-memory SQLite table `t`, - \\runs , and prints results to stdout. + \\Reads input from stdin and/or file arguments, loads each into an in-memory + \\SQLite table, runs , and prints results to stdout. + \\ + \\File arguments: each becomes a table named after its basename (sans extension). + \\Stdin (when piped) is always loaded into table `t`. Combine files with stdin + \\for joins: e.g. cat data.csv | sql-pipe lookup.csv 'SELECT ... FROM t JOIN lookup ...' \\ \\Options: \\ -d, --delimiter Input field delimiter for CSV: 1–8 chars (default: ,) @@ -199,6 +221,9 @@ pub fn printUsage(writer: *std.Io.Writer) !void { \\ cat data.tsv | sql-pipe --tsv 'SELECT * FROM t' \\ cat data.psv | sql-pipe -d '|' 'SELECT * FROM t' \\ cat data.csv | sql-pipe 'SELECT region, SUM(revenue) FROM t GROUP BY region' + \\ sql-pipe orders.csv 'SELECT * FROM orders WHERE amount > 100' + \\ sql-pipe orders.csv customers.csv 'SELECT c.name, SUM(o.amount) FROM orders o JOIN customers c ON o.cust_id = c.id GROUP BY c.name' + \\ cat events.csv | sql-pipe users.csv 'SELECT * FROM t JOIN users ON t.uid = users.id' \\ cat data.csv | sql-pipe --output-format json 'SELECT * FROM t' \\ cat data.json | sql-pipe --input-format json 'SELECT * FROM t' \\ cat data.ndjson | sql-pipe -I ndjson -O ndjson 'SELECT name FROM t WHERE age > 18' @@ -230,7 +255,7 @@ pub fn isValidXmlName(s: []const u8) bool { return true; } -pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { +pub fn parseArgs(allocator: std.mem.Allocator, args: []const [:0]const u8) (SqlPipeError || std.mem.Allocator.Error)!ArgsResult { var query: ?[]const u8 = null; var type_inference = true; var delimiter: []const u8 = ","; @@ -252,6 +277,9 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { var sample_mode = false; var sample_n: usize = 10; var disk = false; + var seen_dashdash = false; + var positional_args: std.ArrayList([]const u8) = .empty; + defer positional_args.deinit(allocator); // Loop invariant I: all args[1..i] have been processed; // query holds the first non-flag argument seen, or null; @@ -261,12 +289,19 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { // output_format reflects the last --output-format/--json flag seen; // input_format reflects the last --input-format flag seen; // max_rows reflects the presence of --max-rows; - // disk reflects the presence of --disk + // disk reflects the presence of --disk; + // positional_args accumulates non-flag arguments for later + // conversion into file inputs and the query string; + // files is built from positional_args after the loop // Bounding function: args.len - i var i: usize = 1; while (i < args.len) : (i += 1) { const arg = args[i]; - if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) { + if (seen_dashdash) { + try positional_args.append(allocator, arg); + } else if (std.mem.eql(u8, arg, "--")) { + seen_dashdash = true; + } else if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) { return .help; } else if (std.mem.eql(u8, arg, "--version") or std.mem.eql(u8, arg, "-V")) { return .version; @@ -374,7 +409,51 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { } else if (std.mem.eql(u8, arg, "--disk")) { disk = true; } else { - if (query == null) query = arg; + try positional_args.append(allocator, arg); + } + } + + // ─── Convert positional args to files + query ────────────────────────── + const pos = positional_args.items; + const is_special_mode = list_columns or validate or sample_mode; + + // Build file list from positional args + var files: std.ArrayList(FileInput) = .empty; + + if (pos.len > 0) { + if (is_special_mode) { + // Special modes: every positional arg is a file input + for (pos) |p| { + const name = try tableNameFromPath(allocator, p); + const fmt = InputFormat.fromExtension(p) orelse input_format; + try files.append(allocator, .{ + .path = p, + .table_name = name, + .format = fmt, + }); + } + } else { + // Normal mode: last positional is the query, rest are files + query = pos[pos.len - 1]; + for (pos[0 .. pos.len - 1]) |p| { + const name = try tableNameFromPath(allocator, p); + const fmt = InputFormat.fromExtension(p) orelse input_format; + try files.append(allocator, .{ + .path = p, + .table_name = name, + .format = fmt, + }); + } + } + } + + // Check for duplicate table names (would cause conflicting table definitions) + { + var seen = std.StringHashMap(void).init(allocator); + defer seen.deinit(); + for (files.items) |f| { + const gop = try seen.getOrPut(f.table_name); + if (gop.found_existing) return error.DuplicateTableName; } } @@ -439,6 +518,7 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { // --columns mode: list headers and exit if (list_columns) return .{ .columns = ColumnsArgs{ + .files = files.items, .delimiter = delimiter, .verbose = verbose, .input_format = input_format, @@ -450,6 +530,7 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { // --validate mode: parse CSV and print summary if (validate) return .{ .validate = ValidateArgs{ + .files = files.items, .delimiter = delimiter, .type_inference = type_inference, .input_format = input_format, @@ -461,6 +542,7 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { // --sample mode: print schema + first n rows and exit if (sample_mode) return .{ .sample = SampleArgs{ + .files = files.items, .delimiter = delimiter, .input_format = input_format, .n = sample_n, @@ -469,6 +551,7 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { return .{ .parsed = ParsedArgs{ .query = query orelse return error.MissingQuery, + .files = files.items, .type_inference = type_inference, .delimiter = delimiter, .header = header, @@ -486,3 +569,9 @@ pub fn parseArgs(args: []const [:0]const u8) SqlPipeError!ArgsResult { .disk = disk, } }; } + +/// Derive a table name from a file path (basename without extension). +fn tableNameFromPath(allocator: std.mem.Allocator, path: []const u8) (std.mem.Allocator.Error)![]const u8 { + const stem = std.fs.path.stem(path); + return allocator.dupe(u8, stem); +} diff --git a/src/format.zig b/src/format.zig index acc3412..5140a27 100644 --- a/src/format.zig +++ b/src/format.zig @@ -28,6 +28,15 @@ pub const InputFormat = enum { pub fn parse(s: []const u8) error{InvalidInputFormat}!InputFormat { return std.meta.stringToEnum(InputFormat, s) orelse error.InvalidInputFormat; } + + /// Detect input format from file extension. + /// Returns null for unrecognized extensions. + pub fn fromExtension(filename: []const u8) ?InputFormat { + const ext = std.fs.path.extension(filename); + if (ext.len == 0) return null; + const ext_no_dot = ext[1..]; // skip the leading '.' + return std.meta.stringToEnum(InputFormat, ext_no_dot); + } }; // ─── Output format ───────────────────────────────────── diff --git a/src/json.zig b/src/json.zig index 8cedb02..a22a2c4 100644 --- a/src/json.zig +++ b/src/json.zig @@ -194,11 +194,11 @@ pub fn navigateJsonPath( // ─── Input loading ──────────────────────────────────── -/// loadJsonArray(allocator, reader, db, max_rows, json_path, stderr_writer) → usize +/// loadJsonArray(allocator, reader, db, table_name, max_rows, json_path, stderr_writer) → usize /// /// Pre: reader is positioned at the start of a JSON document /// db is an open, empty SQLite database -/// Post: table `t` is created with TEXT columns derived from the first object's keys; +/// Post: table is created with TEXT columns derived from the first object's keys; /// all elements of the JSON array are inserted as rows /// result = number of rows inserted /// aborts the process on any parse, I/O, or SQL error @@ -206,6 +206,7 @@ pub fn loadJsonArray( allocator: std.mem.Allocator, reader: *std.Io.Reader, db: *c.sqlite3, + table_name: []const u8, max_rows: ?usize, json_path: ?[]const u8, stderr_writer: *std.Io.Writer, @@ -218,7 +219,7 @@ pub fn loadJsonArray( }; defer allocator.free(buf); - if (buf.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); + if (buf.len == 0) return 0; // Empty input - return 0 rows gracefully var parsed = std.json.parseFromSlice(std.json.Value, allocator, buf, .{}) catch fatal("failed to parse JSON input", stderr_writer, .csv_error, .{}); @@ -237,7 +238,7 @@ pub fn loadJsonArray( fatal("JSON input must be an array of objects", stderr_writer, .csv_error, .{}), }; - if (array.items.len == 0) fatal("empty JSON array: cannot determine column names", stderr_writer, .csv_error, .{}); + if (array.items.len == 0) return 0; // Empty array - return 0 rows gracefully // Extract column names from the first object's keys (insertion order) const first_obj = switch (array.items[0]) { @@ -255,10 +256,10 @@ pub fn loadJsonArray( if (cols.items.len == 0) fatal("first JSON object has no keys", stderr_writer, .csv_error, .{}); // Create all-TEXT table (column names are owned by parsed arena — valid until parsed.deinit()) - createAllTextTable(allocator, db, cols.items, stderr_writer); + createAllTextTable(allocator, db, table_name, cols.items, stderr_writer); beginTransaction(db, stderr_writer); - const stmt = prepareInsertStmt(allocator, db, cols.items.len, stderr_writer); + const stmt = prepareInsertStmt(allocator, db, table_name, cols.items.len, stderr_writer); defer _ = c.sqlite3_finalize(stmt); var rows_inserted: usize = 0; @@ -282,11 +283,11 @@ pub fn loadJsonArray( return rows_inserted; } -/// loadNdjsonInput(allocator, reader, db, max_rows, stderr_writer) → usize +/// loadNdjsonInput(allocator, reader, db, table_name, max_rows, stderr_writer) → usize /// /// Pre: reader is positioned at the start of a newline-delimited JSON stream /// db is an open, empty SQLite database -/// Post: table `t` is created with TEXT columns derived from the first non-blank line; +/// Post: table is created with TEXT columns from the first non-blank line; /// every non-blank line is parsed as a JSON object and inserted as a row /// result = number of rows inserted /// aborts the process on any parse, I/O, or SQL error @@ -294,6 +295,7 @@ pub fn loadNdjsonInput( allocator: std.mem.Allocator, reader: *std.Io.Reader, db: *c.sqlite3, + table_name: []const u8, max_rows: ?usize, stderr_writer: *std.Io.Writer, ) usize { @@ -356,11 +358,11 @@ pub fn loadNdjsonInput( fatal("out of memory", stderr_writer, .csv_error, .{}); const cols_const: []const []const u8 = @ptrCast(cols_owned.?); - createAllTextTable(allocator, db, cols_const, stderr_writer); + createAllTextTable(allocator, db, table_name, cols_const, stderr_writer); beginTransaction(db, stderr_writer); in_transaction = true; - insert_stmt = prepareInsertStmt(allocator, db, cols_owned.?.len, stderr_writer); + insert_stmt = prepareInsertStmt(allocator, db, table_name, cols_owned.?.len, stderr_writer); } rows_inserted += 1; @@ -375,7 +377,7 @@ pub fn loadNdjsonInput( } if (cols_owned == null) - fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); + return 0; // Empty NDJSON input - return 0 rows gracefully if (in_transaction) commitTransaction(db, stderr_writer); return rows_inserted; diff --git a/src/loader.zig b/src/loader.zig index a3085a1..a121448 100644 --- a/src/loader.zig +++ b/src/loader.zig @@ -591,10 +591,11 @@ pub fn printProgress(writer: *std.Io.Writer, n: usize, max_rows: ?usize) void { writer.flush() catch |err| std.log.err("failed to flush progress: {}", .{err}); } -/// loadCsvInput loads all CSV rows from stdin into db table `t`. -/// Pre: db is an open in-memory SQLite handle with no tables yet +/// loadCsvInput loads CSV rows from a reader into db table `table_name`. +/// Pre: db is an open SQLite handle with no tables yet /// parsed.delimiter is valid; allocator and writers are valid -/// Post: table `t` exists in db with columns inferred from the CSV header; +/// reader is positioned at the start of CSV data +/// Post: table exists in db with columns inferred from the CSV header; /// all CSV rows have been inserted; transaction has been committed /// returns rows_inserted (data rows only, header not counted) /// on error: writes message to stderr_writer and exits with appropriate code @@ -602,17 +603,21 @@ pub fn loadCsvInput( allocator: std.mem.Allocator, io: std.Io, db: *c.sqlite3, + table_name: []const u8, + reader: *std.Io.Reader, parsed: args_mod.ParsedArgs, stderr_writer: *std.Io.Writer, ) usize { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, parsed.delimiter); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, reader, parsed.delimiter); const header_record = csv_reader.nextRecord() catch |err| switch (err) { error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), else => fatal("row 1: failed to parse CSV header", stderr_writer, .csv_error, .{}), - } orelse fatal("empty input (no header row)", stderr_writer, .csv_error, .{}); + } orelse { + // Empty input - return 0 rows instead of failing + // This allows graceful handling when stdin is /dev/null and we have file arguments + return 0; + }; defer csv_reader.freeRecord(header_record); const cols = parseHeader(allocator, header_record, stderr_writer) catch |err| switch (err) { @@ -671,11 +676,11 @@ pub fn loadCsvInput( // ─── Phase 2: create table and insert rows ──────────────────────────────── - sqlite_mod.createTable(allocator, db, cols, types, stderr_writer); + sqlite_mod.createTable(allocator, db, table_name, cols, types, stderr_writer); sqlite_mod.beginTransaction(db, stderr_writer); - const stmt = sqlite_mod.prepareInsertStmt(allocator, db, num_cols, stderr_writer); + const stmt = sqlite_mod.prepareInsertStmt(allocator, db, table_name, num_cols, stderr_writer); defer _ = c.sqlite3_finalize(stmt); const is_tty = std.Io.File.isTty(std.Io.File.stderr(), io) catch false; @@ -689,7 +694,7 @@ pub fn loadCsvInput( fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } insertRowTyped(stmt, row, types, @intCast(num_cols)) catch - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + fatalSqlWithContext(allocator, db, table_name, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); } @@ -721,7 +726,7 @@ pub fn loadCsvInput( fatal("input exceeds --max-rows limit ({d} rows)", stderr_writer, .usage, .{limit}); } insertRowTyped(stmt, record, types, @intCast(num_cols)) catch - fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + fatalSqlWithContext(allocator, db, table_name, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); if (is_tty and rows_inserted % progress_interval == 0) printProgress(stderr_writer, rows_inserted, parsed.max_rows); } diff --git a/src/main.zig b/src/main.zig index f4a35d0..94fa235 100644 --- a/src/main.zig +++ b/src/main.zig @@ -32,7 +32,7 @@ const InputFormat = format.InputFormat; const OutputFormat = format.OutputFormat; /// execQuery(db, query, allocator, writer, header, output_format) → !void -/// Pre: db is open with table `t` populated +/// Pre: db is open with tables populated /// query is a valid SQL string (not null-terminated) /// allocator is valid /// when output_format = .json or .ndjson, header must not be set (caller's responsibility) @@ -94,32 +94,93 @@ fn run( defer _ = c.sqlite3_close(db); const start_ts = std.Io.Timestamp.now(io, .awake); + var total_rows: usize = 0; - // Load input into `t` — dispatch on input format - const rows_inserted: usize = switch (parsed.input_format) { - .csv => loadCsvInput(allocator, io, db, parsed, stderr_writer), - .tsv => blk: { - // TSV is CSV with tab delimiter; override delimiter and reuse the CSV loader - var tsv_parsed = parsed; - tsv_parsed.delimiter = "\t"; - break :blk loadCsvInput(allocator, io, db, tsv_parsed, stderr_writer); - }, - .json => blk: { - var stdin_buf: [4096]u8 = undefined; - var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - break :blk json.loadJsonArray(allocator, &stdin_reader.interface, db, parsed.max_rows, parsed.json_path, stderr_writer); - }, - .ndjson => blk: { - var stdin_buf: [4096]u8 = undefined; - var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - break :blk json.loadNdjsonInput(allocator, &stdin_reader.interface, db, parsed.max_rows, stderr_writer); - }, - .xml => blk: { - var stdin_buf: [4096]u8 = undefined; - var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - break :blk xml.loadXmlInput(allocator, &stdin_reader.interface, db, parsed.xml_root_input, parsed.xml_row_input, parsed.max_rows, stderr_writer); - }, - }; + // Load each file argument into its named table + for (parsed.files) |file_input| { + const rows = switch (file_input.format) { + .csv => blk: { + var file_buf: [4096]u8 = undefined; + const file = std.Io.Dir.openFile(std.Io.Dir.cwd(), io, file_input.path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ file_input.path, @errorName(err) }); + defer std.Io.File.close(file, io); + var file_reader = std.Io.File.reader(file, io, &file_buf); + break :blk loadCsvInput(allocator, io, db, file_input.table_name, &file_reader.interface, parsed, stderr_writer); + }, + .tsv => blk: { + var file_buf: [4096]u8 = undefined; + const file = std.Io.Dir.openFile(std.Io.Dir.cwd(), io, file_input.path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ file_input.path, @errorName(err) }); + defer std.Io.File.close(file, io); + var file_reader = std.Io.File.reader(file, io, &file_buf); + var tsv_parsed = parsed; + tsv_parsed.delimiter = "\t"; + break :blk loadCsvInput(allocator, io, db, file_input.table_name, &file_reader.interface, tsv_parsed, stderr_writer); + }, + .json => blk: { + var file_buf: [4096]u8 = undefined; + const file = std.Io.Dir.openFile(std.Io.Dir.cwd(), io, file_input.path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ file_input.path, @errorName(err) }); + defer std.Io.File.close(file, io); + var file_reader = std.Io.File.reader(file, io, &file_buf); + break :blk json.loadJsonArray(allocator, &file_reader.interface, db, file_input.table_name, parsed.max_rows, parsed.json_path, stderr_writer); + }, + .ndjson => blk: { + var file_buf: [4096]u8 = undefined; + const file = std.Io.Dir.openFile(std.Io.Dir.cwd(), io, file_input.path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ file_input.path, @errorName(err) }); + defer std.Io.File.close(file, io); + var file_reader = std.Io.File.reader(file, io, &file_buf); + break :blk json.loadNdjsonInput(allocator, &file_reader.interface, db, file_input.table_name, parsed.max_rows, stderr_writer); + }, + .xml => blk: { + var file_buf: [4096]u8 = undefined; + const file = std.Io.Dir.openFile(std.Io.Dir.cwd(), io, file_input.path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ file_input.path, @errorName(err) }); + defer std.Io.File.close(file, io); + var file_reader = std.Io.File.reader(file, io, &file_buf); + break :blk xml.loadXmlInput(allocator, &file_reader.interface, db, file_input.table_name, parsed.xml_root_input, parsed.xml_row_input, parsed.max_rows, stderr_writer); + }, + }; + if (rows == 0) { + fatal("empty input file: '{s}'", stderr_writer, .csv_error, .{file_input.path}); + } + total_rows += rows; + } + + // Load stdin as `t` if piped + if (parsed.has_stdin) { + const rows = switch (parsed.input_format) { + .csv => blk: { + var stdin_buf: [4096]u8 = undefined; + var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + break :blk loadCsvInput(allocator, io, db, "t", &stdin_reader.interface, parsed, stderr_writer); + }, + .tsv => blk: { + var stdin_buf: [4096]u8 = undefined; + var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var tsv_parsed = parsed; + tsv_parsed.delimiter = "\t"; + break :blk loadCsvInput(allocator, io, db, "t", &stdin_reader.interface, tsv_parsed, stderr_writer); + }, + .json => blk: { + var stdin_buf: [4096]u8 = undefined; + var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + break :blk json.loadJsonArray(allocator, &stdin_reader.interface, db, "t", parsed.max_rows, parsed.json_path, stderr_writer); + }, + .ndjson => blk: { + var stdin_buf: [4096]u8 = undefined; + var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + break :blk json.loadNdjsonInput(allocator, &stdin_reader.interface, db, "t", parsed.max_rows, stderr_writer); + }, + .xml => blk: { + var stdin_buf: [4096]u8 = undefined; + var stdin_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + break :blk xml.loadXmlInput(allocator, &stdin_reader.interface, db, "t", parsed.xml_root_input, parsed.xml_row_input, parsed.max_rows, stderr_writer); + }, + }; + total_rows += rows; + } // Print row count and elapsed time to stderr when stderr is a TTY or --verbose is set. const is_tty = std.Io.File.isTty(std.Io.File.stderr(), io) catch false; @@ -128,10 +189,10 @@ fn run( const elapsed_ns: i96 = end_ts.nanoseconds - start_ts.nanoseconds; const elapsed_ms: u64 = @intCast(@max(@as(i96, 0), @divTrunc(elapsed_ns, std.time.ns_per_ms))); var count_buf: [32]u8 = undefined; - const count_str = fmtThousands(&count_buf, rows_inserted); + const count_str = fmtThousands(&count_buf, total_rows); const secs = elapsed_ms / 1000; const frac = (elapsed_ms % 1000) / 100; - if (is_tty and rows_inserted >= progress_interval) { + if (is_tty and total_rows >= progress_interval) { stderr_writer.writeAll("\r\x1b[K") catch |err| std.log.err("failed to clear progress line: {}", .{err}); } stderr_writer.print("Loaded {s} rows in {d}.{d}s\n", .{ count_str, secs, frac }) catch |err| { @@ -140,9 +201,12 @@ fn run( stderr_writer.flush() catch |err| std.log.err("failed to flush stderr: {}", .{err}); } + // Determine which table to show column context for on error + const main_table: []const u8 = if (parsed.files.len > 0) parsed.files[0].table_name else "t"; + execQuery(allocator, db, query, stdout_writer, parsed.header, parsed.output_format, parsed.xml_root, parsed.xml_row) catch { stdout_writer.flush() catch |err| std.log.err("failed to flush output before fatal: {}", .{err}); - sqlite_mod.fatalSqlWithContext(allocator, db, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); + sqlite_mod.fatalSqlWithContext(allocator, db, main_table, std.mem.span(c.sqlite3_errmsg(db)), stderr_writer); }; } @@ -166,7 +230,13 @@ pub fn main(init: std.process.Init.Minimal) void { const args = init.args.toSlice(args_arena.allocator()) catch fatal("failed to read process arguments", stderr_writer, .usage, .{}); - const args_result = parseArgs(args) catch |err| { + // Determine whether stdin has piped data available. + // Strategy: check if stdin is a TTY. If not, assume data is available. + // However, if we have file arguments, only load stdin if it's explicitly piped + // (not a TTY). This handles CI environments where stdin is /dev/null. + const has_stdin = !(std.Io.File.isTty(std.Io.File.stdin(), io.io()) catch false); + + const args_result = parseArgs(args_arena.allocator(), args) catch |err| { switch (err) { error.IncompatibleFlags => fatal("--header cannot be combined with non-CSV/TSV output format", stderr_writer, .usage, .{}), error.SilentVerboseConflict => fatal("--silent cannot be combined with --verbose", stderr_writer, .usage, .{}), @@ -189,6 +259,7 @@ pub fn main(init: std.process.Init.Minimal) void { error.MissingJsonFlagValue => fatal("--json-path requires a value", stderr_writer, .usage, .{}), error.JsonPathRequiresJson => fatal("--json-path requires -I json", stderr_writer, .usage, .{}), error.InvalidXmlName => fatal("--xml-root and --xml-row must be valid XML element names (letter/underscore first, then letters/digits/-/._/:)", stderr_writer, .usage, .{}), + error.DuplicateTableName => fatal("duplicate table name — file arguments must have unique basenames", stderr_writer, .usage, .{}), else => {}, } printUsage(stderr_writer) catch |werr| std.log.err("failed to write usage: {}", .{werr}); @@ -238,7 +309,17 @@ pub fn main(init: std.process.Init.Minimal) void { std.log.err("failed to flush stderr: {}", .{err}); }; }, - .parsed => |parsed| { + .parsed => |mut_parsed| { + var parsed = mut_parsed; + parsed.has_stdin = has_stdin; + // Check for file-stdin table name collision (t is reserved for stdin) + if (parsed.has_stdin) { + for (parsed.files) |f| { + if (std.mem.eql(u8, f.table_name, "t")) { + fatal("duplicate table name — file arguments must have unique basenames", stderr_writer, .usage, .{}); + } + } + } if (parsed.output) |output_path| { const output_file = std.Io.Dir.createFile(std.Io.Dir.cwd(), io.io(), output_path, .{}) catch |err| { stderr_writer.print("error: cannot create output file '{s}': {s}\n", .{ output_path, @errorName(err) }) catch |werr| { diff --git a/src/modes/columns.zig b/src/modes/columns.zig index e959f60..273de35 100644 --- a/src/modes/columns.zig +++ b/src/modes/columns.zig @@ -19,12 +19,24 @@ pub fn runColumns( stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { + // Determine input source: file argument or stdin + const input_source: union(enum) { file: []const u8, stdin } = if (args.files.len > 0) + .{ .file = args.files[0].path } + else + .stdin; + switch (args.input_format) { .csv, .tsv => { const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &source_reader.interface, col_delim); const header_record = csv_reader.nextRecord() catch |err| switch (err) { error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), @@ -84,10 +96,16 @@ pub fn runColumns( } }, .json => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); - const input = stdin_file_reader.interface.allocRemaining(allocator, .unlimited) catch |err| switch (err) { + const input = source_reader.interface.allocRemaining(allocator, .unlimited) catch |err| switch (err) { error.OutOfMemory => fatal("out of memory reading JSON input", stderr_writer, .csv_error, .{}), error.ReadFailed, error.StreamTooLong => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), }; @@ -131,21 +149,30 @@ pub fn runColumns( } }, .ndjson => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); // Read until we find a non-empty line var line_num: usize = 0; while (true) { line_num += 1; - const line = json_mod.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { + const line = json_mod.readLine(allocator, &source_reader.interface) catch |err| switch (err) { error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), } orelse fatal("empty NDJSON input", stderr_writer, .csv_error, .{}); defer allocator.free(line); const trimmed = std.mem.trim(u8, line, " \t\r"); - if (trimmed.len == 0) { line_num -= 1; continue; } + if (trimmed.len == 0) { + line_num -= 1; + continue; + } var parsed = std.json.parseFromSlice(std.json.Value, allocator, trimmed, .{}) catch fatal("line 1: failed to parse NDJSON", stderr_writer, .csv_error, .{}); @@ -172,10 +199,16 @@ pub fn runColumns( } }, .xml => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); - const names = xml_mod.getXmlColumnNames(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); + const names = xml_mod.getXmlColumnNames(allocator, &source_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); defer { for (names) |name| allocator.free(name); allocator.free(names); diff --git a/src/modes/sample.zig b/src/modes/sample.zig index e6b6ceb..14aa180 100644 --- a/src/modes/sample.zig +++ b/src/modes/sample.zig @@ -20,6 +20,12 @@ pub fn runSample( stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { + // Determine input source: file argument or stdin + const input_source: union(enum) { file: []const u8, stdin } = if (args.files.len > 0) + .{ .file = args.files[0].path } + else + .stdin; + switch (args.input_format) { .json, .ndjson, .xml => fatal( "--sample is only supported with CSV and TSV input", @@ -29,9 +35,15 @@ pub fn runSample( ), .csv, .tsv => { const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &source_reader.interface, col_delim); const header_record = csv_reader.nextRecord() catch |err| switch (err) { error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), diff --git a/src/modes/validate.zig b/src/modes/validate.zig index 8b4cd62..eefa18a 100644 --- a/src/modes/validate.zig +++ b/src/modes/validate.zig @@ -22,12 +22,24 @@ pub fn runValidate( stderr_writer: *std.Io.Writer, stdout_writer: *std.Io.Writer, ) void { + // Determine input source: file argument or stdin + const input_source: union(enum) { file: []const u8, stdin } = if (args.files.len > 0) + .{ .file = args.files[0].path } + else + .stdin; + switch (args.input_format) { .csv, .tsv => { const col_delim: []const u8 = if (args.input_format == .tsv) "\t" else args.delimiter; - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); - var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &stdin_file_reader.interface, col_delim); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); + var csv_reader = csv_mod.csvReaderWithDelimiter(allocator, &source_reader.interface, col_delim); const header_record = csv_reader.nextRecord() catch |err| switch (err) { error.UnterminatedQuotedField => fatal("row 1: unterminated quoted field", stderr_writer, .csv_error, .{}), @@ -140,10 +152,16 @@ pub fn runValidate( }; }, .json => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); - const input = stdin_file_reader.interface.allocRemaining(allocator, .unlimited) catch |err| switch (err) { + const input = source_reader.interface.allocRemaining(allocator, .unlimited) catch |err| switch (err) { error.OutOfMemory => fatal("out of memory reading JSON input", stderr_writer, .csv_error, .{}), error.ReadFailed, error.StreamTooLong => fatal("failed to read JSON input", stderr_writer, .csv_error, .{}), }; @@ -201,8 +219,14 @@ pub fn runValidate( }; }, .ndjson => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); var line_num: usize = 0; var row_count: usize = 0; @@ -214,7 +238,7 @@ pub fn runValidate( while (true) { line_num += 1; - const line = json_mod.readLine(allocator, &stdin_file_reader.interface) catch |err| switch (err) { + const line = json_mod.readLine(allocator, &source_reader.interface) catch |err| switch (err) { error.OutOfMemory => fatal("out of memory reading NDJSON", stderr_writer, .csv_error, .{}), error.ReadFailed => fatal("line {d}: failed to read NDJSON", stderr_writer, .csv_error, .{line_num}), } orelse break; @@ -281,10 +305,16 @@ pub fn runValidate( }; }, .xml => { - var stdin_buf: [4096]u8 = undefined; - var stdin_file_reader = std.Io.File.reader(std.Io.File.stdin(), io, &stdin_buf); + var read_buf: [4096]u8 = undefined; + const source_file = switch (input_source) { + .file => |path| std.Io.Dir.openFile(std.Io.Dir.cwd(), io, path, .{}) catch |err| + fatal("cannot open file '{s}': {s}", stderr_writer, .csv_error, .{ path, @errorName(err) }), + .stdin => std.Io.File.stdin(), + }; + defer if (input_source == .file) std.Io.File.close(source_file, io); + var source_reader = std.Io.File.reader(source_file, io, &read_buf); - const summary = xml_mod.summarizeXml(allocator, &stdin_file_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); + const summary = xml_mod.summarizeXml(allocator, &source_reader.interface, args.xml_root_input, args.xml_row_input, stderr_writer); defer { for (summary.col_names) |name| allocator.free(name); allocator.free(summary.col_names); diff --git a/src/sqlite.zig b/src/sqlite.zig index 7485e10..c7c55a0 100644 --- a/src/sqlite.zig +++ b/src/sqlite.zig @@ -61,6 +61,17 @@ pub const ColumnType = enum { } }; +/// Append a double-quoted SQL identifier to an ArrayList, escaping embedded double quotes. +/// Exits with csv_error on OOM. +fn appendQuotedId(allocator: std.mem.Allocator, writer: *std.Io.Writer, sql: *std.ArrayList(u8), name: []const u8) void { + sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); + for (name) |ch| { + if (ch == '"') sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, ch) catch fatal("out of memory", writer, .csv_error, .{}); + } + sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); +} + /// fatal(fmt, writer, code, args) → noreturn /// /// Writes an error message to writer and exits with the given code. @@ -70,18 +81,21 @@ pub fn fatal(comptime fmt: []const u8, writer: *std.Io.Writer, code: ExitCode, a std.process.exit(@intFromEnum(code)); } -/// Create table `t` with all-TEXT columns. Column names are double-quote–escaped +/// Create a table with all-TEXT columns. Column names are double-quote–escaped /// per SQL identifier rules. pub fn createAllTextTable( allocator: std.mem.Allocator, db: *c.sqlite3, + table_name: []const u8, cols: []const []const u8, writer: *std.Io.Writer, ) void { var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, .csv_error, .{}); + sql.appendSlice(allocator, "CREATE TABLE ") catch fatal("out of memory", writer, .csv_error, .{}); + appendQuotedId(allocator, writer, &sql, table_name); + sql.appendSlice(allocator, " (") catch fatal("out of memory", writer, .csv_error, .{}); for (cols, 0..) |col, i| { if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, .csv_error, .{}); sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); @@ -102,17 +116,20 @@ pub fn createAllTextTable( } } -/// Prepare `INSERT INTO t VALUES (?, …, ?)` with n parameters. +/// Prepare `INSERT INTO VALUES (?, …, ?)` with n parameters. pub fn prepareInsertStmt( allocator: std.mem.Allocator, db: *c.sqlite3, + table_name: []const u8, n: usize, writer: *std.Io.Writer, ) *c.sqlite3_stmt { var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "INSERT INTO t VALUES (") catch fatal("out of memory", writer, .csv_error, .{}); + sql.appendSlice(allocator, "INSERT INTO ") catch fatal("out of memory", writer, .csv_error, .{}); + appendQuotedId(allocator, writer, &sql, table_name); + sql.appendSlice(allocator, " VALUES (") catch fatal("out of memory", writer, .csv_error, .{}); for (0..n) |i| { if (i > 0) sql.append(allocator, ',') catch fatal("out of memory", writer, .csv_error, .{}); sql.append(allocator, '?') catch fatal("out of memory", writer, .csv_error, .{}); @@ -178,18 +195,19 @@ pub fn openDb(disk: bool, writer: *std.Io.Writer) *c.sqlite3 { return db.?; } -/// createTable(allocator, db, cols, types, writer) → void +/// createTable(allocator, db, table_name, cols, types, writer) → void /// Pre: db is an open SQLite handle /// cols.len > 0 /// types.len = cols.len /// allocator is valid -/// Post: table `t` exists in db with cols.len columns named by cols; +/// Post: table exists in db with cols.len columns named by cols; /// each column's SQL type reflects its ColumnType value /// (INTEGER / REAL / TEXT with correct SQLite affinity) /// column identifiers are double-quote escaped per SQL syntax pub fn createTable( allocator: std.mem.Allocator, db: *c.sqlite3, + table_name: []const u8, cols: []const []const u8, types: []const ColumnType, writer: *std.Io.Writer, @@ -197,7 +215,9 @@ pub fn createTable( var sql: std.ArrayList(u8) = .empty; defer sql.deinit(allocator); - sql.appendSlice(allocator, "CREATE TABLE t (") catch fatal("out of memory", writer, .csv_error, .{}); + sql.appendSlice(allocator, "CREATE TABLE ") catch fatal("out of memory", writer, .csv_error, .{}); + appendQuotedId(allocator, writer, &sql, table_name); + sql.appendSlice(allocator, " (") catch fatal("out of memory", writer, .csv_error, .{}); for (cols, 0..) |col, i| { if (i > 0) sql.appendSlice(allocator, ", ") catch fatal("out of memory", writer, .csv_error, .{}); sql.append(allocator, '"') catch fatal("out of memory", writer, .csv_error, .{}); @@ -251,52 +271,57 @@ pub fn levenshteinDistance(a: []const u8, b: []const u8) usize { return prev[b_len]; } -/// Return column names of table `t` via PRAGMA table_info. +/// Return column names of `table_name` via PRAGMA table_info. /// Caller owns the returned slice; free each element and the slice with allocator. /// Returns empty slice on PRAGMA failure. -pub fn getTableColumns(allocator: std.mem.Allocator, db: *c.sqlite3) ![][]const u8 { +pub fn getTableColumns(allocator: std.mem.Allocator, db: *c.sqlite3, table_name: []const u8, writer: *std.Io.Writer) [][]const u8 { + var sql: std.ArrayList(u8) = .empty; + defer sql.deinit(allocator); + sql.appendSlice(allocator, "PRAGMA table_info(") catch fatal("out of memory", writer, .csv_error, .{}); + appendQuotedId(allocator, writer, &sql, table_name); + sql.append(allocator, ')') catch fatal("out of memory", writer, .csv_error, .{}); + sql.append(allocator, 0) catch fatal("out of memory", writer, .csv_error, .{}); + var stmt: ?*c.sqlite3_stmt = null; - if (c.sqlite3_prepare_v2(db, "PRAGMA table_info(t)", -1, &stmt, null) != c.SQLITE_OK) + if (c.sqlite3_prepare_v2(db, sql.items.ptr, -1, &stmt, null) != c.SQLITE_OK) return &.{}; defer _ = c.sqlite3_finalize(stmt); var cols = std.ArrayList([]const u8).empty; - errdefer { - for (cols.items) |col| allocator.free(col); - cols.deinit(allocator); - } while (c.sqlite3_step(stmt) == c.SQLITE_ROW) { // PRAGMA table_info columns: cid(0), name(1), type(2), notnull(3), dflt_value(4), pk(5) const ptr = c.sqlite3_column_text(stmt, 1); if (ptr == null) continue; const name = std.mem.span(@as([*:0]const u8, @ptrCast(ptr))); - const owned = try allocator.dupe(u8, name); - errdefer allocator.free(owned); - try cols.append(allocator, owned); + const owned = allocator.dupe(u8, name) catch fatal("out of memory", writer, .csv_error, .{}); + cols.append(allocator, owned) catch fatal("out of memory", writer, .csv_error, .{}); } - return cols.toOwnedSlice(allocator); + return cols.toOwnedSlice(allocator) catch fatal("out of memory", writer, .csv_error, .{}); } /// Print column context to writer after a SQL error. -/// Prints " table \"t\" has columns: ..." and optionally " hint: did you mean \"\"?" +/// Prints " table \"\" has columns: ..." and optionally " hint: did you mean \"\"?" /// when the error message matches "no such column: " and a column exists within edit distance 2. /// Silently returns on any failure (PRAGMA unavailable, OOM, writer error). pub fn printSqlErrorContext( allocator: std.mem.Allocator, db: *c.sqlite3, + table_name: []const u8, errmsg: []const u8, writer: *std.Io.Writer, ) void { - const columns = getTableColumns(allocator, db) catch return; + const columns = getTableColumns(allocator, db, table_name, writer); defer { for (columns) |col| allocator.free(col); allocator.free(columns); } if (columns.len == 0) return; - writer.writeAll(" table \"t\" has columns: ") catch return; + writer.writeAll(" table \"") catch return; + writer.writeAll(table_name) catch return; + writer.writeAll("\" has columns: ") catch return; for (columns, 0..) |col, i| { if (i > 0) writer.writeAll(", ") catch return; writer.writeAll(col) catch return; @@ -325,18 +350,19 @@ pub fn printSqlErrorContext( } /// Print SQL error message with column context then exit with sql_error code. -/// Pre: errmsg is the SQLite error string; db has table `t` (or PRAGMA silently fails) +/// Pre: errmsg is the SQLite error string; db has table (or PRAGMA silently fails) /// Post: stderr has "error: \n" + optional column list + optional hint; process exits 3 pub fn fatalSqlWithContext( allocator: std.mem.Allocator, db: *c.sqlite3, + table_name: []const u8, errmsg: []const u8, writer: *std.Io.Writer, ) noreturn { writer.print("error: {s}\n", .{errmsg}) catch |err| { std.log.err("failed to write error message: {}", .{err}); }; - printSqlErrorContext(allocator, db, errmsg, writer); + printSqlErrorContext(allocator, db, table_name, errmsg, writer); writer.flush() catch |err| std.log.err("failed to flush: {}", .{err}); std.process.exit(@intFromEnum(ExitCode.sql_error)); } diff --git a/src/xml.zig b/src/xml.zig index ce6cc80..cbd9802 100644 --- a/src/xml.zig +++ b/src/xml.zig @@ -815,21 +815,23 @@ pub fn summarizeXml( return .{ .row_count = row_count, .col_names = col_names.? }; } -/// loadXmlInput(allocator, reader, db, xml_root, xml_row, max_rows, stderr_writer) → usize +/// loadXmlInput(allocator, reader, db, table_name, xml_root, xml_row, max_rows, stderr_writer) → usize /// /// Pre: reader is positioned at the start of a row-based XML document /// db is an open, empty SQLite database -/// Post: table `t` is created with TEXT columns from the first row's element names; +/// Post: table is created with TEXT columns from the first row's element names; /// all row elements are inserted; transaction is committed /// result = number of rows inserted /// aborts the process on any parse, I/O, or SQL error /// +/// table_name: name of the SQLite table to create /// xml_root: when non-null, navigate to this element as the row container /// xml_row: when non-null, only elements with this tag are treated as rows pub fn loadXmlInput( allocator: std.mem.Allocator, reader: *std.Io.Reader, db: *c.sqlite3, + table_name: []const u8, xml_root: ?[]const u8, xml_row: ?[]const u8, max_rows: ?usize, @@ -841,7 +843,7 @@ pub fn loadXmlInput( error.StreamTooLong => unreachable, // .unlimited never triggers this }; defer allocator.free(buf); - if (buf.len == 0) fatal("empty input", stderr_writer, .csv_error, .{}); + if (buf.len == 0) return 0; // Empty input - return 0 rows gracefully var p = XmlParser.init(buf); p.skipPrologue(stderr_writer); @@ -897,10 +899,10 @@ pub fn loadXmlInput( fatal("first XML row element has no column children", stderr_writer, .csv_error, .{}); col_names = names.toOwnedSlice(allocator) catch fatal("out of memory", stderr_writer, .csv_error, .{}); - createAllTextTable(allocator, db, col_names.?, stderr_writer); + createAllTextTable(allocator, db, table_name, col_names.?, stderr_writer); beginTransaction(db, stderr_writer); in_transaction = true; - insert_stmt = prepareInsertStmt(allocator, db, col_names.?.len, stderr_writer); + insert_stmt = prepareInsertStmt(allocator, db, table_name, col_names.?.len, stderr_writer); } // Bind column values by name (order in row may differ from schema order) diff --git a/tests/fixtures/customers.csv b/tests/fixtures/customers.csv new file mode 100644 index 0000000..047590b --- /dev/null +++ b/tests/fixtures/customers.csv @@ -0,0 +1,4 @@ +id,name,region,joined +1,Alice,East,2023-06-15 +2,Bob,West,2023-08-22 +3,Carol,East,2024-01-10 diff --git a/tests/fixtures/events.ndjson b/tests/fixtures/events.ndjson new file mode 100644 index 0000000..09389ee --- /dev/null +++ b/tests/fixtures/events.ndjson @@ -0,0 +1,5 @@ +{"event": "login", "user": "alice", "ts": "2024-01-15T10:30:00", "duration_ms": 1250} +{"event": "purchase", "user": "bob", "ts": "2024-02-20T14:15:00", "duration_ms": 3400} +{"event": "login", "user": "carol", "ts": "2024-03-10T09:00:00", "duration_ms": 890} +{"event": "logout", "user": "alice", "ts": "2024-01-15T11:45:00", "duration_ms": 200} +{"event": "purchase", "user": "alice", "ts": "2024-03-10T16:20:00", "duration_ms": 5600} diff --git a/tests/fixtures/feed.xml b/tests/fixtures/feed.xml new file mode 100644 index 0000000..2f2652d --- /dev/null +++ b/tests/fixtures/feed.xml @@ -0,0 +1,24 @@ + + + + Tech Blog + + First Post + Alice + 150 + 2024-01-15 + + + Second Post + Bob + 80 + 2024-02-20 + + + Third Post + Carol + 200 + 2024-03-10 + + + diff --git a/tests/fixtures/orders.csv b/tests/fixtures/orders.csv new file mode 100644 index 0000000..484fe05 --- /dev/null +++ b/tests/fixtures/orders.csv @@ -0,0 +1,8 @@ +id,customer_id,product,amount,date +1,1,Widget,150.00,2024-01-15 +2,2,Gadget,80.50,2024-02-20 +3,1,Doohickey,200.00,2024-03-10 +4,3,Widget,75.25,2024-01-25 +5,2,Thingamajig,300.00,2024-04-05 +6,1,Gadget,45.00,2024-05-12 +7,3,Widget,120.00,2024-06-18 diff --git a/tests/fixtures/products.json b/tests/fixtures/products.json new file mode 100644 index 0000000..526779e --- /dev/null +++ b/tests/fixtures/products.json @@ -0,0 +1,6 @@ +[ + {"name": "Widget", "price": 25.00, "category": "hardware", "stock": 150}, + {"name": "Gadget", "price": 40.25, "category": "electronics", "stock": 75}, + {"name": "Doohickey", "price": 50.00, "category": "hardware", "stock": 30}, + {"name": "Thingamajig", "price": 60.00, "category": "electronics", "stock": 0} +]