diff --git a/.eslintrc.json b/.eslintrc.json index 8e4e9a2..d31fd86 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -12,7 +12,7 @@ "node_modules" ], "rules": { - "no-unused-vars": "off", + "no-unused-vars": "warn", "no-irregular-whitespace": "off" } } diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 342e01b..faf245d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: - name: setup Node uses: actions/setup-node@v4 with: - node-version: 20.x + node-version: 22.x - run: npm install - name: build binaries diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 03c5ae4..e870bbe 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -30,7 +30,7 @@ jobs: - name: setup Node uses: actions/setup-node@v1 with: - node-version: '20.x' + node-version: '22.x' - name: install sponge (moreutils) run: sudo apt install -y moreutils diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 52ad492..b805645 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,14 +17,8 @@ jobs: strategy: matrix: node-version: - - '20.x' - '22.x' - '24.x' - postgis-docker-tag: - - '14-3.5-alpine' - - '15-3.5-alpine' - - '16-3.5-alpine' - - '17-3.5-alpine' steps: - name: checkout @@ -33,55 +27,14 @@ jobs: uses: actions/setup-node@v4 with: node-version: ${{ matrix.node-version }} - - name: install sponge (moreutils) - run: sudo apt install -y moreutils - - name: install & start PostgreSQL with PostGIS - # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14 - # uses: huaxk/postgis-action@v1 - # with: - # postgresql version: '${{ matrix.postgis-docker-tag }}' - # postgresql password: password - # postgresql user: postgres - # postgresql db: postgres + - name: install DuckDB run: | - docker run -d \ - -e POSTGRES_USER=$PGUSER -e POSTGRES_PASSWORD=$PGPASSWORD -e POSTGRES_DB=$PGDATABASE \ - -p 5432:5432 postgis/postgis:${{ matrix.postgis-docker-tag }} \ - -c timezone=Europe/Berlin - env: - PGUSER: postgres - PGPASSWORD: password - PGDATABASE: postgres - - - name: install PostgREST - run: | - set -euo pipefail - set -x - dl_url="$( - curl -fsSL \ - -H "User-Agent: $user_agent" \ - -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' \ - 'https://api.github.com/repos/PostgREST/postgrest/releases/latest' \ - | jq -rc '.assets[] | select(.name | test("linux-static-x86-64")) | .browser_download_url' - )" - wget -nv -U "$user_agent" \ - --header='Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' \ - -O /tmp/postgrest.tar.xz \ - "$dl_url" - tar -C /usr/local/bin -J -x postgrest = '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/arrs_deps_by_station_and_time.sql b/benchmark/arrs_deps_by_station_and_time.sql index b297b68..f163fd6 100644 --- a/benchmark/arrs_deps_by_station_and_time.sql +++ b/benchmark/arrs_deps_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/arrs_deps_by_station_and_time_manual.sql b/benchmark/arrs_deps_by_station_and_time_manual.sql new file mode 100644 index 0000000..3bca576 --- /dev/null +++ b/benchmark/arrs_deps_by_station_and_time_manual.sql @@ -0,0 +1,6 @@ +SELECT * +FROM arrivals_departures +WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0.sql b/benchmark/arrs_deps_by_station_and_time_seq_0.sql index 2a2a20d..9bace6c 100644 --- a/benchmark/arrs_deps_by_station_and_time_seq_0.sql +++ b/benchmark/arrs_deps_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') AND stop_sequence = 0 diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql b/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql new file mode 100644 index 0000000..5201d1f --- /dev/null +++ b/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql @@ -0,0 +1,7 @@ +SELECT * +FROM arrivals_departures +WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' +AND stop_sequence = 0 diff --git a/benchmark/arrs_deps_by_stop_and_time.sql b/benchmark/arrs_deps_by_stop_and_time.sql index 5b26ff6..195a3aa 100644 --- a/benchmark/arrs_deps_by_stop_and_time.sql +++ b/benchmark/arrs_deps_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/arrs_deps_by_stop_and_time_manual.sql b/benchmark/arrs_deps_by_stop_and_time_manual.sql new file mode 100644 index 0000000..5a71d6d --- /dev/null +++ b/benchmark/arrs_deps_by_stop_and_time_manual.sql @@ -0,0 +1,6 @@ +SELECT * +FROM arrivals_departures +WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' diff --git a/benchmark/arrs_deps_by_time.sql b/benchmark/arrs_deps_by_time.sql index 1d01275..f7158ed 100644 --- a/benchmark/arrs_deps_by_time.sql +++ b/benchmark/arrs_deps_by_time.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone) +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND "date" >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND "date" <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/arrs_deps_by_time_manual.sql b/benchmark/arrs_deps_by_time_manual.sql index 5c4dada..74e8a01 100644 --- a/benchmark/arrs_deps_by_time_manual.sql +++ b/benchmark/arrs_deps_by_time_manual.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= '2022-08-08' -AND date <= '2022-08-09' +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' diff --git a/benchmark/arrs_deps_by_trip_and_date.sql b/benchmark/arrs_deps_by_trip_and_date.sql index 89d4609..2a90f80 100644 --- a/benchmark/arrs_deps_by_trip_and_date.sql +++ b/benchmark/arrs_deps_by_trip_and_date.sql @@ -1,4 +1,4 @@ SELECT * FROM arrivals_departures -WHERE trip_id = '168977951' -AND date > '2022-08-08' AND date <= '2022-08-09' +WHERE trip_id = '262623609' -- route_id=10144_109, route_short_name=S2 +AND date = '2025-05-27' diff --git a/benchmark/as-md.js b/benchmark/as-md.js index 2765c3c..017d1d3 100755 --- a/benchmark/as-md.js +++ b/benchmark/as-md.js @@ -1,39 +1,31 @@ #!/usr/bin/env node -const {pipeline, Transform} = require('stream') -const csvParser = require('csv-parser') -const {ok} = require('assert') +const {createInterface} = require('node:readline') -let firstRow = true +const linewise = createInterface({ + input: process.stdin, + // Note: We use the crlfDelay option to recognize all instances of CR LF as a single line break. + crlfDelay: Infinity, +}) -pipeline( - process.stdin, - csvParser(), - new Transform({ - objectMode: true, - transform: function (row, _, cb) { - if (firstRow) { - firstRow = false +;(async () => { + let firstRow = true + for await (const line of linewise) { + const row = JSON.parse(line) - const keys = Object.keys(row).filter(key => key !== 'filename') - process.stdout.write(`| ${keys.join(' | ')} |\n`) - process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`) - } + if (firstRow) { + firstRow = false - const formattedVals = Object.entries(row) - .map(([key, val]) => { - if (key === 'query') return '
' + val.replace(/\n/g, '
') + '
' - return val - }) - process.stdout.write(`| ${formattedVals.join(' | ')} |\n`) + const keys = Object.keys(row).filter(key => key !== 'filename') + process.stdout.write(`| ${keys.join(' | ')} |\n`) + process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`) + } - cb() - }, - }), - process.stdout, - (err) => { - if (!err) return; - console.error(err) - process.exit(1) - }, -) + const formattedVals = Object.entries(row) + .map(([key, val]) => { + if (key === 'query') return '
' + val.replace(/\n/g, '
') + '
' + return typeof val === 'number' && !Number.isInteger(val) ? Math.round(val * 100) / 100 : val + }) + process.stdout.write(`| ${formattedVals.join(' | ')} |\n`) + } +})() diff --git a/benchmark/connections_by_route_name_and_time.sql b/benchmark/connections_by_route_name_and_time.sql index ca5bcc0..feac3ae 100644 --- a/benchmark/connections_by_route_name_and_time.sql +++ b/benchmark/connections_by_route_name_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE route_short_name = 'S1' -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time.sql b/benchmark/connections_by_station_and_time.sql index 861108e..6e68e61 100644 --- a/benchmark/connections_by_station_and_time.sql +++ b/benchmark/connections_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections -WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time_seq_0.sql b/benchmark/connections_by_station_and_time_seq_0.sql index 7eaa73d..40c19b2 100644 --- a/benchmark/connections_by_station_and_time_seq_0.sql +++ b/benchmark/connections_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM connections -WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') -AND from_stop_sequence = 0 +WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') +AND from_stop_sequence_consec = 0 diff --git a/benchmark/connections_by_stop_and_time.sql b/benchmark/connections_by_stop_and_time.sql index 7baf415..e161f36 100644 --- a/benchmark/connections_by_stop_and_time.sql +++ b/benchmark/connections_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_time.sql b/benchmark/connections_by_time.sql index de4dff1..8b7205c 100644 --- a/benchmark/connections_by_time.sql +++ b/benchmark/connections_by_time.sql @@ -1,7 +1,7 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone) +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) ORDER BY t_departure LIMIT 100 diff --git a/benchmark/connections_by_time_manual.sql b/benchmark/connections_by_time_manual.sql index c483d02..4a2dc73 100644 --- a/benchmark/connections_by_time_manual.sql +++ b/benchmark/connections_by_time_manual.sql @@ -1,7 +1,6 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= '2022-08-08' -AND date <= '2022-08-09' +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' AND date <= '2025-05-27' ORDER BY t_departure LIMIT 100 diff --git a/benchmark/connections_by_trip_and_date.sql b/benchmark/connections_by_trip_and_date.sql index 93ef135..c5ece9b 100644 --- a/benchmark/connections_by_trip_and_date.sql +++ b/benchmark/connections_by_trip_and_date.sql @@ -1,4 +1,4 @@ SELECT * FROM connections -WHERE trip_id = '168977951' -AND date > '2022-08-08' AND date <= '2022-08-09' +WHERE trip_id = '262535123' -- route_id=17452_900 (M4) +AND date >= '2025-05-26' AND date <= '2025-06-01' diff --git a/benchmark/index.cjs b/benchmark/index.cjs new file mode 100755 index 0000000..0d42a33 --- /dev/null +++ b/benchmark/index.cjs @@ -0,0 +1,125 @@ +#!/usr/bin/env node + +const {parseArgs} = require('node:util') +const {readFile} = require('node:fs/promises') +const {DuckDBInstance} = require('@duckdb/node-api') +const {Bench: Benchmark} = require('tinybench') +const {basename} = require('node:path') + +// adapted from https://stackoverflow.com/a/55297611/1072129 +const quantile = (sorted, q) => { + const pos = (sorted.length - 1) * q + const base = Math.floor(pos) + const rest = pos - base + if (base + 1 < sorted.length) { + return sorted[base] + rest * (sorted[base + 1] - sorted[base]) + } else { + return sorted[base] + } +} + +const { + values: flags, + positionals: args, +} = parseArgs({ + options: { + 'help': { + type: 'boolean', + short: 'h', + }, + }, + allowPositionals: true, +}) + +if (flags.help) { + process.stdout.write(` +Usage: + benchmark [options] [--] ... +\n`) + process.exit(0) +} + +;(async () => { + +const [pathToDb, ...queryFiles] = args +if (!pathToDb) { + console.error('you must pass the path to a DuckDB db file') + process.exit(1) +} +if (queryFiles.length === 0) { + console.error('you must pass >0 SQL files') + process.exit(1) +} +const instance = await DuckDBInstance.create(pathToDb, { + access_mode: 'READ_ONLY', +}) +const db = await instance.connect() + +await db.run(`\ +INSTALL spatial; +LOAD spatial; +`) + +const queriesByName = new Map() +const benchmark = new Benchmark({ + // - The default minimum number of iterations is too high. + // - The default minimum time is too low. + warmup: true, + warmupIterations: 1, + warmupTime: 5000, // 5s + iterations: 3, + time: 10000, // 10s +}) +await Promise.all( + queryFiles + .filter(queryFile => queryFile.slice(-9) !== '.skip.sql') + .map(async (queryFile) => { + const name = basename(queryFile) + const query = await readFile(queryFile, {encoding: 'utf8'}) + queriesByName.set(name, query) + benchmark.add(name, async () => { + await db.run(query) + }) + }), +) + +// do all queries once, to make sure they work +for (const [name, query] of queriesByName.entries()) { + try { + await db.run(query) + } catch (err) { + err.benchmark = name + err.query = query + throw err + } +} + +benchmark.addEventListener('cycle', (ev) => { + const {task} = ev + const query = queriesByName.get(task.name) + if ('error' in task.result) { + console.error(task.result) + process.exit(1) + } + const samples = Array.from(task.result.samples).sort() + console.log(JSON.stringify({ + query, + avg: task.result.latency.mean, + min: task.result.latency.min, + p25: quantile(samples, .25), + p50: task.result.latency.p50, + p75: task.result.latency.p75, + p95: quantile(samples, .95), + p99: task.result.latency.p99, + max: task.result.latency.max, + iterations: task.result.samples.length, + })) +}) + +await benchmark.run() + +})() +.catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/benchmark/index.sql b/benchmark/index.sql deleted file mode 100644 index 9d0371f..0000000 --- a/benchmark/index.sql +++ /dev/null @@ -1,110 +0,0 @@ -BEGIN; -CREATE TEMP TABLE _benchmark ( - filename TEXT, - query TEXT, - avg FLOAT, - min FLOAT, - p25 FLOAT, - p50 FLOAT, - p75 FLOAT, - p95 FLOAT, - p99 FLOAT, - max FLOAT, - iterations INTEGER -); - --- slightly modified from "How to benchmark PostgreSQL queries well" --- https://www.tangramvision.com/blog/how-to-benchmark-postgresql-queries-well#sql-function-with-clock_timestamp -CREATE OR REPLACE FUNCTION bench(_filename TEXT, _query TEXT, _iterations INTEGER) -RETURNS void -AS $$ -DECLARE - _warmup_iterations INTEGER; - _start TIMESTAMPTZ; - _end TIMESTAMPTZ; - _delta DOUBLE PRECISION; -BEGIN - CREATE TEMP TABLE IF NOT EXISTS _bench_results ( - elapsed DOUBLE PRECISION - ); - - -- Warm the cache - _warmup_iterations = GREATEST(3, _iterations / 10); - FOR i IN 1.._warmup_iterations LOOP - EXECUTE _query; - END LOOP; - - FOR i IN 1.._iterations LOOP - _start = clock_timestamp(); - EXECUTE _query; - _end = clock_timestamp(); - _delta = 1000 * (extract(epoch from _end) - extract(epoch from _start)); - INSERT INTO _bench_results VALUES (_delta); - END LOOP; - - INSERT INTO _benchmark - SELECT - _filename, - _query, - round(avg(elapsed)::numeric, 0), - min(elapsed), - round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - max(elapsed), - _iterations - FROM _bench_results; - - DROP TABLE _bench_results; -END -$$ -LANGUAGE plpgsql; - --- We aim for ~4s per benchmark, but take more time for slow benchmarks. --- Apple Silicon M2, most queries seem to be single-threaded. -\set query `cat arrs_deps_by_non_existent_stop.sql` -SELECT bench('arrs_deps_by_non_existent_stop.sql', :'query', 500); -\set query `cat arrs_deps_by_route_name_and_time.sql` -SELECT bench('arrs_deps_by_route_name_and_time.sql', :'query', 90); -\set query `cat arrs_deps_by_station_and_time.sql` -SELECT bench('arrs_deps_by_station_and_time.sql', :'query', 170); -\set query `cat arrs_deps_by_station_and_time_seq_0.sql` -SELECT bench('arrs_deps_by_station_and_time_seq_0.sql', :'query', 500); -\set query `cat arrs_deps_by_stop.sql` -SELECT bench('arrs_deps_by_stop.sql', :'query', 50); -\set query `cat arrs_deps_by_stop_and_time.sql` -SELECT bench('arrs_deps_by_stop_and_time.sql', :'query', 400); -\set query `cat arrs_deps_by_time.sql` -SELECT bench('arrs_deps_by_time.sql', :'query', 5); -\set query `cat arrs_deps_by_time_manual.sql` -SELECT bench('arrs_deps_by_time_manual.sql', :'query', 5); -\set query `cat arrs_deps_by_trip_and_date.sql` -SELECT bench('arrs_deps_by_trip_and_date.sql', :'query', 500); -\set query `cat connections_by_non_existent_stop.sql` -SELECT bench('connections_by_non_existent_stop.sql', :'query', 500); -\set query `cat connections_by_route_name_and_time.sql` -SELECT bench('connections_by_route_name_and_time.sql', :'query', 20); -\set query `cat connections_by_station_and_time.sql` -SELECT bench('connections_by_station_and_time.sql', :'query', 50); -\set query `cat connections_by_station_and_time_seq_0.sql` -SELECT bench('connections_by_station_and_time_seq_0.sql', :'query', 300); -\set query `cat connections_by_stop.sql` -SELECT bench('connections_by_stop.sql', :'query', 40); -\set query `cat connections_by_stop_and_time.sql` -SELECT bench('connections_by_stop_and_time.sql', :'query', 200); -\set query `cat connections_by_time.sql` -SELECT bench('connections_by_time.sql', :'query', 3); -\set query `cat connections_by_time_manual.sql` -SELECT bench('connections_by_time_manual.sql', :'query', 3); -\set query `cat connections_by_trip_and_date.sql` -SELECT bench('connections_by_trip_and_date.sql', :'query', 500); -\set query `cat stats_by_route_date.sql` -SELECT bench('stats_by_route_date.sql', :'query', 5); -\set query `cat stops_by_distance.sql` -SELECT bench('stops_by_distance.sql', :'query', 170); - -SELECT * FROM _benchmark; - -ROLLBACK; diff --git a/benchmark/init.sh b/benchmark/init.sh index 1465b89..9c1e638 100755 --- a/benchmark/init.sh +++ b/benchmark/init.sh @@ -5,11 +5,12 @@ set -o pipefail cd "$(dirname "$0")" set -x -wget --compression auto -r --no-parent --no-directories -R .csv.gz -P ../vbb-2022-07-01.gtfs -N 'https://vbb-gtfs.jannisr.de/2022-07-01/' -ls -lh ../vbb-2022-07-01.gtfs +wget --compression auto -r --no-parent --no-directories -R .csv.gz,.csv.br -P ../vbb-2025-05-21.gtfs -N 'https://vbb-gtfs.jannisr.de/2025-05-21/' +ls -lh ../vbb-2025-05-21.gtfs env | grep '^PG' || true ../cli.js -d \ --stops-location-index --stats-by-route-date=view \ - ../vbb-2022-07-01.gtfs/*.csv | sponge | psql -b + vbb-2025-05-21.gtfs.duckdb \ + ../vbb-2025-05-21.gtfs/*.csv diff --git a/benchmark/stats_by_route_date.sql b/benchmark/stats_by_route_date.sql index a894e09..4f8b5dc 100644 --- a/benchmark/stats_by_route_date.sql +++ b/benchmark/stats_by_route_date.sql @@ -1,5 +1,5 @@ SELECT * FROM stats_by_route_date WHERE route_id = '17452_900' -- M4 -AND date >= '2022-08-08' AND date <= '2022-08-14' +AND date >= '2025-05-26' AND date <= '2025-06-01' AND is_effective = true diff --git a/benchmark/stops_by_distance.sql b/benchmark/stops_by_distance.sql index fc112f9..ff351c6 100644 --- a/benchmark/stops_by_distance.sql +++ b/benchmark/stops_by_distance.sql @@ -1,4 +1,4 @@ SELECT * FROM stops -ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC +ORDER BY ST_Distance(stop_loc::geometry, ST_Point(9.7, 50.547)) ASC LIMIT 100 diff --git a/cli.js b/cli.js index a0a2b4b..b5292bb 100755 --- a/cli.js +++ b/cli.js @@ -41,10 +41,10 @@ const { 'stops-without-level-id': { type: 'boolean', }, - 'lower-case-lang-codes': { + 'stops-location-index': { type: 'boolean', }, - 'stops-location-index': { + 'lower-case-lang-codes': { type: 'boolean', }, 'stats-by-route-date': { @@ -56,24 +56,6 @@ const { 'stats-active-trips-by-hour': { type: 'string', }, - 'schema': { - type: 'string', - }, - 'postgraphile': { - type: 'boolean', - }, - 'postgraphile-password': { - type: 'string', - }, - 'postgrest': { - type: 'boolean', - }, - 'postgrest-password': { - type: 'string', - }, - 'postgrest-query-cost-limit': { - type: 'string', - }, 'import-metadata': { type: 'boolean', } @@ -84,7 +66,7 @@ const { if (flags.help) { process.stdout.write(` Usage: - gtfs-to-sql [options] [--] ... + import-gtfs-into-duckdb [options] [--] ... Options: --silent -s Don't show files being converted. --require-dependencies -d Require files that the specified GTFS files depend @@ -124,34 +106,16 @@ Options: currently running trips over time, by hour. Like --stats-by-route-date, this flag accepts none, view & materialized-view. - --schema The schema to use for the database. Default: public - Even when importing into a schema other than \`public\`, - a function \`public.gtfs_via_postgres_import_version()\` - gets created, to ensure that multiple imports into the - same database are all made using the same version. See - also multiple-datasets.md in the docs. - --postgraphile Tweak generated SQL for PostGraphile usage. - https://www.graphile.org/postgraphile/ - --postgraphile-password Password for the PostGraphile PostgreSQL user. - Default: $POSTGRAPHILE_PGPASSWORD, fallback random. - --postgrest Tweak generated SQL for PostgREST usage. - Please combine it with --schema. - https://postgrest.org/ - --postgrest-password Password for the PostgREST PostgreSQL user \`web_anon\`. - Default: $POSTGREST_PGPASSWORD, fallback random. - --postgrest-query-cost-limit Define a cost limit [1] for queries executed by PostgREST - on behalf of a user. It is only enforced if - pg_plan_filter [2] is installed in the database! - Must be a positive float. Default: none - [1] https://www.postgresql.org/docs/14/using-explain.html - [2] https://github.com/pgexperts/pg_plan_filter --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - - gtfs_via_postgres_version (text) - - gtfs_via_postgres_options (jsonb) + - gtfs_via_duckdb_version (text) + - gtfs_via_duckdb_options (jsonb) +Notes: + If you just want to check if the GTFS data can be imported but don't care about the + resulting DuckDB database file, you can import into an in-memory database by specifying + \`:memory:\` as the . Examples: - gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump + import-gtfs-into-duckdb some-gtfs.duckdb some-gtfs/*.txt [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J @@ -165,11 +129,11 @@ if (flags.version) { } const {basename, extname} = require('path') -const {pipeline} = require('stream') const convertGtfsToSql = require('./index') -const DataError = require('./lib/data-error') -const files = args.map((file) => { +const [pathToDb] = args + +const files = args.slice(1).map((file) => { const name = basename(file, extname(file)) return {name, file} }) @@ -185,9 +149,6 @@ const opt = { statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', statsActiveTripsByHour: flags['stats-active-trips-by-hour'] || 'none', - schema: flags['schema'] || 'public', - postgraphile: !!flags.postgraphile, - postgrest: !!flags.postgrest, importMetadata: !!flags['import-metadata'], } if ('stops-without-level-id' in flags) { @@ -196,31 +157,9 @@ if ('stops-without-level-id' in flags) { if ('lower-case-lang-codes' in flags) { opt.lowerCaseLanguageCodes = flags['lower-case-lang-codes'] } -if ('postgraphile-password' in flags) { - opt.postgraphilePassword = flags['postgraphile-password'] -} -if ('postgrest-password' in flags) { - opt.postgrestPassword = flags['postgrest-password'] -} -if ('postgrest-query-cost-limit' in flags) { - const limit = parseFloat(flags['postgrest-query-cost-limit']) - if (!Number.isFinite(limit) || limit < 0) { - console.error('Invalid --postgrest-query-cost-limit value.') - process.exit(1) - } - opt.lowerCaseLanguageCodes = limit -} -pipeline( - convertGtfsToSql(files, opt), - process.stdout, - (err) => { - if (!err) return; - if (err instanceof DataError) { - console.error(String(err)) - } else if (err.code !== 'EPIPE') { - console.error(err) - } - process.exit(1) - } -) +convertGtfsToSql(pathToDb, files, opt) +.catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md index 9983331..da64e3b 100644 --- a/docs/analysis/active-trips-by-hour.md +++ b/docs/analysis/active-trips-by-hour.md @@ -2,9 +2,9 @@ Do you want to know how many trips are running at a specific point in time? -`gtfs-via-postgres` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: +`gtfs-via-duckdb` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: -- If you run `gtfs-to-sql` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. +- If you run `gtfs-to-duckdb` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. - If you pass `--stats-active-trips-by-hour=materialized-view`, the `stats_active_trips_by_hour` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (a minute for small feeds, many hours for large feeds). ## example: number of active trips over the course of a day diff --git a/docs/analysis/distance-between-stops.md b/docs/analysis/distance-between-stops.md new file mode 100644 index 0000000..10bb1ff --- /dev/null +++ b/docs/analysis/distance-between-stops.md @@ -0,0 +1,27 @@ +# calculating the geographic distance of a trip's shape between two stops + +1. For each stop, respectively, find the point that's closest to the stop (using `ST_LineLocatePoint()`), and then +2. measure the length between those points (using `ST_LineSubstring()` & `ST_Length()`). + +```sql +WITH + stop_a AS ( + SELECT * + FROM stops + WHERE stop_id = 'stop A ID' + ), + stop_b AS ( + SELECT * + FROM stops + WHERE stop_id = 'stop B ID' + ) +SELECT + ST_Length(ST_LineSubstring( + shape::geography, + ST_LineLocatePoint(shape::geography, stop_a.stop_loc), + ST_LineLocatePoint(shape::geography, stop_b.stop_loc) + )) AS segment_length +FROM stop_a, stop_b, trips +JOIN shapes_aggregated ON shapes_aggregated.shape_id = trips.shape_id +WHERE trip_id = 'some trip ID' +``` diff --git a/docs/analysis/feed-by-agency-route-stop-and-hour.md b/docs/analysis/feed-by-agency-route-stop-and-hour.md index 521482d..0c1273d 100644 --- a/docs/analysis/feed-by-agency-route-stop-and-hour.md +++ b/docs/analysis/feed-by-agency-route-stop-and-hour.md @@ -1,5 +1,5 @@ # analysing a GTFS dataset by route ID, stop ID and/or hour -With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-postgres` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`. +With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-duckdb` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`. Note: As a materialized view, `stats_by_agency_route_stop_hour` takes up a significant amount of space, e.g. 13GB with the 2023-05-02 VBB GTFS feed. diff --git a/docs/analysis/feed-by-route-date.md b/docs/analysis/feed-by-route-date.md index afb08c7..e7af15a 100644 --- a/docs/analysis/feed-by-route-date.md +++ b/docs/analysis/feed-by-route-date.md @@ -6,9 +6,9 @@ Are you trying to answer a question like those below? - Has the number of stop time events decreased, compared to the last dataset version? - Do specific routes stop running during certain time periods? -`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: +`gtfs-via-duckdb` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: -- If you run `gtfs-to-sql` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. +- If you run `gtfs-to-duckdb` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. - If you pass `--stats-by-route-date=materialized-view`, the `stats_by_route_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed). `stats_by_route_date` has the following columns: diff --git a/docs/import-metadata.md b/docs/import-metadata.md index 3c96fce..740a028 100644 --- a/docs/import-metadata.md +++ b/docs/import-metadata.md @@ -1,6 +1,6 @@ # import metadata -If you run `gtfs-to-sql` with the `--import-metadata` option, it will create functions providing information about the imported feed as well as the import process. +If you run `gtfs-to-duckdb` with the `--import-metadata` option, it will create functions providing information about the imported feed as well as the import process. An example with the [2023-04-05 VBB GTFS feed](https://vbb-gtfs.jannisr.de/2023-04-05): @@ -12,7 +12,7 @@ SELECT gtfs_via_postgres_version() -- 4.5.3 SELECT gtfs_via_postgres_options() --- {"schema": "public", "silent": false, "importStart": 1681417454781, "postgraphile": false, "importMetadata": true, … } +-- {"silent": false, "importStart": 1681417454781, "importMetadata": true, … } SELECT (gtfs_via_postgres_options())['tripsWithoutShapeId'] -- true -``` \ No newline at end of file +``` diff --git a/docs/multiple-datasets.md b/docs/multiple-datasets.md index f1249be..725e068 100644 --- a/docs/multiple-datasets.md +++ b/docs/multiple-datasets.md @@ -1,26 +1,31 @@ -# importing multiple datasets into one DB +# working with multiple datasets -Using `gtfs-via-postgres`, you can import more than one dataset into a single PostgreSQL database by importing them into separate [schemas](https://www.postgresql.org/docs/14/ddl-schemas.html). You can then run queries combine or compare data from them. +Using [DuckDB's ability to attach databases to one session](https://duckdb.org/docs/stable/sql/statements/attach), you can run queries combining or comparing data from multiple GTFS datasets. -As an example, let's import two datasets ([Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités)' and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg)'s) into separate schemas: +As an example, let's compare two datasets from [Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités) and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg). + +First, we import each into its own database: ```shell -wget -U 'gtfs-via-postgres demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' +wget -U 'gtfs-via-duckdb demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' unzip -d paris.gtfs paris.gtfs.zip -gtfs-to-sql --require-dependencies \ - --schema paris -- paris.gtfs/*.txt \ - | sponge | psql -b +gtfs-to-duckdb --require-dependencies \ + paris.gtfs.duckdb \ + paris.gtfs/*.txt -wget -U 'gtfs-via-postgres demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' +wget -U 'gtfs-via-duckdb demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' unzip -d berlin.gtfs berlin.gtfs.zip -gtfs-to-sql --require-dependencies \ - --schema berlin -- berlin.gtfs/*.txt \ - | sponge | psql -b +gtfs-to-duckdb --require-dependencies \ + berlin.gtfs.duckdb \ + berlin.gtfs/*.txt ``` -We can now do queries across both datasets, for example finding the geographically furthest 2 stops: +In a new DuckDB shell/session, we can now do queries across both datasets, for example finding the geographically furthest 2 stops: ```sql +ATTACH 'paris.gtfs.duckdb' AS paris; +ATTACH 'berlin.gtfs.duckdb' AS berlin; + -- warning: takes a long time to compute! SELECT paris.stop_id AS paris_stop_id, @@ -28,8 +33,7 @@ SELECT FROM paris.stops paris, berlin.stops berlin +-- todo: does this operator work in DuckDB? ORDER BY paris.stop_loc <-> berlin.stop_loc DESC LIMIT 100 ``` - -*Note:* During an import, a function `public.gtfs_via_postgres_import_version()` gets created that returns `gtfs-via-postgres`'s version. If that function already exists (because it has been created by a previous import), its return value is compared to `gtfs-via-postgres`'s version, and if these two versions are not equal, the second import will fail. This ensures that multiple imports into the same database can only be made using the exact same `gtfs-via-postgres` version. diff --git a/docs/postgrest.md b/docs/postgrest.md deleted file mode 100644 index 09d4181..0000000 --- a/docs/postgrest.md +++ /dev/null @@ -1,9 +0,0 @@ -# PostgREST integration - -[PostgREST](https://postgrest.org/) is a tool that, given a PostgreSQL database (a schema within it, to be exact), creates a [RESTful](https://en.wikipedia.org/wiki/Representational_state_transfer) HTTP API from it. It will also automatically generate an [OpenAPI](https://spec.openapis.org/oas/latest.html) spec for it. - -`gtfs-via-postgres`'s PostgREST integration is read-only: It will create a [role](https://www.postgresql.org/docs/current/database-roles.html) `web_anon` with read-only access to the GTFS data. Due to [a bug](https://github.com/PostgREST/postgrest/issues/1870), it will also expose `POST`/`PUT`/`PATCH` operations in the OpenAPI spec, but they won't work; Only `GET`/`HEAD` (& `OPTIONS` for [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS#access-control-max-age)) will work. - -*Note:* Since PostgreSQL roles exist across databases, it might be that you already have a role called `web_anon`. In this case, to make sure PostgREST only has access to the newly imported data, it will 1) **re-assign all database objects (tables, etc.) currently owned by `web_anon` to the role you're importing the SQL as**, and b) revoke all existing permissions from the `web_anon` role! - -The `--postgrest` option will modify the generated SQL slightly, so that PostgREST can be run as-is on the database. It pairs well with the `--schema ` option, so that PostgREST only exposes what's in the schema, preventing accidental leaks. diff --git a/example.sh b/example.sh index dabc52d..86ded12 100755 --- a/example.sh +++ b/example.sh @@ -1,36 +1,39 @@ #!/bin/sh set -e +set -u set -o pipefail -2>&1 echo "importing into PostgreSQL:" +rm -f example.duckdb + +2>&1 echo "importing into example.duckdb:" ./cli.js --ignore-unsupported --require-dependencies --trips-without-shape-id --silent \ - node_modules/sample-gtfs-feed/gtfs/*.txt \ - | sponge | psql -b + example.duckdb \ + node_modules/sample-gtfs-feed/gtfs/*.txt 2>&1 echo "\nfetching a connection during DST switch:" -psql -c "$(cat <<- EOM +duckdb -csv -c "$(cat <<- EOM SELECT trip_id, route_id, from_stop_id, t_departure, - stop_sequence, + from_stop_sequence, to_stop_id, t_arrival FROM connections WHERE trip_id = 'during-dst-1' - AND t_departure > '2019-03-31T01:55+01' AND t_departure < '2019-03-31T03:00+02' + AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND from_stop_id = 'airport' -EOM)" +EOM)" example.duckdb 2>&1 echo "\nfetching the departure at the same time:" -psql -c "$(cat <<- EOM +duckdb -csv -c "$(cat <<- EOM SELECT trip_id, route_id, stop_id, t_departure, stop_sequence FROM arrivals_departures WHERE trip_id = 'during-dst-1' - AND t_departure > '2019-03-31T01:55+01' AND t_departure < '2019-03-31T03:00+02' + AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND stop_id = 'airport' -EOM)" +EOM)" example.duckdb diff --git a/index.js b/index.js index 4343424..317f3d9 100644 --- a/index.js +++ b/index.js @@ -1,16 +1,19 @@ 'use strict' -const debug = require('debug')('gtfs-via-postgres') -const {randomBytes} = require('crypto') +const createDebug = require('debug') const sequencify = require('sequencify') -const {inspect} = require('util') -const readCsv = require('gtfs-utils/read-csv') -const {Stringifier} = require('csv-stringify') +const {DuckDBInstance} = require('@duckdb/node-api') const formatters = require('./lib') const getDependencies = require('./lib/deps') -const pkg = require('./package.json') +const RUN = require('./lib/run.js') +const GET = require('./lib/get.js') + +const debug = createDebug('gtfs-via-duckdb') +const debugSql = createDebug('gtfs-via-duckdb:sql') + +const convertGtfsToSql = async (pathToDb, files, opt = {}) => { + debug('pathToDb', pathToDb) -const convertGtfsToSql = async function* (files, opt = {}) { opt = { silent: false, // todo [breaking]: make the default! @@ -25,14 +28,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByRouteIdAndDate: 'none', statsByAgencyIdAndRouteIdAndStopAndHour: 'none', statsActiveTripsByHour: 'none', - schema: 'public', - postgraphile: false, - postgraphilePassword: process.env.POSTGRAPHILE_PGPASSWORD || null, - postgrest: false, - postgrestPassword: process.env.POSTGREST_PASSWORD || null, - // see https://github.com/pgexperts/pg_plan_filter - // see also https://www.postgresql.org/docs/14/using-explain.html - postgrestQueryCostLimit: null, // or float importMetadata: false, ...opt, } @@ -47,16 +42,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByAgencyIdAndRouteIdAndStopAndHour, statsActiveTripsByHour, } = opt - let postgraphilePassword = opt.postgraphilePassword - if (opt.postgraphile && postgraphilePassword === null) { - postgraphilePassword = randomBytes(10).toString('hex') - console.error(`PostGraphile PostgreSQL user's password:`, postgraphilePassword) - } - let postgrestPassword = opt.postgrestPassword - if (opt.postgrest && postgrestPassword === null) { - postgrestPassword = randomBytes(10).toString('hex') - console.error(`PostrREST PostgreSQL user's password:`, postgrestPassword) - } if (ignoreUnsupportedFiles) { files = files.filter(f => !!formatters[f.name]) @@ -64,21 +49,17 @@ const convertGtfsToSql = async function* (files, opt = {}) { debug('files', files) const fileNames = files.map(f => f.name) + opt.files = fileNames const deps = getDependencies(opt, fileNames) debug('deps', deps) const tasks = { // file name -> [dep name] - 'is_valid_lang_code': { + 'valid_lang_codes': { dep: [], }, - 'is_timezone': { + 'valid_timezones': { dep: [], }, - ...(tripsWithoutShapeId ? {} : { - 'shape_exists': { - dep: [...deps.shape_exists], - }, - }), // special handling of calendar/calendar_dates: // service_days relies on *both* calendar's & calendar_dates' tables to @@ -151,62 +132,35 @@ const convertGtfsToSql = async function* (files, opt = {}) { opt.importStart = Date.now() - yield `\ --- GTFS SQL dump generated by ${pkg.name} v${pkg.version} --- ${pkg.homepage} --- options: -${inspect(opt, {compact: false}).split('\n').map(line => '-- ' + line).join('\n')} - -\\set ON_ERROR_STOP on -CREATE EXTENSION IF NOT EXISTS postgis; -${opt.schema !== 'public' ? `CREATE SCHEMA IF NOT EXISTS "${opt.schema}";` : ''} -BEGIN; - --- gtfs-via-postgres supports importing >1 GTFS datasets into 1 DB, each dataset within its own schema. See https://github.com/public-transport/gtfs-via-postgres/issues/51 for more information. --- Because almost all helper utilities (enums, functions, etc.) are schema-specific, they get imported more than once. In order to prevent subtle bugs due to incompatibilities among two schemas imported by different gtfs-via-postgres versions, we mock a "mutex" here by checking for public.gtfs_via_postgres_import_version()'s return value. - --- todo: this can be done more elegantly: just a "DO" block, "ASSERT" that the version matches, create gtfs_via_postgres_import_version() in the "EXCEPTION" block -CREATE FUNCTION pg_temp.get_gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - DECLARE - res TEXT; - BEGIN - SELECT public.gtfs_via_postgres_import_version() INTO res; - RETURN res; - EXCEPTION - WHEN undefined_function THEN - -- do nothing, silence error - RETURN NULL; - END; -$$ -LANGUAGE plpgsql; - -DO $$ -BEGIN - IF EXISTS ( - SELECT version - FROM ( - SELECT pg_temp.get_gtfs_via_postgres_import_version() AS version - ) t - WHERE version != '${pkg.version}' - ) THEN - RAISE EXCEPTION 'existing GTFS data imported with an incompatible version of gtfs-via-postgres'; - END IF; -END -$$ -LANGUAGE plpgsql; - -CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - SELECT '${pkg.version}' -$$ -LANGUAGE sql; + const instance = await DuckDBInstance.create(pathToDb) + const db = await instance.connect() + db[RUN] = async (query, ...args) => { + debugSql('db[RUN]', query, ...args) + try { + return await db.run(query, ...args) + } catch (err) { + err.query = query + err.args = args + throw err + } + } + db[GET] = async (query, ...args) => { + debugSql('db[GET]', query, ...args) + try { + const result = await db.runAndReadAll(query, ...args) + return result.getRowObjects() + } catch (err) { + err.query = query + err.args = args + throw err + } + } -\n` + await db[RUN](` +-- todo +-- BEGIN TRANSACTION; +`) - const csv = new Stringifier({quoted: true}) const nrOfRowsByName = new Map() const workingState = { nrOfRowsByName, @@ -215,145 +169,29 @@ LANGUAGE sql; for (const name of order) { if (!silent) console.error(name) const task = tasks[name] - yield `-- ${name}\n-----------------\n\n` - - const { - beforeAll, - afterAll, - } = formatters[name] - - if ('string' === typeof beforeAll && beforeAll) { - yield beforeAll - } else if ('function' === typeof beforeAll) { - yield beforeAll(opt, workingState) - } - if (task.file) { - const {formatRow} = formatters[name] - let nrOfRows = 0 - for await (const rawRow of await readCsv(task.file)) { - const row = formatRow(rawRow, opt, workingState) - let formattedRow = null - csv.api.__transform(row, (_formattedRow) => { - formattedRow = _formattedRow - }) - yield formattedRow - nrOfRows++ - } + const importData = formatters[name] - nrOfRowsByName.set(name, nrOfRows) - // todo [breaking]: indent with \t - // todo [breaking]: print a summary of all files instead - if (!silent) console.error(` processed ${nrOfRows} rows`) + // calendar's & calendar_dates's importData() should run even if their respective files are not present. + // Also, the frequencies table is needed for stop_times's arrivals_departures & connections views. + if (!task.file && importData.runDespiteMissingSrcFile !== true) { + console.error('skipping!') // todo: remove + continue } - if ('string' === typeof afterAll && afterAll) { - yield afterAll + ';\n' - } else if ('function' === typeof afterAll) { - yield afterAll(opt, workingState) + ';\n' + try { + await importData(db, task.file || null, opt, workingState) + } catch (err) { + err.gtfsFile = name + throw err } } - yield `\ - -${opt.postgraphile ? `\ --- seal imported data --- todo: --- > Be careful with public schema.It already has a lot of default privileges that you maybe don't want... See documentation[1]. --- > [1]: postgresql.org/docs/11/ddl-schemas.html#DDL-SCHEMAS-PRIV -DO $$ -BEGIN - -- https://stackoverflow.com/questions/8092086/create-postgresql-role-user-if-it-doesnt-exist#8099557 - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgraphile' - ) THEN - RAISE NOTICE 'Role "postgraphile" already exists, skipping creation.'; - ELSE - CREATE ROLE postgraphile LOGIN PASSWORD '${opt.postgraphilePassword}'; -- todo: escape properly - END IF; -END -$$; -DO $$ - DECLARE - db TEXT := current_database(); - BEGIN - -- todo: grant just on $opt.schema instead? - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %I TO %I', db, 'postgraphile'); - END -$$; -GRANT USAGE ON SCHEMA "${opt.schema}" TO postgraphile; --- https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment50679407_762649 -REVOKE CREATE ON SCHEMA "${opt.schema}" FROM PUBLIC; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO postgraphile; --- ALTER DEFAULT PRIVILEGES IN SCHEMA "${opt.schema}" GRANT SELECT ON TABLES TO postgraphile; --- todo: set search_path? https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment33535263_762649 -` : ''} - -${opt.postgrest ? `\ -${opt.schema !== 'public' ? `\ --- pattern from https://stackoverflow.com/a/8099557 -DO -$$ -BEGIN - -- Roles are shared across databases, so we have remove previously configured privileges. - -- This might of course interfere with other programs running on the DBMS! - -- todo: find a cleaner solution - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'web_anon' - ) THEN - RAISE WARNING 'Role web_anon already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY web_anon TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE web_anon NOLOGIN NOINHERIT; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role web_anon was just created by a concurrent transaction.'; - END; - END IF; - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgrest' - ) THEN - RAISE WARNING 'Role postgrest already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY postgrest TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE postgrest LOGIN NOINHERIT NOCREATEDB NOCREATEROLE NOSUPERUSER PASSWORD '${postgrestPassword}'; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role postgrest was just created by a concurrent transaction.'; - END; - END IF; -END -$$; - - --- https://postgrest.org/en/stable/tutorials/tut0.html#step-4-create-database-for-api --- https://postgrest.org/en/stable/explanations/db_authz.html --- todo: is this secure? -GRANT USAGE ON SCHEMA "${opt.schema}" TO web_anon; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA "${opt.schema}" TO web_anon; - -GRANT web_anon TO postgrest; - -${opt.postgrestQueryCostLimit !== null ? ` --- If pg_plan_filter is installed, limit the cost of queries made by PostgREST users. -ALTER USER web_anon SET plan_filter.statement_cost_limit = ${opt.postgrestQueryCostLimit}; -` : ''} - -COMMENT ON SCHEMA "${opt.schema}" IS -$$GTFS REST API -This REST API is created by running [PostgREST](https://postgrest.org/) on top of a [PostgreSQL](https://www.postgresql.org) DB generated using [${pkg.name} v${pkg.version}](${pkg.homepage || pkg.repository}). -$$; -` : ''} -` : ''} + debug('workingState', workingState) -COMMIT;` + // todo + // await db[RUN]('COMMIT') + debug('done!') } module.exports = convertGtfsToSql diff --git a/lib/agency.js b/lib/agency.js index 450230c..3e80856 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -1,50 +1,42 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#agencytxt -const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".agency ( +const importData = async (db, pathToAgency, opt, workingState) => { + await db[RUN](`\ +CREATE TABLE agency ( agency_id TEXT PRIMARY KEY, agency_name TEXT NOT NULL, agency_url TEXT NOT NULL, - agency_timezone TEXT NOT NULL - CONSTRAINT valid_timezone CHECK ("${opt.schema}".is_timezone(agency_timezone)), + agency_timezone TEXT NOT NULL REFERENCES valid_timezones (tz), agency_lang TEXT, -- todo: validate? agency_phone TEXT, agency_fare_url TEXT, agency_email TEXT ); -COPY "${opt.schema}".agency ( - agency_id, - agency_name, - agency_url, - agency_timezone, - agency_lang, - agency_phone, - agency_fare_url, - agency_email -) FROM STDIN csv; -` - -const formatAgencyRow = (a) => { - return [ - a.agency_id || null, - a.agency_name || null, - a.agency_url || null, - a.agency_timezone || null, - a.agency_lang || null, - a.agency_phone || null, - a.agency_fare_url || null, - a.agency_email || null, - ] -} +INSERT INTO agency +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToAgency}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] +); -const afterAll = `\ -\\. -` +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX agency_agency_id ON agency(agency_id); +`) -module.exports = { - beforeAll, - formatRow: formatAgencyRow, - afterAll, + workingState.nrOfRowsByName.set('agency', await queryNumberOfRows(db, 'agency', opt)) } + +module.exports = importData diff --git a/lib/calendar.js b/lib/calendar.js index 3727eca..c8ecbe2 100644 --- a/lib/calendar.js +++ b/lib/calendar.js @@ -1,67 +1,77 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#calendartxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".availability AS ENUM ( +const importData = async (db, pathToCalendar, opt, workingState) => { + await db[RUN](`\ +CREATE TYPE availability AS ENUM ( 'not_available' -- 0 – Service is not available for Mondays in the date range. , 'available' -- 1 – Service is available for all Mondays in the date range. ); -CREATE CAST ("${opt.schema}".availability AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (availability AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar ( +CREATE TABLE calendar ( service_id TEXT PRIMARY KEY, - monday "${opt.schema}".availability NOT NULL, - tuesday "${opt.schema}".availability NOT NULL, - wednesday "${opt.schema}".availability NOT NULL, - thursday "${opt.schema}".availability NOT NULL, - friday "${opt.schema}".availability NOT NULL, - saturday "${opt.schema}".availability NOT NULL, - sunday "${opt.schema}".availability NOT NULL, + monday availability NOT NULL, + tuesday availability NOT NULL, + wednesday availability NOT NULL, + thursday availability NOT NULL, + friday availability NOT NULL, + saturday availability NOT NULL, + sunday availability NOT NULL, start_date DATE NOT NULL, end_date DATE NOT NULL ); +`) -COPY "${opt.schema}".calendar ( - service_id, - monday, - tuesday, - wednesday, - thursday, - friday, - saturday, - sunday, - start_date, - end_date -) FROM STDIN csv; -` - -const availability = (val) => { - if (val === '0') return 'not_available' - if (val === '1') return 'available' - throw new Error('invalid availability: ' + val) -} + if (pathToCalendar !== null) { + await db[RUN](`\ +INSERT INTO calendar +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::availability)[monday + 1] AS monday, + enum_range(NULL::availability)[tuesday + 1] AS tuesday, + enum_range(NULL::availability)[wednesday + 1] AS wednesday, + enum_range(NULL::availability)[thursday + 1] AS thursday, + enum_range(NULL::availability)[friday + 1] AS friday, + enum_range(NULL::availability)[saturday + 1] AS saturday, + enum_range(NULL::availability)[sunday + 1] AS sunday, + array_slice(start_date, 0, 4) || '-' || array_slice(start_date, 5, 6) || '-' || array_slice(start_date, 7, 8) AS start_date, + array_slice(end_date, 0, 4) || '-' || array_slice(end_date, 5, 6) || '-' || array_slice(end_date, 7, 8) AS end_date +) +FROM read_csv( + '${pathToCalendar}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + service_id: 'TEXT', + monday: 'UINTEGER', + tuesday: 'UINTEGER', + wednesday: 'UINTEGER', + thursday: 'UINTEGER', + friday: 'UINTEGER', + saturday: 'UINTEGER', + sunday: 'UINTEGER', + start_date: 'TEXT', + end_date: 'TEXT' + } +); +`) + } -const formatCalendarRow = (c) => { - return [ - c.service_id || null, - c.monday ? availability(c.monday) : null, - c.tuesday ? availability(c.tuesday) : null, - c.wednesday ? availability(c.wednesday) : null, - c.thursday ? availability(c.thursday) : null, - c.friday ? availability(c.friday) : null, - c.saturday ? availability(c.saturday) : null, - c.sunday ? availability(c.sunday) : null, - c.start_date, - c.end_date, - ] + workingState.nrOfRowsByName.set('calendar', await queryNumberOfRows(db, 'calendar', opt)) } -const afterAll = `\ -\\. -` +importData.runDespiteMissingSrcFile = true -module.exports = { - beforeAll, - formatRow: formatCalendarRow, - afterAll, -} +module.exports = importData diff --git a/lib/calendar_dates.js b/lib/calendar_dates.js index 1002afd..beb8a67 100644 --- a/lib/calendar_dates.js +++ b/lib/calendar_dates.js @@ -1,54 +1,61 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#calendar_datestxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".exception_type_v AS ENUM ( +const importData = async (db, pathToCalendarDates, opt, workingState) => { + await db[RUN](`\ +CREATE TYPE exception_type_v AS ENUM ( 'added' -- 1 – Service has been added for the specified date. , 'removed' -- 2 – Service has been removed for the specified date. ); -CREATE CAST ("${opt.schema}".exception_type_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (exception_type_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar_dates ( +CREATE TABLE calendar_dates ( service_id TEXT NOT NULL, "date" DATE NOT NULL, - PRIMARY KEY (service_id, "date"), - exception_type "${opt.schema}".exception_type_v NOT NULL + CONSTRAINT primary_key PRIMARY KEY (service_id, "date"), + exception_type exception_type_v NOT NULL ); +`) + if (pathToCalendarDates !== null) { + await db[RUN](`\ +INSERT INTO calendar_dates +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + array_slice(date, 0, 4) || '-' || array_slice(date, 5, 6) || '-' || array_slice(date, 7, 8) AS date, + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::exception_type_v)[exception_type] AS exception_type, +) +FROM read_csv( + '${pathToCalendarDates}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + service_id: 'TEXT', + date: 'TEXT', + exception_type: 'UINTEGER' + } +); +`) + } -COPY "${opt.schema}".calendar_dates ( - service_id, - date, - exception_type -) FROM STDIN csv; -` - -const exceptionType = (val) => { - if (val === '1') return 'added' - if (val === '2') return 'removed' - throw new Error('invalid exception_type: ' + val) -} + await db[RUN](`\ +CREATE INDEX calendar_dates_service_id ON calendar_dates (service_id); +CREATE INDEX calendar_dates_exception_type ON calendar_dates (exception_type); +`) -const formatCalendarDatesRow = (e) => { - return [ - e.service_id || null, - e.date, - e.exception_type ? exceptionType(e.exception_type) : null, - ] + workingState.nrOfRowsByName.set('calendar_dates', await queryNumberOfRows(db, 'calendar_dates', opt)) } -const afterAll = (opt) => `\ -\\. +importData.runDespiteMissingSrcFile = true -CREATE INDEX ON "${opt.schema}".calendar_dates (service_id); -CREATE INDEX ON "${opt.schema}".calendar_dates (exception_type); - -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".calendar_dates IS E'@foreignKey (service_id) references calendar|@fieldName calendar'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatCalendarDatesRow, - afterAll, -} +module.exports = importData diff --git a/lib/columns.js b/lib/columns.js new file mode 100644 index 0000000..7629125 --- /dev/null +++ b/lib/columns.js @@ -0,0 +1,41 @@ +'use strict' + +const GET = require('./get.js') +const {queryNumberOfRows} = require('./rows-count.js') + +// https://gtfs.org/documentation/schedule/reference/#stop_timestxt +const queryFileColumns = async (db, pathToFile) => { + const columns = await db[GET]( + `\ + DESCRIBE ( + SELECT * + FROM read_csv( + -- Using a parameter would be the proper & safer approach here, but it crashes DuckDB as of v1.3.2. + -- $1, + '${pathToFile}', + header = true + ) + LIMIT 1 + ) +`, + // [pathToFile], + ) + return columns +} + +const queryIfColumnsExist = async (db, pathToFile, columns) => { + const res = Object.create(null) + const existing = new Set( + (await queryFileColumns(db, pathToFile)) + .map(col => col.column_name), + ) + for (const column of columns) { + res[column] = existing.has(column) + } + return res +} + +module.exports = { + queryFileColumns, + queryIfColumnsExist, +} diff --git a/lib/deps.js b/lib/deps.js index dceb7c6..b744c1c 100644 --- a/lib/deps.js +++ b/lib/deps.js @@ -7,14 +7,11 @@ const getDependencies = (opt, files) => { stopsWithoutLevelId, } = opt return { - shape_exists: [ - 'shapes', - ], agency: [ - 'is_timezone', + 'valid_timezones', ], stops: [ - 'is_timezone', + 'valid_timezones', ...(stopsWithoutLevelId ? [] : ['levels']), ], transfers: [ @@ -27,12 +24,12 @@ const getDependencies = (opt, files) => { 'frequencies', ], routes: [ - ...(routesWithoutAgencyId ? [] : ['agency']), + ...(routesWithoutAgencyId && !files.includes('agency') ? [] : ['agency']), ], trips: [ 'routes', 'service_days', - ...(tripsWithoutShapeId ? [] : ['shapes', 'shape_exists']), + ...(tripsWithoutShapeId ? [] : ['shapes']), ], frequencies: [ 'trips', @@ -41,10 +38,10 @@ const getDependencies = (opt, files) => { 'stops', ], feed_info: [ - 'is_valid_lang_code', + 'valid_lang_codes', ], translations: [ - 'is_valid_lang_code', + 'valid_lang_codes', // > table_name // > Defines the dataset table that contains the field to be translated. The following values are allowed: // > agency @@ -60,14 +57,14 @@ const getDependencies = (opt, files) => { // todo: respect opt.*! // these are soft dependencies, they are not depended upon, they must only be imported first // todo: only specify dependencies here if the files are not in use + + // these are required files anyways 'agency', 'stops', 'routes', 'trips', - ...(files.includes('stop_times') - ? ['stop_times'] - : [] - ), + 'stop_times', + // these are optional, so we only depend on them if they are present ...(files.includes('feed_info') ? ['feed_info'] : [] @@ -80,7 +77,12 @@ const getDependencies = (opt, files) => { ? ['levels'] : [] ), + ...(files.includes('calendar') ? ['calendar'] : []), + ...(files.includes('calendar_dates') ? ['calendar_dates'] : []), // not supported yet: attributions + // not supported yet: fare_attributes/fare_rules + // not supported yet: frequencies + // not supported yet: transfers ], } } diff --git a/lib/feed_info.js b/lib/feed_info.js index 6cc1168..d865e3a 100644 --- a/lib/feed_info.js +++ b/lib/feed_info.js @@ -1,22 +1,21 @@ 'use strict' +const RUN = require('./run.js') + // https://gtfs.org/documentation/schedule/reference/#feed_infotxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToFeedInfo, opt, workingState) => { + await db[RUN](`\ -- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate *_lang. -- https://github.com/MobilityData/gtfs-validator/blob/31ff374800f7d7883fd9de91b71049c2a4de4e45/main/src/main/java/org/mobilitydata/gtfsvalidator/validator/MatchingFeedAndAgencyLangValidator.java#L82 -- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html -- related: https://github.com/google/transit/pull/98 -CREATE TABLE "${opt.schema}".feed_info ( +CREATE TABLE feed_info ( feed_publisher_name TEXT PRIMARY KEY, feed_publisher_url TEXT NOT NULL, - feed_lang TEXT NOT NULL - CONSTRAINT valid_feed_lang CHECK ( - "${opt.schema}".is_valid_lang_code(feed_lang) - ), - default_lang TEXT - CONSTRAINT valid_default_lang CHECK ( - default_lang IS NULL OR "${opt.schema}".is_valid_lang_code(default_lang) - ), + feed_lang TEXT NOT NULL, + FOREIGN KEY (feed_lang) REFERENCES valid_lang_codes, + default_lang TEXT, + FOREIGN KEY (default_lang) REFERENCES valid_lang_codes, feed_start_date DATE, feed_end_date DATE, feed_version TEXT, @@ -24,39 +23,29 @@ CREATE TABLE "${opt.schema}".feed_info ( feed_contact_url TEXT ); -COPY "${opt.schema}".feed_info ( - feed_publisher_name, - feed_publisher_url, - feed_lang, - default_lang, - feed_start_date, - feed_end_date, - feed_version, - feed_contact_email, - feed_contact_url -) FROM STDIN csv; -` - -const formatFeedInfoRow = (i) => { - return [ - i.feed_publisher_name || null, - i.feed_publisher_url || null, - i.feed_lang || null, - i.default_lang || null, - i.feed_start_date || null, - i.feed_end_date || null, - i.feed_version || null, - i.feed_contact_email || null, - i.feed_contact_url || null, - ] +INSERT INTO feed_info +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + ( + array_slice(feed_start_date, 0, 4) + || '-' || array_slice(feed_start_date, 5, 6) + || '-' || array_slice(feed_start_date, 7, 8) + ) AS feed_start_date, + ( + array_slice(feed_end_date, 0, 4) + || '-' || array_slice(feed_end_date, 5, 6) + || '-' || array_slice(feed_end_date, 7, 8) + ) AS feed_end_date +) +FROM read_csv( + '${pathToFeedInfo}', + header = true, + -- > Option to skip type detection for CSV parsing and assume all columns to be of type VARCHAR [a.k.a. TEXT]. + all_varchar = true +); +`) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatFeedInfoRow, - afterAll, -} +module.exports = importData diff --git a/lib/frequencies.js b/lib/frequencies.js index 39f7d33..03b7971 100644 --- a/lib/frequencies.js +++ b/lib/frequencies.js @@ -1,98 +1,106 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#frequenciestxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".exact_times_v AS ENUM ( +const importData = async (db, pathToFrequencies, opt, workingState) => { + await db[RUN](`\ +CREATE TYPE exact_times_v AS ENUM ( 'frequency_based' -- 0 or empty - Frequency-based trips. , 'schedule_based' -- 1 – Schedule-based trips with the exact same headway throughout the day. In this case the end_time value must be greater than the last desired trip start_time but less than the last desired trip start_time + headway_secs. ); -CREATE CAST ("${opt.schema}".exact_times_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (exact_times_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".frequencies ( - -- Used to implement arrivals_departures & connections. Filled after COPY-ing, see below. +CREATE TABLE frequencies ( + -- Used to implement arrivals_departures & connections. Filled by the INSERT below. frequencies_row INTEGER, trip_id TEXT NOT NULL, - FOREIGN KEY (trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (trip_id) REFERENCES trips, start_time INTERVAL NOT NULL, + -- todo, once supported by DuckDB: PRIMARY KEY (trip_id, start_time) end_time INTERVAL NOT NULL, headway_secs INT NOT NULL, - exact_times "${opt.schema}".exact_times_v, - -- frequencies' primary key currently is just (trip_id, start_time) - -- see also https://github.com/google/transit/issues/514 - -- todo: add primary key? - UNIQUE ( - trip_id, - start_time, - end_time, - headway_secs, - exact_times - ) + exact_times exact_times_v -- todo: NOT NULL & ifnull() + -- frequencies' primary is just (trip_id, start_time). however, the definition for the headway_secs field says: + -- > Multiple headways may be defined for the same trip, but must not overlap. New headways may start at the exact time the previous headway ends. + -- https://gtfs.org/documentation/schedule/reference/#frequenciestxt + -- todo: add a unique constraint once there is consensus in https://github.com/google/transit/issues/514 ); +`) -COPY "${opt.schema}".frequencies ( - trip_id, - start_time, - end_time, - headway_secs, - exact_times -) FROM STDIN csv; -` - -const exactTimes = (val) => { - if (val === '0') return 'frequency_based' - if (val === '1') return 'schedule_based' - throw new Error('invalid exact_times: ' + val) -} + if (pathToFrequencies === null) { + // todo: keep? + // workingState.nrOfRowsByName.set('frequencies', 0) + return; + } -const formatFrequenciesRow = (f) => { - const startTime = f.start_time - ? formatTime(f.start_time) - : null - const endTime = f.end_time - ? formatTime(f.end_time) - : null - - return [ - f.trip_id || null, - startTime, - endTime, - f.headway_secs ? parseInt(f.headway_secs) : null, - f.exact_times ? exactTimes(f.exact_times) : null, - ] -} + // exact_times is optional, so the entire columns can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + exact_times: has_exact_times, + } = await queryIfColumnsExist(db, pathToFrequencies, [ + 'exact_times', + ]) -const afterAll = (opt) => `\ -\\. + await db[RUN](`\ +INSERT INTO frequencies +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + ${has_exact_times ? `` : `NULL AS exact_times,`} + * + REPLACE ( + -- dummy entry in case no optional column is present + trip_id AS trip_id, + ${has_exact_times ? `\ + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + -- Also, we explicitly cast until https://github.com/duckdb/duckdb/issues/17431 is resolved. + enum_range(NULL::exact_times_v)[exact_times::integer + 1] AS exact_times + ` : ``} + ), + row_number() OVER (PARTITION BY trip_id, exact_times) AS frequencies_row +FROM read_csv( + '${pathToFrequencies}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + start_time: 'INTERVAL', + end_time: 'INTERVAL', + ${has_exact_times ? `exact_times: 'INTEGER',` : ``} + } +); +`) --- frequencies_row is used to implement arrivals_departures & connections. -UPDATE "${opt.schema}".frequencies --- This is ugly, but AFAICT there is no cleaner way. --- see also https://stackoverflow.com/a/4359354/1072129 -SET frequencies_row = t.frequencies_row -FROM ( - SELECT - -- order by all columns so that we don't implicitly depend on the file's order - (row_number() OVER (PARTITION BY trip_id, start_time ORDER BY end_time, headway_secs, exact_times))::integer AS frequencies_row, - trip_id, start_time - FROM "${opt.schema}".frequencies -) AS t --- self-join + await db[RUN](`\ +-- We create UNIQUE index *afterwards* to make the data import faster. -- frequencies' primary is just (trip_id, start_time) -- however, the definition for the headway_secs field says: -- > Multiple headways may be defined for the same trip, but must not overlap. New headways may start at the exact time the previous headway ends. -- https://gtfs.org/documentation/schedule/reference/#frequenciestxt --- todo: add \`frequencies.exact_times::text = t.exact_times::text\`? once there is consensus in https://github.com/google/transit/issues/514 -WHERE frequencies.trip_id = t.trip_id -AND frequencies.start_time = t.start_time; +-- todo: add more columns once there is consensus in https://github.com/google/transit/issues/514 +CREATE UNIQUE INDEX frequencies_unique ON frequencies ( + trip_id, + -- As of v1.0.0, DuckDB does not support UNIQUE indexes on INTERVAL columns yet, so we cast to INTEGER. + (start_time::string) +); -CREATE INDEX ON "${opt.schema}".frequencies (trip_id); -CREATE INDEX ON "${opt.schema}".frequencies (exact_times); -` +CREATE INDEX frequencies_trip_id ON frequencies (trip_id); +CREATE INDEX frequencies_exact_times ON frequencies (exact_times); +`) -module.exports = { - beforeAll, - formatRow: formatFrequenciesRow, - afterAll, + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } + +importData.runDespiteMissingSrcFile = true + +module.exports = importData diff --git a/lib/get.js b/lib/get.js new file mode 100644 index 0000000..9affca3 --- /dev/null +++ b/lib/get.js @@ -0,0 +1,5 @@ +'use strict' + +const GET = Symbol('get') + +module.exports = GET \ No newline at end of file diff --git a/lib/import_metadata.js b/lib/import_metadata.js index c93e769..8343191 100644 --- a/lib/import_metadata.js +++ b/lib/import_metadata.js @@ -1,33 +1,30 @@ 'use strict' const {strictEqual} = require('assert') +const RUN = require('./run.js') const pkg = require('../package.json') -const afterAll = (opt) => { +const populateImportMetadata = async (db, _, opt) => { strictEqual(typeof opt.importStart, 'number', 'opt.importStart must be a number') // todo: escape properly - return `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_data_imported_at () -RETURNS TIMESTAMP WITH TIME ZONE -AS $$ - SELECT '${new Date(opt.importStart).toISOString()}'::timestamp with time zone; -$$ LANGUAGE SQL IMMUTABLE; + await db[RUN](`\ +CREATE OR REPLACE FUNCTION gtfs_data_imported_at (a) +AS ( + '${new Date(opt.importStart).toISOString()}'::timestamp with time zone +); -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_via_postgres_version () -RETURNS TEXT -AS $$ - SELECT '${pkg.version}'; -$$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION gtfs_via_duckdb_version () +AS ( + '${pkg.version}'::text +); -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_via_postgres_options () -RETURNS jsonb -AS $$ - SELECT '${JSON.stringify(opt).replace(/'/g, `''`)}'::jsonb; -$$ LANGUAGE SQL IMMUTABLE; -` +CREATE OR REPLACE FUNCTION gtfs_via_duckdb_options () +AS ( + '${JSON.stringify(opt).replace(/'/g, `''`)}'::json +); +`) } +populateImportMetadata.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = populateImportMetadata diff --git a/lib/index.js b/lib/index.js index 9490a3c..c8e7431 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,8 +1,9 @@ 'use strict' module.exports = { - is_valid_lang_code: require('./prerequisites').is_valid_lang_code, - is_timezone: require('./prerequisites').is_timezone, + icu: require('./prerequisites').icu, + valid_lang_codes: require('./prerequisites').valid_lang_codes, + valid_timezones: require('./prerequisites').valid_timezones, shape_exists: require('./prerequisites').shape_exists, agency: require('./agency'), calendar: require('./calendar'), diff --git a/lib/levels.js b/lib/levels.js index ae72df2..591f79d 100644 --- a/lib/levels.js +++ b/lib/levels.js @@ -1,36 +1,40 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#levelstxt -const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".levels ( +const importData = async (db, pathToLevels, opt, workingState) => { + await db[RUN](`\ +CREATE TABLE levels ( level_id TEXT PRIMARY KEY, - level_index DOUBLE PRECISION NOT NULL, + level_index REAL NOT NULL, level_name TEXT ); -COPY "${opt.schema}".levels ( - level_id, - level_index, - level_name -) FROM STDIN csv; -` - -const formatLevelsRow = (l) => { - return [ - l.level_id, - parseFloat(l.level_index), - l.level_name || null, - ] -} +INSERT INTO levels +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToLevels}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + level_index: 'REAL', + } +); -const afterAll = `\ -\\. -` +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX levels_level_id ON levels(level_id); +`) -module.exports = { - beforeAll, - formatRow: formatLevelsRow, - afterAll, + workingState.nrOfRowsByName.set('levels', await queryNumberOfRows(db, 'levels', opt)) } + +module.exports = importData diff --git a/lib/pathways.js b/lib/pathways.js index c69a905..9f0837f 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -1,10 +1,34 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#pathwaystxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".pathway_mode_v AS ENUM ( +const importData = async (db, pathToPathways, opt, workingState) => { + // Several columns are optional, so their columns may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + length: has_length, + traversal_time: has_traversal_time, + stair_count: has_stair_count, + max_slope: has_max_slope, + min_width: has_min_width, + signposted_as: has_signposted_as, + reversed_signposted_as: has_reversed_signposted_as, + } = await queryIfColumnsExist(db, pathToPathways, [ + 'length', + 'traversal_time', + 'stair_count', + 'max_slope', + 'min_width', + 'signposted_as', + 'reversed_signposted_as', + ]) + + await db[RUN](`\ +CREATE TYPE pathway_mode_v AS ENUM ( 'walkway' -- 1 , 'stairs' -- 2 , 'moving_sidewalk_travelator' -- 3 – moving sidewalk/travelator @@ -14,85 +38,57 @@ CREATE TYPE "${opt.schema}".pathway_mode_v AS ENUM ( -- Fare gates may either separate paid areas of the station from unpaid ones, or separate different payment areas within the same station from each other. This information can be used to avoid routing passengers through stations using shortcuts that would require passengers to make unnecessary payments, like directing a passenger to walk through a subway platform to reach a busway. , 'exit_gate' -- 7 – Indicates a pathway exiting an area where proof-of-payment is required into an area where proof-of-payment is no longer required. ); -CREATE CAST ("${opt.schema}".pathway_mode_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (pathway_mode_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".pathways ( +CREATE TABLE pathways ( pathway_id TEXT PRIMARY KEY, from_stop_id TEXT NOT NULL, - FOREIGN KEY (from_stop_id) REFERENCES "${opt.schema}".stops (stop_id), + FOREIGN KEY (from_stop_id) REFERENCES stops (stop_id), to_stop_id TEXT NOT NULL, - FOREIGN KEY (to_stop_id) REFERENCES "${opt.schema}".stops (stop_id), - pathway_mode "${opt.schema}".pathway_mode_v NOT NULL, + FOREIGN KEY (to_stop_id) REFERENCES stops (stop_id), + pathway_mode pathway_mode_v NOT NULL, is_bidirectional BOOLEAN NOT NULL, - length DOUBLE PRECISION, -- todo: add non-negative constraint + length REAL, -- todo: add non-negative constraint traversal_time INTEGER, -- todo: add positive constraint stair_count INTEGER, -- todo: add non-0 constraint - max_slope DOUBLE PRECISION, - min_width DOUBLE PRECISION, -- todo: add positive constraint + max_slope REAL, + min_width REAL, -- todo: add positive constraint signposted_as TEXT, reversed_signposted_as TEXT ); -COPY "${opt.schema}".pathways ( - pathway_id, - from_stop_id, - to_stop_id, - pathway_mode, - is_bidirectional, - length, - traversal_time, - stair_count, - max_slope, - min_width, - signposted_as, - reversed_signposted_as -) FROM STDIN csv; -` - -const pathwayMode = (val) => { - if (val === '1') return 'walkway' - if (val === '2') return 'stairs' - if (val === '3') return 'moving_sidewalk_travelator' - if (val === '4') return 'escalator' - if (val === '5') return 'elevator' - if (val === '6') return 'fare_gate' - if (val === '7') return 'exit_gate' - throw new Error('invalid pathway_mode: ' + val) -} +INSERT INTO pathways +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- todo: check that is_bidirectional is actually 0 or 1 + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::pathway_mode_v)[pathway_mode] AS pathway_mode +) +FROM read_csv( + '${pathToPathways}', + header = true, + all_varchar = true, + types = { + pathway_mode: 'INTEGER', + is_bidirectional: 'INTEGER' + ${has_length ? `, length: 'REAL'` : ``} + ${has_traversal_time ? `, traversal_time: 'INTEGER'` : ``} + ${has_stair_count ? `, stair_count: 'INTEGER'` : ``} + ${has_max_slope ? `, max_slope: 'REAL'` : ``} + ${has_min_width ? `, min_width: 'REAL'` : ``} + ${has_signposted_as ? `, signposted_as: 'TEXT'` : ``} + ${has_reversed_signposted_as ? `, reversed_signposted_as: 'TEXT'` : ``} + } +); -const formatPathwaysRow = (p) => { - let is_bidirectional - if (p.is_bidirectional === '0') is_bidirectional = 'false' - else if (p.is_bidirectional === '1') is_bidirectional = 'true' - else throw new Error('invalid is_bidirectional: ' + p.is_bidirectional) +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX pathways_pathway_id ON pathways(pathway_id); +`) - return [ - p.pathway_id, - p.from_stop_id, - p.to_stop_id, - pathwayMode(p.pathway_mode), - is_bidirectional, - p.length, - p.traversal_time, - p.stair_count, - p.max_slope, - p.min_width, - p.signposted_as || null, - p.reversed_signposted_as || null, - ] + workingState.nrOfRowsByName.set('pathways', await queryNumberOfRows(db, 'pathways', opt)) } -const afterAll = (opt) => `\ -\\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".pathways (from_stop_id); -CREATE INDEX ON "${opt.schema}".pathways (to_stop_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatPathwaysRow, - afterAll, -} +module.exports = importData diff --git a/lib/prerequisites.js b/lib/prerequisites.js index d923789..54ec918 100644 --- a/lib/prerequisites.js +++ b/lib/prerequisites.js @@ -1,92 +1,69 @@ 'use strict' -const is_valid_lang_code = { - beforeAll: (opt) => `\ --- Unfortunately information_schema.collations.collation_name only has --- identifiers with "_", not with "-", so we use pg_collation instead. --- https://www.postgresql.org/docs/current/infoschema-collations.html --- https://www.postgresql.org/docs/current/catalog-pg-collation.html --- todo [breaking]: rename to e.g. is_similar_to_bcp_47_tag? -CREATE OR REPLACE FUNCTION "${opt.schema}".is_bcp_47_tag( - input TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT collctype - FROM pg_collation - WHERE ${opt.lowerCaseLanguageCodes ? `lower(collctype)` : `collctype`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} || '-x-icu' - LIMIT 1 - ); -$$ language sql STABLE; - -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_bcp_47_tag IS E'@omit'; -` : ''} +const RUN = require('./run.js') --- todo [breaking]: remove -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_lang_code( - input TEXT -) -RETURNS BOOLEAN -AS $$ - -- todo: see also https://github.com/MobilityData/gtfs-validator/issues/1987 - SELECT "${opt.schema}".is_bcp_47_tag(input); -$$ language sql STABLE; +const valid_lang_codes = async (db, _, opt) => { + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_lang_code IS E'@omit'; -` : ''} -`, +-- todo: once https://github.com/MobilityData/gtfs-validator/issues/1987 is solved, adapt this code +-- Unfortunately pragma_collations().collname only has +-- identifiers with "_", not with "-", so we use pg_collation instead. +-- see also https://duckdb.org/docs/sql/expressions/collations#icu-collations +-- todo: Also, entries like "de_DE" are missing. +CREATE TABLE valid_lang_codes ( + -- As of DuckDB v1.2.0, referring to this table via either a subquery or a plain foreign key doesn't work because + -- - subqueries are prohibited in CHECK constraints, and + -- - the foreign key doesn't seem to work with a NOCASE primary key. + -- This is why we use a case-sensitive primary key and unnest() to enumerate all (relevant) casings ourselves. + lang_code TEXT PRIMARY KEY, +); +INSERT INTO valid_lang_codes +SELECT * +FROM ( + SELECT + unnest([ + collname, + CASE WHEN contains(collname, '-') THEN + concat_ws('-', split_part(collname, '-', 1), upper(split_part(collname, '-', 2))) + ELSE + NULL + END + ]) AS lang_code + FROM ( + SELECT + replace(collname, '_', '-') AS collname + FROM pragma_collations() + ) t +) t +WHERE lang_code IS NOT NULL; +`) } -const is_timezone = { - beforeAll: (opt) => `\ --- https://justatheory.com/2007/11/postgres-timezone-validation/ -CREATE OR REPLACE FUNCTION "${opt.schema}".is_timezone( - tz TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - date TIMESTAMPTZ; - BEGIN - date := now() AT TIME ZONE tz; - RETURN TRUE; - EXCEPTION WHEN invalid_parameter_value THEN - RETURN FALSE; - END; -$$ language plpgsql STABLE; +valid_lang_codes.runDespiteMissingSrcFile = true -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_timezone IS E'@omit'; -` : ''} -`, -} -const shape_exists = { - beforeAll: (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".shape_exists( - some_shape_id TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT shape_id - FROM "${opt.schema}".shapes - WHERE shape_id = some_shape_id - LIMIT 1 - ); -$$ language sql STABLE; +const valid_timezones = async (db, _, opt) => { + // DuckDB v0.10: "subqueries prohibited in CHECK constraints" + // > CONSTRAINT valid_timezone CHECK (is_timezone(agency_timezone)) + // or inlined: + // > CONSTRAINT valid_timezone CHECK (EXISTS(SELECT name FROM pg_timezone_names() WHERE name = agency_timezone)) + // so we create a helper table instead + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".shape_exists IS E'@omit'; -` : ''} -`, +CREATE TABLE valid_timezones( + tz TEXT PRIMARY KEY +); +INSERT INTO valid_timezones ( + SELECT name AS tz + FROM pg_timezone_names() +); +`) } +valid_timezones.runDespiteMissingSrcFile = true module.exports = { - is_valid_lang_code, - is_timezone, - shape_exists, + valid_lang_codes, + valid_timezones, } diff --git a/lib/routes.js b/lib/routes.js index 999ca80..7485f53 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -1,6 +1,8 @@ 'use strict' -const DataError = require('./data-error') +// const DataError = require('./data-error') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // Google's "Extended GTFS Route Types" // https://developers.google.com/transit/gtfs/reference/extended-route-types @@ -233,14 +235,29 @@ const routeTypesSchemes = Object.assign(Object.create(null), { }) // https://gtfs.org/documentation/schedule/reference/#routestxt -const beforeAll = (opt) => { +const importData = async (db, pathToRoutes, opt, workingState) => { if (!(opt.routeTypesScheme in routeTypesSchemes)) { throw new Error(`invalid opt.routeTypesScheme, must be one of these: ${Object.keys(routeTypesSchemes).join(', ')}.`) } const extRouteTypes = routeTypesSchemes[opt.routeTypesScheme] - return `\ -CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( + // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. + // It seems that GTFS has allowed this at least since 2016: + // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 + const exactly1Agency = workingState.nrOfRowsByName.get('agency') === 1 + // todo: throw special error indicating an error in the input data? does the foreign key constraint achieve this implicitly? old code: + // throw new DataError( + // 'routes', + // 'agency_id must not be empty/null', + // [ + // 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' + // ], + // ) + + const withAgencyFKey = !opt.routesWithoutAgencyId && !exactly1Agency + + await db[RUN](`\ +CREATE TYPE route_type_val AS ENUM ( -- basic types '0' -- 0 – Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area. , '1' -- 1 – Subway, Metro. Any underground rail system within a metropolitan area. @@ -256,85 +273,51 @@ CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( -- extended types ${extRouteTypes.map(([route_type, desc]) => `, '${route_type}' -- ${desc}`).join('\n')} ); -CREATE CAST ("${opt.schema}".route_type_val AS text) WITH INOUT AS IMPLICIT; --- todo [breaking]: use small table as enum? https://www.graphile.org/postgraphile/enums/#with-enum-tables -${opt.postgraphile ? `\ -COMMENT ON TYPE "${opt.schema}".route_type_val IS E'@enum\\n@enumName RouteType\\n'; -` : ''} -CREATE TABLE "${opt.schema}".routes ( +CREATE TABLE routes ( route_id TEXT PRIMARY KEY, - agency_id TEXT, - ${opt.routesWithoutAgencyId ? '' : `FOREIGN KEY (agency_id) REFERENCES "${opt.schema}".agency,`} + -- As of DuckDB v1.3.0, a foreign key constraint does not enforce non-NULL values. + agency_id TEXT ${withAgencyFKey ? `NOT NULL` : ''}, + ${withAgencyFKey + ? '' + : `FOREIGN KEY (agency_id) REFERENCES agency,` + } -- todo: Either route_short_name or route_long_name must be specified, or potentially both if appropriate. route_short_name TEXT, route_long_name TEXT, route_desc TEXT, - route_type "${opt.schema}".route_type_val NOT NULL, + route_type route_type_val NOT NULL, route_url TEXT, route_color TEXT, route_text_color TEXT, route_sort_order INT ); -COPY "${opt.schema}".routes ( - route_id, - agency_id, - route_short_name, - route_long_name, - route_desc, - route_type, - route_url, - route_color, - route_text_color, - route_sort_order -) FROM STDIN csv; -` -} - -const formatRoutesRow = (r, opt, workingState) => { - const agency_id = r.agency_id || null - if (agency_id === null && !opt.routesWithoutAgencyId) { - // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. - // It seems that GTFS has allowed this at least since 2016: - // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - if (workingState.nrOfRowsByName.get('agency') !== 1) { - // todo: throw special error indicating an error in the input data - throw new DataError( - 'routes', - 'agency_id must not be empty/null', - [ - 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' - ], - ) - } +INSERT INTO routes +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToRoutes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + route_type: 'TEXT', } +); - return [ - r.route_id || null, - agency_id, - r.route_short_name || null, - r.route_long_name || null, - r.route_desc || null, - r.route_type || null, - r.route_url || null, - r.route_color || null, - r.route_text_color || null, - r.route_sort_order ? parseInt(r.route_sort_order) : null, - ] -} - -const afterAll = (opt) => `\ -\\. +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX routes_route_id ON routes(route_id); -CREATE INDEX ON "${opt.schema}".routes (route_short_name); -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".routes (agency_id); -` : ''} -` +CREATE INDEX routes_route_short_name ON routes (route_short_name); +`) -module.exports = { - beforeAll, - formatRow: formatRoutesRow, - afterAll, + workingState.nrOfRowsByName.set('routes', await queryNumberOfRows(db, 'routes', opt)) } + +module.exports = importData diff --git a/lib/rows-count.js b/lib/rows-count.js new file mode 100644 index 0000000..f88ebd2 --- /dev/null +++ b/lib/rows-count.js @@ -0,0 +1,15 @@ +'use strict' + +const GET = require('./get.js') + +const queryNumberOfRows = async (db, dbName, opt) => { + const [{count: nrOfRows}] = await db[GET](` + SELECT count(*) AS count + FROM "${dbName}" + `) + return nrOfRows +} + +module.exports = { + queryNumberOfRows, +} diff --git a/lib/run.js b/lib/run.js new file mode 100644 index 0000000..a155611 --- /dev/null +++ b/lib/run.js @@ -0,0 +1,5 @@ +'use strict' + +const RUN = Symbol('run') + +module.exports = RUN diff --git a/lib/service_days.js b/lib/service_days.js index 5df4b1f..d83267e 100644 --- a/lib/service_days.js +++ b/lib/service_days.js @@ -1,8 +1,21 @@ 'use strict' -const afterAll = (opt) => `\ +const RUN = require('./run.js') + +// https://gtfs.org/documentation/schedule/reference/#calendar_datestxt +const importData = async (db, _, opt, workingState) => { + await db[RUN](`\ +-- DuckDB currently has no materialized views, only tables. +-- see https://github.com/duckdb/duckdb/discussions/3638#discussioncomment-2801284 +-- todo: what if i modify calendar/calendar_dates? define triggers? -- todo [breaking]: rename to service_dates? -CREATE MATERIALIZED VIEW "${opt.schema}".service_days AS +CREATE TABLE service_days ( + service_id TEXT NOT NULL, + date TIMESTAMP NOT NULL, + PRIMARY KEY (service_id, date) +); + +INSERT INTO service_days SELECT base_days.service_id, base_days.date @@ -16,7 +29,7 @@ FROM ( SELECT service_id, "date", - extract(dow FROM "date") dow, + date_part('dow', "date") dow, sunday, monday, tuesday, @@ -27,12 +40,12 @@ FROM ( FROM ( SELECT *, - generate_series( + unnest(generate_series( start_date::TIMESTAMP, end_date::TIMESTAMP, '1 day'::INTERVAL - ) "date" - FROM "${opt.schema}".calendar + )) "date" + FROM calendar ) all_days_raw ) all_days WHERE (sunday = 'available' AND dow = 0) @@ -47,7 +60,7 @@ FROM ( -- "removed" exceptions LEFT JOIN ( SELECT * - FROM "${opt.schema}".calendar_dates + FROM calendar_dates WHERE exception_type = 'removed' ) removed ON base_days.service_id = removed.service_id @@ -56,23 +69,20 @@ WHERE removed.date IS NULL -- "added" exceptions UNION SELECT service_id, "date" -FROM "${opt.schema}".calendar_dates +FROM calendar_dates WHERE exception_type = 'added' ORDER BY service_id, "date"; -CREATE UNIQUE INDEX ON "${opt.schema}".service_days (service_id, date); +CREATE UNIQUE INDEX service_days_unique_service_id_date ON service_days (service_id, date); -CREATE INDEX ON "${opt.schema}".service_days (service_id); -CREATE INDEX ON "${opt.schema}".service_days (date); +CREATE INDEX service_days_service_id ON service_days (service_id); +CREATE INDEX service_days_date ON service_days (date); -- apparently the unique index (service_id, date) doesn't speed up queries -CREATE INDEX ON "${opt.schema}".service_days (service_id, date); +CREATE INDEX service_days_service_id_date ON service_days (service_id, date); +`) +} -${opt.postgraphile ? `\ -COMMENT ON MATERIALIZED VIEW "${opt.schema}".service_days IS E'@name serviceDates\\n@primaryKey service_id,date'; -` : ''} -` +importData.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = importData diff --git a/lib/shapes.js b/lib/shapes.js index 7418dcb..7f19d9f 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -1,63 +1,81 @@ 'use strict' -// https://gtfs.org/documentation/schedule/reference/#shapestxt -const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".shapes ( - id SERIAL PRIMARY KEY, - shape_id TEXT, - shape_pt_sequence INT, - shape_pt_loc geography(POINT), - shape_dist_traveled REAL -); +const GET = require('./get.js') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') -COPY "${opt.schema}".shapes ( - shape_id, - shape_pt_loc, - shape_pt_sequence, - shape_dist_traveled -) FROM STDIN csv; -` - -const formatShapesRow = (s) => { - return [ - s.shape_id || null, - `POINT(${parseFloat(s.shape_pt_lon)} ${parseFloat(s.shape_pt_lat)})`, - s.shape_pt_sequence ? parseInt(s.shape_pt_sequence) : null, - s.shape_dist_traveled ? parseInt(s.shape_dist_traveled) : null, - ] -} +// https://gtfs.org/documentation/schedule/reference/#shapestxt +const importData = async (db, pathToShapes, opt, workingState) => { + // shape_dist_traveled is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + shape_dist_traveled: has_shape_dist_traveled, + } = await queryIfColumnsExist(db, pathToShapes, [ + 'shape_dist_traveled', + ]) -const afterAll = (opt) => `\ -\\. + // todo: why does extracting `Count` directly work here and not with other files? + const [ + {Count: nrOfShapes}, + ] = await db[GET](`\ +INSTALL spatial; -- todo: make install optional? +LOAD spatial; -CREATE INDEX shapes_by_shape_id ON "${opt.schema}".shapes (shape_id); -CREATE INDEX ON "${opt.schema}".shapes (shape_id, shape_pt_sequence); +CREATE TABLE shapes ( + shape_id TEXT PRIMARY KEY, + shape GEOMETRY, + distances_travelled REAL[] +); -CREATE OR REPLACE VIEW "${opt.schema}".shapes_aggregated AS +INSERT INTO shapes +-- WITH +-- csv_columns AS ( +-- SELECT list(column_name) AS cols +-- FROM ( +-- DESCRIBE ( +-- SELECT * +-- FROM read_csv( +-- 'node_modules/sample-gtfs-feed/gtfs/shapes.txt', +-- header = true +-- ) +-- ) +-- ) columns +-- ), +-- table_columns AS ( +-- SELECT list(column_name) +-- FROM ( +-- DESCRIBE shapes +-- ) columns +-- ) +-- SELECT COLUMNS(x -> x IN (SELECT cols FROM csv_columns)) SELECT - shape_id, - array_agg(shape_dist_traveled) AS distances_travelled, - -- todo [breaking]: make this a geography! - ST_MakeLine(array_agg(shape_pt_loc)) AS shape + any_value(shape_id) AS shape_id, + ST_MakeLine(array_agg(ST_Point(shape_pt_lon, shape_pt_lat))) AS shape, + ${has_shape_dist_traveled ? `array_agg(shape_dist_traveled)` : `NULL`} AS distances_travelled FROM ( - SELECT - shape_id, - shape_dist_traveled, - ST_AsText(shape_pt_loc)::geometry AS shape_pt_loc - FROM "${opt.schema}".shapes - ORDER by shape_id, shape_pt_sequence -) shapes + SELECT * + FROM read_csv( + '${pathToShapes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] + ) + ORDER BY shape_id, shape_pt_sequence +) t GROUP BY shape_id; +`) + + await db[RUN](`\ +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX shapes_shape_id ON shapes(shape_id); +`) -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".shapes IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".shapes.id IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".shapes_aggregated IS E'@name shapes\\n@primaryKey shape_id'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatShapesRow, - afterAll, + // Note: This is not the number of shapes.txt rows! + workingState.nrOfRowsByName.set('shapes', nrOfShapes) } + +module.exports = importData diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js index e369261..9329217 100644 --- a/lib/stats_active_trips_by_hour.js +++ b/lib/stats_active_trips_by_hour.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsActiveTripsByHourView = async (db, _, opt) => { let materialized = false if (opt.statsActiveTripsByHour === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsActiveTripsByHour: materialized-view is currently not supported') + // materialized = true } else if (opt.statsActiveTripsByHour !== 'view') { throw new Error('invalid opt.statsActiveTripsByHour, must be one of these: none, view, materialized-view.') } @@ -11,45 +17,47 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ -CREATE MATERIALIZED VIEW "${opt.schema}".feed_time_frame AS + await db[RUN](`\ +-- todo: use materialized view once DuckDB supports that +-- see also https://github.com/duckdb/duckdb/discussions/3638 +CREATE TABLE feed_time_frame AS WITH dates AS ( SELECT min("date") AS min, max("date") AS max - FROM "${opt.schema}".service_days + FROM service_days ), date_offset AS ( - SELECT greatest( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() - ) AS o + SELECT + largest AS o + FROM largest_arr_dep_time ), date_min_max AS ( SELECT + -- todo date_trunc('day', min + o) AS min, date_trunc('day', max - o) AS max FROM dates, date_offset ), min_dep AS ( SELECT min("t_departure") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date <= (SELECT min FROM date_min_max) ), min_arr AS ( SELECT min("t_arrival") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date <= (SELECT min FROM date_min_max) ), max_dep AS ( SELECT min("t_departure") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date >= (SELECT max FROM date_min_max) ), max_arr AS ( SELECT min("t_arrival") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date >= (SELECT max FROM date_min_max) ) SELECT @@ -57,24 +65,23 @@ SELECT least(max_dep.t, max_arr.t) as max FROM min_dep, min_arr, max_dep, max_arr; -CREATE OR REPLACE FUNCTION "${opt.schema}".feed_time_series( - time_unit TEXT +CREATE OR REPLACE FUNCTION feed_time_series( + time_unit ) -RETURNS SETOF timestamptz -AS $$ +AS ( SELECT generate_series( date_trunc(time_unit, min), date_trunc(time_unit, max), ('1 ' || time_unit)::interval ) as t - FROM "${opt.schema}".feed_time_frame -$$ LANGUAGE sql STABLE; + FROM feed_time_frame +); -${createViewCmd} "${opt.schema}".stats_active_trips_by_hour AS +${createViewCmd} stats_active_trips_by_hour AS WITH all_hours AS NOT MATERIALIZED ( - SELECT "${opt.schema}".feed_time_series('hour') AS "hour" + SELECT unnest(feed_time_series('hour')) AS "hour" ) SELECT DISTINCT ON ("hour") "hour", @@ -86,23 +93,20 @@ FROM ( FROM ( SELECT * FROM all_hours - LEFT JOIN "${opt.schema}".connections ON ( + LEFT JOIN connections ON ( date_trunc('hour', t_departure) <= "hour" AND date_trunc('hour', t_arrival) >= "hour" ) ) t ) cons; +`) -${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_active_trips_by_hour ("hour"); -` : ''} - -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_active_trips_by_hour IS E'@name hourlyActiveTripsStats\\n@primaryKey hour'; -` : ''} -` + if (materialized) { + await db[RUN](`\ +CREATE INDEX ON stats_active_trips_by_hour ("hour"); +`) + } } +createStatsActiveTripsByHourView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsActiveTripsByHourView diff --git a/lib/stats_by_agency_route_stop_hour.js b/lib/stats_by_agency_route_stop_hour.js index c753d31..e937019 100644 --- a/lib/stats_by_agency_route_stop_hour.js +++ b/lib/stats_by_agency_route_stop_hour.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsByAgencyIdAndRouteIdAndStopAndHourView = async (db, _, opt) => { let materialized = false if (opt.statsByAgencyIdAndRouteIdAndStopAndHour === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsByAgencyIdAndRouteIdAndStopAndHour: materialized-view is currently not supported') + // materialized = true } else if (opt.statsByAgencyIdAndRouteIdAndStopAndHour !== 'view') { throw new Error('invalid opt.statsByAgencyIdAndRouteIdAndStopAndHour, must be one of these: none, view, materialized-view.') } @@ -11,28 +17,25 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ -${createViewCmd} "${opt.schema}".stats_by_agency_route_stop_hour AS + await db[RUN](`\ +${createViewCmd} stats_by_agency_route_stop_hour AS SELECT DISTINCT ON (agency_id, route_id, stop_id, effective_hour) agency_id, route_id, stop_id, station_id, "date" as service_date, date_trunc('hour', t_arrival) AS effective_hour, count(*) OVER (PARTITION BY route_id, stop_id, date_trunc('hour', t_arrival)) AS nr_of_arrs -FROM "${opt.schema}".arrivals_departures; - -${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (route_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (stop_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (station_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (effective_hour); -` : ''} +FROM arrivals_departures; +`) -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_agency_route_stop_hour IS E'@name hourlyStats\\n@primaryKey route_id,stop_id,effective_hour\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByStopIdAndHour\\n@foreignKey (stop_id) references stops|@fieldName stop|@foreignFieldName statsByRouteIdAndHour'; -` : ''} -` + if (materialized) { + await db[RUN](`\ +CREATE INDEX ON stats_by_agency_route_stop_hour (route_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (stop_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (station_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (effective_hour); +`) + } } +createStatsByAgencyIdAndRouteIdAndStopAndHourView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsByAgencyIdAndRouteIdAndStopAndHourView diff --git a/lib/stats_by_route_date.js b/lib/stats_by_route_date.js index ff816f9..eaec758 100644 --- a/lib/stats_by_route_date.js +++ b/lib/stats_by_route_date.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsByRouteAndDateView = async (db, _, opt) => { let materialized = false if (opt.statsByRouteIdAndDate === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsByRouteIdAndDate: materialized-view is currently not supported') + // materialized = true } else if (opt.statsByRouteIdAndDate !== 'view') { throw new Error('invalid opt.statsByRouteIdAndDate, must be one of these: none, view, materialized-view.') } @@ -11,15 +17,15 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ -${createViewCmd} "${opt.schema}".stats_by_route_date AS + await db[RUN](`\ +${createViewCmd} stats_by_route_date AS WITH arrs_deps_with_svc_date AS NOT MATERIALIZED ( SELECT route_id, stop_sequence_consec, "date"::date AS svc_date, EXTRACT(DOW FROM "date") AS svc_dow - FROM "${opt.schema}".arrivals_departures + FROM arrivals_departures ), by_svc_date AS NOT MATERIALIZED ( SELECT DISTINCT ON (route_id, svc_date) @@ -35,7 +41,7 @@ WITH route_id, stop_sequence_consec, coalesce(t_departure, t_arrival)::date AS effective_date, EXTRACT(DOW FROM coalesce(t_departure, t_arrival)) AS effective_dow - FROM "${opt.schema}".arrivals_departures + FROM arrivals_departures ), by_effective_date AS NOT MATERIALIZED ( SELECT DISTINCT ON (route_id, effective_date) @@ -55,20 +61,16 @@ SELECT *, False AS is_effective FROM by_svc_date; - -${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id); -CREATE INDEX ON "${opt.schema}".stats_by_route_date ("date"); -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, "date", is_effective); -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, dow, is_effective); -` : ''} - -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_route_date IS E'@name routeStats\\n@primaryKey route_id,date,is_effective\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByDate'; -` : ''} -` +`) + if (materialized) { + await db[RUN](`\ +CREATE INDEX ON stats_by_route_date (route_id); +CREATE INDEX ON stats_by_route_date ("date"); +CREATE INDEX ON stats_by_route_date (route_id, "date", is_effective); +CREATE INDEX ON stats_by_route_date (route_id, dow, is_effective); +`) + } } +createStatsByRouteAndDateView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsByRouteAndDateView diff --git a/lib/stop_times.js b/lib/stop_times.js index c58acab..db9d75e 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -1,190 +1,169 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#stop_timestxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".pickup_drop_off_type AS ENUM ( +const importData = async (db, pathToStopTimes, opt, workingState) => { + // timepoint & shape_dist_traveled are optional, so the entire columns can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + shape_dist_traveled: has_shape_dist_traveled, + timepoint: has_timepoint, + } = await queryIfColumnsExist(db, pathToStopTimes, [ + 'shape_dist_traveled', + 'timepoint', + ]) + + await db[RUN](`\ +CREATE TYPE pickup_drop_off_type AS ENUM ( 'regular' -- 0 or empty - Regularly scheduled pickup/dropoff. , 'not_available' -- 1 – No pickup/dropoff available. , 'call' -- 2 – Must phone agency to arrange pickup/dropoff. , 'driver' -- 3 – Must coordinate with driver to arrange pickup/dropoff. ); -CREATE CAST ("${opt.schema}".pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; -CREATE TYPE "${opt.schema}".timepoint_v AS ENUM ( +CREATE TYPE timepoint_v AS ENUM ( 'approximate' -- 0 – Times are considered approximate. , 'exact' -- 1 or empty - Times are considered exact. ); -CREATE CAST ("${opt.schema}".timepoint_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (timepoint_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".stop_times ( +CREATE TABLE stop_times ( trip_id TEXT NOT NULL, - FOREIGN KEY (trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (trip_id) REFERENCES trips, -- https://gist.github.com/derhuerst/574edc94981a21ef0ce90713f1cff7f6 arrival_time INTERVAL, departure_time INTERVAL, stop_id TEXT NOT NULL, - FOREIGN KEY (stop_id) REFERENCES "${opt.schema}".stops, + FOREIGN KEY (stop_id) REFERENCES stops, stop_sequence INT NOT NULL, stop_sequence_consec INT, stop_headsign TEXT, - pickup_type "${opt.schema}".pickup_drop_off_type, - drop_off_type "${opt.schema}".pickup_drop_off_type, + pickup_type pickup_drop_off_type, -- todo: NOT NULL & ifnull() + drop_off_type pickup_drop_off_type, -- todo: NOT NULL & ifnull() shape_dist_traveled REAL, - timepoint "${opt.schema}".timepoint_v, - -- Used to implement frequencies.txt. Filled after COPY-ing, see below. - trip_start_time INTERVAL + timepoint timepoint_v, + -- Used to implement frequencies.txt. Filled below. + trip_start_time INTERVAL, + PRIMARY KEY (trip_id, stop_sequence) ); -COPY "${opt.schema}".stop_times ( - trip_id, - arrival_time, - departure_time, - stop_id, - stop_sequence, - stop_headsign, - pickup_type, - drop_off_type, - shape_dist_traveled, - timepoint -) FROM STDIN csv; -` - -const pickupDropOffType = (val) => { - if (val === '0') return 'regular' - if (val === '1') return 'not_available' - if (val === '2') return 'call' - if (val === '3') return 'driver' - throw new Error('invalid/unsupported pickup_type/drop_off_type: ' + val) -} - -const timepoint = (val) => { - if (val === '0') return 'approximate' - if (val === '1') return 'exact' - throw new Error('invalid/unsupported timepoint_v: ' + val) -} - -const formatStopTimesRow = (s) => { - const arrTime = s.arrival_time - ? formatTime(s.arrival_time) - : null - const depTime = s.departure_time - ? formatTime(s.departure_time) - : null - - return [ - s.trip_id || null, - arrTime, - depTime, - s.stop_id || null, - s.stop_sequence ? parseInt(s.stop_sequence) : null, - s.stop_headsign || null, - s.pickup_type ? pickupDropOffType(s.pickup_type) : null, - s.drop_off_type ? pickupDropOffType(s.drop_off_type) : null, - s.shape_dist_traveled || null, - s.timepoint ? timepoint(s.timepoint) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - --- trip_start_time is used to implement frequencies.txt. -UPDATE "${opt.schema}".stop_times --- This is ugly, but AFAICT there is no cleaner way. --- see also https://stackoverflow.com/a/4359354/1072129 -SET trip_start_time = t.trip_start_time -FROM ( - SELECT - -- todo: is frequencies.txt relative to 1st arrival_time or departure_time? - coalesce( - first_value(departure_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence), - first_value(arrival_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence) - ) AS trip_start_time, - trip_id, stop_sequence - FROM "${opt.schema}".stop_times -) AS t --- self-join -WHERE stop_times.trip_id = t.trip_id -AND stop_times.stop_sequence = t.stop_sequence; - -CREATE INDEX ON "${opt.schema}".stop_times (trip_id); -CREATE INDEX ON "${opt.schema}".stop_times (stop_id); - -${opt.postgraphile ? `\ -COMMENT ON COLUMN "${opt.schema}".stop_times.stop_sequence_consec IS E'@name stopSequenceConsecutive'; -` : ''} - -UPDATE "${opt.schema}".stop_times -SET stop_sequence_consec = t.seq -FROM ( - SELECT - row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC)::integer - 1 AS seq, - trip_id, stop_sequence - FROM "${opt.schema}".stop_times -) AS t -WHERE "${opt.schema}".stop_times.trip_id = t.trip_id -AND "${opt.schema}".stop_times.stop_sequence = t.stop_sequence; - -CREATE INDEX ON "${opt.schema}".stop_times (stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (trip_id, stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time DESC NULLS LAST); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time DESC NULLS LAST); --- todo: are these two necessary? -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time); +INSERT INTO stop_times +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + -- We stay compatible with PostgreSQL's row_number() here, which starts with 0. + row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC) - 1 AS stop_sequence_consec, + ${has_shape_dist_traveled ? `` : `NULL AS shape_dist_traveled,`} + ${has_timepoint ? `` : `NULL AS timepoint,`} + * + REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::pickup_drop_off_type)[drop_off_type + 1] AS drop_off_type, + enum_range(NULL::pickup_drop_off_type)[pickup_type + 1] AS pickup_type + ${has_timepoint ? `,enum_range(NULL::timepoint_v)[timepoint + 1] AS timepoint` : ''} + ), + -- todo: is frequencies.txt relative to 1st arrival_time or departure_time? + coalesce( + first_value(departure_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence), + first_value(arrival_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence) + ) AS trip_start_time +FROM read_csv( + '${pathToStopTimes}', + header = true, + all_varchar = true, + types = { + arrival_time: 'INTERVAL', + departure_time: 'INTERVAL', + stop_sequence: 'INTEGER', + pickup_type: 'INTEGER', + drop_off_type: 'INTEGER', + ${has_shape_dist_traveled ? `shape_dist_traveled: 'REAL',` : ``} + ${has_timepoint ? `timepoint: 'INTEGER',` : ``} + } +); -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_departure_time () -RETURNS interval AS $$ - SELECT departure_time - FROM "${opt.schema}".stop_times - WHERE EXISTS ( - SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id - ) - ORDER BY departure_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_arrival_time () -RETURNS interval AS $$ - SELECT arrival_time - FROM "${opt.schema}".stop_times - WHERE EXISTS ( - SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX stop_times_trip_id_stop_sequence ON stop_times(trip_id, stop_sequence); + +-- todo: are all of them beneficial/necessary? +CREATE INDEX stop_times_trip_id ON stop_times (trip_id); +CREATE INDEX stop_times_stop_id ON stop_times (stop_id); +CREATE INDEX stop_times_stop_sequence_consec ON stop_times (stop_sequence_consec); +CREATE INDEX stop_times_trip_id_stop_sequence_consec ON stop_times (trip_id, stop_sequence_consec); +-- As of DuckDB v1.0.0, indexes on INTERVAL columns are not supported yet. +-- todo: alternatively just change these columns to INTEGER? +-- CREATE INDEX stop_times_arrival_time ON stop_times (arrival_time); +-- CREATE INDEX stop_times_departure_time ON stop_times (departure_time); + +-- todo: use materialized view once DuckDB supports that +-- see also https://github.com/duckdb/duckdb/discussions/3638 +CREATE TABLE largest_arr_dep_time AS +WITH + largest_departure_time AS ( + SELECT departure_time + FROM stop_times stop_times + WHERE EXISTS ( + SELECT * + FROM trips trips + JOIN service_days service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY departure_time DESC + LIMIT 1 + ), + largest_arrival_time AS ( + SELECT arrival_time + FROM stop_times stop_times + WHERE EXISTS ( + SELECT * + FROM trips trips + JOIN service_days service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY arrival_time DESC + LIMIT 1 ) - ORDER BY arrival_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_min ( - _timestamp TIMESTAMP WITH TIME ZONE +SELECT + to_seconds(greatest( + epoch(arrival_time), + epoch(departure_time) + )) AS largest +FROM largest_departure_time, largest_arrival_time; + +CREATE OR REPLACE FUNCTION dates_filter_min ( + _timestamp ) -RETURNS date AS $$ +AS ( SELECT date_trunc( 'day', - _timestamp - - GREATEST( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() - ) + _timestamp::TIMESTAMP WITH TIME ZONE + - largest -- we assume the DST <-> standard time shift is always <= 1h - '1 hour 1 second'::interval - ); -$$ LANGUAGE SQL IMMUTABLE; + )::DATE AS date_min + FROM largest_arr_dep_time +); -- This function doesn't do much, we just provide it to match date_filter_min(). -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_max ( - _timestamp TIMESTAMP WITH TIME ZONE +CREATE OR REPLACE FUNCTION dates_filter_max ( + _timestamp ) -RETURNS date AS $$ - SELECT date_trunc('day', _timestamp); -$$ LANGUAGE SQL IMMUTABLE; +AS ( + SELECT date_trunc( + 'day', + _timestamp::TIMESTAMP WITH TIME ZONE + )::DATE AS date_max +); -CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures AS +-- todo: add "ORDER BY stop_sequence_consec ASC" without affecting performance? +CREATE OR REPLACE VIEW arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT agency.agency_id, @@ -242,12 +221,12 @@ WITH stop_times_based AS NOT MATERIALIZED ( 'no_info_or_inherit' ) AS wheelchair_boarding FROM ( - "${opt.schema}".stop_times s - JOIN "${opt.schema}".stops ON s.stop_id = stops.stop_id - LEFT JOIN "${opt.schema}".stops stations ON stops.parent_station = stations.stop_id - JOIN "${opt.schema}".trips ON s.trip_id = trips.trip_id - JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( + stop_times s + JOIN stops stops ON s.stop_id = stops.stop_id + LEFT JOIN stops stations ON stops.parent_station = stations.stop_id + JOIN trips trips ON s.trip_id = trips.trip_id + JOIN routes routes ON trips.route_id = routes.route_id + LEFT JOIN agency agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -255,7 +234,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - JOIN "${opt.schema}".service_days ON trips.service_id = service_days.service_id + JOIN service_days service_days ON trips.service_id = service_days.service_id ) -- todo: this slows down slightly -- ORDER BY route_id, s.trip_id, "date", stop_sequence @@ -263,74 +242,72 @@ WITH stop_times_based AS NOT MATERIALIZED ( -- stop_times-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as arrival_departure_id, - stop_times_based.*, -- todo: expose local arrival/departure "wall clock time"? -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as arrival_departure_id, * FROM ( SELECT - *, - row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, stop_sequence_consec ORDER BY t_departure ASC)::integer AS frequencies_it + row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, stop_sequence_consec ORDER BY t_departure ASC) AS frequencies_it, + * FROM ( SELECT - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - agency_id, - route_id, route_short_name, route_long_name, route_type, - trip_id, direction_id, trip_headsign, wheelchair_accessible, bikes_allowed, - service_id, - shape_id, - "date", - stop_sequence, stop_sequence_consec, - stop_headsign, pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_arrival - trip_start_time + start_time, - t_arrival - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - departure_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_departure - trip_start_time + start_time, - t_departure - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - trip_start_time, - stop_id, stop_name, - station_id, station_name, - wheelchair_boarding, - frequencies_row + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + trip_start_time, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_arrival - trip_start_time + start_time, + t_arrival - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival, + unnest(generate_series( + t_departure - trip_start_time + start_time, + t_departure - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + ) FROM ( SELECT stop_times_based.*, @@ -339,42 +316,30 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN "${opt.schema}".frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? +) frequencies_based ) t -) t -) frequencies_based; - -CREATE OR REPLACE FUNCTION "${opt.schema}".arrival_departure_by_arrival_departure_id(id TEXT) -RETURNS "${opt.schema}".arrivals_departures -AS $$ - SELECT * - FROM "${opt.schema}".arrivals_departures - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; - -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.station_name IS E'@omit'; --- > If you want to rename just one field or type, your best bet is to use a [@name] smart comment […]. --- > NOTE: this still uses the inflectors, but it pretends that the tables name is different, so the input to the inflectors differs. --- https://www.graphile.org/postgraphile/inflection/#overriding-naming---one-off -COMMENT ON VIEW "${opt.schema}".arrivals_departures IS E'@name arrival_departures\\n@primaryKey trip_id,date,stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (stop_id) references stops|@fieldName stop\\n@foreignKey (station_id) references stops|@fieldName station'; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".connections AS +) t; + +-- CREATE OR REPLACE FUNCTION arrival_departure_by_arrival_departure_id(id TEXT) +-- RETURNS arrivals_departures +-- AS $$ +-- SELECT * +-- FROM arrivals_departures arrivals_departures +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; +`) + + await db[RUN](`\ +-- todo: add "ORDER BY stop_sequence_consec ASC" without affecting performance? +CREATE OR REPLACE VIEW connections AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT route_id, @@ -478,9 +443,9 @@ WITH stop_times_based AS NOT MATERIALIZED ( nullif(to_stations.wheelchair_boarding, 'no_info_or_inherit'), 'no_info_or_inherit' ) AS to_wheelchair_boarding - FROM "${opt.schema}".trips - LEFT JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( + FROM trips trips + LEFT JOIN routes routes ON trips.route_id = routes.route_id + LEFT JOIN agency agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -488,112 +453,86 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - LEFT JOIN "${opt.schema}".stop_times ON trips.trip_id = stop_times.trip_id - LEFT JOIN "${opt.schema}".stops from_stops ON stop_times.stop_id = from_stops.stop_id - LEFT JOIN "${opt.schema}".stops from_stations ON from_stops.parent_station = from_stations.stop_id - INNER JOIN "${opt.schema}".stop_times to_stop_times ON stop_times.trip_id = to_stop_times.trip_id AND stop_times.stop_sequence_consec + 1 = to_stop_times.stop_sequence_consec - INNER JOIN "${opt.schema}".stops to_stops ON to_stop_times.stop_id = to_stops.stop_id - LEFT JOIN "${opt.schema}".stops to_stations ON to_stops.parent_station = to_stations.stop_id + LEFT JOIN stop_times stop_times ON trips.trip_id = stop_times.trip_id + LEFT JOIN stops from_stops ON stop_times.stop_id = from_stops.stop_id + LEFT JOIN stops from_stations ON from_stops.parent_station = from_stations.stop_id + INNER JOIN stop_times to_stop_times ON stop_times.trip_id = to_stop_times.trip_id AND stop_times.stop_sequence_consec + 1 = to_stop_times.stop_sequence_consec + INNER JOIN stops to_stops ON to_stop_times.stop_id = to_stops.stop_id + LEFT JOIN stops to_stations ON to_stops.parent_station = to_stations.stop_id ) trips JOIN ( SELECT * - FROM "${opt.schema}".service_days + FROM service_days service_days ORDER BY service_id, "date" ) service_days ON trips.service_id = service_days.service_id ) -- stop_times-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as connection_id, - stop_times_based.*, - -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as connection_id, - - frequencies_based.* + * FROM ( SELECT - *, - row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, from_stop_sequence_consec ORDER BY t_departure ASC)::integer AS frequencies_it + row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, from_stop_sequence_consec ORDER BY t_departure ASC) AS frequencies_it, + * FROM ( SELECT - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, route_short_name, route_long_name, route_type, - trip_id, - service_id, - direction_id, - trip_headsign, - wheelchair_accessible, - bikes_allowed, - trip_start_time, - - from_stop_id, - from_stop_name, - from_station_id, - from_station_name, - from_wheelchair_boarding, - - from_stop_headsign, - from_pickup_type, - generate_series( - t_departure - trip_start_time + start_time, - t_departure - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_stop_sequence_consec, - from_timepoint, - - "date", - - to_timepoint, - to_stop_sequence, - to_stop_sequence_consec, - generate_series( - t_arrival - trip_start_time + start_time, - t_arrival - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - to_stop_headsign, - - to_stop_id, - to_stop_name, - to_station_id, - to_station_name, - to_wheelchair_boarding, - - frequencies_row + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + trip_start_time, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_departure - trip_start_time + start_time, + t_departure - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + unnest(generate_series( + t_arrival - trip_start_time + start_time, + t_arrival - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival + ) FROM ( SELECT stop_times_based.*, @@ -602,48 +541,28 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN "${opt.schema}".frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? +) frequencies_based ) t -) t -) frequencies_based; - -CREATE OR REPLACE FUNCTION "${opt.schema}".connection_by_connection_id(id TEXT) -RETURNS "${opt.schema}".connections -AS $$ - SELECT * - FROM "${opt.schema}".connections - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND from_stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; - -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) --- todo: allow filtering based on stop and/or route and/or trip and/or time frame --- https://www.graphile.org/postgraphile/functions/#setof-functions---connections -COMMENT ON COLUMN "${opt.schema}".connections.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_station_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_station_name IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".connections IS E'@primaryKey trip_id,date,from_stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (from_stop_id) references stops|@fieldName fromStop\\n@foreignKey (from_station_id) references stops|@fieldName fromStation\\n@foreignKey (to_stop_id) references stops|@fieldName toStop\\n@foreignKey (to_station_id) references stops|@fieldName toStation'; -` : ''} -` - - - - -module.exports = { - beforeAll, - formatRow: formatStopTimesRow, - afterAll, +) t; + +-- CREATE OR REPLACE FUNCTION connection_by_connection_id(id TEXT) +-- RETURNS connections +-- AS $$ +-- SELECT * +-- FROM connections connections +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND from_stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; +`) + + workingState.nrOfRowsByName.set('stop_times', await queryNumberOfRows(db, 'stop_times', opt)) } + +module.exports = importData diff --git a/lib/stops.js b/lib/stops.js index 7e8d65f..c43c33e 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -1,15 +1,48 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#stopstxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".location_type_val AS ENUM ( +const importData = async (db, pathToStops, opt, workingState) => { + // Several columns are optional, so they may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + stop_code: has_stop_code, + stop_desc: has_stop_desc, + zone_id: has_zone_id, + stop_url: has_stop_url, + location_type: has_location_type, + parent_station: has_parent_station, + stop_timezone: has_stop_timezone, + wheelchair_boarding: has_wheelchair_boarding, + level_id: has_level_id, + platform_code: has_platform_code, + } = await queryIfColumnsExist(db, pathToStops, [ + 'stop_code', + 'stop_desc', + 'zone_id', + 'stop_url', + 'location_type', + 'parent_station', + 'stop_timezone', + 'wheelchair_boarding', + 'level_id', + 'platform_code', + ]) + + await db[RUN](`\ +CREATE TYPE location_type_val AS ENUM ( 'stop' -- 0 (or blank): Stop (or Platform). A location where passengers board or disembark from a transit vehicle. Is called a platform when defined within a parent_station. , 'station' -- 1 – Station. A physical structure or area that contains one or more platform. , 'entrance_exit' -- 2 – Entrance/Exit. A location where passengers can enter or exit a station from the street. If an entrance/exit belongs to multiple stations, it can be linked by pathways to both, but the data provider must pick one of them as parent. , 'node' -- 3 – Generic Node. A location within a station, not matching any other location_type, which can be used to link together pathways define in pathways.txt. , 'boarding_area' -- 4 – Boarding Area. A specific location on a platform, where passengers can board and/or alight vehicles. ); -CREATE CAST ("${opt.schema}".location_type_val AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (location_type_val AS text) WITH INOUT AS IMPLICIT; -- For parentless stops: -- 0 or empty - No accessibility information for the stop. @@ -25,102 +58,135 @@ CREATE CAST ("${opt.schema}".location_type_val AS text) WITH INOUT AS IMPLICIT; -- 0 or empty - Station entrance will inherit its wheelchair_boarding behavior from the parent station, if specified for the parent. -- 1 - Station entrance is wheelchair accessible. -- 2 - No accessible path from station entrance to stops/platforms. -CREATE TYPE "${opt.schema}".wheelchair_boarding_val AS ENUM ( +CREATE TYPE wheelchair_boarding_val AS ENUM ( 'no_info_or_inherit' , 'accessible' , 'not_accessible' ); -CREATE CAST ("${opt.schema}".wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; + +INSTALL spatial; -- todo: make install optional? +LOAD spatial; -CREATE TABLE "${opt.schema}".stops ( +CREATE TABLE stops ( stop_id TEXT PRIMARY KEY, stop_code TEXT, -- todo: Required for locations which are stops (location_type=0), stations (location_type=1) or entrances/exits (location_type=2). Optional for locations which are generic nodes (location_type=3) or boarding areas (location_type=4). stop_name TEXT, stop_desc TEXT, - stop_loc geography(POINT), -- stop_lat/stop_lon + stop_loc GEOMETRY, -- stop_lat/stop_lon zone_id TEXT, stop_url TEXT, - location_type "${opt.schema}".location_type_val, + location_type location_type_val, parent_station TEXT, - stop_timezone TEXT CHECK ("${opt.schema}".is_timezone(stop_timezone)), - wheelchair_boarding "${opt.schema}".wheelchair_boarding_val, + -- In stops.txt, *any* row's parent_station might reference *any* other row. Essentially, stops.txt describes a tree. + -- As of DuckDB v1.0.0, it *seems* like adding a foreign key constraint here doesn't work, even if we order the stops to put parents before their children (see below). + -- todo: Report this with DuckDB? Alternatively, add the constraint after the import (see below). + -- FOREIGN KEY (parent_station) REFERENCES stops, + stop_timezone TEXT, + FOREIGN KEY (stop_timezone) REFERENCES valid_timezones, + wheelchair_boarding wheelchair_boarding_val, level_id TEXT, - ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES "${opt.schema}".levels,`} + ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES levels,`} platform_code TEXT ); -COPY "${opt.schema}".stops ( - stop_id, - stop_code, - stop_name, - stop_desc, - stop_loc, - zone_id, - stop_url, - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -) FROM STDIN csv; -` - -const locationType = (val) => { - if (val === '0') return 'stop' - if (val === '1') return 'station' - if (val === '2') return 'entrance_exit' - if (val === '3') return 'node' - if (val === '4') return 'boarding_area' - throw new Error('invalid/unsupported location_type: ' + val) -} - -const wheelchairBoarding = (val) => { - if (val === '0') return 'no_info_or_inherit' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid/unsupported wheelchair_boarding: ' + val) -} +INSERT INTO stops +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +WITH RECURSIVE + stops AS ( + SELECT + ${has_stop_code ? `` : `NULL AS stop_code,`} + ${has_stop_desc ? `` : `NULL AS stop_desc,`} + ${has_zone_id ? `` : `NULL AS zone_id,`} + ${has_stop_url ? `` : `NULL AS stop_url,`} + ${has_location_type ? `` : `NULL AS location_type,`} + ${has_parent_station ? `` : `NULL AS parent_station,`} + ${has_stop_timezone ? `` : `NULL AS stop_timezone,`} + ${has_wheelchair_boarding ? `` : `NULL AS wheelchair_boarding,`} + ${has_level_id ? `` : `NULL AS level_id,`} + ${has_platform_code ? `` : `NULL AS platform_code,`} + ST_Point(stop_lon, stop_lat) AS stop_loc, + * + EXCLUDE ( + stop_lat, stop_lon + ) + REPLACE ( + -- dummy entry in case no optional column is present + stop_id AS stop_id, + ${has_location_type ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::location_type_val)[location_type + 1] AS location_type, + ` : ``} + ${has_wheelchair_boarding ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::wheelchair_boarding_val)[ifnull(wheelchair_boarding, 0) + 1] AS wheelchair_boarding + ` : ``} + ) + FROM read_csv( + '${pathToStops}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + -- dummy entry in case no optional column is present + stop_id: 'TEXT', + ${has_stop_code ? `stop_code: 'TEXT',` : ``} + ${has_location_type ? `location_type: 'INTEGER',` : ``} + ${has_wheelchair_boarding ? `wheelchair_boarding: 'INTEGER',` : ``} + ${has_platform_code ? `platform_code: 'TEXT',` : ``} + } + ) + ), + -- order the stops to put parents before their children + stops_sorted_by_parents AS ( + ( + SELECT + *, + stop_id AS root_id, + 1 AS recursion_level + FROM stops + WHERE parent_station IS NULL + ) + UNION ALL + ( + SELECT + children.*, + parent.root_id, + parent.recursion_level + 1 + FROM stops children + JOIN stops_sorted_by_parents parent ON parent.stop_id = children.parent_station + ) + ) +SELECT * EXCLUDE ( + -- omit sorting helper columns + root_id, + recursion_level +) +FROM stops_sorted_by_parents +ORDER BY root_id, recursion_level, stop_id; -const formatStopsRow = (s) => { - return [ - s.stop_id || null, - s.stop_code || null, - s.stop_name || null, - s.stop_desc || null, - `POINT(${parseFloat(s.stop_lon)} ${parseFloat(s.stop_lat)})`, - s.zone_id || null, - s.stop_url || null, - s.location_type - ? locationType(s.location_type) - : null, - s.parent_station || null, - s.stop_timezone || null, - s.wheelchair_boarding - ? wheelchairBoarding(s.wheelchair_boarding) - : null, - s.level_id || null, - s.platform_code || null, - ] -} +-- todo: DuckDB v1.0.0 doesn't support them yet: +-- > The ADD CONSTRAINT and DROP CONSTRAINT clauses are not yet supported in DuckDB. +-- ALTER TABLE stops +-- ADD CONSTRAINT stops_parent_station_fkey +-- FOREIGN KEY (parent_station) REFERENCES stops; -const afterAll = (opt) => `\ -\\. +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX stops_stop_id ON stops(stop_id); -ALTER TABLE "${opt.schema}".stops -ADD CONSTRAINT stops_parent_station_fkey -FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops; +CREATE INDEX stops_parent_station ON stops (parent_station); +${opt.stopsLocationIndex ? `CREATE INDEX stops_stop_loc ON stops (stop_loc);` : ''} +`) -CREATE INDEX ON "${opt.schema}".stops (parent_station); -${opt.stopsLocationIndex ? `CREATE INDEX ON "${opt.schema}".stops (stop_loc);` : ''} -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".stops (level_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatStopsRow, - afterAll, + workingState.nrOfRowsByName.set('stops', await queryNumberOfRows(db, 'stops', opt)) } + +module.exports = importData diff --git a/lib/transfers.js b/lib/transfers.js index 9ac0e2b..b2844d1 100644 --- a/lib/transfers.js +++ b/lib/transfers.js @@ -1,91 +1,76 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#transferstxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".transfer_type_v AS ENUM ( +const importData = async (db, pathToTransfers, opt, workingState) => { + // min_transfer_time is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + min_transfer_time: has_min_transfer_time, + } = await queryIfColumnsExist(db, pathToTransfers, [ + 'min_transfer_time', + ]) + + await db[RUN](`\ +CREATE TYPE transfer_type_v AS ENUM ( 'recommended' -- 0 or empty - Recommended transfer point between routes. , 'timed' -- 1 - Timed transfer point between two routes. The departing vehicle is expected to wait for the arriving one and leave sufficient time for a rider to transfer between routes. , 'minimum_time' -- 2 – Transfer requires a minimum amount of time between arrival and departure to ensure a connection. The time required to transfer is specified by min_transfer_time. , 'impossible' -- 3 - Transfers are not possible between routes at the location. ); -CREATE CAST ("${opt.schema}".transfer_type_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (transfer_type_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".transfers ( - id SERIAL PRIMARY KEY, +CREATE TABLE transfers ( from_stop_id TEXT, - FOREIGN KEY (from_stop_id) REFERENCES "${opt.schema}".stops, + FOREIGN KEY (from_stop_id) REFERENCES stops, to_stop_id TEXT, - FOREIGN KEY (to_stop_id) REFERENCES "${opt.schema}".stops, - transfer_type "${opt.schema}".transfer_type_v, + FOREIGN KEY (to_stop_id) REFERENCES stops, + transfer_type transfer_type_v, min_transfer_time INT, from_route_id TEXT, - FOREIGN KEY (from_route_id) REFERENCES "${opt.schema}".routes, + FOREIGN KEY (from_route_id) REFERENCES routes, to_route_id TEXT, - FOREIGN KEY (from_route_id) REFERENCES "${opt.schema}".routes, + FOREIGN KEY (from_route_id) REFERENCES routes, from_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (from_trip_id) REFERENCES trips, to_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips + FOREIGN KEY (from_trip_id) REFERENCES trips, + -- We're not using a primary key index here because several columns can be NULL. + UNIQUE ( + from_stop_id, + from_trip_id, + from_route_id, + to_stop_id, + to_trip_id, + to_route_id + ) ); -ALTER TABLE "${opt.schema}".transfers -ADD CONSTRAINT transfers_sig -UNIQUE ( - from_stop_id, - to_stop_id, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id +INSERT INTO transfers +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::transfer_type_v)[transfer_type + 1] AS transfer_type +) +FROM read_csv( + '${pathToTransfers}', + header = true, + all_varchar = true, + types = { + transfer_type: 'INTEGER' + ${has_min_transfer_time ? `, min_transfer_time: 'INTEGER'` : ``} + } ); +`) -COPY "${opt.schema}".transfers ( - from_stop_id, - to_stop_id, - transfer_type, - min_transfer_time, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id -) FROM STDIN csv; -` - -const transferType = (val) => { - if (val === '0') return 'recommended' - if (val === '1') return 'timed' - if (val === '2') return 'minimum_time' - if (val === '3') return 'impossible' - throw new Error('invalid/unsupported transfer_type: ' + val) -} - -const formatTransfersRow = (t) => { - return [ - t.from_stop_id || null, - t.to_stop_id || null, - t.transfer_type ? transferType(t.transfer_type) : null, - t.min_transfer_time ? parseInt(t.min_transfer_time) : null, - t.from_route_id, - t.to_route_id, - t.from_trip_id, - t.to_trip_id, - ] + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } -const afterAll = (opt) => `\ -\\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".transfers (from_route_id); -CREATE INDEX ON "${opt.schema}".transfers (from_trip_id); -CREATE INDEX ON "${opt.schema}".transfers (to_stop_id); -CREATE INDEX ON "${opt.schema}".transfers (to_route_id); -CREATE INDEX ON "${opt.schema}".transfers (to_trip_id); -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatTransfersRow, - afterAll, -} +module.exports = importData diff --git a/lib/translations.js b/lib/translations.js index 61baa8e..c9d69f7 100644 --- a/lib/translations.js +++ b/lib/translations.js @@ -1,789 +1,743 @@ 'use strict' +const {strictEqual} = require('assert') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + +// > ## record_id +// > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: +// > - agency_id for agency +// > - stop_id for stops +// > - route_id for routes +// > - trip_id for trips +// > - trip_id for stop_times +// > - pathway_id for pathways +// > - level_id for levels +// > - attribution_id for attribution +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: +// > - service_id for calendar +// > - service_id for calendar_dates +// > - fare_id for fare_attributes +// > - fare_id for fare_rules +// > - shape_id for shapes +// > - trip_id for frequencies +// > - from_stop_id for transfers +// > ## record_sub_id +// > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: +// > - None for agency.txt +// > - None for stops.txt +// > - None for routes.txt +// > - None for trips.txt +// > - stop_sequence for stop_times.txt +// > - None for pathways.txt +// > - None for levels.txt +// > - None for attributions.txt +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: +// > - None for calendar.txt +// > - date for calendar_dates.txt +// > - None for fare_attributes.txt +// > - route_id for fare_rules.txt +// > - None for shapes.txt +// > - start_time for frequencies.txt +// > - to_stop_id for transfers.txt // https://gtfs.org/documentation/schedule/reference/#translationstxt -const beforeAll = (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".table_exists( - t_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM pg_tables - WHERE schemaname = '${opt.schema}' - AND tablename = t_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".table_exists IS E'@omit'; -` : ''} - -CREATE OR REPLACE FUNCTION "${opt.schema}".column_exists( - t_name TEXT, - c_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM information_schema.columns - WHERE table_schema = '${opt.schema}' - AND table_name = t_name - AND column_name = c_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".column_exists IS E'@omit'; -` : ''} - -CREATE TABLE "${opt.schema}"._translations_ref_cols ( - table_name TEXT PRIMARY KEY, - -- todo: only check if columns exist when table exists? - record_id_col TEXT NOT NULL - CONSTRAINT valid_record_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".column_exists(table_name, record_id_col) - ), - record_sub_id_col TEXT - CONSTRAINT valid_record_sub_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - record_sub_id_col IS NULL - OR - "${opt.schema}".column_exists(table_name, record_sub_id_col) - ) -); -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}"._translations_ref_cols IS E'@omit'; -` : ''} - --- > ## record_id --- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: --- > - agency_id for agency --- > - stop_id for stops --- > - route_id for routes --- > - trip_id for trips --- > - trip_id for stop_times --- > - pathway_id for pathways --- > - level_id for levels --- > - attribution_id for attribution --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: --- > - service_id for calendar --- > - service_id for calendar_dates --- > - fare_id for fare_attributes --- > - fare_id for fare_rules --- > - shape_id for shapes --- > - trip_id for frequencies --- > - from_stop_id for transfers --- > ## record_sub_id --- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: --- > - None for agency.txt --- > - None for stops.txt --- > - None for routes.txt --- > - None for trips.txt --- > - stop_sequence for stop_times.txt --- > - None for pathways.txt --- > - None for levels.txt --- > - None for attributions.txt --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: --- > - None for calendar.txt --- > - date for calendar_dates.txt --- > - None for fare_attributes.txt --- > - route_id for fare_rules.txt --- > - None for shapes.txt --- > - start_time for frequencies.txt --- > - to_stop_id for transfers.txt --- https://gtfs.org/documentation/schedule/reference/#translationstxt -INSERT INTO "${opt.schema}"._translations_ref_cols ( - table_name, - record_id_col, - record_sub_id_col -) VALUES - -- todo: feed_info - ('agency', 'agency_id', NULL), - ('stops', 'stop_id', NULL), - ('routes', 'route_id', NULL), - ('trips', 'trip_id', NULL), - ('stop_times', 'trip_id', 'stop_sequence'), - ('pathways', 'pathway_id', NULL), - ('levels', 'level_id', NULL), - ('attribution', 'attribution_id', NULL), - ('calendar', 'service_id', NULL), - ('calendar_dates', 'service_id', 'date'), - ('fare_attributes', 'fare_id', NULL), - ('fare_rules', 'fare_id', 'route_id'), - ('shapes', 'shape_id', NULL), - ('frequencies', 'trip_id', 'start_time'), - ('transfers', 'from_stop_id', 'to_stop_id') -; - -CREATE OR REPLACE FUNCTION "${opt.schema}".row_exists( - table_name TEXT, - col_a_name TEXT, - col_a_value TEXT, - col_b_name TEXT, - col_b_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - result BOOLEAN; - BEGIN - IF col_b_name IS NULL THEN - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - LIMIT 1 - ) - ', '${opt.schema}', table_name, col_a_name, col_a_value) - INTO STRICT result; - RETURN result; - ELSE - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - AND %I = %L -- col_b_name, col_b_value - LIMIT 1 - ) - ', '${opt.schema}', table_name, col_a_name, col_a_value, col_b_name, col_b_value) - INTO STRICT result; - RETURN result; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".row_exists IS E'@omit'; -` : ''} - --- todo: assert that row_exists works as intended --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_id', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_name', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- todo: assert that it fails with 2 rows - -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_translation_ref( - _table_name TEXT, - _field_name TEXT, - _record_id TEXT, - _record_sub_id TEXT, - _field_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - _record_id_col TEXT; - _record_sub_id_col TEXT; - result BOOLEAN; - BEGIN - IF _record_id IS NOT NULL THEN - SELECT record_id_col - FROM "${opt.schema}"._translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_id_col; - SELECT record_sub_id_col - FROM "${opt.schema}"._translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_sub_id_col; - - IF _record_sub_id_col IS NULL AND _record_sub_id IS NOT NULL THEN - RAISE EXCEPTION - USING - MESSAGE = format('record_sub_id must be NULL for %I but is %L', _table_name, _record_sub_id), - ERRCODE = 'data_exception'; - END IF; - SELECT "${opt.schema}".row_exists( - _table_name, - _record_id_col, _record_id, - _record_sub_id_col, _record_sub_id - ) - INTO STRICT result; - RETURN result; - ELSEIF _field_value IS NOT NULL THEN - SELECT "${opt.schema}".row_exists( - _table_name, - _field_name, _field_value, - NULL, NULL - ) - INTO STRICT result; - RETURN result; - ELSE - RAISE EXCEPTION - USING - MESSAGE = 'Either record_id or field_value must be NOT NULL', - HINT = 'Refer to translations.txt the GTFS Static/Schedule reference.', - ERRCODE = 'data_exception'; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_translation_ref IS E'@omit'; -` : ''} - --- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". --- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 --- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html --- related: https://github.com/google/transit/pull/98 - --- https://gtfs.org/documentation/schedule/reference/#translationstxt -CREATE TABLE "${opt.schema}".translations ( - -- > Defines the table that contains the field to be translated. Allowed values are: - -- > agency, stops, routes, trips, stop_times, pathways, levels, feed_info, attributions - -- > Any file added to GTFS will have a table_name value equivalent to the file name, as listed above (i.e., not including the .txt file extension). - table_name TEXT NOT NULL, - - -- > Name of the field to be translated. […] Fields with other types should not be translated. - field_name TEXT NOT NULL - CONSTRAINT valid_field_name CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".column_exists(table_name, field_name) - ), - - language TEXT NOT NULL - CONSTRAINT valid_language CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - "${opt.schema}".is_valid_lang_code(language) - ), - - translation TEXT NOT NULL, - - -- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below […]. - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. […] - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if field_value is empty. - record_id TEXT, - - -- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: - -- > - None for agency.txt - -- > - None for stops.txt - -- > - None for routes.txt - -- > - None for trips.txt - -- > - stop_sequence for stop_times.txt - -- > - None for pathways.txt - -- > - None for levels.txt - -- > - None for attributions.txt - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: - -- > - None for calendar.txt - -- > - date for calendar_dates.txt - -- > - None for fare_attributes.txt - -- > - route_id for fare_rules.txt - -- > - None for shapes.txt - -- > - start_time for frequencies.txt - -- > - to_stop_id for transfers.txt - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if table_name=stop_times and record_id is defined. - record_sub_id TEXT, - - -- > Instead of […] using record_id and record_sub_id, this field can be used […]. When used, the translation will be applied when the fields identified by table_name and field_name contains the exact same value defined in field_value. - -- > The field must have exactly the value defined in field_value. If only a subset of the value matches field_value, the translation won’t be applied. - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if record_id is defined. - -- > - Required if record_id is empty. - -- todo: - -- > If two translation rules match the same record (one with field_value, and the other one with record_id), the rule with record_id takes precedence. - field_value TEXT, - - CONSTRAINT field_value_or_record_id CHECK ( - field_value IS NULL OR record_id IS NULL - ), - CONSTRAINT not_with_feed_info CHECK ( - field_value IS NULL OR table_name != 'feed_info' - ), +const supportedTranslationRefs = new Map([ + ['agency', { + src_table_name: 'agency', + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stops', { + src_table_name: 'stops', + record_id_column: 'stop_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['routes', { + src_table_name: 'routes', + record_id_column: 'route_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['trips', { + src_table_name: 'trips', + record_id_column: 'trip_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stop_times', { + src_table_name: 'stop_times', + record_id_column: 'trip_id', + record_sub_id_column: 'stop_sequence', record_sub_id_column_type: 'INTEGER', + }], + ['pathways', { + src_table_name: 'pathways', + record_id_column: 'pathway_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['levels', { + src_table_name: 'levels', + record_id_column: 'level_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // todo: attribution.txt is not supported yet + // ['attribution', { + // src_table_name: 'attribution', + // record_id_column: 'attribution_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + ['calendar', { + src_table_name: 'calendar', + record_id_column: 'service_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['calendar_dates', { + src_table_name: 'calendar_dates', + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // ['fare_attributes', { + // src_table_name: 'fare_attributes', + // record_id_column: 'fare_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + // ['fare_rules', { + // src_table_name: 'fare_rules', + // record_id_column: 'fare_id', + // record_sub_id_column: 'route_id', record_sub_id_column_type: 'TEXT', + // }], + ['shapes', { + src_table_name: 'shapes', + record_id_column: 'shape_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // frequencies.txt has no primary key and/or unique index yet because DuckDB doesn't support indexes on INTERVAL. See frequencies.js for more details. + // ['frequencies', { + // src_table_name: 'frequencies', + // record_id_column: 'trip_id', + // record_sub_id_column: 'start_time', record_sub_id_column_type: 'INTERVAL', + // }], + // transfers' rows are *not* unique on (from_stop_id, to_stop_id), so we cannot create a foreign key reference on the table. + // todo: find a workaround + // ['transfers', { + // src_table_name: 'transfers', + // record_id_column: 'from_stop_id', + // record_sub_id_column: 'to_stop_id', record_sub_id_column_type: 'TEXT', + // }], + ['feed_info', { + src_table_name: 'feed_info', + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }], +]) + +const _srcTableRefSql = (table_name) => { + return `_translations_${table_name}` +} - CONSTRAINT valid_reference CHECK ( - NOT "${opt.schema}".table_exists(table_name) - OR - table_name = 'feed_info' - OR - "${opt.schema}".is_valid_translation_ref( - table_name, - field_name, - record_id, - record_sub_id, - field_value +const _srcTablesSql = (pathToTranslations, table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, record_sub_id_column_type, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const srcTableRef = _srcTableRefSql(table_name) + + return `\ +CREATE TABLE ${srcTableRef} ( + ${hasCol ? `record_id TEXT NOT NULL,` : ``} + ${hasSubCol ? `record_sub_id ${record_sub_id_column_type} NOT NULL,` : ``} +${hasCol ? `\ + FOREIGN KEY ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} ) - ), - - -- > Primary key (table_name, field_name, language, record_id, record_sub_id, field_value) - -- https://gtfs.org/documentation/schedule/reference/#translationstxt - -- PostgreSQL doesn't allow NULL values for primary key columns, so we use UNIQUE. - UNIQUE ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value - ) + REFERENCES ${table_name} ( + ${colRef} + ${hasSubCol ? `, ${subColRef}` : ``} + ),\ +` : ``} + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -COPY "${opt.schema}".translations ( - table_name, +INSERT INTO ${srcTableRef} +SELECT + ${hasCol ? `record_id,` : ``} + ${hasSubCol ? `record_sub_id,` : ``} field_name, language, - translation, - record_id, - record_sub_id, - field_value -) FROM STDIN csv; + translation +FROM read_csv( + '${pathToTranslations}', + header = true, + all_varchar = true +) +WHERE table_name = '${table_name}' +-- todo: support field_value-based translations +AND field_value IS NULL; ` - -const formatTranslationsRow = (t) => { - return [ - t.table_name || null, - t.field_name || null, - t.language || null, - t.translation || null, - t.record_id || null, - t.record_sub_id || null, - t.field_value || null, - ] } - -const afterAll = (opt) => `\ -\\. - --- todo -CREATE INDEX ON "${opt.schema}".translations ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value +strictEqual( + _srcTablesSql('foo/trans.txt', 'feed_info', { + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ +CREATE TABLE _translations_feed_info ( + + + + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -CREATE OR REPLACE VIEW "${opt.schema}".stops_translated AS +INSERT INTO _translations_feed_info SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - stop_id, - stop_code, - coalesce(stop_n_t.translation, stop_name) as stop_name, - stop_n_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(stop_d_t.translation, stop_desc) as stop_desc, - stop_d_t.language as stop_desc_lang, -- todo: fall back to feed_info.feed_lang? - stop_loc, - zone_id, - coalesce(stop_u_t.translation, stop_url) as stop_url, - stop_u_t.language as stop_url_lang, -- todo: fall back to feed_info.feed_lang? - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -FROM "${opt.schema}".stops s -LEFT JOIN "${opt.schema}".translations stop_n_t ON ( - stop_n_t.table_name = 'stops' AND stop_n_t.field_name = 'stop_name' - AND (s.stop_id = stop_n_t.record_id OR s.stop_name = stop_n_t.field_value) + + + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN "${opt.schema}".translations stop_d_t ON ( - stop_d_t.table_name = 'stops' AND stop_d_t.field_name = 'stop_desc' - AND (s.stop_id = stop_d_t.record_id OR s.stop_name = stop_d_t.field_value) +WHERE table_name = 'feed_info' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with feed_info.txt', ) -LEFT JOIN "${opt.schema}".translations stop_u_t ON ( - stop_u_t.table_name = 'stops' AND stop_u_t.field_name = 'stop_url' - AND (s.stop_id = stop_u_t.record_id OR s.stop_name = stop_u_t.field_value) +strictEqual( + _srcTablesSql('foo/trans.txt', 'calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ +CREATE TABLE _translations_calendar_dates ( + record_id TEXT NOT NULL, + record_sub_id DATE NOT NULL, + FOREIGN KEY ( + record_id + , record_sub_id + ) + REFERENCES calendar_dates ( + "service_id" + , "date" + ), + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".stops_translated IS E'@omit'; -CREATE OR REPLACE FUNCTION "${opt.schema}".stops_translated_stop_name ( - stop stops, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, stops.stop_name) - FROM "${opt.schema}".stops - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'stops' AND t.field_name = 'stop_name' - AND (stops.stop_id = t.record_id OR stops.stop_name = t.field_value) - ) - WHERE stops.stop_id = stop.stop_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".routes_translated AS +INSERT INTO _translations_calendar_dates SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - agency_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_d_t.translation, route_desc) as route_desc, - route_d_t.language as route_desc_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - coalesce(route_u_t.translation, route_url) as route_url, - route_u_t.language as route_url_lang, -- todo: fall back to feed_info.feed_lang? - route_color, - route_text_color, - route_sort_order -FROM "${opt.schema}".routes r -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (r.route_id = route_s_t.record_id OR r.route_short_name = route_s_t.field_value) + record_id, + record_sub_id, + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (r.route_id = route_l_t.record_id OR r.route_long_name = route_l_t.field_value) +WHERE table_name = 'calendar_dates' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with calendar_dates.txt', ) -LEFT JOIN "${opt.schema}".translations route_d_t ON ( - route_d_t.table_name = 'routes' AND route_d_t.field_name = 'route_desc' - AND (r.route_id = route_d_t.record_id OR r.route_long_name = route_d_t.field_value) + +const _selectToBeMergedSql = (table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const hasSubCol = record_sub_id_column !== null + const srcTableRef = _srcTableRefSql(table_name) + + return `\ + SELECT + '${table_name}' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + ${hasCol ? `record_id::TEXT as record_id,` : ``} + ${hasSubCol ? `record_sub_id::TEXT as record_sub_id,` : ``} + * + ${hasCol ? `EXCLUDE ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} + )` : ``} + FROM ${srcTableRef} +` +} +strictEqual( + _selectToBeMergedSql('agency', { + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ + SELECT + 'agency' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + + * + EXCLUDE ( + record_id + + ) + FROM _translations_agency +`, + '_selectToBeMergedSql with agency.txt', +) +strictEqual( + _selectToBeMergedSql('calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ + SELECT + 'calendar_dates' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + record_sub_id::TEXT as record_sub_id, + * + EXCLUDE ( + record_id + , record_sub_id + ) + FROM _translations_calendar_dates +`, + '_selectToBeMergedSql with calendar_dates.txt', ) -LEFT JOIN "${opt.schema}".translations route_u_t ON ( - route_u_t.table_name = 'routes' AND route_u_t.field_name = 'route_url' - AND (r.route_id = route_u_t.record_id OR r.route_long_name = route_u_t.field_value) -); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".routes_translated IS E'@omit'; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_short_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_short_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_short_name' - AND (routes.route_id = t.record_id OR routes.route_short_name = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_long_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_long_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_long_name' - AND (routes.route_id = t.record_id OR routes.route_long_name = t.field_value) +const _translatedSql = (table_name, translatedCols) => { + const _transRefSql = (col) => `"trans_${col}"` + + const _sqls = Array.from(translatedCols.entries()) + .map(([col, translationRef]) => { + const { + src_table_name, + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const srcTableRef = _srcTableRefSql(src_table_name) + const transRef = _transRefSql(col) + + return { + colLangSelect: `\ + ${transRef}.language AS "${col}_lang",`, + colReplace: `\ + coalesce(${transRef}.translation, "${col}") AS "${col}"`, + transJoin: `\ +LEFT JOIN ${srcTableRef} ${transRef} ON ( + ${transRef}.field_name = '${col}' + ${hasCol ? `AND data.${colRef} = ${transRef}.record_id` : ``} + ${hasSubCol ? `AND data.${subColRef} = ${transRef}.record_sub_id` : ``} +)`, + } + }) + + return `\ +CREATE VIEW ${table_name}_translated AS +SELECT + -- todo: fall back to feed_info.feed_lang? +${_sqls.map(sql => sql.colLangSelect).join('\n')} + data.* + REPLACE ( +${_sqls.map(sql => sql.colReplace).join(',\n')} ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_desc ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_desc) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_desc' - AND (routes.route_id = t.record_id OR routes.route_desc = t.field_value) +FROM ${table_name} data +${_sqls.map(sql => sql.transJoin).join('\n')}; +` +} +{ + const agencyRef = supportedTranslationRefs.get('agency') + strictEqual( + _translatedSql('agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ])), + `\ +CREATE VIEW agency_translated AS +SELECT + -- todo: fall back to feed_info.feed_lang? + "trans_agency_name".language AS "agency_name_lang", + "trans_agency_url".language AS "agency_url_lang", + data.* + REPLACE ( + coalesce("trans_agency_name".translation, "agency_name") AS "agency_name", + coalesce("trans_agency_url".translation, "agency_url") AS "agency_url" ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_url ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_url) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_url' - AND (routes.route_id = t.record_id OR routes.route_url = t.field_value) +FROM agency data +LEFT JOIN _translations_agency "trans_agency_name" ON ( + "trans_agency_name".field_name = 'agency_name' + AND data."agency_id" = "trans_agency_name".record_id + +) +LEFT JOIN _translations_agency "trans_agency_url" ON ( + "trans_agency_url".field_name = 'agency_url' + AND data."agency_id" = "trans_agency_url".record_id + +); +`, + '_translatedSql with agency.txt', ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - --- todo [breaking]: remove in favor of trip_headsign_translations & trip_short_name_translations -CREATE OR REPLACE VIEW "${opt.schema}".trips_translated AS +} +{ + const calendarDatesRef = supportedTranslationRefs.get('calendar_dates') + strictEqual( + _translatedSql('calendar_dates', new Map([ + ['foo', calendarDatesRef], + ['b-a-r', calendarDatesRef], + ])), + `\ +CREATE VIEW calendar_dates_translated AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - trip_id, - route_id, - service_id, - coalesce(trip_h_t.translation, trip_headsign) as trip_headsign, - trip_h_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(trip_s_t.translation, trip_short_name) as trip_short_name, - trip_s_t.language as trip_short_name_lang, -- todo: fall back to feed_info.feed_lang? - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -FROM "${opt.schema}".trips t -LEFT JOIN "${opt.schema}".translations trip_s_t ON ( - trip_s_t.table_name = 'trips' AND trip_s_t.field_name = 'trip_short_name' - AND (t.trip_id = trip_s_t.record_id OR t.trip_headsign = trip_s_t.field_value) + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" + ) +FROM calendar_dates data +LEFT JOIN _translations_calendar_dates "trans_foo" ON ( + "trans_foo".field_name = 'foo' + AND data."service_id" = "trans_foo".record_id + AND data."date" = "trans_foo".record_sub_id ) -LEFT JOIN "${opt.schema}".translations trip_h_t ON ( - trip_h_t.table_name = 'trips' AND trip_h_t.field_name = 'trip_headsign' - AND (t.trip_id = trip_h_t.record_id OR t.trip_headsign = trip_h_t.field_value) +LEFT JOIN _translations_calendar_dates "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + AND data."service_id" = "trans_b-a-r".record_id + AND data."date" = "trans_b-a-r".record_sub_id ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".trips_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_short_name ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_short_name) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_short_name' - AND (trips.trip_id = t.record_id OR trips.trip_short_name = t.field_value) - ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_headsign ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_headsign) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_headsign' - AND (trips.trip_id = t.record_id OR trips.trip_headsign = t.field_value) +`, + '_translatedSql with calendar_dates.txt', ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures_translated AS +} +{ + const feedInfoRef = supportedTranslationRefs.get('feed_info') + strictEqual( + _translatedSql('feed_info', new Map([ + ['foo', { + ...feedInfoRef, + src_table_name: 'some-other-table', + }], + ['b-a-r', feedInfoRef], + ])), + `\ +CREATE VIEW feed_info_translated AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - service_id, - shape_id, - "date", - stop_sequence, - coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, - stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, t_arrival, - departure_time, t_departure, - stop_id, - coalesce(stop_t.translation, stop_name) as stop_name, - stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - station_id, - coalesce(station_t.translation, station_name) as station_name, - station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".arrivals_departures ad -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) -) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) -) -LEFT JOIN "${opt.schema}".translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) -) -LEFT JOIN "${opt.schema}".translations stop_t ON ( - stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' - AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) -) -LEFT JOIN "${opt.schema}".translations station_t ON ( - station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' - AND station_t.language = stop_t.language - AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) -) -LEFT JOIN "${opt.schema}".translations stop_times_t ON ( - stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' - AND ( - (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) - OR ad.stop_headsign = stop_times_t.field_value + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" ) +FROM feed_info data +LEFT JOIN _translations_some-other-table "trans_foo" ON ( + "trans_foo".field_name = 'foo' + + +) +LEFT JOIN _translations_feed_info "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + + ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".arrivals_departures_translated IS E'@omit'; -` : ''} - -CREATE OR REPLACE VIEW "${opt.schema}".connections_translated AS -SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, - service_id, - direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - - from_stop_id, - coalesce(from_stop.translation, from_stop_name) as from_stop_name, - from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - from_station_id, - coalesce(from_station.translation, from_station_name) as from_station_name, - from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? - - coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, - from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - from_pickup_type, - t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_timepoint, +`, + '_translatedSql with feed_info.txt', + ) +} - "date", +// https://gtfs.org/documentation/schedule/reference/#translationstxt +const importData = async (db, pathToTranslations, opt, workingState) => { + const translationRefs = new Map( + supportedTranslationRefs.entries() + // If there is no such file/table, don't allow translations for it. + .filter(([table_name]) => opt.files.includes(table_name)) + ) - to_timepoint, - to_stop_sequence, - t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, - to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? + const selectsToBeMerged = [] + for (const [table_name, translationRef] of translationRefs.entries()) { + await db[RUN](_srcTablesSql(pathToTranslations, table_name, translationRef)) + selectsToBeMerged.push(_selectToBeMergedSql(table_name, translationRef)) + } - to_stop_id, - coalesce(to_stop.translation, to_stop_name) as to_stop_name, - to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - to_station_id, - coalesce(to_station.translation, to_station_name) as to_station_name, - to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".connections c -LEFT JOIN "${opt.schema}".translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) -) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) -) -LEFT JOIN "${opt.schema}".translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) -) -LEFT JOIN "${opt.schema}".translations from_stop ON ( - from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' - AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) -) -LEFT JOIN "${opt.schema}".translations from_station ON ( - from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' - AND from_station.language = from_stop.language - AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) -) -LEFT JOIN "${opt.schema}".translations to_stop ON ( - to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' - AND to_stop.language = from_stop.language - AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) -) -LEFT JOIN "${opt.schema}".translations to_station ON ( - to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' - AND to_station.language = from_stop.language - AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) -) -LEFT JOIN "${opt.schema}".translations from_stop_times_t ON ( - from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) - OR c.from_stop_headsign = from_stop_times_t.field_value - ) -) -LEFT JOIN "${opt.schema}".translations to_stop_times_t ON ( - to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) - OR c.to_stop_headsign = to_stop_times_t.field_value - ) -); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".connections_translated IS E'@omit'; -` : ''} -` + await db[RUN](`\ +-- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". +-- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 +-- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html +-- related: https://github.com/google/transit/pull/98 -module.exports = { - beforeAll, - formatRow: formatTranslationsRow, - afterAll, +-- We mimick a true table with a view that UNIONs all individual _translations_* tables. +CREATE VIEW translations AS +${selectsToBeMerged.map(sql => `(${sql})`).join(`UNION ALL BY NAME`)}; +`) + + const agencyRef = supportedTranslationRefs.get('agency') + const stopsRef = supportedTranslationRefs.get('stops') + const routesRef = supportedTranslationRefs.get('routes') + const tripsRef = supportedTranslationRefs.get('trips') + const stopTimesRef = supportedTranslationRefs.get('stop_times') + const pathwaysRef = supportedTranslationRefs.get('pathways') + const levelsRef = supportedTranslationRefs.get('levels') + const feedInfoRef = supportedTranslationRefs.get('feed_info') + const preTranslatedColumns = new Map([ + ['agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ['agency_phone', agencyRef], + ['agency_fare_url', agencyRef], + ['agency_email', agencyRef], + ])], + ['stops', new Map([ + ['stop_code', stopsRef], + ['stop_name', stopsRef], + // todo: not supported yet by stops.js + // ['tts_stop_name', stopsRef], + ['stop_desc', stopsRef], + ['stop_url', stopsRef], + ['platform_code', stopsRef], + ])], + ['routes', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['route_desc', routesRef], + ['route_url', routesRef], + ])], + ['trips', new Map([ + ['trip_headsign', tripsRef], + ['trip_short_name', tripsRef], + // todo: not supported yet by trips.js + // ['trip_desc', tripsRef], + // ['trip_url', tripsRef], + ])], + ['stop_times', new Map([ + ['stop_headsign', stopTimesRef], + ])], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // todo: frequencies.txt (see above) + // todo: areas.txt is not supported yet + // todo: networks.txt is not supported yet + ['pathways', new Map([ + ['signposted_as', pathwaysRef], + ['reversed_signposted_as', pathwaysRef], + ])], + ['levels', new Map([ + ['level_name', levelsRef], + ])], + // todo: location_groups.txt is not supported yet + // todo: booking_rules.txt is not supported yet + ['feed_info', new Map([ + ['feed_publisher_name', feedInfoRef], + ['feed_publisher_url', feedInfoRef], + ['feed_version', feedInfoRef], + ['feed_contact_email', feedInfoRef], + ['feed_contact_url', feedInfoRef], + ])], + // todo: attribution.txt is not supported yet + + ]) + for (const [table_name, translatedCols] of preTranslatedColumns) { + if (!opt.files.includes(table_name)) { + // If there is no such file/table, don't allow translations for it. + continue + } + + await db[RUN](_translatedSql(table_name, translatedCols)) + } + + // *_translated for tables/views made up by gtfs-via-duckdb + { + await db[RUN](_translatedSql('arrivals_departures', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['trip_headsign', tripsRef], + ['stop_headsign', stopsRef], + ['stop_name', stopsRef], + // todo: ['station_name', stopsRef], + ]))) + } + // todo: connections + +// `\ +// -- CREATE OR REPLACE VIEW arrivals_departures_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- service_id, +// -- shape_id, +// -- "date", +// -- stop_sequence, +// -- coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, +// -- stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- pickup_type, drop_off_type, shape_dist_traveled, timepoint, +// -- tz, +// -- arrival_time, t_arrival, +// -- departure_time, t_departure, +// -- stop_id, +// -- coalesce(stop_t.translation, stop_name) as stop_name, +// -- stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- station_id, +// -- coalesce(station_t.translation, station_name) as station_name, +// -- station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM arrivals_departures ad +// -- LEFT JOIN translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN translations stop_t ON ( +// -- stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' +// -- AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) +// -- ) +// -- LEFT JOIN translations station_t ON ( +// -- station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' +// -- AND station_t.language = stop_t.language +// -- AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) +// -- ) +// -- LEFT JOIN translations stop_times_t ON ( +// -- stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) +// -- OR ad.stop_headsign = stop_times_t.field_value +// -- ) +// -- ); +// -- +// -- CREATE OR REPLACE VIEW connections_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, +// -- service_id, +// -- direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- from_stop_id, +// -- coalesce(from_stop.translation, from_stop_name) as from_stop_name, +// -- from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_station_id, +// -- coalesce(from_station.translation, from_station_name) as from_station_name, +// -- from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, +// -- from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_pickup_type, +// -- t_departure, +// -- departure_time, -- todo [breaking]: this is misleading, remove it +// -- from_stop_sequence, +// -- from_timepoint, +// -- +// -- "date", +// -- +// -- to_timepoint, +// -- to_stop_sequence, +// -- t_arrival, +// -- arrival_time, -- todo [breaking]: this is misleading, remove it +// -- to_drop_off_type, +// -- coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, +// -- to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- to_stop_id, +// -- coalesce(to_stop.translation, to_stop_name) as to_stop_name, +// -- to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- to_station_id, +// -- coalesce(to_station.translation, to_station_name) as to_station_name, +// -- to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM connections c +// -- LEFT JOIN translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN translations from_stop ON ( +// -- from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' +// -- AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) +// -- ) +// -- LEFT JOIN translations from_station ON ( +// -- from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' +// -- AND from_station.language = from_stop.language +// -- AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) +// -- ) +// -- LEFT JOIN translations to_stop ON ( +// -- to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' +// -- AND to_stop.language = from_stop.language +// -- AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) +// -- ) +// -- LEFT JOIN translations to_station ON ( +// -- to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' +// -- AND to_station.language = from_stop.language +// -- AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) +// -- ) +// -- LEFT JOIN translations from_stop_times_t ON ( +// -- from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) +// -- OR c.from_stop_headsign = from_stop_times_t.field_value +// -- ) +// -- ) +// -- LEFT JOIN translations to_stop_times_t ON ( +// -- to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) +// -- OR c.to_stop_headsign = to_stop_times_t.field_value +// -- ) +// -- ); +// `; + + workingState.nrOfRowsByName.set('translations', await queryNumberOfRows(db, 'translations', opt)) } + +module.exports = importData diff --git a/lib/trips.js b/lib/trips.js index d6ec684..d66f7a4 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -1,95 +1,93 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#tripstxt -const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".wheelchair_accessibility AS ENUM ( +const importData = async (db, pathToTrips, opt, workingState) => { + // Several columns are optional, so they may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + wheelchair_accessible: has_wheelchair_accessible, + bikes_allowed: has_bikes_allowed, + } = await queryIfColumnsExist(db, pathToTrips, [ + 'wheelchair_accessible', + 'bikes_allowed', + ]) + + await db[RUN](`\ +CREATE TYPE wheelchair_accessibility AS ENUM ( 'unknown' -- 0 or empty - No accessibility information for the trip. , 'accessible' -- 1 – Vehicle being used on this particular trip can accommodate at least one rider in a wheelchair. , 'not_accessible' -- 2 – No riders in wheelchairs can be accommodated on this trip. ); -CREATE CAST ("${opt.schema}".wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; -CREATE TYPE "${opt.schema}".bikes_allowance AS ENUM ( +CREATE TYPE bikes_allowance AS ENUM ( 'unknown' -- 0 or empty - No bike information for the trip. , 'allowed' -- 1 – Vehicle being used on this particular trip can accommodate at least one bicycle. , 'not_allowed' -- 2 – No bicycles are allowed on this trip. ); -CREATE CAST ("${opt.schema}".bikes_allowance AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (bikes_allowance AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".trips ( +CREATE TABLE trips ( trip_id TEXT PRIMARY KEY, route_id TEXT NOT NULL, - FOREIGN KEY (route_id) REFERENCES "${opt.schema}".routes, - service_id TEXT NOT NULL, -- references "${opt.schema}".service_days.service_id + FOREIGN KEY (route_id) REFERENCES routes, + -- todo: add foreign key constraint? + service_id TEXT NOT NULL, -- references service_days.service_id trip_headsign TEXT, trip_short_name TEXT, direction_id INT, block_id TEXT, - shape_id TEXT, -- todo: add NOT NULL? - ${opt.tripsWithoutShapeId ? '' : `CONSTRAINT valid_shape_id CHECK ("${opt.schema}".shape_exists(shape_id)),`} - -- todo [breaking]: use 0/unknown for empty values - wheelchair_accessible "${opt.schema}".wheelchair_accessibility, + shape_id TEXT, + ${opt.tripsWithoutShapeId ? '' : `FOREIGN KEY (shape_id) REFERENCES shapes,`} + wheelchair_accessible wheelchair_accessibility, -- todo [breaking]: use 0/unknown for empty values - bikes_allowed "${opt.schema}".bikes_allowance + bikes_allowed bikes_allowance ); -COPY "${opt.schema}".trips ( - trip_id, - route_id, - service_id, - trip_headsign, - trip_short_name, - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -) FROM STDIN csv; -` +INSERT INTO trips +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + ${has_wheelchair_accessible ? `` : `NULL AS wheelchair_accessible,`} + ${has_bikes_allowed ? `` : `NULL AS bikes_allowed,`} + * + REPLACE ( + -- dummy entry in case no optional column is present + trip_id AS trip_id, + ${has_wheelchair_accessible ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::wheelchair_accessibility)[wheelchair_accessible + 1] AS wheelchair_accessible, + ` : ``} + ${has_bikes_allowed ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::bikes_allowance)[bikes_allowed + 1] AS bikes_allowed + ` : ``} + ) +FROM read_csv( + '${pathToTrips}', + header = true, + all_varchar = true, + types = { + direction_id: 'INTEGER', + ${has_wheelchair_accessible ? `wheelchair_accessible: 'INTEGER',` : ``} + ${has_bikes_allowed ? `bikes_allowed: 'INTEGER',` : ``} + } +); -const wheelchairAccessibility = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid wheelchair_accessibility: ' + val) -} +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX trips_trip_id ON trips(trip_id); +`) -const bikesAllowance = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'allowed' - if (val === '2') return 'not_allowed' - throw new Error('invalid bikes_allowance: ' + val) + workingState.nrOfRowsByName.set('trips', await queryNumberOfRows(db, 'trips', opt)) } -const formatTripsRow = (t) => { - return [ - t.trip_id || null, - t.route_id || null, - t.service_id || null, - t.trip_headsign || null, - t.trip_short_name || null, - t.direction_id ? parseInt(t.direction_id) : null, - t.block_id || null, - t.shape_id || null, - t.wheelchair_accessible - ? wheelchairAccessibility(t.wheelchair_accessible) - : null, - t.bikes_allowed ? bikesAllowance(t.bikes_allowed) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON "${opt.schema}".trips (route_id); - -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".trips IS E'@foreignKey (shape_id) references shapes_aggregated|@fieldName shape'; -` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatTripsRow, - afterAll, -} +module.exports = importData diff --git a/lib/util.js b/lib/util.js deleted file mode 100644 index affe20b..0000000 --- a/lib/util.js +++ /dev/null @@ -1,12 +0,0 @@ -'use strict' - -const parseTime = require('gtfs-utils/parse-time') - -const formatTime = (gtfsTime) => { - const {hours: h, minutes: m, seconds: s} = parseTime(gtfsTime) - return `${h} hours ${m} minutes ${s === null ? 0 : s} seconds` -} - -module.exports = { - formatTime, -} diff --git a/package.json b/package.json index 103e824..0d30a73 100644 --- a/package.json +++ b/package.json @@ -1,11 +1,11 @@ { - "name": "gtfs-via-postgres", - "description": "Process GTFS using PostgreSQL.", - "version": "4.10.4", + "private": true, + "name": "gtfs-via-duckdb", + "description": "Analyze GTFS datasets using DuckDB.", + "version": "5.0.0", "main": "lib/index.js", "bin": { - "gtfs-to-sql": "cli.js", - "serve-gtfs-via-graphql": "scripts/run-postgraphile.js" + "gtfs-to-duckdb": "cli.js" }, "files": [ "cli.js", @@ -23,10 +23,9 @@ "public transport", "transit", "convert", - "postgres", - "postgresql", - "sql", - "graphql" + "duckdb", + "data analysis", + "sql" ], "author": "Jannis R ", "contributors": [ @@ -34,12 +33,12 @@ "Magnus Burton ", "smohiudd " ], - "homepage": "https://github.com/public-transport/gtfs-via-postgres/tree/4.10.4", + "homepage": "https://github.com/public-transport/gtfs-via-duckdb/tree/5.0.0", "repository": { "type": "git", - "url": "git+https://github.com/public-transport/gtfs-via-postgres.git" + "url": "git+https://github.com/public-transport/gtfs-via-duckdb.git" }, - "bugs": "https://github.com/public-transport/gtfs-via-postgres/issues", + "bugs": "https://github.com/public-transport/gtfs-via-duckdb/issues", "license": "(Apache-2.0 AND Prosperity-3.0.0)", "funding": [ { @@ -56,33 +55,26 @@ } ], "engines": { - "node": ">=20.17" + "node": ">=22" }, "dependencies": { - "csv-stringify": "^6.2.0", + "@duckdb/node-api": "^1.3.2-alpha.25", "debug": "^4.3.3", "gtfs-utils": "^5.1.0", "sequencify": "0.0.7" }, "devDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", "@yao-pkg/pkg": "^6.6.0", "csv-parser": "^3.0.0", "eslint": "^8.33.0", - "postgraphile": "^4.12.11", - "sample-gtfs-feed": "^0.13.0" - }, - "peerDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", - "postgraphile": "^4.12.11" + "sample-gtfs-feed": "^0.13.0", + "tinybench": "^4.0.1" }, "scripts": { "test": "./test/index.sh", "lint": "eslint .", "benchmark": "./benchmark/run.sh", - "build-binaries": "pkg --public -t node22-macos-x64,node22-macos-arm64,node22-linux-x64,node22-linux-arm64 -o dist/gtfs-via-postgres cli.js && gzip -k --best dist/gtfs-via-postgres-*", + "build-binaries": "pkg --public -t node22-macos-x64,node22-macos-arm64,node22-linux-x64,node22-linux-arm64 -o dist/gtfs-via-duckdb cli.js && gzip -k --best dist/gtfs-via-duckdb-*", "prepublishOnly": "npm run lint && npm test" } } diff --git a/readme.md b/readme.md index 463dbe4..f8ce109 100644 --- a/readme.md +++ b/readme.md @@ -1,22 +1,21 @@ -# gtfs-via-postgres +# gtfs-via-duckdb -**Import [GTFS Static/Schedule](https://gtfs.org/documentation/schedule/reference/) datasets into a [PostgreSQL database](https://www.postgresql.org)**, to allow for efficient querying and analysis. +**Import [GTFS Static/Schedule](https://gtfs.org/documentation/schedule/reference/) datasets into a [DuckDB database](https://duckdb.org)**, to allow for efficient querying and analysis. -[![npm version](https://img.shields.io/npm/v/gtfs-via-postgres.svg)](https://www.npmjs.com/package/gtfs-via-postgres) -[![binary build status](https://img.shields.io/github/actions/workflow/status/public-transport/gtfs-via-postgres/publish.yml?label=binary%20build)](https://github.com/public-transport/gtfs-via-postgres/actions) +[![npm version](https://img.shields.io/npm/v/gtfs-via-duckdb.svg)](https://www.npmjs.com/package/gtfs-via-duckdb) +[![binary build status](https://img.shields.io/github/actions/workflow/status/public-transport/gtfs-via-duckdb/publish.yml?label=binary%20build)](https://github.com/public-transport/gtfs-via-duckdb/actions) [![Prosperity/Apache license](https://img.shields.io/static/v1?label=license&message=Prosperity%2FApache&color=0997E8)](#license) -![minimum Node.js version](https://img.shields.io/node/v/gtfs-via-postgres.svg) +![minimum Node.js version](https://img.shields.io/node/v/gtfs-via-duckdb.svg) [![support me via GitHub Sponsors](https://img.shields.io/badge/support%20me-donate-fa7664.svg)](https://github.com/sponsors/derhuerst) [![chat with me on Twitter](https://img.shields.io/badge/chat%20with%20me-on%20Twitter-1da1f2.svg)](https://twitter.com/derhuerst) - ✅ handles [daylight saving time correctly](#correctness-vs-speed-regarding-gtfs-time-values) but retains reasonable lookup performance - ✅ supports `frequencies.txt` -- ✨ joins `stop_times.txt`/`frequencies.txt`, `calendar.txt`/`calendar_dates.txt`, `trips.txt`, `route.txt` & `stops.txt` into [views](https://www.postgresql.org/docs/14/sql-createview.html) for straightforward data analysis (see below) -- 🚀 is carefully optimised to let PostgreSQL's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) +- ✨ joins `stop_times.txt`/`frequencies.txt`, `calendar.txt`/`calendar_dates.txt`, `trips.txt`, `route.txt` & `stops.txt` into [views](https://duckdb.org/docs/stable/sql/statements/create_view) for straightforward data analysis (see below) +- 🚀 is carefully optimised to let DuckDB's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) - ✅ validates and imports `translations.txt` -- ✨ exposes (almost) all data via GraphQL using [PostGraphile](https://www.graphile.org/postgraphile/introduction/), and as a RESTful API using [PostgREST](https://postgrest.org/) -To work with the time-related data (`stop_times` etc.), `gtfs-via-postgres` supports two "mental models": +To work with the time-related data (`stop_times` etc.), `gtfs-via-duckdb` supports two "mental models": - the time-*unexpanded* data that is almost directly taken from the GTFS Schedule data – This is useful if you want to do network analysis. - the time-*expanded* view that "applies" every trip's `stop_times` rows to all of its service days – This is useful for routing & queries from the traveller's perspective. @@ -25,61 +24,59 @@ To work with the time-related data (`stop_times` etc.), `gtfs-via-postgres` supp ## Installation ```shell -npm install -g gtfs-via-postgres +npm install -g gtfs-via-duckdb ``` Or use [`npx`](https://npmjs.com/package/npx). ✨ -There are also [prebuilt binaries](https://github.com/public-transport/gtfs-via-postgres/releases/latest) and [Docker images](https://github.com/public-transport/gtfs-via-postgres/pkgs/container/gtfs-via-postgres) available. +There are also [prebuilt binaries](https://github.com/public-transport/gtfs-via-duckdb/releases/latest) and [Docker images](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb) available. -*Note:* `gtfs-via-postgres` **needs PostgreSQL >=14** to work, as it uses the [`WITH … AS NOT MATERIALIZED`](https://www.postgresql.org/docs/14/queries-with.html#id-1.5.6.12.7) syntax. You can check your PostgreSQL server's version with `psql -t -c 'SELECT version()'`. +> [!NOTE] +> `gtfs-via-duckdb` **needs DuckDB >=1.2** and its [`icu`](https://duckdb.org/docs/stable/extensions/icu) and [`spatial`](https://duckdb.org/docs/stable/extensions/spatial/overview) extensions to work. ## Getting Started +Install the DuckDB [`icu`](https://duckdb.org/docs/stable/extensions/icu) and [`spatial`](https://duckdb.org/docs/stable/extensions/spatial/overview) extensions. + +```shell +duckdb_cli -c 'INSTALL icu' +duckdb_cli -c 'INSTALL spatial' +``` + If you have a `.zip` GTFS feed, unzip it into individual files. -We're going to use the [2022-07-01 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-07-01/) as an example, which consists of individual files already. +We're going to use the [2025-05-21 *VBB* feed](https://vbb-gtfs.jannisr.de/2025-05-21/) as an example, which consists of individual files already. ```sh wget --compression auto \ - -r --no-parent --no-directories -R .csv.gz \ - -P gtfs -N 'https://vbb-gtfs.jannisr.de/2022-07-01/' + -r --no-parent --no-directories -R .csv.gz -R .csv.br \ + -P gtfs -N 'https://vbb-gtfs.jannisr.de/2025-05-21/' # … # Downloaded 14 files in 20s. ls -lh gtfs -# 3.3K agency.csv -# 97K calendar.csv -# 1.1M calendar_dates.csv +# 3.2K agency.csv +# 107K calendar.csv +# 1.2M calendar_dates.csv # 2.5K datapackage.json # 64B frequencies.csv -# 5.9K levels.csv +# 6.1K levels.csv # 246B license -# 8.3M pathways.csv -# 49K routes.csv -# 146M shapes.csv -# 368M stop_times.csv -# 5.0M stops.csv -# 4.7M transfers.csv -# 16M trips.csv -``` - -Depending on your specific setup, configure access to the PostgreSQL database via [`PG*` environment variables](https://www.postgresql.org/docs/14/libpq-envars.html): - -```sh -export PGUSER=postgres -export PGPASSWORD=password -env PGDATABASE=postgres psql -c 'create database vbb_2022_02_25' -export PGDATABASE=vbb_2022_02_25 +# 8.9M pathways.csv +# 50K routes.csv +# 152M shapes.csv +# 383M stop_times.csv +# 7.0M stops.csv +# 3.0M transfers.csv +# 17M trips.csv ``` -*Note*: `gtfs-via-postgres` generates SQL that contains the `CREATE EXTENSION postgis` instruction. For this to work, the PostgreSQL user you're connecting as needs the `CREATE` [permission](https://www.postgresql.org/docs/14/ddl-priv.html) on the database. Also, the `postgis` extension must either be marked as trusted (by putting `trusted = true` into `$(pg_config --sharedir)/extension/postgis.control`), or your user must be a superuser. - -Install `gtfs-via-postgres` and use it to import the GTFS data: +Install `gtfs-via-duckdb` and use it to import the GTFS data: ```sh -npm install -D gtfs-via-postgres -npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b +npm install -D gtfs-via-duckdb +npm exec -- gtfs-to-duckdb --require-dependencies -- gtfs.duckdb gtfs/*.csv +# todo # agency # calendar # CREATE EXTENSION @@ -92,37 +89,37 @@ npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b # COMMIT ``` -Importing will take 10s to 10m, depending on the size of the feed. On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), importing the above feed takes about 4m; Importing the [260kb 2021-10-06 Amtrak feed](https://transitfeeds.com/p/amtrak/1136/20211006) takes 6s. +Importing will take a few seconds to a few minutes, depending on the size of the feed. On an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop, importing the above feed takes about 30s. -In addition to a table for each GTFS file, `gtfs-via-postgres` adds these views to help with real-world analysis: +In addition to a table for each GTFS file, `gtfs-via-duckdb` adds these views to help with real-world analysis: -- `service_days` ([materialized](https://www.postgresql.org/docs/14/sql-creatematerializedview.html)) "applies" [`calendar_dates`](https://gtfs.org/documentation/schedule/reference/#calendar_datestxt) to [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt) to give you all days of operation for each "service" defined in [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt). +- `service_days` (table) "applies" [`calendar_dates`](https://gtfs.org/documentation/schedule/reference/#calendar_datestxt) to [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt) to give you all days of operation for each "service" defined in [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt). - `arrivals_departures` "applies" [`stop_times`](https://gtfs.org/documentation/schedule/reference/#stop_timestxt)/[`frequencies`](https://gtfs.org/documentation/schedule/reference/#frequenciestxt) to [`trips`](https://gtfs.org/documentation/schedule/reference/#tripstxt) and `service_days` to give you all arrivals/departures at each stop with their *absolute* dates & times. It also resolves each stop's parent station ID & name. - `connections` "applies" [`stop_times`](https://gtfs.org/documentation/schedule/reference/#stop_timestxt)/[`frequencies`](https://gtfs.org/documentation/schedule/reference/#frequenciestxt) to [`trips`](https://gtfs.org/documentation/schedule/reference/#tripstxt) and `service_days`, just like `arrivals_departures`, but gives you departure (at stop A) & arrival (at stop B) *pairs*. -- `shapes_aggregated` aggregates individual shape points in [`shapes`](https://gtfs.org/documentation/schedule/reference/#shapestxt) into a [PostGIS `LineString`](http://postgis.net/workshops/postgis-intro/geometries.html#linestrings). +- `shapes_aggregated` aggregates individual shape points in [`shapes`](https://gtfs.org/documentation/schedule/reference/#shapestxt) into a [`LineString`](https://duckdb.org/docs/stable/extensions/spatial/overview#the-geometry-type). - `stats_by_route_date` provides the number of arrivals/departures by route ID and date. – [read more](docs/analysis/feed-by-route-date.md) - `stats_by_agency_route_stop_hour` provides the number of arrivals/departures by agency ID, route ID, stop ID & hour. – [read more](docs/analysis/feed-by-agency-route-stop-and-hour.md) - In contrast to `stats_by_route_date` & `stats_by_agency_route_stop_hour`, `stats_active_trips_by_hour` provides the number of *currently running* trips for each hour in the feeds period of time. -As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01`: +As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01`: ```sql SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900120003' -AND t_departure >= '2022-03-23T12:30+01' AND t_departure <= '2022-03-23T12:35+01' +AND t_departure >= '2022-03-23T12:30:00+01' AND t_departure <= '2022-03-23T12:35:00+01' ``` `route_id` | `route_short_name` | `route_type` | `trip_id` | `date` | `stop_sequence` | `t_arrival` | `t_departure` | `stop_id` | `stop_name` | `station_id` | `station_name` -|-|-|-|-|-|-|-|-|-|-|- -`10148_109` | `S3` | `109` | `169035756` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:31:24+01` | `2022-03-23 12:32:12+01` | `de:11000:900120003:2:53` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10148_109` | `S3` | `109` | `169035899` | `2022-03-23 00:00:00` | `10` | `2022-03-23 12:33:06+01` | `2022-03-23 12:33:54+01` | `de:11000:900120003:3:55` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10162_109` | `S7` | `109` | `169128381` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:33:54+01` | `2022-03-23 12:34:42+01` | `de:11000:900120003:2:53` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10162_109` | `S7` | `109` | `169128495` | `2022-03-23 00:00:00` | `9` | `2022-03-23 12:30:36+01` | `2022-03-23 12:31:24+01` | `de:11000:900120003:3:55` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10223_109` | `S41` | `109` | `169054370` | `2022-03-23 00:00:00` | `21` | `2022-03-23 12:30:24+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5:58` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10227_109` | `S42` | `109` | `169071882` | `2022-03-23 00:00:00` | `6` | `2022-03-23 12:30:30+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5:59` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`19040_100` | `RB14` | `100` | `178748721` | `2022-03-23 00:00:00` | `13` | `2022-03-23 12:30:00+01` | `2022-03-23 12:30:00+01` | `de:11000:900120003:1:50` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`22664_2` | `FEX` | `2` | `178748125` | `2022-03-23 00:00:00` | `1` | `2022-03-23 12:32:00+01` | `2022-03-23 12:34:00+01` | `de:11000:900120003:4:57` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10148_109` | `S3` | `109` | `169035756` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:31:24+01` | `2022-03-23 12:32:12+01` | `de:11000:900120003:2` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10148_109` | `S3` | `109` | `169035899` | `2022-03-23 00:00:00` | `10` | `2022-03-23 12:33:06+01` | `2022-03-23 12:33:54+01` | `de:11000:900120003:3` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10162_109` | `S7` | `109` | `169128381` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:33:54+01` | `2022-03-23 12:34:42+01` | `de:11000:900120003:2` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10162_109` | `S7` | `109` | `169128495` | `2022-03-23 00:00:00` | `9` | `2022-03-23 12:30:36+01` | `2022-03-23 12:31:24+01` | `de:11000:900120003:3` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10223_109` | `S41` | `109` | `169054370` | `2022-03-23 00:00:00` | `21` | `2022-03-23 12:30:24+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10227_109` | `S42` | `109` | `169071882` | `2022-03-23 00:00:00` | `6` | `2022-03-23 12:30:30+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`19040_100` | `RB14` | `100` | `178748721` | `2022-03-23 00:00:00` | `13` | `2022-03-23 12:30:00+01` | `2022-03-23 12:30:00+01` | `de:11000:900120003:1` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`22664_2` | `FEX` | `2` | `178748125` | `2022-03-23 00:00:00` | `1` | `2022-03-23 12:32:00+01` | `2022-03-23 12:34:00+01` | `de:11000:900120003:4` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` ### translations @@ -147,7 +144,7 @@ AND (stop_url_lang = 'de-CH' OR stop_url_lang IS NULL) ``` Usage: - gtfs-to-sql [options] [--] ... + import-gtfs-into-duckdb [options] [--] ... Options: --silent -s Don't show files being converted. --require-dependencies -d Require files that the specified GTFS files depend @@ -187,147 +184,54 @@ Options: currently running trips over time, by hour. Like --stats-by-route-date, this flag accepts none, view & materialized-view. - --schema The schema to use for the database. Default: public - Even when importing into a schema other than `public`, - a function `public.gtfs_via_postgres_import_version()` - gets created, to ensure that multiple imports into the - same database are all made using the same version. See - also multiple-datasets.md in the docs. - --postgraphile Tweak generated SQL for PostGraphile usage. - https://www.graphile.org/postgraphile/ - --postgraphile-password Password for the PostGraphile PostgreSQL user. - Default: $POSTGRAPHILE_PGPASSWORD, fallback random. - --postgrest Tweak generated SQL for PostgREST usage. - Please combine it with --schema. - https://postgrest.org/ - --postgrest-password Password for the PostgREST PostgreSQL user `web_anon`. - Default: $POSTGREST_PGPASSWORD, fallback random. - --postgrest-query-cost-limit Define a cost limit [1] for queries executed by PostgREST - on behalf of a user. It is only enforced if - pg_plan_filter [2] is installed in the database! - Must be a positive float. Default: none - [1] https://www.postgresql.org/docs/14/using-explain.html - [2] https://github.com/pgexperts/pg_plan_filter --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - - gtfs_via_postgres_version (text) - - gtfs_via_postgres_options (jsonb) + - gtfs_via_duckdb_version (text) + - gtfs_via_duckdb_options (jsonb) +Notes: + If you just want to check if the GTFS data can be imported but don't care about the + resulting DuckDB database file, you can import into an in-memory database by specifying + `:memory:` as the . Examples: - gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump + import-gtfs-into-duckdb some-gtfs.duckdb some-gtfs/*.txt [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J ``` -Some notable limitations mentioned in the [PostgreSQL 14 documentation on date/time types](https://www.postgresql.org/docs/14/datatype-datetime.html): - -> For `timestamp with time zone`, the internally stored value is always in UTC (Universal Coordinated Time, traditionally known as Greenwich Mean Time, GMT). An input value that has an explicit time zone specified is converted to UTC using the appropriate offset for that time zone. - -> When a `timestamp with time zone` value is output, it is always converted from UTC to the current `timezone` zone, and displayed as local time in that zone. To see the time in another time zone, either change `timezone` or use the `AT TIME ZONE` construct […]. - -You can run queries with date+time values in any timezone (offset) and they will be processed correctly, but the output will always be in the database timezone (offset), unless you have explicitly used `AT TIME ZONE`. +> [!TIP] +> DuckDB will always store `timestamp with time zone` values as microsends since the [Unix epoch](https://en.wikipedia.org/wiki/Unix_time) (similar to UTC). An input value with an explicit offset specified (e.g. `2022-03-04T05:06:07+08:00`) is converted to the internal representation using the offset. +> When the stored value is queried, it is always converted back into the current offset of the timezone specified by the `TimeZone` config. To see the time in another time zone, [change the `TimeZone` config](https://duckdb.org/docs/1.2/sql/data_types/timestamp#settings). +> TLDR: You can run queries with date+time values in any timezone (offset) and they will be processed correctly. ### With Docker *Note:* Just like the `npm`-installed variant, the Docker integration too assumes that your GTFS dataset consists of individual files (i.e. unzipped). -Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs-via-postgres` Docker image](https://github.com/public-transport/gtfs-via-postgres/pkgs/container/gtfs-via-postgres): - -```shell -# variant A: use Docker image just to convert GTFS to SQL -docker run --rm --volume /path/to/gtfs:/gtfs \ - ghcr.io/public-transport/gtfs-via-postgres --require-dependencies -- '/gtfs/*.csv' \ - | sponge | psql -b -``` +Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs-via-duckdb` Docker image](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb): *Note:* Remember to pass the `/gtfs/*.csv` glob as a string (with `'`), so that it gets evaluated *inside* the Docker container. -With the code above, the `psql -b` process will run *outside* of the Docker container, so your host machine needs access to PostgreSQL. - -If you want to directly import the GTFS data *from within the Docker container*, you need add `psql` to the image and run it from inside. To do that, write a new Dockerfile that extends the `ghcr.io/public-transport/gtfs-via-postgres` image: - -```Dockerfile -FROM ghcr.io/public-transport/gtfs-via-postgres -ENV PGPORT=5432 PGUSER=postgres -WORKDIR /gtfs -# pass all arguments into gtfs-via-postgres, pipe output into psql: -ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-postgres $0 $@ | sponge | psql -b"] -``` - ```shell -# start PostgreSQL DB in another container "db" -docker run --name db -p 5432:5432 -e POSTGRES_PASSWORD=password postgis/postgis - -# variant B: use Docker image to convert GTFS to SQL and import it directly -docker build -t import-gtfs . # build helper Docker image from Dockerfile docker run --rm --volume /path/to/gtfs:/gtfs \ - --link db -e PGHOST=db -e PGPASSWORD=password \ - import-gtfs --require-dependencies -- '/gtfs/*.csv' + ghcr.io/public-transport/gtfs-via-duckdb --require-dependencies -- '/gtfs/*.csv' ``` -### Importing a GTFS Schedule feed continuously - -[postgis-gtfs-importer](https://github.com/mobidata-bw/postgis-gtfs-importer) imports [GTFS Schedule](https://gtfs.org/schedule/) data into a [PostGIS](https://postgis.net) database using `gtfs-via-postgres`. It allows running a production service (e.g. an API) on top of programmatically re-imported data from a periodically changing GTFS feed without downtime. - -Because it works as [atomically](https://en.wikipedia.org/wiki/Atomicity_(database_systems)) as possible with PostgreSQL, it makes the import pipeline *robust*, even if an import fails or if simultaneous imports get started. - ### Exporting data efficiently -If you want to export data from the database, use the [`COPY` command](https://www.postgresql.org/docs/14/sql-copy.html); On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), PostgreSQL 14 can export about 500k `connections` rows per second. +If you want to export data from the database, use the [`COPY` command](https://duckdb.org/docs/stable/sql/statements/copy). ```shell -psql -c 'COPY (SELECT * FROM connections) TO STDOUT csv HEADER' >connections.csv +duckdb -c 'COPY (SELECT * FROM connections) TO STDOUT csv HEADER' my-gtfs.duckdb >my-gtfs-connections.csv ``` -In the nested `SELECT` query, you can use features like `WHERE`, `ORDER BY` and `LIMIT`. Because `psql` passes on the exported data right away, you could stream it into another process. - ### Querying stops by location efficiently -If you want to find stops by (geo)location, run `gtfs-via-postgres` with `--stops-location-index`. This will create a [spatial index](https://postgis.net/workshops/postgis-intro/indexing.html) on `stops.stop_loc`, so that most [PostGIS functions & operators](https://postgis.net/docs/manual-3.2/reference.html#Measurement_Functions) make use of it. - -### GraphQL support - -The `--postgraphile` flag changes the SQL generated by `gtfs-via-postgres` slightly, so that you get a reasonably idiomatic GraphQL API out-of-the-box when running [PostGraphile](https://www.graphile.org/postgraphile/) v4 on it: - -```shell -# import data into PostgreSQL with PostGraphile tweaks -npm exec -- gtfs-to-sql -d --postgraphile -- gtfs/*.csv | sponge | psql -b -``` - -In line with the intended PostGraphile usage, `gtfs-via-postgres` will create a PostgreSQL role/user `postgraphile` with read-only access to the DB. You can set the `postgraphile`'s password with the `--postgraphile-password` option, or using the `$POSTGRAPHILE_PGPASSWORD` environment variable; By default, it will use (and log) a random password. - -`gtfs-via-postgres` *doesn't* specify PostGraphile as a regular dependency, but as `peerDependencies`, in order to stay lightweight for users who don't need the GraphQL interface. Some versions of some package managers install unmet peer dependencies, some don't. Let's make sure that PostGraphile (and its plugins) are installed: - -```shell -npm install \ - postgraphile@^4.12 \ - @graphile-contrib/pg-simplify-inflector@^6.1 \ - @graphile/postgis@^0.2.0-0 -``` - -The `serve-gtfs-via-graphql` helper script configures and runs PostGraphile. With `NODE_ENV=development`, it will - -- serve a fully configured [GraphiQL UI](https://graphql-dotnet.github.io/docs/getting-started/graphiql/) at `/graphiql` -- provide more errors on database & query errors -- allow [using PostgreSQL's `EXPLAIN` via GraphQL](https://www.graphile.org/postgraphile/debugging/#via-postgraphiql-explain) - -``` -# listens on port 3000, this can be changed using $PORT -env NODE_ENV=development npm exec -- serve-gtfs-via-graphql -``` - -**As an example for the GraphQL API, check out the [test query](test/sample-gtfs-feed-postgraphile-test.graphql)** or open the [GraphiQL UI](https://github.com/graphql/graphiql) served at [`localhost:3000/graphiql`](http://localhost:3000/graphiql). - -### REST API support - -With the `--postgrest` flag, `gtfs-via-postgres` will augment the schema with a `web_anon` role and some comments, so that when running [PostgREST](https://postgrest.org/) on the database, you will get a powerful REST API. - -[read more](docs/postgrest.md) +If you want to find stops by (geo)location, run `gtfs-via-duckdb` with `--stops-location-index`. This will create a [spatial index](https://duckdb.org/docs/stable/extensions/spatial/r-tree_indexes) on `stops.stop_loc`, so that most spatial queries can be done efficiently. ### more guides -The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`. +The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-duckdb`. ## Correctness vs. Speed regarding GTFS Time Values @@ -338,24 +242,24 @@ This means that, in order to determine all *absolute* points in time where a par Let's consider two examples: -- A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59+01`. -- A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01+02` which is *not* `3h1m` after `2021-03-28T00:00+01`. +- A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59:00+01`. +- A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01:00+02` which is *not* `3h1m` after `2021-03-28T00:00:00+01`. -`gtfs-via-postgres` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. +`gtfs-via-duckdb` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. -However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-postgres` provides two low-level helper functions `largest_arrival_time()` & `largest_departure_time()` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). +However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows DuckDB to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-duckdb` provides a low-level helper table `largest_arr_dep_time` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). -For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01` within the [2022-02-25 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-02-25/), filtering by `date` speeds it up nicely (Apple M1, PostgreSQL 14.2): +For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2025-05-21 *VBB* feed](https://vbb-gtfs.jannisr.de/2025-05-21/), filtering by `date` speeds it up nicely (Apple M2, DuckDB v1.3.0): `station_id` filter | `date` filter | query time | nr of results -|-|-|- -`de:11000:900120003` | *none* | 230ms | ~574k -`de:11000:900120003` | `2022-03-13` >= `date` < `2022-04-08` | 105ms | ~51k -`de:11000:900120003` | `2022-03-23` >= `date` < `2022-03-24` | 55ms | ~2k -`de:11000:900120003` | `2022-03-22` > `date` < `2022-03-24` | 55ms | ~2k -*none* | *none* | 192s | 370m -*none* | `2022-03-13` >= `date` < `2022-04-08` | 34s | ~35m -*none* | `2022-03-22` > `date` < `2022-03-24` | 2.4s | ~1523k +`de:11000:900120003` | *none* | todo | ~todok +`de:11000:900120003` | `2022-03-13` >= `date` < `2022-04-08` | todo | ~todok +`de:11000:900120003` | `2022-03-23` >= `date` < `2022-03-24` | todo | ~todok +`de:11000:900120003` | `2022-03-22` > `date` < `2022-03-24` | todo | ~todok +*none* | *none* | todo | todom +*none* | `2022-03-13` >= `date` < `2022-04-08` | todo | ~todom +*none* | `2022-03-22` > `date` < `2022-03-24` | todo | ~todok Using `dates_filter_min(t_min)` & `dates_filter_max(t_max)`, we can easily filter by `date`. When filtering by `t_departure` (absolute departure date+time), `t_min` is the lower `t_departure` bound, whereas `t_max` is the upper bound. The VBB example above can be queried like this: @@ -363,46 +267,45 @@ Using `dates_filter_min(t_min)` & `dates_filter_max(t_max)`, we can easily filte SELECT * FROM arrivals_departures -- filter by absolute departure date+time -WHERE t_departure >= '2022-03-23T12:30+01' AND t_departure <= '2022-03-23T12:35+01' +WHERE t_departure >= '2022-03-23T12:30:00+01' AND t_departure <= '2022-03-23T12:35:00+01' -- allow "cutoffs" by filtering by date -AND "date" >= dates_filter_min('2022-03-23T12:30+01') -- evaluates to 2023-03-22 -AND "date" <= dates_filter_max('2022-03-23T12:35+01') -- evaluates to 2023-03-23 +AND "date" >= dates_filter_min('2022-03-23T12:30:00+01') -- evaluates to 2023-03-22 +AND "date" <= dates_filter_max('2022-03-23T12:35:00+01') -- evaluates to 2023-03-23 ``` ## Performance -With all use cases I could think of, `gtfs-via-postgres` is reasonably fast. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-postgres/issues/new)! +`gtfs-via-duckdb` is fast enough for most use cases I can think of. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-duckdb/issues/new)! -The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-postgres@4.7.4` and PostgreSQL 14.7 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 12.6.8; All measurements are in milliseconds. +The following benchmarks were run with the [2025-05-21 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2025-05-21/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.7; All measurements are in milliseconds. | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | -|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC
LIMIT 100
| 15 | 14.982 | 15 | 15 | 15 | 15 | 15 | 15.488 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 61 | 60.901 | 61 | 61 | 61 | 61 | 62 | 61.778 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 33 | 33.129 | 33 | 33 | 33 | 33 | 33 | 33.342 | 40 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND stop_sequence = 0
| 5 | 4.548 | 5 | 5 | 5 | 5 | 5 | 4.598 | 50 | -|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 8 | 8.038 | 8 | 8 | 8 | 8 | 8 | 8.164 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 2 | 1.878 | 2 | 2 | 2 | 2 | 2 | 1.911 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 58 | 57.485 | 58 | 58 | 58 | 58 | 58 | 57.789 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 2 | 1.832 | 2 | 2 | 2 | 2 | 2 | 1.876 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
| 6310 | 6238.819 | 6241 | 6262 | 6311 | 6503 | 6560 | 6573.768 | 10 | -|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
| 4931 | 4914.388 | 4925 | 4928 | 4937 | 4946 | 4948 | 4948.689 | 10 | -|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 164 | 163.018 | 163 | 164 | 164 | 164 | 165 | 166.568 | 100 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 59 | 58.137 | 58 | 58 | 59 | 60 | 61 | 61.461 | 40 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND from_stop_sequence = 0
| 7 | 7.439 | 7 | 7 | 7 | 7 | 7 | 7.49 | 50 | -|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 15 | 14.529 | 15 | 15 | 15 | 15 | 15 | 14.698 | 100 | -|
SELECT *
FROM connections
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 3 | 2.86 | 3 | 3 | 3 | 3 | 3 | 2.931 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 73 | 72.687 | 73 | 73 | 73 | 73 | 73 | 73.35 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 3 | 3.428 | 3 | 3 | 3 | 3 | 4 | 3.525 | 100 | -|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
ORDER BY t_departure
LIMIT 100
| 13127 | 13056.841 | 13086 | 13125 | 13170 | 13194 | 13199 | 13200.027 | 7 | -|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 6417 | 6237.932 | 6346 | 6394 | 6512 | 6562 | 6570 | 6571.455 | 7 | -|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2862 | 2853.972 | 2860 | 2863 | 2863 | 2867 | 2867 | 2866.798 | 10 | - +|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_Point(9.7, 50.547)) ASC
LIMIT 100
| 6.35 | 5.91 | 5.98 | 6.25 | 6.6 | 6.86 | 8.41 | 10.05 | 1576 | +|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02'
AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone)
AND date <= dates_filter_max('2025-05-27T07:30+02'::timestamp with time zone)
| 305.15 | 260.52 | 303.8 | 307.73 | 312.2 | 320.64 | 326.84 | 328.44 | 33 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 129.43 | 119.85 | 126.19 | 128.62 | 131.84 | 138.44 | 140.46 | 142 | 78 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
AND stop_sequence = 0
| 81.42 | 65.73 | 79.48 | 82.11 | 84.33 | 87.26 | 89.64 | 102.97 | 123 | +|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 83.79 | 64.57 | 82.15 | 84.64 | 85.83 | 91.36 | 95.79 | 97.08 | 120 | +|
SELECT *
FROM arrivals_departures
WHERE trip_id = '262623609' -- route_id=10144_109, route_short_name=S2
AND date = '2025-05-27'
| 14.25 | 12.38 | 13.42 | 13.98 | 14.84 | 16.12 | 18.98 | 21.77 | 702 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 70.9 | 67.54 | 69.09 | 70.1 | 72.47 | 75.73 | 77.24 | 78.83 | 142 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 23.61 | 20.31 | 21.97 | 22.67 | 24.84 | 27.51 | 30.78 | 40.43 | 424 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= '2025-05-25'
AND date <= '2025-05-27'
| 1269.86 | 1139.03 | 1254.52 | 1272.09 | 1318.94 | 1329.66 | 1331.44 | 1331.89 | 8 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02'
AND "date" >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone)
AND "date" <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone)
| 34148.21 | 32101.25 | 33459.12 | 34816.99 | 35171.69 | 35455.44 | 35512.2 | 35526.38 | 3 | +|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 8697.84 | 8629.78 | 8673.26 | 8716.73 | 8731.86 | 8743.96 | 8746.39 | 8746.99 | 3 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 1154.01 | 1070.8 | 1115.77 | 1156.47 | 1168.38 | 1243.5 | 1281.37 | 1290.84 | 9 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
AND from_stop_sequence_consec = 0
| 482.23 | 454.29 | 466.55 | 467.45 | 475.64 | 555.32 | 571.05 | 574.98 | 21 | +|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 885.14 | 835.29 | 869.24 | 875.76 | 909.79 | 922.32 | 923.64 | 923.97 | 12 | +|
SELECT *
FROM connections
WHERE trip_id = '262535123' -- route_id=17452_900 (M4)
AND date >= '2025-05-26' AND date <= '2025-06-01'
| 19.31 | 15.83 | 18.02 | 18.99 | 20.27 | 22.76 | 24.78 | 27.96 | 519 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 341.42 | 263.96 | 340.65 | 346.83 | 350.72 | 355.91 | 358.76 | 359.65 | 30 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 343.5 | 314.1 | 319.13 | 345.04 | 354.63 | 362.52 | 463.4 | 503.94 | 30 | +|
SELECT *
FROM connections
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2025-05-27T07:30+02'::timestamp with time zone)
ORDER BY t_departure
LIMIT 100
| 1013055.35 | 986377.24 | 1026394.41 | 1009900.4 | 1026394.41 | 992028.36 | 1042228.66 | 1042888.42 | 3 | +|
SELECT *
FROM connections
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= '2025-05-25' AND date <= '2025-05-27'
ORDER BY t_departure
LIMIT 100
| 16347.21 | 16250.36 | 16285.17 | 16319.98 | 16395.63 | 16456.16 | 16468.27 | 16471.29 | 3 | +|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2025-05-26' AND date <= '2025-06-01'
AND is_effective = true
| 4765.59 | 4704.49 | 4706.87 | 4709.25 | 4796.14 | 4865.64 | 4879.54 | 4883.02 | 3 | ## Related Projects -There are some projects that are very similar to `gtfs-via-postgres`: +There are some projects that are very similar to `gtfs-via-duckdb`: ### Node-GTFS @@ -428,9 +331,9 @@ I don't use it because There are several forks of the [original outdated project](https://github.com/cbick/gtfs_SQL_importer); [fitnr's fork](https://github.com/fitnr/gtfs-sql-importer) seems to be the most recent one. -The project has a slightly different goal than `gtfs-via-postgres`: While `gtfs-sql-importer` is designed to import multiple versions of a GTFS dataset in an idempotent fashion, `gtfs-via-postgres` assumes that *one* (version of a) GTFS dataset is imported into *one* DB exactly once. +The project has a slightly different goal than `gtfs-via-duckdb`: While `gtfs-sql-importer` is designed to import multiple versions of a GTFS dataset in an idempotent fashion, `gtfs-via-duckdb` assumes that *one* (version of a) GTFS dataset is imported into *one* DB exactly once. -`gtfs-via-postgres` aims to provide more tools – e.g. the `arrivals_departures` & `connections` views – to help with the analysis of a GTFS dataset, whereas `gtfs-sql-importer` just imports the data. +`gtfs-via-duckdb` aims to provide more tools – e.g. the `arrivals_departures` & `connections` views – to help with the analysis of a GTFS dataset, whereas `gtfs-sql-importer` just imports the data. ### other related projects @@ -443,11 +346,12 @@ The project has a slightly different goal than `gtfs-via-postgres`: While `gtfs- - [gtfs-lib](https://github.com/conveyal/gtfs-lib) – Java library & CLI for importing GTFS files into a PostgreSQL database. - [gtfs-schema](https://github.com/tyleragreen/gtfs-schema) – PostgreSQL schemas for GTFS feeds. (plain SQL) - [markusvalo/HSLtraffic](https://github.com/markusvalo/HSLtraffic) – Scripts to create a PostgreSQL database for HSL GTFS-data. (plain SQL) +- [smohiudd/gtfs-parquet-duckdb-wasm](https://github.com/smohiudd/gtfs-parquet-duckdb-wasm) – Test visualization of GTFS data using DuckDB-Wasm ([blog post](http://saadiqm.com/gtfs-parquet-duckdb-wasm/)) ## License -This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) contributions are licensed under the [*Prosperity Public License*](https://prosperitylicense.com), [contributions of other people](https://github.com/public-transport/gtfs-via-postgres/graphs/contributors) are licensed as [Apache 2.0](https://apache.org/licenses/LICENSE-2.0)**. +This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) contributions are licensed under the [*Prosperity Public License*](https://prosperitylicense.com), [contributions of other people](https://github.com/public-transport/gtfs-via-duckdb/graphs/contributors) are licensed as [Apache 2.0](https://apache.org/licenses/LICENSE-2.0)**. > This license allows you to use and share this software for noncommercial purposes for free and to try this software for commercial purposes for thirty days. @@ -458,6 +362,6 @@ This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) ## Contributing -If you have a question or need support using `gtfs-via-postgres`, please double-check your code and setup first. If you think you have found a bug or want to propose a feature, use [the issues page](https://github.com/public-transport/gtfs-via-postgres/issues). +If you have a question or need support using `gtfs-via-duckdb`, please double-check your code and setup first. If you think you have found a bug or want to propose a feature, use [the issues page](https://github.com/public-transport/gtfs-via-duckdb/issues). By contributing, you agree to release your modifications under the [Apache 2.0 license](LICENSE-APACHE). diff --git a/scripts/run-postgraphile.js b/scripts/run-postgraphile.js deleted file mode 100755 index bf74ee8..0000000 --- a/scripts/run-postgraphile.js +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env node - -const {createServer} = require('http') -const {postgraphile} = require('postgraphile') -const postgisPlugin = require('@graphile/postgis').default -const simplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector') - -const DEV = process.env.NODE_ENV === 'development' -const PROD = !DEV -const PORT = process.env.PORT ? parseInt(process.env.PORT) : 3000 -const SCHEMA = process.env.PGSCHEMA || 'public' - -const pg = postgraphile({}, SCHEMA, { - appendPlugins: [ - // PostGIS support for PostGraphile - postgisPlugin, - - // Simplifies the graphile-build-pg inflector to trim the `ByFooIdAndBarId` from relations - simplifyInflectorPlugin, - ], - graphileBuildOptions: { - pgSimplifyAllRows: false, - pgShortPk: false, - }, - - pgSettings: async () => ({ - // With `timestamptz` (a.k.a. `timestamp with time zone`), PostgreSQL *doesn't* store the timezone (offset) specified on input; Instead, it always converts to UTC. - // When querying a `timestamptz` value, it converts to the local timezone (offset) of the client's session or database server. - // Because we loose the timezone offset information *anyways*, we configure PostGraphile to give predictable results by letting PostgreSQL always convert to UTC. - timezone: 'UTC', - }), - - // [Experimental] Determines if the 'Explain' feature in GraphiQL can be used to show the user the SQL statements that were executed. Set to a boolean to enable all users to use this, or to a function that filters each request to determine if the request may use Explain. DO NOT USE IN PRODUCTION unless you're comfortable with the security repurcussions of doing so. - allowExplain: DEV, - - // Enables classic ids for Relay support. Instead of using the field name nodeId for globally unique ids, PostGraphile will instead use the field name id for its globally unique ids. This means that table id columns will also get renamed to rowId. - classicIds: true, - - // Turns off GraphQL query logging. By default PostGraphile will log every GraphQL query it processes along with some other information. Set this to true (recommended in production) to disable that feature. - disableQueryLog: PROD, - - // By default, JSON and JSONB fields are presented as strings (JSON encoded) from the GraphQL schema. Setting this to true (recommended) enables raw JSON input and output, saving the need to parse / stringify JSON manually. - dynamicJson: true, - - // Set this to true to add some enhancements to GraphiQL; intended for development usage only (automatically enables with subscriptions and live). - enhanceGraphiql: DEV, - - // Set this to true to enable the GraphiQL interface. - graphiql: true, - - // Extends the error response with additional details from the Postgres error. Can be any combination of ['hint', 'detail', 'errcode']. Default is []. - extendedErrors: DEV ? ['hint', 'detail', 'errcode'] : [], - - // Set false to exclude filters, orderBy, and relations that would be expensive to access due to missing indexes. Changing this from true to false is a breaking change, but false to true is not. The default is true. - ignoreIndexes: false, - - // Set false (recommended) to exclude fields, queries and mutations that are not available to any possible user (determined from the user in connection string and any role they can become); set this option true to skip these checks and create GraphQL fields and types for everything. The default is true, in v5 the default will change to false. - ignoreRBAC: false, - - // Some one-to-one relations were previously detected as one-to-many - should we export 'only' the old relation shapes, both new and old but mark the old ones as 'deprecated' (default), or 'omit' (recommended) the old relation shapes entirely. - legacyRelations: 'omit', - - // If none of your RETURNS SETOF compound_type functions mix NULLs with the results then you may set this false to reduce the nullables in the GraphQL schema. - setofFunctionsContainNulls: false, - - // Enables adding a stack field to the error response. Can be either the boolean true (which results in a single stack string) or the string json (which causes the stack to become an array with elements for each line of the stack). Recommended in development, not recommended in production. - showErrorStack: DEV, - - // Should we use relay pagination, or simple collections? - // "omit" (default) - // relay connections only, "only" (not recommended) - // simple collections only (no Relay connections), "both" - both. - simpleCollections: 'omit', -}) - -const server = createServer(pg) -server.listen(PORT, (err) => { - if (err) { - console.error(err) - process.exit(1) - } - const {port} = server.address() - console.info(`PostGraphile listening on port ${port}`) -}) diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 0adcfc7..83a3f7f 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -11,21 +11,19 @@ env | grep '^PG' || true unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 ls -lh amtrak-gtfs-2021-10-06 -psql -c 'create database amtrak_2021_10_06' -export PGDATABASE='amtrak_2021_10_06' +path_to_db="$(mktemp -d -t gtfs)/amtrak-gtfs-2021-10-06.duckdb" -../cli.js -d --trips-without-shape-id --schema amtrak \ +../cli.js -d --trips-without-shape-id \ --import-metadata \ --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ --stats-active-trips-by-hour=view \ - --postgrest \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival -from amtrak.arrivals_departures +from arrivals_departures where stop_id = 'BHM' -- Birmingham and date = '2021-11-26' order by t_arrival @@ -33,26 +31,20 @@ EOF ) # 2021-11-26T15:15:00-05:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1637957700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2021-11-27T13:45:00-05:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" "$path_to_db" | tail -n 1) if [[ "$arrN" != "1638038700" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 fi -version=$(psql --csv -t -c "SELECT split_part(amtrak.gtfs_via_postgres_version(), '.', 1)" | tail -n 1) -if [[ "$version" != "4" ]]; then - echo "invalid gtfs_via_postgres_version(): $version" 1>&2 - exit 1 -fi - -fMin=$(psql --csv -t -c "SELECT amtrak.dates_filter_min('2021-11-27T13:45:00-06')" | tail -n 1) +fMin=$(duckdb -csv -noheader -c "SELECT dates_filter_min('2021-11-27T13:45:00-06')" "$path_to_db" | tail -n 1) if [[ "$fMin" != "2021-11-24" ]]; then echo "invalid dates_filter_min(…): $fMin" 1>&2 exit 1 @@ -60,13 +52,13 @@ fi acelaStatQuery=$(cat << EOF SELECT nr_of_trips, nr_of_arrs_deps -FROM amtrak.stats_by_route_date +FROM stats_by_route_date WHERE route_id = '40751' -- Acela AND date = '2021-11-26' AND is_effective = True EOF ) -acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1) +acelaStat=$(duckdb -csv -noheader -c "$acelaStatQuery" "$path_to_db" | tail -n 1) if [[ "$acelaStat" != "16,190" ]]; then echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 exit 1 @@ -74,13 +66,13 @@ fi acelaPhillyStatQuery=$(cat << EOF SELECT nr_of_arrs -FROM amtrak.stats_by_agency_route_stop_hour +FROM stats_by_agency_route_stop_hour WHERE route_id = '40751' -- Acela AND stop_id = 'PHL' -- Philadelphia -AND effective_hour = '2022-07-24T09:00-05' +AND effective_hour = '2022-07-24 09:00:00-05:00' EOF ) -acelaPhillyStat=$(psql --csv -t -c "$acelaPhillyStatQuery" | tail -n 1) +acelaPhillyStat=$(duckdb -csv -noheader -c "$acelaPhillyStatQuery" "$path_to_db" | tail -n 1) if [[ "$acelaPhillyStat" != "2" ]]; then echo "invalid stats for route 40751 (Acela) at PHL (Philadelphia) on 2021-11-26: $acelaPhillyStat" 1>&2 exit 1 @@ -88,54 +80,20 @@ fi nrOfActiveTripsQuery=$(cat << EOF SELECT nr_of_active_trips -FROM amtrak.stats_active_trips_by_hour -WHERE "hour" = '2021-11-26T04:00-05' +FROM stats_active_trips_by_hour +WHERE "hour" = '2021-11-26 04:00:00-05:00' EOF ) # Note: I'm not sure if 127 is correct, but it is in the right ballpark. 🙈 # The following query yields 150 connections, and it doesn't contain those who depart earlier and arrive later. # SELECT DISTINCT ON (trip_id) * # FROM amtrak.connections -# WHERE t_departure >= '2021-11-26T02:00-05' -# AND t_arrival <= '2021-11-26T06:00-05' -nrOfActiveTrips=$(psql --csv -t -c "$nrOfActiveTripsQuery" | tail -n 1) +# WHERE t_departure >= '2021-11-26 02:00:00-05:00' +# AND t_arrival <= '2021-11-26 06:00:00-05:00' +nrOfActiveTrips=$(duckdb -csv -noheader -c "$nrOfActiveTripsQuery" "$path_to_db" | tail -n 1) if [[ "$nrOfActiveTrips" != "127" ]]; then echo "unexpected no. of active trips at 2021-11-26T04:00-05: $nrOfActiveTrips" 1>&2 exit 1 fi -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -env \ - PGRST_DB_SCHEMAS=amtrak \ - PGRST_DB_ANON_ROLE=web_anon \ - PGRST_ADMIN_SERVER_PORT=3001 \ - PGRST_LOG_LEVEL=info \ - postgrest & - # docker run --rm -i \ - # -p 3000:3000 -p 3001:3001 \ - # -e PGHOST=host.docker.internal -e PGUSER -e PGPASSWORD -e PGDATABASE \ - # postgrest/postgrest & -sleep 3 - -health_status="$(curl 'http://localhost:3001/live' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$health_status" != '200' ]; then - 1>&2 echo "/live: expected 200, got $health_status" - exit 1 -fi - -stops_url='http://localhost:3000/stops?stop_name=ilike.%25palm%25&limit=1&order=stop_id.asc' -stops_status="$(curl "$stops_url" -H 'Accept: application/json' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$stops_status" != '200' ]; then - 1>&2 echo "$stops_url: expected 200, got $stops_status" - exit 1 -fi -stop_id="$(curl "$stops_url" -H 'Accept: application/json' -fsS | jq -rc '.[0].stop_id')" -if [ "$stop_id" != 'PDC' ]; then - 1>&2 echo "$stops_url: expected PDC, got $stop_id" - exit 1 -fi - echo 'works ✔' diff --git a/test/calendar-dates-only.sh b/test/calendar-dates-only.sh index 1014ad1..e11dc4f 100755 --- a/test/calendar-dates-only.sh +++ b/test/calendar-dates-only.sh @@ -8,12 +8,11 @@ set -x env | grep '^PG' || true -psql -c 'create database calendar_dates_only' -export PGDATABASE='calendar_dates_only' +path_to_db="$(mktemp -d -t gtfs)/calendar-dates-only.duckdb" ../cli.js -d --trips-without-shape-id -- \ - calendar-dates-only/*.txt \ - | sponge | psql -b + "$path_to_db" \ + calendar-dates-only/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -24,14 +23,14 @@ EOF ) # 2019-07-15T15:30:00+02:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1563197400" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2019-07-20T15:30:00+02:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" "$path_to_db" | tail -n 1) if [[ "$arrN" != "1563629400" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 @@ -43,7 +42,7 @@ from arrivals_departures where agency_id IS NULL EOF ) -agency_id_null_count="$(psql --csv -t -c "$agency_id_null")" +agency_id_null_count="$(duckdb -csv -noheader -c "$agency_id_null" "$path_to_db")" if [[ "$agency_id_null_count" != "0" ]]; then echo ">0 rows with agency_id = null" 1>&2 exit 1 @@ -57,7 +56,7 @@ FROM arrivals_departures ORDER BY stop_id, trip_id EOF ) -wheelchair_boarding_rows="$(psql --csv -t -c "$wheelchair_boarding_query")" +wheelchair_boarding_rows="$(duckdb -csv -noheader -c "$wheelchair_boarding_query" "$path_to_db")" wheelchair_boarding_expected="$(echo -e "airport,accessible\nairport-1,not_accessible\nlake,no_info_or_inherit\nmuseum,no_info_or_inherit")" if [[ "$wheelchair_boarding_rows" != "$wheelchair_boarding_expected" ]]; then echo "invalid wheelchair_boarding values" 1>&2 diff --git a/test/index.sh b/test/index.sh index a40fa08..d8512f0 100755 --- a/test/index.sh +++ b/test/index.sh @@ -6,15 +6,14 @@ set -o pipefail cd "$(dirname $0)" set -x -psql -t -c 'SELECT version()' +duckdb --version ./calendar-dates-only.sh ./sample-gtfs-feed.sh ./amtrak-gtfs-2021-10-06.sh -./postgraphile.sh ./routes-without-agency-id.sh ./stops-without-level-id.sh ./invalid-empty-agency-id.sh -./multiple-schemas.sh +./multiple-datasets.sh echo -e "\n\n✔︎ tests passing" diff --git a/test/invalid-empty-agency-id.sh b/test/invalid-empty-agency-id.sh index c4dbba8..9245389 100755 --- a/test/invalid-empty-agency-id.sh +++ b/test/invalid-empty-agency-id.sh @@ -9,8 +9,9 @@ set -x # Refer to https://github.com/public-transport/gtfs-via-postgres/issues/45 for context. # The "core" bug: A feed without routes.agency_id should be importable. -if ../cli.js -d --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null; then +# However, this only applies if there is exactly one route. If there are >1 routes, every route must have an agency_id. +if ../cli.js -d --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt; then echo "import didn't fail" 1>&2 exit 1 else @@ -20,6 +21,6 @@ fi # A related bug: With --routes-without-agency-id, lib/deps.js *does not* specify routes to depend on agency. # *In some cases*, this causes agency to be processed *after* routes, causing the routes processing to fail. # see also https://github.com/public-transport/gtfs-via-postgres/issues/45#issuecomment-1632649826 -../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null +../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt echo 'did not fail even with --routes-without-agency-id ✔' diff --git a/test/multiple-datasets.sh b/test/multiple-datasets.sh new file mode 100755 index 0000000..fd17846 --- /dev/null +++ b/test/multiple-datasets.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +set -e +set -u +set -o pipefail +cd "$(dirname $0)" +set -x + +env | grep '^PG' || true + +unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 +ls -lh amtrak-gtfs-2021-10-06 + +db_dir="$(mktemp -d -t gtfs)" +path_to_db1="$db_dir/multiple-schemas-1.duckdb" +path_to_db2="$db_dir/multiple-schemas-2.duckdb" + +shopt -s extglob + +../cli.js -d --trips-without-shape-id \ + "$path_to_db1" \ + -- amtrak-gtfs-2021-10-06/!(transfers).txt + +../cli.js -d --trips-without-shape-id \ + "$path_to_db2" \ + -- amtrak-gtfs-2021-10-06/*.txt + +shopt -u extglob + +query_prefix=$(cat << EOF +ATTACH DATABASE '$path_to_db1' AS one (READ_ONLY); +ATTACH DATABASE '$path_to_db2' AS two (READ_ONLY); +SET search_path = 'one,two'; +EOF +) + +tables_query=$(cat << EOF +$query_prefix +SELECT + (table_catalog || '.' || table_name) AS table_name +FROM information_schema.tables +WHERE table_schema = 'main' +AND table_catalog = ANY(['one', 'two']) +ORDER BY table_catalog, table_name; +EOF +) +tables_rows=$(duckdb -csv -noheader -c "$tables_query") +# note that one.transfers is missing +tables_expected=$(cat << EOF +one.agency +one.arrivals_departures +one.calendar +one.calendar_dates +one.connections +one.feed_info +one.frequencies +one.largest_arr_dep_time +one.routes +one.service_days +one.stop_times +one.stops +one.trips +one.valid_lang_codes +one.valid_timezones +two.agency +two.arrivals_departures +two.calendar +two.calendar_dates +two.connections +two.feed_info +two.frequencies +two.largest_arr_dep_time +two.routes +two.service_days +two.stop_times +two.stops +two.transfers +two.trips +two.valid_lang_codes +two.valid_timezones +EOF +) +if [[ "$tables_rows" != "$tables_expected" ]]; then + echo "unexpected list of tables" 1>&2 + exit 1 +fi + +# https://dba.stackexchange.com/a/72656 +nr_of_unequal_stops=$(cat << EOF +$query_prefix +SELECT count(*) +FROM one.stops a +FULL OUTER JOIN two.stops b ON ( + a.stop_id = b.stop_id +) +WHERE ( + a.stop_code IS DISTINCT FROM b.stop_code + OR a.stop_name IS DISTINCT FROM b.stop_name + OR a.stop_desc IS DISTINCT FROM b.stop_desc + OR a.stop_loc IS DISTINCT FROM b.stop_loc + OR a.zone_id IS DISTINCT FROM b.zone_id + OR a.stop_url IS DISTINCT FROM b.stop_url + OR a.location_type::TEXT IS DISTINCT FROM b.location_type::TEXT + OR a.parent_station IS DISTINCT FROM b.parent_station + OR a.stop_timezone IS DISTINCT FROM b.stop_timezone + OR a.wheelchair_boarding::TEXT IS DISTINCT FROM b.wheelchair_boarding::TEXT + OR a.level_id IS DISTINCT FROM b.level_id + OR a.platform_code IS DISTINCT FROM b.platform_code +) +EOF +) + +unequal_stops_1=$(duckdb -csv -noheader -c "$nr_of_unequal_stops" | head -n 1) +if [[ "$unequal_stops_1" -ne 0 ]]; then + 1>&2 echo "$unequal_stops_1 unequal stops between one.stops & two.stops" + exit 1 +fi + +# # put an incompatible version +# duckdb -c "$(cat << EOF +# CREATE OR REPLACE FUNCTION public.gtfs_via_duckdb_import_version() +# RETURNS TEXT +# AS \$\$ +# SELECT '0.1.2' +# \$\$ +# LANGUAGE SQL +# EOF +# )" + +# # expect another import to fail +# if ../cli.js -d --trips-without-shape-id \ +# "$path_to_db" \ +# -- amtrak-gtfs-2021-10-06/*.txt; then +# 1>&2 echo "re-import with incompatible version didn't fail" +# exit 1 +# fi + +echo 'works ✔' diff --git a/test/multiple-schemas.sh b/test/multiple-schemas.sh deleted file mode 100755 index 7ddabbd..0000000 --- a/test/multiple-schemas.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -cd "$(dirname $0)" -set -x - -env | grep '^PG' || true - -unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 -ls -lh amtrak-gtfs-2021-10-06 - -psql -c 'create database multiple_schemas' -export PGDATABASE='multiple_schemas' - -../cli.js -d --trips-without-shape-id \ - --schema one \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b - -../cli.js -d --trips-without-shape-id \ - --schema two \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b - -# https://dba.stackexchange.com/a/72656 -nr_of_unequal_stops=$(cat << EOF -SELECT count(*) -FROM one.stops a -FULL OUTER JOIN two.stops b ON ( - a.stop_id = b.stop_id -) -WHERE ( - a.stop_code IS DISTINCT FROM b.stop_code - OR a.stop_name IS DISTINCT FROM b.stop_name - OR a.stop_desc IS DISTINCT FROM b.stop_desc - OR a.stop_loc IS DISTINCT FROM b.stop_loc - OR a.zone_id IS DISTINCT FROM b.zone_id - OR a.stop_url IS DISTINCT FROM b.stop_url - OR a.location_type::TEXT IS DISTINCT FROM b.location_type::TEXT - OR a.parent_station IS DISTINCT FROM b.parent_station - OR a.stop_timezone IS DISTINCT FROM b.stop_timezone - OR a.wheelchair_boarding::TEXT IS DISTINCT FROM b.wheelchair_boarding::TEXT - OR a.level_id IS DISTINCT FROM b.level_id - OR a.platform_code IS DISTINCT FROM b.platform_code -) -EOF -) - -unequal_stops_1=$(psql --csv -t -c "$nr_of_unequal_stops" | head -n 1) -if [[ "$unequal_stops_1" -ne 0 ]]; then - 1>&2 echo "$unequal_stops_1 unequal stops between one.stops & two.stops" - exit 1 -fi - -# todo: assert that more tables are equal? - -# put an incompatible version -psql -c "$(cat << EOF -CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() -RETURNS TEXT -AS \$\$ - SELECT '0.1.2' -\$\$ -LANGUAGE SQL -EOF -)" - -# expect another import to fail -if ../cli.js -d --trips-without-shape-id \ - --schema three \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b; then - 1>&2 echo "re-import with incompatible version didn't fail" - exit 1 -fi - -echo 'works ✔' diff --git a/test/postgraphile.sh b/test/postgraphile.sh deleted file mode 100755 index f06bce1..0000000 --- a/test/postgraphile.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -cd "$(dirname $0)" -set -x - -env | grep '^PG' || true - -psql -c 'create database postgraphile' -export PGDATABASE='postgraphile' - -../cli.js -d --trips-without-shape-id --postgraphile -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - | sponge | psql -b - -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -../scripts/run-postgraphile.js & -sleep 2 - -body=$(node -e 'process.stdout.write(JSON.stringify({query: fs.readFileSync("sample-gtfs-feed-postgraphile-test.graphql", {encoding: "utf8"})}))') -actual_path="$(mktemp -t sample-gtfs-feed-postgraphile-test-XXX)" -curl -X POST 'http://localhost:3000/graphql' -H 'Content-Type: application/json' -H 'Accept: application/json' --data "$body" -fsS | jq -r --tab . >"$actual_path" - -git diff --exit-code sample-gtfs-feed-postgraphile-test.res.json "$actual_path" - -echo 'works ✔' diff --git a/test/routes-without-agency-id.sh b/test/routes-without-agency-id.sh index 51e1530..c63e296 100755 --- a/test/routes-without-agency-id.sh +++ b/test/routes-without-agency-id.sh @@ -7,7 +7,7 @@ cd "$(dirname $0)" set -x ../cli.js -d --routes-without-agency-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/*.txt echo 'works ✔' diff --git a/test/sample-gtfs-feed-postgraphile-test.graphql b/test/sample-gtfs-feed-postgraphile-test.graphql deleted file mode 100644 index bccbf0d..0000000 --- a/test/sample-gtfs-feed-postgraphile-test.graphql +++ /dev/null @@ -1,115 +0,0 @@ -query SampleGtfsFeedPostgraphileTest { - stopByStopId(stopId: "airport-1") { - translatedStopName(language: "de-DE") - stopId - stopLoc { - geojson - latitude - longitude - } - } - - routeByRouteId(routeId: "B") { - agency { - agencyName - agencyEmail - } - routeShortName - routeLongName - at: translatedRouteLongName(language: "de-AT") - de: translatedRouteLongName(language: "de-DE") - trips(first: 3, orderBy: TRIP_ID_ASC) { - nodes { - tripId - } - } - connections(orderBy: PRIMARY_KEY_ASC, offset: 3, first: 3) { - nodes { - tripId - fromStopSequence - fromStopId - tDeparture - tArrival - toStopId - toStopSequence - } - } - } - bOutboundOnWeekends: tripByTripId(tripId: "b-outbound-on-weekends") { - translatedTripHeadsign(language: "de-DE") - } - aDowntownAllDay: tripByTripId(tripId: "a-downtown-all-day") { - shape { - shape { - __typename - geojson - } - } - } - - # stop_times-based -> no frequencies_{row,it} - aOutboundAllDay20190301ArrDep: arrivalDepartureByArrivalDepartureId( - id: "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=" - ) { - # tripId: "a-outbound-all-day" - # date: "2019-03-01" - # stopSequence: 3 - # frequenciesRow: -1 - # frequenciesIt: -1 - arrivalDepartureId - tripId - date - stopSequence - frequenciesRow - frequenciesIt - } - # frequencies-based -> has frequencies_{row,it} - bDowntownOnWorkingDays20190608ArrDep: arrivalDepartureByArrivalDepartureId( - id: "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==" - ) { - # tripId: "b-downtown-on-working-days" - # date: "2019-06-08" - # stopSequence: 3 - # frequenciesRow: 1 - # frequenciesIt: 2 - arrivalDepartureId - tripId - date - stopSequence - frequenciesRow - frequenciesIt - } - - # stop_times-based -> no frequencies_{row,it} - aOutboundAllDay20190301Con: connectionByConnectionId( - id: "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=" - ) { - # tripId: "a-outbound-all-day" - # date: "2019-03-01" - # fromStopSequence: 3 - # frequenciesRow: -1 - # frequenciesIt: -1 - connectionId - tripId - date - fromStopSequence - frequenciesRow - frequenciesIt - } - # frequencies-based -> has frequencies_{row,it} - bDowntownOnWorkingDays20190608Con: connectionByConnectionId( - id: "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==" - ) { - # tripId: "b-downtown-on-working-days" - # date: "2019-06-08" - # fromStopSequence: 3 - # frequenciesRow: 1 - # frequenciesIt: 2 - connectionId - tripId - date - fromStopSequence - frequenciesRow - frequenciesIt - } -} diff --git a/test/sample-gtfs-feed-postgraphile-test.res.json b/test/sample-gtfs-feed-postgraphile-test.res.json deleted file mode 100644 index 76359a1..0000000 --- a/test/sample-gtfs-feed-postgraphile-test.res.json +++ /dev/null @@ -1,2316 +0,0 @@ -{ - "data": { - "stopByStopId": { - "translatedStopName": "Gleis 1", - "stopId": "airport-1", - "stopLoc": { - "geojson": { - "type": "Point", - "coordinates": [ - 13.5087, - 52.36396 - ] - }, - "latitude": 52.36396, - "longitude": 13.5087 - } - }, - "routeByRouteId": { - "agency": { - "agencyName": "Full Transit Agency", - "agencyEmail": "contact@fta.example.org" - }, - "routeShortName": "Babbage", - "routeLongName": "Charles Babbage Tram Line", - "at": "Tram-Linie Charles Babbage", - "de": null, - "trips": { - "nodes": [ - { - "tripId": "b-downtown-on-weekends" - }, - { - "tripId": "b-downtown-on-working-days" - }, - { - "tripId": "b-outbound-on-weekends" - } - ] - }, - "connections": { - "nodes": [ - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 3, - "fromStopId": "lake", - "tDeparture": "2019-03-09T12:24:00+00:00", - "tArrival": "2019-03-09T12:30:00+00:00", - "toStopId": "center", - "toStopSequence": 5 - }, - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 1, - "fromStopId": "airport", - "tDeparture": "2019-03-10T12:14:00+00:00", - "tArrival": "2019-03-10T12:22:00+00:00", - "toStopId": "lake", - "toStopSequence": 3 - }, - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 3, - "fromStopId": "lake", - "tDeparture": "2019-03-10T12:24:00+00:00", - "tArrival": "2019-03-10T12:30:00+00:00", - "toStopId": "center", - "toStopSequence": 5 - } - ] - } - }, - "bOutboundOnWeekends": { - "translatedTripHeadsign": "Babbage (auswärts)" - }, - "aDowntownAllDay": { - "shape": { - "shape": { - "__typename": "GeometryLineString", - "geojson": { - "type": "LineString", - "coordinates": [ - [ - 13.510294914, - 52.364833832 - ], - [ - 13.510567665, - 52.364398956 - ], - [ - 13.510860443, - 52.363952637 - ], - [ - 13.511548042, - 52.362854004 - ], - [ - 13.511612892, - 52.362743378 - ], - [ - 13.511850357, - 52.362812042 - ], - [ - 13.513009071, - 52.363082886 - ], - [ - 13.513717651, - 52.363246918 - ], - [ - 13.514398575, - 52.363361359 - ], - [ - 13.516216278, - 52.363788605 - ], - [ - 13.516494751, - 52.363868713 - ], - [ - 13.516823769, - 52.364009857 - ], - [ - 13.516993523, - 52.364112854 - ], - [ - 13.517116547, - 52.364208221 - ], - [ - 13.517197609, - 52.364322662 - ], - [ - 13.517261505, - 52.364448547 - ], - [ - 13.517277718, - 52.364532471 - ], - [ - 13.517285347, - 52.364704132 - ], - [ - 13.517237663, - 52.365009308 - ], - [ - 13.517251968, - 52.365158081 - ], - [ - 13.517328262, - 52.365364075 - ], - [ - 13.517384529, - 52.365451813 - ], - [ - 13.517477036, - 52.365539551 - ], - [ - 13.517616272, - 52.365650177 - ], - [ - 13.517773628, - 52.365726471 - ], - [ - 13.518079758, - 52.365856171 - ], - [ - 13.518387794, - 52.365940094 - ], - [ - 13.528774261, - 52.368408203 - ], - [ - 13.529670715, - 52.368545532 - ], - [ - 13.530094147, - 52.368579865 - ], - [ - 13.5308218, - 52.368587494 - ], - [ - 13.531106949, - 52.368598938 - ], - [ - 13.531417847, - 52.368621826 - ], - [ - 13.531955719, - 52.36869812 - ], - [ - 13.532168388, - 52.368759155 - ], - [ - 13.532450676, - 52.368862152 - ], - [ - 13.53266716, - 52.368961334 - ], - [ - 13.532931328, - 52.369121552 - ], - [ - 13.533116341, - 52.369255066 - ], - [ - 13.533249855, - 52.36938858 - ], - [ - 13.533371925, - 52.369533539 - ], - [ - 13.533464432, - 52.369682312 - ], - [ - 13.533542633, - 52.369838715 - ], - [ - 13.533593178, - 52.370014191 - ], - [ - 13.533617973, - 52.370185852 - ], - [ - 13.533589363, - 52.370334625 - ], - [ - 13.533475876, - 52.370624542 - ], - [ - 13.533353806, - 52.370826721 - ], - [ - 13.533203125, - 52.371002197 - ], - [ - 13.532802582, - 52.371387482 - ], - [ - 13.532670021, - 52.37153244 - ], - [ - 13.532507896, - 52.371768951 - ], - [ - 13.532444, - 52.371963501 - ], - [ - 13.5324049, - 52.372131348 - ], - [ - 13.53239727, - 52.37229538 - ], - [ - 13.532422066, - 52.372528076 - ], - [ - 13.532460213, - 52.372646332 - ], - [ - 13.532538414, - 52.372817993 - ], - [ - 13.532709122, - 52.373149872 - ], - [ - 13.534140587, - 52.375667572 - ], - [ - 13.534313202, - 52.375961304 - ], - [ - 13.534439087, - 52.376140594 - ], - [ - 13.534526825, - 52.376251221 - ], - [ - 13.534785271, - 52.376514435 - ], - [ - 13.535042763, - 52.376712799 - ], - [ - 13.535244942, - 52.376853943 - ], - [ - 13.535474777, - 52.376983643 - ], - [ - 13.535713196, - 52.377109528 - ], - [ - 13.536309242, - 52.377346039 - ], - [ - 13.53663826, - 52.377441406 - ], - [ - 13.537053108, - 52.377536774 - ], - [ - 13.537810326, - 52.377681732 - ], - [ - 13.53807354, - 52.377750397 - ], - [ - 13.538312912, - 52.377830505 - ], - [ - 13.538555145, - 52.377925873 - ], - [ - 13.538812637, - 52.378055573 - ], - [ - 13.538974762, - 52.37815094 - ], - [ - 13.5391922, - 52.378314972 - ], - [ - 13.539357185, - 52.378479004 - ], - [ - 13.539421082, - 52.378543854 - ], - [ - 13.539493561, - 52.378623962 - ], - [ - 13.539569855, - 52.378723145 - ], - [ - 13.539703369, - 52.379005432 - ], - [ - 13.539748192, - 52.379161835 - ], - [ - 13.539772034, - 52.37940979 - ], - [ - 13.539751053, - 52.379619598 - ], - [ - 13.539697647, - 52.379798889 - ], - [ - 13.539621353, - 52.379974365 - ], - [ - 13.539505959, - 52.380153656 - ], - [ - 13.539352417, - 52.380329132 - ], - [ - 13.539167404, - 52.380493164 - ], - [ - 13.538882256, - 52.380710602 - ], - [ - 13.536517143, - 52.382324219 - ], - [ - 13.536241531, - 52.382499695 - ], - [ - 13.535950661, - 52.382644653 - ], - [ - 13.535591125, - 52.3828125 - ], - [ - 13.535319328, - 52.382923126 - ], - [ - 13.535028458, - 52.383018494 - ], - [ - 13.534606934, - 52.383136749 - ], - [ - 13.53421402, - 52.383220673 - ], - [ - 13.533993721, - 52.38325882 - ], - [ - 13.533719063, - 52.383296967 - ], - [ - 13.533379555, - 52.383335114 - ], - [ - 13.53301239, - 52.383358002 - ], - [ - 13.532653809, - 52.383365631 - ], - [ - 13.53222084, - 52.383361816 - ], - [ - 13.531785011, - 52.383354187 - ], - [ - 13.531435013, - 52.383361816 - ], - [ - 13.531114578, - 52.383388519 - ], - [ - 13.530774117, - 52.383441925 - ], - [ - 13.530474663, - 52.383522034 - ], - [ - 13.530198097, - 52.383605957 - ], - [ - 13.529940605, - 52.383716583 - ], - [ - 13.529669762, - 52.383857727 - ], - [ - 13.529401779, - 52.384044647 - ], - [ - 13.529109955, - 52.38432312 - ], - [ - 13.52870369, - 52.384784698 - ], - [ - 13.528428078, - 52.38508606 - ], - [ - 13.528366089, - 52.385158539 - ], - [ - 13.524540901, - 52.389453888 - ], - [ - 13.524550438, - 52.389503479 - ], - [ - 13.524573326, - 52.389541626 - ], - [ - 13.524604797, - 52.389583588 - ], - [ - 13.524658203, - 52.389625549 - ], - [ - 13.525242805, - 52.389953613 - ], - [ - 13.525495529, - 52.390113831 - ], - [ - 13.525518417, - 52.390159607 - ], - [ - 13.525501251, - 52.390201569 - ], - [ - 13.525468826, - 52.390254974 - ], - [ - 13.525419235, - 52.390304565 - ], - [ - 13.524431229, - 52.391159058 - ], - [ - 13.523122787, - 52.392383575 - ], - [ - 13.522995949, - 52.392505646 - ], - [ - 13.522948265, - 52.392536163 - ], - [ - 13.52287674, - 52.392559052 - ], - [ - 13.522799492, - 52.392566681 - ], - [ - 13.522711754, - 52.392566681 - ], - [ - 13.521859169, - 52.392444611 - ], - [ - 13.521745682, - 52.392436981 - ], - [ - 13.521669388, - 52.392440796 - ], - [ - 13.521622658, - 52.392456055 - ], - [ - 13.521595955, - 52.392478943 - ], - [ - 13.521548271, - 52.392578125 - ], - [ - 13.522637367, - 52.392738342 - ], - [ - 13.522878647, - 52.392772675 - ], - [ - 13.523015022, - 52.392837524 - ], - [ - 13.523111343, - 52.392879486 - ], - [ - 13.523198128, - 52.392917633 - ], - [ - 13.523303032, - 52.392993927 - ], - [ - 13.523317337, - 52.393058777 - ], - [ - 13.523306847, - 52.393119812 - ], - [ - 13.523284912, - 52.393169403 - ], - [ - 13.522663116, - 52.393768311 - ], - [ - 13.521858215, - 52.394523621 - ], - [ - 13.521655083, - 52.394649506 - ], - [ - 13.521375656, - 52.39491272 - ], - [ - 13.520638466, - 52.395599365 - ], - [ - 13.520013809, - 52.396232605 - ], - [ - 13.519786835, - 52.396499634 - ], - [ - 13.51952076, - 52.396839142 - ], - [ - 13.519312859, - 52.397209167 - ], - [ - 13.519210815, - 52.397247314 - ], - [ - 13.519133568, - 52.397315979 - ], - [ - 13.519043922, - 52.397338867 - ], - [ - 13.518992424, - 52.397354126 - ], - [ - 13.518731117, - 52.397266388 - ], - [ - 13.518521309, - 52.397186279 - ], - [ - 13.518030167, - 52.396968842 - ], - [ - 13.517698288, - 52.397338867 - ], - [ - 13.51756382, - 52.397563934 - ], - [ - 13.517389297, - 52.397800446 - ], - [ - 13.516566277, - 52.398674011 - ], - [ - 13.515673637, - 52.399570465 - ], - [ - 13.514561653, - 52.400661469 - ], - [ - 13.514300346, - 52.400932312 - ], - [ - 13.513332367, - 52.401851654 - ], - [ - 13.51246357, - 52.40272522 - ], - [ - 13.510783195, - 52.40435791 - ], - [ - 13.510543823, - 52.404605865 - ], - [ - 13.510230064, - 52.404914856 - ], - [ - 13.50899601, - 52.406147003 - ], - [ - 13.508612633, - 52.406547546 - ], - [ - 13.50774765, - 52.407478333 - ], - [ - 13.506917953, - 52.408348083 - ], - [ - 13.505527496, - 52.409721375 - ], - [ - 13.505458832, - 52.40978241 - ], - [ - 13.505138397, - 52.41009903 - ], - [ - 13.503731728, - 52.411491394 - ], - [ - 13.503533363, - 52.411678314 - ], - [ - 13.502279282, - 52.412883759 - ], - [ - 13.501524925, - 52.413482666 - ], - [ - 13.501321793, - 52.413619995 - ], - [ - 13.500832558, - 52.413936615 - ], - [ - 13.50038147, - 52.4141922 - ], - [ - 13.49997139, - 52.414409637 - ], - [ - 13.499858856, - 52.414455414 - ], - [ - 13.499188423, - 52.414749146 - ], - [ - 13.498696327, - 52.41493988 - ], - [ - 13.497921944, - 52.415218353 - ], - [ - 13.497368813, - 52.415431976 - ], - [ - 13.496650696, - 52.415706635 - ], - [ - 13.496446609, - 52.415782928 - ], - [ - 13.496009827, - 52.415969849 - ], - [ - 13.495700836, - 52.416107178 - ], - [ - 13.495515823, - 52.416194916 - ], - [ - 13.495312691, - 52.416297913 - ], - [ - 13.494745255, - 52.41658783 - ], - [ - 13.49464035, - 52.416652679 - ], - [ - 13.494258881, - 52.416881561 - ], - [ - 13.493819237, - 52.417167664 - ], - [ - 13.493548393, - 52.417369843 - ], - [ - 13.493290901, - 52.417572021 - ], - [ - 13.493026733, - 52.417778015 - ], - [ - 13.492693901, - 52.418022156 - ], - [ - 13.492493629, - 52.418174744 - ], - [ - 13.492147446, - 52.418441772 - ], - [ - 13.490100861, - 52.420017242 - ], - [ - 13.489993095, - 52.420093536 - ], - [ - 13.489733696, - 52.420288086 - ], - [ - 13.489574432, - 52.420440674 - ], - [ - 13.48927021, - 52.420764923 - ], - [ - 13.489129066, - 52.420928955 - ], - [ - 13.488491058, - 52.421710968 - ], - [ - 13.488237381, - 52.421993256 - ], - [ - 13.487900734, - 52.422344208 - ], - [ - 13.487172127, - 52.422939301 - ], - [ - 13.486559868, - 52.423408508 - ], - [ - 13.486092567, - 52.423770905 - ], - [ - 13.48562336, - 52.424152374 - ], - [ - 13.485471725, - 52.424255371 - ], - [ - 13.485077858, - 52.424537659 - ], - [ - 13.484401703, - 52.425022125 - ], - [ - 13.483383179, - 52.425769806 - ], - [ - 13.483257294, - 52.425865173 - ], - [ - 13.482924461, - 52.426101685 - ], - [ - 13.482698441, - 52.426265717 - ], - [ - 13.480806351, - 52.427612305 - ], - [ - 13.479895592, - 52.428302765 - ], - [ - 13.47981739, - 52.428356171 - ], - [ - 13.47854805, - 52.42930603 - ], - [ - 13.478359222, - 52.429431915 - ], - [ - 13.478157997, - 52.429595947 - ], - [ - 13.478037834, - 52.429683685 - ], - [ - 13.47772789, - 52.429954529 - ], - [ - 13.477515221, - 52.430137634 - ], - [ - 13.476874352, - 52.430652618 - ], - [ - 13.47661972, - 52.430850983 - ], - [ - 13.476373672, - 52.431026459 - ], - [ - 13.475787163, - 52.431369781 - ], - [ - 13.475365639, - 52.431587219 - ], - [ - 13.474074364, - 52.4322052 - ], - [ - 13.473599434, - 52.432434082 - ], - [ - 13.473415375, - 52.43252182 - ], - [ - 13.472993851, - 52.432712555 - ], - [ - 13.4716959, - 52.433292389 - ], - [ - 13.471329689, - 52.433441162 - ], - [ - 13.469817162, - 52.434043884 - ], - [ - 13.469201088, - 52.434314728 - ], - [ - 13.469059944, - 52.434391022 - ], - [ - 13.468596458, - 52.434631348 - ], - [ - 13.466616631, - 52.435817719 - ], - [ - 13.466080666, - 52.436122894 - ], - [ - 13.465838432, - 52.436260223 - ], - [ - 13.465315819, - 52.436565399 - ], - [ - 13.464496613, - 52.437023163 - ], - [ - 13.463171005, - 52.43762207 - ], - [ - 13.462702751, - 52.437843323 - ], - [ - 13.462409019, - 52.437988281 - ], - [ - 13.46231842, - 52.438037872 - ], - [ - 13.462058067, - 52.438179016 - ], - [ - 13.460422516, - 52.439193726 - ], - [ - 13.460037231, - 52.439479828 - ], - [ - 13.459775925, - 52.43970108 - ], - [ - 13.459723473, - 52.439754486 - ], - [ - 13.459409714, - 52.440021515 - ], - [ - 13.459005356, - 52.440368652 - ], - [ - 13.458240509, - 52.441017151 - ], - [ - 13.457676888, - 52.441509247 - ], - [ - 13.456965446, - 52.442108154 - ], - [ - 13.456719398, - 52.442359924 - ], - [ - 13.456583023, - 52.442497253 - ], - [ - 13.456512451, - 52.442592621 - ], - [ - 13.456332207, - 52.442821503 - ], - [ - 13.456071854, - 52.443252563 - ], - [ - 13.455370903, - 52.444972992 - ], - [ - 13.455272675, - 52.44519043 - ], - [ - 13.455172539, - 52.4454422 - ], - [ - 13.454929352, - 52.445533752 - ], - [ - 13.454372406, - 52.445556641 - ], - [ - 13.452836037, - 52.44562149 - ], - [ - 13.451435089, - 52.445671082 - ], - [ - 13.449950218, - 52.445732117 - ], - [ - 13.449712753, - 52.445739746 - ], - [ - 13.449320793, - 52.44575882 - ], - [ - 13.448624611, - 52.445781708 - ], - [ - 13.448477745, - 52.445789337 - ], - [ - 13.447191238, - 52.445838928 - ], - [ - 13.445914268, - 52.445884705 - ], - [ - 13.445550919, - 52.445896149 - ], - [ - 13.444639206, - 52.445934296 - ], - [ - 13.444497108, - 52.445941925 - ], - [ - 13.444350243, - 52.445953369 - ], - [ - 13.4439888, - 52.44600296 - ], - [ - 13.442544937, - 52.446834564 - ], - [ - 13.441972733, - 52.447113037 - ], - [ - 13.440879822, - 52.447547913 - ], - [ - 13.440397263, - 52.447731018 - ], - [ - 13.440110207, - 52.447826385 - ], - [ - 13.439696312, - 52.447929382 - ], - [ - 13.439285278, - 52.448001862 - ], - [ - 13.439059258, - 52.448059082 - ], - [ - 13.438909531, - 52.448108673 - ], - [ - 13.438746452, - 52.448192596 - ], - [ - 13.438674927, - 52.448131561 - ], - [ - 13.437895775, - 52.447433472 - ], - [ - 13.437681198, - 52.447235107 - ], - [ - 13.437045097, - 52.446689606 - ], - [ - 13.436873436, - 52.44651413 - ], - [ - 13.436362267, - 52.446037292 - ], - [ - 13.436190605, - 52.445854187 - ], - [ - 13.436362267, - 52.446037292 - ], - [ - 13.436873436, - 52.44651413 - ], - [ - 13.437045097, - 52.446689606 - ], - [ - 13.437681198, - 52.447235107 - ], - [ - 13.437660217, - 52.447479248 - ], - [ - 13.437644005, - 52.447570801 - ], - [ - 13.437449455, - 52.448303223 - ], - [ - 13.437498093, - 52.448410034 - ], - [ - 13.437361717, - 52.448432922 - ], - [ - 13.437185287, - 52.448471069 - ], - [ - 13.437046051, - 52.448516846 - ], - [ - 13.436974525, - 52.448551178 - ], - [ - 13.436769485, - 52.448654175 - ], - [ - 13.436707497, - 52.448688507 - ], - [ - 13.436658859, - 52.448745728 - ], - [ - 13.436600685, - 52.44877243 - ], - [ - 13.436264038, - 52.448947906 - ], - [ - 13.436333656, - 52.449050903 - ], - [ - 13.436501503, - 52.449287415 - ], - [ - 13.436527252, - 52.449321747 - ], - [ - 13.436580658, - 52.449398041 - ], - [ - 13.438248634, - 52.451702118 - ], - [ - 13.438462257, - 52.452030182 - ], - [ - 13.438505173, - 52.452144623 - ], - [ - 13.43860054, - 52.452201843 - ], - [ - 13.438648224, - 52.452415466 - ], - [ - 13.438677788, - 52.452579498 - ], - [ - 13.438673973, - 52.452697754 - ], - [ - 13.438651085, - 52.452781677 - ], - [ - 13.438504219, - 52.453048706 - ], - [ - 13.438376427, - 52.453136444 - ], - [ - 13.438240051, - 52.453483582 - ], - [ - 13.437959671, - 52.454143524 - ], - [ - 13.437572479, - 52.455108643 - ], - [ - 13.43737793, - 52.455593109 - ], - [ - 13.437252045, - 52.455905914 - ], - [ - 13.437194824, - 52.456039429 - ], - [ - 13.437150002, - 52.456150055 - ], - [ - 13.436961174, - 52.456607819 - ], - [ - 13.436709404, - 52.457176208 - ], - [ - 13.436512947, - 52.45759201 - ], - [ - 13.436303139, - 52.457977295 - ], - [ - 13.436096191, - 52.458377838 - ], - [ - 13.436008453, - 52.458568573 - ], - [ - 13.435801506, - 52.459049225 - ], - [ - 13.435427666, - 52.459831238 - ], - [ - 13.435299873, - 52.46018219 - ], - [ - 13.435123444, - 52.460720062 - ], - [ - 13.435070038, - 52.460891724 - ], - [ - 13.435009956, - 52.461063385 - ], - [ - 13.434931755, - 52.461303711 - ], - [ - 13.434843063, - 52.46149826 - ], - [ - 13.434798241, - 52.461856842 - ], - [ - 13.434743881, - 52.462051392 - ], - [ - 13.434661865, - 52.462287903 - ], - [ - 13.434613228, - 52.462429047 - ], - [ - 13.434565544, - 52.462532043 - ], - [ - 13.434479713, - 52.462696075 - ], - [ - 13.434398651, - 52.462814331 - ], - [ - 13.434216499, - 52.462982178 - ], - [ - 13.434041023, - 52.463176727 - ], - [ - 13.433871269, - 52.463363647 - ], - [ - 13.433573723, - 52.4637146 - ], - [ - 13.433339119, - 52.464027405 - ], - [ - 13.432988167, - 52.46452713 - ], - [ - 13.432909966, - 52.46465683 - ], - [ - 13.433052063, - 52.464736938 - ], - [ - 13.433185577, - 52.464847565 - ], - [ - 13.434347153, - 52.46578598 - ], - [ - 13.43439579, - 52.465824127 - ], - [ - 13.434561729, - 52.465961456 - ], - [ - 13.4347229, - 52.466087341 - ], - [ - 13.435704231, - 52.466880798 - ], - [ - 13.436393738, - 52.466960907 - ], - [ - 13.436722755, - 52.466999054 - ], - [ - 13.436852455, - 52.467014313 - ], - [ - 13.438138962, - 52.467159271 - ], - [ - 13.440406799, - 52.467430115 - ], - [ - 13.441808701, - 52.467594147 - ], - [ - 13.441857338, - 52.467605591 - ], - [ - 13.442008972, - 52.467670441 - ], - [ - 13.44198513, - 52.467803955 - ], - [ - 13.441916466, - 52.468208313 - ], - [ - 13.441903114, - 52.468284607 - ], - [ - 13.441795349, - 52.46887207 - ], - [ - 13.441762924, - 52.468978882 - ], - [ - 13.44181633, - 52.469089508 - ], - [ - 13.441812515, - 52.469108582 - ], - [ - 13.441781998, - 52.469345093 - ], - [ - 13.441745758, - 52.469604492 - ], - [ - 13.441641808, - 52.469726562 - ], - [ - 13.441613197, - 52.469841003 - ], - [ - 13.441549301, - 52.470172882 - ], - [ - 13.441507339, - 52.470413208 - ], - [ - 13.441498756, - 52.470462799 - ], - [ - 13.441394806, - 52.471038818 - ], - [ - 13.441366196, - 52.471206665 - ], - [ - 13.441405296, - 52.471279144 - ], - [ - 13.441369057, - 52.471508026 - ], - [ - 13.441458702, - 52.471515656 - ], - [ - 13.441511154, - 52.4715271 - ], - [ - 13.441576958, - 52.471569061 - ], - [ - 13.442343712, - 52.472263336 - ], - [ - 13.442426682, - 52.472301483 - ], - [ - 13.442523003, - 52.472324371 - ], - [ - 13.442516327, - 52.472366333 - ], - [ - 13.4425354, - 52.47240448 - ], - [ - 13.44308567, - 52.472888947 - ], - [ - 13.443483353, - 52.473255157 - ], - [ - 13.443502426, - 52.47328186 - ], - [ - 13.443569183, - 52.473423004 - ], - [ - 13.443605423, - 52.473564148 - ], - [ - 13.443622589, - 52.473712921 - ], - [ - 13.443624496, - 52.4737854 - ], - [ - 13.444513321, - 52.473873138 - ], - [ - 13.445295334, - 52.473960876 - ], - [ - 13.445151329, - 52.474128723 - ], - [ - 13.445291519, - 52.474163055 - ], - [ - 13.446117401, - 52.474380493 - ], - [ - 13.446929932, - 52.474597931 - ], - [ - 13.44698143, - 52.47460556 - ], - [ - 13.447024345, - 52.47460556 - ], - [ - 13.447067261, - 52.474601746 - ], - [ - 13.447439194, - 52.475803375 - ], - [ - 13.447550774, - 52.476112366 - ], - [ - 13.447616577, - 52.476333618 - ], - [ - 13.447663307, - 52.476425171 - ], - [ - 13.44804287, - 52.477684021 - ], - [ - 13.448085785, - 52.47782135 - ], - [ - 13.448119164, - 52.477935791 - ], - [ - 13.448202133, - 52.478160858 - ], - [ - 13.448659897, - 52.478504181 - ], - [ - 13.449481964, - 52.479129791 - ], - [ - 13.450310707, - 52.479755402 - ], - [ - 13.450602531, - 52.479976654 - ], - [ - 13.450725555, - 52.480072021 - ], - [ - 13.451802254, - 52.480880737 - ], - [ - 13.452273369, - 52.481243134 - ], - [ - 13.45246315, - 52.481388092 - ], - [ - 13.452612877, - 52.481498718 - ], - [ - 13.45324707, - 52.48197937 - ], - [ - 13.453961372, - 52.482517242 - ], - [ - 13.456096649, - 52.484138489 - ], - [ - 13.456254959, - 52.484260559 - ], - [ - 13.457948685, - 52.485546112 - ], - [ - 13.458347321, - 52.485839844 - ], - [ - 13.456992149, - 52.486808777 - ], - [ - 13.456432343, - 52.487201691 - ], - [ - 13.455653191, - 52.487758636 - ], - [ - 13.453977585, - 52.488941193 - ], - [ - 13.454572678, - 52.489376068 - ], - [ - 13.455964088, - 52.490432739 - ], - [ - 13.456983566, - 52.491188049 - ], - [ - 13.457713127, - 52.491714478 - ], - [ - 13.457920074, - 52.491786957 - ], - [ - 13.458016396, - 52.491840363 - ], - [ - 13.45939827, - 52.492835999 - ], - [ - 13.459500313, - 52.492912292 - ], - [ - 13.460221291, - 52.493465424 - ], - [ - 13.460617065, - 52.493797302 - ], - [ - 13.460804939, - 52.493942261 - ], - [ - 13.461070061, - 52.494159698 - ], - [ - 13.461850166, - 52.494800568 - ], - [ - 13.462110519, - 52.49521637 - ], - [ - 13.462246895, - 52.495323181 - ], - [ - 13.46389389, - 52.496665955 - ], - [ - 13.464204788, - 52.496753693 - ], - [ - 13.464544296, - 52.497119904 - ], - [ - 13.464606285, - 52.497215271 - ], - [ - 13.464643478, - 52.497310638 - ], - [ - 13.464666367, - 52.497421265 - ], - [ - 13.464753151, - 52.497566223 - ], - [ - 13.464924812, - 52.497829437 - ], - [ - 13.465081215, - 52.49805069 - ], - [ - 13.465513229, - 52.498714447 - ], - [ - 13.465647697, - 52.49892807 - ], - [ - 13.465722084, - 52.499099731 - ], - [ - 13.465786934, - 52.499248505 - ], - [ - 13.465964317, - 52.499668121 - ], - [ - 13.466071129, - 52.499950409 - ], - [ - 13.466582298, - 52.501167297 - ], - [ - 13.466583252, - 52.501277924 - ], - [ - 13.466501236, - 52.501941681 - ], - [ - 13.466501236, - 52.502063751 - ], - [ - 13.466513634, - 52.502140045 - ], - [ - 13.466565132, - 52.502277374 - ], - [ - 13.466641426, - 52.502422333 - ], - [ - 13.466691971, - 52.502490997 - ], - [ - 13.466762543, - 52.502563477 - ], - [ - 13.466827393, - 52.502609253 - ], - [ - 13.466879845, - 52.502632141 - ], - [ - 13.466954231, - 52.502655029 - ], - [ - 13.467047691, - 52.502666473 - ], - [ - 13.467115402, - 52.502670288 - ], - [ - 13.467172623, - 52.502670288 - ], - [ - 13.467508316, - 52.502624512 - ], - [ - 13.468510628, - 52.502464294 - ], - [ - 13.469779968, - 52.502254486 - ], - [ - 13.469991684, - 52.502220154 - ], - [ - 13.471398354, - 52.502017975 - ], - [ - 13.474914551, - 52.501502991 - ], - [ - 13.475172043, - 52.5014534 - ], - [ - 13.475209236, - 52.501560211 - ], - [ - 13.475388527, - 52.501979828 - ], - [ - 13.477026939, - 52.50170517 - ], - [ - 13.477138519, - 52.50169754 - ], - [ - 13.477298737, - 52.50170517 - ], - [ - 13.477692604, - 52.501735687 - ], - [ - 13.478367805, - 52.501785278 - ], - [ - 13.48141098, - 52.502010345 - ], - [ - 13.483391762, - 52.502151489 - ], - [ - 13.484308243, - 52.502223969 - ], - [ - 13.484597206, - 52.502246857 - ], - [ - 13.486724854, - 52.502410889 - ], - [ - 13.487017632, - 52.502426147 - ], - [ - 13.487155914, - 52.502399445 - ], - [ - 13.488366127, - 52.502124786 - ], - [ - 13.48913765, - 52.501945496 - ], - [ - 13.490619659, - 52.501483917 - ], - [ - 13.490981102, - 52.501377106 - ], - [ - 13.492963791, - 52.500965118 - ], - [ - 13.493370056, - 52.500881195 - ], - [ - 13.494781494, - 52.500595093 - ], - [ - 13.495025635, - 52.500537872 - ], - [ - 13.495450974, - 52.500431061 - ], - [ - 13.495686531, - 52.500354767 - ], - [ - 13.495876312, - 52.500293732 - ], - [ - 13.496304512, - 52.500156403 - ], - [ - 13.497889519, - 52.499641418 - ] - ] - } - } - } - }, - "aOutboundAllDay20190301ArrDep": { - "arrivalDepartureId": "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=", - "tripId": "a-outbound-all-day", - "date": "2019-03-01T00:00:00", - "stopSequence": 3, - "frequenciesRow": -1, - "frequenciesIt": -1 - }, - "bDowntownOnWorkingDays20190608ArrDep": { - "arrivalDepartureId": "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==", - "tripId": "b-downtown-on-working-days", - "date": "2019-03-08T00:00:00", - "stopSequence": 3, - "frequenciesRow": 1, - "frequenciesIt": 2 - }, - "aOutboundAllDay20190301Con": { - "connectionId": "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=", - "tripId": "a-outbound-all-day", - "date": "2019-03-01T00:00:00", - "fromStopSequence": 3, - "frequenciesRow": -1, - "frequenciesIt": -1 - }, - "bDowntownOnWorkingDays20190608Con": { - "connectionId": "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==", - "tripId": "b-downtown-on-working-days", - "date": "2019-03-08T00:00:00", - "fromStopSequence": 3, - "frequenciesRow": 1, - "frequenciesIt": 2 - } - } -} diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index a368ee6..fe136a4 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -8,11 +8,15 @@ set -x env | grep '^PG' || true -psql -c 'create database sample_gtfs_feed' -export PGDATABASE='sample_gtfs_feed' +# path_to_db="sample-gtfs-feed.duckdb" +path_to_db="$(mktemp -d)/sample-gtfs-feed.duckdb" +# path_to_db=':memory:' +# todo: what about sample-gtfs-feed@0.13? # --lower-case-lang-codes: Even though sample-gtfs-feed@0.11.2 *does not* contain invalid-case language codes (e.g. de_aT or de-at), we check that with --lower-case-lang-codes valid ones are still accepted. ../cli.js -d --trips-without-shape-id --lower-case-lang-codes -- \ + "$path_to_db" \ + ../node_modules/sample-gtfs-feed/gtfs/feed_info.txt \ ../node_modules/sample-gtfs-feed/gtfs/agency.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar_dates.txt \ @@ -23,8 +27,7 @@ export PGDATABASE='sample_gtfs_feed' ../node_modules/sample-gtfs-feed/gtfs/stop_times.txt \ ../node_modules/sample-gtfs-feed/gtfs/levels.txt \ ../node_modules/sample-gtfs-feed/gtfs/pathways.txt \ - ../node_modules/sample-gtfs-feed/gtfs/translations.txt \ - | sponge | psql -b + ../node_modules/sample-gtfs-feed/gtfs/translations.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -34,18 +37,19 @@ order by t_arrival EOF ) -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1553993700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi -arr2=$(psql --csv -t -c "$query" | head -n 2 | tail -n 1) +arr2=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 2 | tail -n 1) if [[ "$arr2" != "1553994180" ]]; then echo "invalid 2nd t_arrival: $arr2" 1>&2 exit 1 fi +# In sample-gtfs-feed@0.13, the frequencies-based arrivals/departures are earlier (from 8:00 until 8:59) than the stop_times-based ones (13:13), so across all service days, the earliest departure has to be a frequencies-based one. arrs_deps_b_downtown_on_working_days=$(cat << EOF SELECT stop_sequence, @@ -58,12 +62,12 @@ arrs_deps_b_downtown_on_working_days=$(cat << EOF LIMIT 2 EOF ) -freq_arr_dep1=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 1) +freq_arr_dep1=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" "$path_to_db" | head -n 1) if [[ "$freq_arr_dep1" != "1,1552028340,1552028400,1,1" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep1" 1>&2 exit 1 fi -freq_arr_dep2=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 2 | tail -n 1) +freq_arr_dep2=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" "$path_to_db" | head -n 2 | tail -n 1) if [[ "$freq_arr_dep2" != "1,1552028640,1552028700,1,2" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep2" 1>&2 exit 1 @@ -81,7 +85,7 @@ cons_b_downtown_on_working_days=$(cat << EOF LIMIT 1 EOF ) -freq_con1=$(psql --csv -t -c "$cons_b_downtown_on_working_days") +freq_con1=$(duckdb -csv -noheader -c "$cons_b_downtown_on_working_days" "$path_to_db") if [[ "$freq_con1" != "1,1552028400,3,1552028760" ]]; then echo "invalid/missing frequencies-based connection: $freq_con1" 1>&2 exit 1 @@ -93,10 +97,10 @@ connection_during_dst=$(cat << EOF extract(epoch from t_departure)::integer as dep FROM connections WHERE trip_id = 'during-dst-1' - AND t_departure = '2019-03-31T01:58+01' + AND t_departure = '2019-03-31T01:58:00+01:00' EOF ) -dst1=$(psql --csv -t -c "$connection_during_dst" | head -n 1) +dst1=$(duckdb -csv -noheader -c "$connection_during_dst" "$path_to_db" | head -n 1) if [[ "$dst1" != "0,1553993880" ]]; then echo "invalid/missing DST t_departure: $dst1" 1>&2 exit 1 @@ -113,8 +117,8 @@ airport_levels=$(cat << EOF LIMIT 1 EOF ) -lvl1=$(psql --csv -t -c "$airport_levels" | head -n 1) -if [[ "$lvl1" != "airport-level-0,0,ground level" ]]; then +lvl1=$(duckdb -csv -noheader -c "$airport_levels" "$path_to_db" | head -n 1) +if [[ "$lvl1" != 'airport-level-0,0.0,ground level' ]]; then echo "invalid/missing lowest airport-% level: $lvl1" 1>&2 exit 1 fi @@ -129,8 +133,8 @@ airportPathway=$(cat << EOF LIMIT 1 EOF ) -pw1=$(psql --csv -t -c "$airportPathway" | head -n 1) -if [[ "$pw1" != "escalator,f" ]]; then +pw1=$(duckdb -csv -noheader -c "$airportPathway" "$path_to_db" | head -n 1) +if [[ "$pw1" != 'escalator,false' ]]; then echo "invalid/missing DST t_departure: $pw1" 1>&2 exit 1 fi @@ -143,7 +147,7 @@ timepoint_exact=$(cat << EOF LIMIT 1 EOF ) -exact1=$(psql --csv -t -c "$timepoint_exact" | head -n 1) +exact1=$(duckdb -csv -noheader -c "$timepoint_exact" "$path_to_db" | head -n 1) if [[ "$exact1" != "exact" ]]; then echo "invalid/missing DST t_departure: $exact1" 1>&2 exit 1 @@ -157,7 +161,7 @@ stops_translations=$(cat << EOF AND record_id = 'airport-entrance' EOF ) -airport_entrance_translation=$(psql --csv -t -c "$stops_translations") +airport_entrance_translation=$(duckdb -csv -noheader -c "$stops_translations" "$path_to_db") if [[ "$airport_entrance_translation" != "Eingang,de-DE" ]]; then echo "invalid/missing stop translation: $airport_entrance_translation" 1>&2 exit 1 @@ -173,7 +177,7 @@ stops_translated=$(cat << EOF AND stop_id = 'airport-entrance' EOF ) -translated_airport_entrance=$(psql --csv -t -c "$stops_translated") +translated_airport_entrance=$(duckdb -csv -noheader -c "$stops_translated" "$path_to_db") if [[ "$translated_airport_entrance" != "airport-entrance,Eingang,de-DE" ]]; then echo "invalid/missing translated stop: $translated_airport_entrance" 1>&2 exit 1 @@ -187,10 +191,10 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -wheelchair_accessible_arrs_deps_rows="$(psql --csv -t -c "$wheelchair_accessible_arrs_deps_query")" +wheelchair_accessible_arrs_deps_rows="$(duckdb -csv -noheader -c "$wheelchair_accessible_arrs_deps_query" "$path_to_db")" wheelchair_accessible_arrs_deps_expected=$(cat << EOF -a-downtown-all-day, -a-outbound-all-day, +a-downtown-all-day,NULL +a-outbound-all-day,NULL b-downtown-on-weekends,accessible b-downtown-on-working-days,accessible b-outbound-on-weekends,unknown @@ -210,10 +214,10 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -bikes_allowed_arrs_deps_rows="$(psql --csv -t -c "$bikes_allowed_arrs_deps_query")" +bikes_allowed_arrs_deps_rows="$(duckdb -csv -noheader -c "$bikes_allowed_arrs_deps_query" "$path_to_db")" bikes_allowed_arrs_deps_expected=$(cat << EOF -a-downtown-all-day, -a-outbound-all-day, +a-downtown-all-day,NULL +a-outbound-all-day,NULL b-downtown-on-weekends,unknown b-downtown-on-working-days,unknown b-outbound-on-weekends,allowed @@ -229,9 +233,10 @@ frequencies_it_query=$(cat << EOF SELECT t_departure, stop_sequence, stop_id, frequencies_it FROM arrivals_departures WHERE trip_id = 'b-downtown-on-working-days' AND "date" = '2019-05-29' AND frequencies_it = 3 +ORDER BY t_departure EOF ) -frequencies_it_rows="$(psql --csv -t -c "$frequencies_it_query")" +frequencies_it_rows="$(duckdb -csv -noheader -c "$frequencies_it_query" "$path_to_db")" frequencies_it_expected=$(cat << EOF 2019-05-29 08:10:00+02,1,airport,3 2019-05-29 08:18:00+02,3,lake,3 @@ -253,7 +258,7 @@ ORDER BY t_departure ASC LIMIT 3 EOF ) -frequencies_it_connections_rows="$(psql --csv -t -c "$frequencies_it_connections_query")" +frequencies_it_connections_rows="$(duckdb -csv -noheader -c "$frequencies_it_connections_query" "$path_to_db")" frequencies_it_connections_expected=$(cat << EOF 1,2019-03-08 08:00:00+01,2019-03-08 08:06:00+01,1 1,2019-03-08 08:05:00+01,2019-03-08 08:11:00+01,2 @@ -273,9 +278,10 @@ SELECT stop_url, stop_url_lang FROM stops_translated WHERE stop_id LIKE 'airport%' +ORDER BY stop_id, stop_name_lang, stop_desc_lang EOF ) -stops_translated_rows="$(psql --csv -t -c "$stops_translated_query")" +stops_translated_rows="$(duckdb -csv -noheader -nullvalue '' -c "$stops_translated_query" "$path_to_db")" stops_translated_expected=$(cat << EOF airport,International Airport (ABC),,train station at the Internationl Airport (ABC),,https://fta.example.org/stations/airport.html, airport-1,Gleis 1,de-DE,Platform 1,,, diff --git a/test/stops-without-level-id.sh b/test/stops-without-level-id.sh index 7431429..473a2b0 100755 --- a/test/stops-without-level-id.sh +++ b/test/stops-without-level-id.sh @@ -8,16 +8,16 @@ set -x shopt -s extglob -# When omitting levels.txt, --stops-without-level-id/opt.stopsWithoutLevelId should be true by default. +# Importing should work *without* levels.txt. # see also https://github.com/public-transport/gtfs-via-postgres/issues/43 ../cli.js -d -s -- \ - ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ - | grep -c 'stopsWithoutLevelId: true' + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt # Importing should work *with* --stops-without-level-id (and without levels.txt). # see also https://github.com/public-transport/gtfs-via-postgres/issues/43#issuecomment-1632657546 ../cli.js -d -s --stops-without-level-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt echo 'works ✔'