From 1b4f8a240845c3ead8bb18f142ae862770a4eafc Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 12 Jun 2025 19:46:39 +0200 Subject: [PATCH 01/16] =?UTF-8?q?add=20guide:=20distance=20between=20stops?= =?UTF-8?q?=20along=20trip=20shape=20=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/analysis/distance-between-stops.md | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 docs/analysis/distance-between-stops.md diff --git a/docs/analysis/distance-between-stops.md b/docs/analysis/distance-between-stops.md new file mode 100644 index 0000000..10bb1ff --- /dev/null +++ b/docs/analysis/distance-between-stops.md @@ -0,0 +1,27 @@ +# calculating the geographic distance of a trip's shape between two stops + +1. For each stop, respectively, find the point that's closest to the stop (using `ST_LineLocatePoint()`), and then +2. measure the length between those points (using `ST_LineSubstring()` & `ST_Length()`). + +```sql +WITH + stop_a AS ( + SELECT * + FROM stops + WHERE stop_id = 'stop A ID' + ), + stop_b AS ( + SELECT * + FROM stops + WHERE stop_id = 'stop B ID' + ) +SELECT + ST_Length(ST_LineSubstring( + shape::geography, + ST_LineLocatePoint(shape::geography, stop_a.stop_loc), + ST_LineLocatePoint(shape::geography, stop_b.stop_loc) + )) AS segment_length +FROM stop_a, stop_b, trips +JOIN shapes_aggregated ON shapes_aggregated.shape_id = trips.shape_id +WHERE trip_id = 'some trip ID' +``` From 42d31512ae99f98530949b810f850c227ff0366e Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 26 Jun 2025 23:23:08 +0200 Subject: [PATCH 02/16] =?UTF-8?q?fix=20files=20import=20order=20in=20some?= =?UTF-8?q?=20cases=20with=20agency=20=F0=9F=90=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/deps.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/deps.js b/lib/deps.js index dceb7c6..b2b9d66 100644 --- a/lib/deps.js +++ b/lib/deps.js @@ -27,7 +27,7 @@ const getDependencies = (opt, files) => { 'frequencies', ], routes: [ - ...(routesWithoutAgencyId ? [] : ['agency']), + ...(routesWithoutAgencyId && !files.includes('agency') ? [] : ['agency']), ], trips: [ 'routes', From 8e828f7a089330d69fe3dcaf9a48664d4ce45a3a Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 26 Jun 2025 23:37:04 +0200 Subject: [PATCH 03/16] =?UTF-8?q?DuckDB=20rewrite:=20remove=20PostgreSQL-s?= =?UTF-8?q?pecific=20features=20=F0=9F=92=A5=E2=9C=85=F0=9F=92=9A?= =?UTF-8?q?=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/test.yml | 20 - cli.js | 46 - docs/import-metadata.md | 2 +- docs/postgrest.md | 9 - index.js | 115 - lib/calendar_dates.js | 4 - lib/pathways.js | 5 - lib/prerequisites.js | 13 - lib/routes.js | 7 - lib/service_days.js | 3 - lib/shapes.js | 5 - lib/stats_active_trips_by_hour.js | 4 - lib/stats_by_agency_route_stop_hour.js | 3 - lib/stats_by_route_date.js | 4 - lib/stop_times.js | 35 - lib/stops.js | 3 - lib/transfers.js | 8 - lib/translations.js | 173 -- lib/trips.js | 4 - package.json | 14 +- readme.md | 61 - scripts/run-postgraphile.js | 84 - test/amtrak-gtfs-2021-10-06.sh | 35 - test/index.sh | 1 - test/postgraphile.sh | 31 - ...sample-gtfs-feed-postgraphile-test.graphql | 115 - ...ample-gtfs-feed-postgraphile-test.res.json | 2316 ----------------- 27 files changed, 3 insertions(+), 3117 deletions(-) delete mode 100644 docs/postgrest.md delete mode 100755 scripts/run-postgraphile.js delete mode 100755 test/postgraphile.sh delete mode 100644 test/sample-gtfs-feed-postgraphile-test.graphql delete mode 100644 test/sample-gtfs-feed-postgraphile-test.res.json diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 52ad492..5b5fc7a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,26 +54,6 @@ jobs: PGPASSWORD: password PGDATABASE: postgres - - name: install PostgREST - run: | - set -euo pipefail - set -x - dl_url="$( - curl -fsSL \ - -H "User-Agent: $user_agent" \ - -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' \ - 'https://api.github.com/repos/PostgREST/postgrest/releases/latest' \ - | jq -rc '.assets[] | select(.name | test("linux-static-x86-64")) | .browser_download_url' - )" - wget -nv -U "$user_agent" \ - --header='Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' \ - -O /tmp/postgrest.tar.xz \ - "$dl_url" - tar -C /usr/local/bin -J -x postgrest ` option, so that PostgREST only exposes what's in the schema, preventing accidental leaks. diff --git a/index.js b/index.js index 4343424..400fa71 100644 --- a/index.js +++ b/index.js @@ -26,13 +26,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByAgencyIdAndRouteIdAndStopAndHour: 'none', statsActiveTripsByHour: 'none', schema: 'public', - postgraphile: false, - postgraphilePassword: process.env.POSTGRAPHILE_PGPASSWORD || null, - postgrest: false, - postgrestPassword: process.env.POSTGREST_PASSWORD || null, - // see https://github.com/pgexperts/pg_plan_filter - // see also https://www.postgresql.org/docs/14/using-explain.html - postgrestQueryCostLimit: null, // or float importMetadata: false, ...opt, } @@ -47,16 +40,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByAgencyIdAndRouteIdAndStopAndHour, statsActiveTripsByHour, } = opt - let postgraphilePassword = opt.postgraphilePassword - if (opt.postgraphile && postgraphilePassword === null) { - postgraphilePassword = randomBytes(10).toString('hex') - console.error(`PostGraphile PostgreSQL user's password:`, postgraphilePassword) - } - let postgrestPassword = opt.postgrestPassword - if (opt.postgrest && postgrestPassword === null) { - postgrestPassword = randomBytes(10).toString('hex') - console.error(`PostrREST PostgreSQL user's password:`, postgrestPassword) - } if (ignoreUnsupportedFiles) { files = files.filter(f => !!formatters[f.name]) @@ -255,104 +238,6 @@ LANGUAGE sql; } yield `\ - -${opt.postgraphile ? `\ --- seal imported data --- todo: --- > Be careful with public schema.It already has a lot of default privileges that you maybe don't want... See documentation[1]. --- > [1]: postgresql.org/docs/11/ddl-schemas.html#DDL-SCHEMAS-PRIV -DO $$ -BEGIN - -- https://stackoverflow.com/questions/8092086/create-postgresql-role-user-if-it-doesnt-exist#8099557 - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgraphile' - ) THEN - RAISE NOTICE 'Role "postgraphile" already exists, skipping creation.'; - ELSE - CREATE ROLE postgraphile LOGIN PASSWORD '${opt.postgraphilePassword}'; -- todo: escape properly - END IF; -END -$$; -DO $$ - DECLARE - db TEXT := current_database(); - BEGIN - -- todo: grant just on $opt.schema instead? - EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %I TO %I', db, 'postgraphile'); - END -$$; -GRANT USAGE ON SCHEMA "${opt.schema}" TO postgraphile; --- https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment50679407_762649 -REVOKE CREATE ON SCHEMA "${opt.schema}" FROM PUBLIC; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO postgraphile; --- ALTER DEFAULT PRIVILEGES IN SCHEMA "${opt.schema}" GRANT SELECT ON TABLES TO postgraphile; --- todo: set search_path? https://stackoverflow.com/questions/760210/how-do-you-create-a-read-only-user-in-postgresql#comment33535263_762649 -` : ''} - -${opt.postgrest ? `\ -${opt.schema !== 'public' ? `\ --- pattern from https://stackoverflow.com/a/8099557 -DO -$$ -BEGIN - -- Roles are shared across databases, so we have remove previously configured privileges. - -- This might of course interfere with other programs running on the DBMS! - -- todo: find a cleaner solution - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'web_anon' - ) THEN - RAISE WARNING 'Role web_anon already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY web_anon TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE web_anon NOLOGIN NOINHERIT; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role web_anon was just created by a concurrent transaction.'; - END; - END IF; - IF EXISTS ( - SELECT FROM pg_catalog.pg_roles - WHERE rolname = 'postgrest' - ) THEN - RAISE WARNING 'Role postgrest already exists. Reassigning owned DB objects to current_user().'; - REASSIGN OWNED BY postgrest TO SESSION_USER; - ELSE - BEGIN - CREATE ROLE postgrest LOGIN NOINHERIT NOCREATEDB NOCREATEROLE NOSUPERUSER PASSWORD '${postgrestPassword}'; - EXCEPTION - WHEN duplicate_object THEN - RAISE NOTICE 'Role postgrest was just created by a concurrent transaction.'; - END; - END IF; -END -$$; - - --- https://postgrest.org/en/stable/tutorials/tut0.html#step-4-create-database-for-api --- https://postgrest.org/en/stable/explanations/db_authz.html --- todo: is this secure? -GRANT USAGE ON SCHEMA "${opt.schema}" TO web_anon; -GRANT SELECT ON ALL TABLES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA "${opt.schema}" TO web_anon; -GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA "${opt.schema}" TO web_anon; - -GRANT web_anon TO postgrest; - -${opt.postgrestQueryCostLimit !== null ? ` --- If pg_plan_filter is installed, limit the cost of queries made by PostgREST users. -ALTER USER web_anon SET plan_filter.statement_cost_limit = ${opt.postgrestQueryCostLimit}; -` : ''} - -COMMENT ON SCHEMA "${opt.schema}" IS -$$GTFS REST API -This REST API is created by running [PostgREST](https://postgrest.org/) on top of a [PostgreSQL](https://www.postgresql.org) DB generated using [${pkg.name} v${pkg.version}](${pkg.homepage || pkg.repository}). -$$; -` : ''} -` : ''} - COMMIT;` } diff --git a/lib/calendar_dates.js b/lib/calendar_dates.js index 1002afd..924174f 100644 --- a/lib/calendar_dates.js +++ b/lib/calendar_dates.js @@ -41,10 +41,6 @@ const afterAll = (opt) => `\ CREATE INDEX ON "${opt.schema}".calendar_dates (service_id); CREATE INDEX ON "${opt.schema}".calendar_dates (exception_type); - -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".calendar_dates IS E'@foreignKey (service_id) references calendar|@fieldName calendar'; -` : ''} ` module.exports = { diff --git a/lib/pathways.js b/lib/pathways.js index c69a905..bb00834 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -84,11 +84,6 @@ const formatPathwaysRow = (p) => { const afterAll = (opt) => `\ \\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".pathways (from_stop_id); -CREATE INDEX ON "${opt.schema}".pathways (to_stop_id); -` : ''} ` module.exports = { diff --git a/lib/prerequisites.js b/lib/prerequisites.js index d923789..2688e44 100644 --- a/lib/prerequisites.js +++ b/lib/prerequisites.js @@ -22,10 +22,6 @@ AS $$ ); $$ language sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_bcp_47_tag IS E'@omit'; -` : ''} - -- todo [breaking]: remove CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_lang_code( input TEXT @@ -36,9 +32,6 @@ AS $$ SELECT "${opt.schema}".is_bcp_47_tag(input); $$ language sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_lang_code IS E'@omit'; -` : ''} `, } const is_timezone = { @@ -59,9 +52,6 @@ AS $$ END; $$ language plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_timezone IS E'@omit'; -` : ''} `, } const shape_exists = { @@ -79,9 +69,6 @@ AS $$ ); $$ language sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".shape_exists IS E'@omit'; -` : ''} `, } diff --git a/lib/routes.js b/lib/routes.js index 999ca80..b2249ac 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -257,10 +257,6 @@ CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( ${extRouteTypes.map(([route_type, desc]) => `, '${route_type}' -- ${desc}`).join('\n')} ); CREATE CAST ("${opt.schema}".route_type_val AS text) WITH INOUT AS IMPLICIT; --- todo [breaking]: use small table as enum? https://www.graphile.org/postgraphile/enums/#with-enum-tables -${opt.postgraphile ? `\ -COMMENT ON TYPE "${opt.schema}".route_type_val IS E'@enum\\n@enumName RouteType\\n'; -` : ''} CREATE TABLE "${opt.schema}".routes ( route_id TEXT PRIMARY KEY, @@ -328,9 +324,6 @@ const afterAll = (opt) => `\ \\. CREATE INDEX ON "${opt.schema}".routes (route_short_name); -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".routes (agency_id); -` : ''} ` module.exports = { diff --git a/lib/service_days.js b/lib/service_days.js index 5df4b1f..f32bee7 100644 --- a/lib/service_days.js +++ b/lib/service_days.js @@ -68,9 +68,6 @@ CREATE INDEX ON "${opt.schema}".service_days (date); -- apparently the unique index (service_id, date) doesn't speed up queries CREATE INDEX ON "${opt.schema}".service_days (service_id, date); -${opt.postgraphile ? `\ -COMMENT ON MATERIALIZED VIEW "${opt.schema}".service_days IS E'@name serviceDates\\n@primaryKey service_id,date'; -` : ''} ` module.exports = { diff --git a/lib/shapes.js b/lib/shapes.js index 7418dcb..dc4bd99 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -49,11 +49,6 @@ FROM ( ) shapes GROUP BY shape_id; -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".shapes IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".shapes.id IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".shapes_aggregated IS E'@name shapes\\n@primaryKey shape_id'; -` : ''} ` module.exports = { diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js index e369261..17287e0 100644 --- a/lib/stats_active_trips_by_hour.js +++ b/lib/stats_active_trips_by_hour.js @@ -96,10 +96,6 @@ FROM ( ${materialized ? `\ CREATE INDEX ON "${opt.schema}".stats_active_trips_by_hour ("hour"); ` : ''} - -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_active_trips_by_hour IS E'@name hourlyActiveTripsStats\\n@primaryKey hour'; -` : ''} ` } diff --git a/lib/stats_by_agency_route_stop_hour.js b/lib/stats_by_agency_route_stop_hour.js index c753d31..6993c7a 100644 --- a/lib/stats_by_agency_route_stop_hour.js +++ b/lib/stats_by_agency_route_stop_hour.js @@ -27,9 +27,6 @@ CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (station_id); CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (effective_hour); ` : ''} -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_agency_route_stop_hour IS E'@name hourlyStats\\n@primaryKey route_id,stop_id,effective_hour\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByStopIdAndHour\\n@foreignKey (stop_id) references stops|@fieldName stop|@foreignFieldName statsByRouteIdAndHour'; -` : ''} ` } diff --git a/lib/stats_by_route_date.js b/lib/stats_by_route_date.js index ff816f9..12ca5b0 100644 --- a/lib/stats_by_route_date.js +++ b/lib/stats_by_route_date.js @@ -62,10 +62,6 @@ CREATE INDEX ON "${opt.schema}".stats_by_route_date ("date"); CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, "date", is_effective); CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, dow, is_effective); ` : ''} - -${opt.postgraphile ? `\ -COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_route_date IS E'@name routeStats\\n@primaryKey route_id,date,is_effective\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByDate'; -` : ''} ` } diff --git a/lib/stop_times.js b/lib/stop_times.js index c58acab..cf40515 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -112,10 +112,6 @@ AND stop_times.stop_sequence = t.stop_sequence; CREATE INDEX ON "${opt.schema}".stop_times (trip_id); CREATE INDEX ON "${opt.schema}".stop_times (stop_id); -${opt.postgraphile ? `\ -COMMENT ON COLUMN "${opt.schema}".stop_times.stop_sequence_consec IS E'@name stopSequenceConsecutive'; -` : ''} - UPDATE "${opt.schema}".stop_times SET stop_sequence_consec = t.seq FROM ( @@ -359,21 +355,6 @@ AS $$ LIMIT 1; $$ LANGUAGE SQL STABLE STRICT; -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".arrivals_departures.station_name IS E'@omit'; --- > If you want to rename just one field or type, your best bet is to use a [@name] smart comment […]. --- > NOTE: this still uses the inflectors, but it pretends that the tables name is different, so the input to the inflectors differs. --- https://www.graphile.org/postgraphile/inflection/#overriding-naming---one-off -COMMENT ON VIEW "${opt.schema}".arrivals_departures IS E'@name arrival_departures\\n@primaryKey trip_id,date,stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (stop_id) references stops|@fieldName stop\\n@foreignKey (station_id) references stops|@fieldName station'; -` : ''} - CREATE OR REPLACE VIEW "${opt.schema}".connections AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT @@ -621,22 +602,6 @@ AS $$ -- todo: what if there are >1 rows? LIMIT 1; $$ LANGUAGE SQL STABLE STRICT; - -${opt.postgraphile ? `\ --- todo: currently named arrivalsDeparture, should be arrivalDeparture (but allArrivalsDeparturesList!) --- todo: allow filtering based on stop and/or route and/or trip and/or time frame --- https://www.graphile.org/postgraphile/functions/#setof-functions---connections -COMMENT ON COLUMN "${opt.schema}".connections.route_short_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_long_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.route_type IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.direction_id IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.trip_headsign IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.from_station_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_stop_name IS E'@omit'; -COMMENT ON COLUMN "${opt.schema}".connections.to_station_name IS E'@omit'; -COMMENT ON VIEW "${opt.schema}".connections IS E'@primaryKey trip_id,date,from_stop_sequence,frequencies_row,frequencies_it\\n@foreignKey (route_id) references routes|@fieldName route\\n@foreignKey (trip_id) references trips|@fieldName trip\\n@foreignKey (from_stop_id) references stops|@fieldName fromStop\\n@foreignKey (from_station_id) references stops|@fieldName fromStation\\n@foreignKey (to_stop_id) references stops|@fieldName toStop\\n@foreignKey (to_station_id) references stops|@fieldName toStation'; -` : ''} ` diff --git a/lib/stops.js b/lib/stops.js index 7e8d65f..53549b4 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -114,9 +114,6 @@ FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops; CREATE INDEX ON "${opt.schema}".stops (parent_station); ${opt.stopsLocationIndex ? `CREATE INDEX ON "${opt.schema}".stops (stop_loc);` : ''} -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".stops (level_id); -` : ''} ` module.exports = { diff --git a/lib/transfers.js b/lib/transfers.js index 9ac0e2b..c0c5be9 100644 --- a/lib/transfers.js +++ b/lib/transfers.js @@ -74,14 +74,6 @@ const formatTransfersRow = (t) => { const afterAll = (opt) => `\ \\. - -${opt.postgraphile ? `\ -CREATE INDEX ON "${opt.schema}".transfers (from_route_id); -CREATE INDEX ON "${opt.schema}".transfers (from_trip_id); -CREATE INDEX ON "${opt.schema}".transfers (to_stop_id); -CREATE INDEX ON "${opt.schema}".transfers (to_route_id); -CREATE INDEX ON "${opt.schema}".transfers (to_trip_id); -` : ''} ` module.exports = { diff --git a/lib/translations.js b/lib/translations.js index 61baa8e..ca08054 100644 --- a/lib/translations.js +++ b/lib/translations.js @@ -14,9 +14,6 @@ AS $$ LIMIT 1 ); $$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".table_exists IS E'@omit'; -` : ''} CREATE OR REPLACE FUNCTION "${opt.schema}".column_exists( t_name TEXT, @@ -32,9 +29,6 @@ AS $$ LIMIT 1 ); $$ LANGUAGE sql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".column_exists IS E'@omit'; -` : ''} CREATE TABLE "${opt.schema}"._translations_ref_cols ( table_name TEXT PRIMARY KEY, @@ -54,9 +48,6 @@ CREATE TABLE "${opt.schema}"._translations_ref_cols ( "${opt.schema}".column_exists(table_name, record_sub_id_col) ) ); -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}"._translations_ref_cols IS E'@omit'; -` : ''} -- > ## record_id -- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: @@ -156,9 +147,6 @@ AS $$ END IF; END; $$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".row_exists IS E'@omit'; -` : ''} -- todo: assert that row_exists works as intended -- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', NULL, NULL); -- Virchowstr. (Berlin) @@ -229,9 +217,6 @@ AS $$ END IF; END; $$ LANGUAGE plpgsql STABLE; -${opt.postgraphile ? `\ -COMMENT ON FUNCTION "${opt.schema}".is_valid_translation_ref IS E'@omit'; -` : ''} -- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". -- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 @@ -406,30 +391,6 @@ LEFT JOIN "${opt.schema}".translations stop_u_t ON ( stop_u_t.table_name = 'stops' AND stop_u_t.field_name = 'stop_url' AND (s.stop_id = stop_u_t.record_id OR s.stop_name = stop_u_t.field_value) ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".stops_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".stops_translated_stop_name ( - stop stops, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, stops.stop_name) - FROM "${opt.schema}".stops - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'stops' AND t.field_name = 'stop_name' - AND (stops.stop_id = t.record_id OR stops.stop_name = t.field_value) - ) - WHERE stops.stop_id = stop.stop_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} CREATE OR REPLACE VIEW "${opt.schema}".routes_translated AS SELECT @@ -466,90 +427,6 @@ LEFT JOIN "${opt.schema}".translations route_u_t ON ( route_u_t.table_name = 'routes' AND route_u_t.field_name = 'route_url' AND (r.route_id = route_u_t.record_id OR r.route_long_name = route_u_t.field_value) ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".routes_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_short_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_short_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_short_name' - AND (routes.route_id = t.record_id OR routes.route_short_name = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_long_name ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_long_name) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_long_name' - AND (routes.route_id = t.record_id OR routes.route_long_name = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_desc ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_desc) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_desc' - AND (routes.route_id = t.record_id OR routes.route_desc = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".routes_translated_route_url ( - route routes, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, routes.route_url) - FROM "${opt.schema}".routes - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'routes' AND t.field_name = 'route_url' - AND (routes.route_id = t.record_id OR routes.route_url = t.field_value) - ) - WHERE routes.route_id = route.route_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} -- todo [breaking]: remove in favor of trip_headsign_translations & trip_short_name_translations CREATE OR REPLACE VIEW "${opt.schema}".trips_translated AS @@ -577,50 +454,6 @@ LEFT JOIN "${opt.schema}".translations trip_h_t ON ( trip_h_t.table_name = 'trips' AND trip_h_t.field_name = 'trip_headsign' AND (t.trip_id = trip_h_t.record_id OR t.trip_headsign = trip_h_t.field_value) ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".trips_translated IS E'@omit'; - -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_short_name ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_short_name) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_short_name' - AND (trips.trip_id = t.record_id OR trips.trip_short_name = t.field_value) - ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -CREATE OR REPLACE FUNCTION "${opt.schema}".trips_translated_trip_headsign ( - trip trips, - language TEXT -) RETURNS TEXT AS $$ - SELECT coalesce(t.translation, trips.trip_headsign) - FROM "${opt.schema}".trips - JOIN ( - SELECT - table_name, - field_name, record_id, field_value, - language as lang, translation - FROM "${opt.schema}".translations - ) t ON ( - t.table_name = 'trips' AND t.field_name = 'trip_headsign' - AND (trips.trip_id = t.record_id OR trips.trip_headsign = t.field_value) - ) - WHERE trips.trip_id = trip.trip_id - AND t.lang = language - LIMIT 1; -$$ LANGUAGE sql STABLE STRICT; -` : ''} CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures_translated AS SELECT @@ -680,9 +513,6 @@ LEFT JOIN "${opt.schema}".translations stop_times_t ON ( OR ad.stop_headsign = stop_times_t.field_value ) ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".arrivals_departures_translated IS E'@omit'; -` : ''} CREATE OR REPLACE VIEW "${opt.schema}".connections_translated AS SELECT @@ -777,9 +607,6 @@ LEFT JOIN "${opt.schema}".translations to_stop_times_t ON ( OR c.to_stop_headsign = to_stop_times_t.field_value ) ); -${opt.postgraphile ? `\ -COMMENT ON VIEW "${opt.schema}".connections_translated IS E'@omit'; -` : ''} ` module.exports = { diff --git a/lib/trips.js b/lib/trips.js index d6ec684..599ad9e 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -82,10 +82,6 @@ const afterAll = (opt) => `\ \\. CREATE INDEX ON "${opt.schema}".trips (route_id); - -${opt.postgraphile ? `\ -COMMENT ON TABLE "${opt.schema}".trips IS E'@foreignKey (shape_id) references shapes_aggregated|@fieldName shape'; -` : ''} ` module.exports = { diff --git a/package.json b/package.json index 103e824..e4e818f 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,7 @@ "version": "4.10.4", "main": "lib/index.js", "bin": { - "gtfs-to-sql": "cli.js", - "serve-gtfs-via-graphql": "scripts/run-postgraphile.js" + "gtfs-to-sql": "cli.js" }, "files": [ "cli.js", @@ -25,8 +24,7 @@ "convert", "postgres", "postgresql", - "sql", - "graphql" + "sql" ], "author": "Jannis R ", "contributors": [ @@ -65,19 +63,11 @@ "sequencify": "0.0.7" }, "devDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", "@yao-pkg/pkg": "^6.6.0", "csv-parser": "^3.0.0", "eslint": "^8.33.0", - "postgraphile": "^4.12.11", "sample-gtfs-feed": "^0.13.0" }, - "peerDependencies": { - "@graphile-contrib/pg-simplify-inflector": "^6.1.0", - "@graphile/postgis": "^0.2.0-0", - "postgraphile": "^4.12.11" - }, "scripts": { "test": "./test/index.sh", "lint": "eslint .", diff --git a/readme.md b/readme.md index 463dbe4..aab4c31 100644 --- a/readme.md +++ b/readme.md @@ -14,7 +14,6 @@ - ✨ joins `stop_times.txt`/`frequencies.txt`, `calendar.txt`/`calendar_dates.txt`, `trips.txt`, `route.txt` & `stops.txt` into [views](https://www.postgresql.org/docs/14/sql-createview.html) for straightforward data analysis (see below) - 🚀 is carefully optimised to let PostgreSQL's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) - ✅ validates and imports `translations.txt` -- ✨ exposes (almost) all data via GraphQL using [PostGraphile](https://www.graphile.org/postgraphile/introduction/), and as a RESTful API using [PostgREST](https://postgrest.org/) To work with the time-related data (`stop_times` etc.), `gtfs-via-postgres` supports two "mental models": @@ -193,21 +192,6 @@ Options: gets created, to ensure that multiple imports into the same database are all made using the same version. See also multiple-datasets.md in the docs. - --postgraphile Tweak generated SQL for PostGraphile usage. - https://www.graphile.org/postgraphile/ - --postgraphile-password Password for the PostGraphile PostgreSQL user. - Default: $POSTGRAPHILE_PGPASSWORD, fallback random. - --postgrest Tweak generated SQL for PostgREST usage. - Please combine it with --schema. - https://postgrest.org/ - --postgrest-password Password for the PostgREST PostgreSQL user `web_anon`. - Default: $POSTGREST_PGPASSWORD, fallback random. - --postgrest-query-cost-limit Define a cost limit [1] for queries executed by PostgREST - on behalf of a user. It is only enforced if - pg_plan_filter [2] is installed in the database! - Must be a positive float. Default: none - [1] https://www.postgresql.org/docs/14/using-explain.html - [2] https://github.com/pgexperts/pg_plan_filter --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - gtfs_via_postgres_version (text) @@ -266,12 +250,6 @@ docker run --rm --volume /path/to/gtfs:/gtfs \ import-gtfs --require-dependencies -- '/gtfs/*.csv' ``` -### Importing a GTFS Schedule feed continuously - -[postgis-gtfs-importer](https://github.com/mobidata-bw/postgis-gtfs-importer) imports [GTFS Schedule](https://gtfs.org/schedule/) data into a [PostGIS](https://postgis.net) database using `gtfs-via-postgres`. It allows running a production service (e.g. an API) on top of programmatically re-imported data from a periodically changing GTFS feed without downtime. - -Because it works as [atomically](https://en.wikipedia.org/wiki/Atomicity_(database_systems)) as possible with PostgreSQL, it makes the import pipeline *robust*, even if an import fails or if simultaneous imports get started. - ### Exporting data efficiently If you want to export data from the database, use the [`COPY` command](https://www.postgresql.org/docs/14/sql-copy.html); On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), PostgreSQL 14 can export about 500k `connections` rows per second. @@ -286,45 +264,6 @@ In the nested `SELECT` query, you can use features like `WHERE`, `ORDER BY` and If you want to find stops by (geo)location, run `gtfs-via-postgres` with `--stops-location-index`. This will create a [spatial index](https://postgis.net/workshops/postgis-intro/indexing.html) on `stops.stop_loc`, so that most [PostGIS functions & operators](https://postgis.net/docs/manual-3.2/reference.html#Measurement_Functions) make use of it. -### GraphQL support - -The `--postgraphile` flag changes the SQL generated by `gtfs-via-postgres` slightly, so that you get a reasonably idiomatic GraphQL API out-of-the-box when running [PostGraphile](https://www.graphile.org/postgraphile/) v4 on it: - -```shell -# import data into PostgreSQL with PostGraphile tweaks -npm exec -- gtfs-to-sql -d --postgraphile -- gtfs/*.csv | sponge | psql -b -``` - -In line with the intended PostGraphile usage, `gtfs-via-postgres` will create a PostgreSQL role/user `postgraphile` with read-only access to the DB. You can set the `postgraphile`'s password with the `--postgraphile-password` option, or using the `$POSTGRAPHILE_PGPASSWORD` environment variable; By default, it will use (and log) a random password. - -`gtfs-via-postgres` *doesn't* specify PostGraphile as a regular dependency, but as `peerDependencies`, in order to stay lightweight for users who don't need the GraphQL interface. Some versions of some package managers install unmet peer dependencies, some don't. Let's make sure that PostGraphile (and its plugins) are installed: - -```shell -npm install \ - postgraphile@^4.12 \ - @graphile-contrib/pg-simplify-inflector@^6.1 \ - @graphile/postgis@^0.2.0-0 -``` - -The `serve-gtfs-via-graphql` helper script configures and runs PostGraphile. With `NODE_ENV=development`, it will - -- serve a fully configured [GraphiQL UI](https://graphql-dotnet.github.io/docs/getting-started/graphiql/) at `/graphiql` -- provide more errors on database & query errors -- allow [using PostgreSQL's `EXPLAIN` via GraphQL](https://www.graphile.org/postgraphile/debugging/#via-postgraphiql-explain) - -``` -# listens on port 3000, this can be changed using $PORT -env NODE_ENV=development npm exec -- serve-gtfs-via-graphql -``` - -**As an example for the GraphQL API, check out the [test query](test/sample-gtfs-feed-postgraphile-test.graphql)** or open the [GraphiQL UI](https://github.com/graphql/graphiql) served at [`localhost:3000/graphiql`](http://localhost:3000/graphiql). - -### REST API support - -With the `--postgrest` flag, `gtfs-via-postgres` will augment the schema with a `web_anon` role and some comments, so that when running [PostgREST](https://postgrest.org/) on the database, you will get a powerful REST API. - -[read more](docs/postgrest.md) - ### more guides The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`. diff --git a/scripts/run-postgraphile.js b/scripts/run-postgraphile.js deleted file mode 100755 index bf74ee8..0000000 --- a/scripts/run-postgraphile.js +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env node - -const {createServer} = require('http') -const {postgraphile} = require('postgraphile') -const postgisPlugin = require('@graphile/postgis').default -const simplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector') - -const DEV = process.env.NODE_ENV === 'development' -const PROD = !DEV -const PORT = process.env.PORT ? parseInt(process.env.PORT) : 3000 -const SCHEMA = process.env.PGSCHEMA || 'public' - -const pg = postgraphile({}, SCHEMA, { - appendPlugins: [ - // PostGIS support for PostGraphile - postgisPlugin, - - // Simplifies the graphile-build-pg inflector to trim the `ByFooIdAndBarId` from relations - simplifyInflectorPlugin, - ], - graphileBuildOptions: { - pgSimplifyAllRows: false, - pgShortPk: false, - }, - - pgSettings: async () => ({ - // With `timestamptz` (a.k.a. `timestamp with time zone`), PostgreSQL *doesn't* store the timezone (offset) specified on input; Instead, it always converts to UTC. - // When querying a `timestamptz` value, it converts to the local timezone (offset) of the client's session or database server. - // Because we loose the timezone offset information *anyways*, we configure PostGraphile to give predictable results by letting PostgreSQL always convert to UTC. - timezone: 'UTC', - }), - - // [Experimental] Determines if the 'Explain' feature in GraphiQL can be used to show the user the SQL statements that were executed. Set to a boolean to enable all users to use this, or to a function that filters each request to determine if the request may use Explain. DO NOT USE IN PRODUCTION unless you're comfortable with the security repurcussions of doing so. - allowExplain: DEV, - - // Enables classic ids for Relay support. Instead of using the field name nodeId for globally unique ids, PostGraphile will instead use the field name id for its globally unique ids. This means that table id columns will also get renamed to rowId. - classicIds: true, - - // Turns off GraphQL query logging. By default PostGraphile will log every GraphQL query it processes along with some other information. Set this to true (recommended in production) to disable that feature. - disableQueryLog: PROD, - - // By default, JSON and JSONB fields are presented as strings (JSON encoded) from the GraphQL schema. Setting this to true (recommended) enables raw JSON input and output, saving the need to parse / stringify JSON manually. - dynamicJson: true, - - // Set this to true to add some enhancements to GraphiQL; intended for development usage only (automatically enables with subscriptions and live). - enhanceGraphiql: DEV, - - // Set this to true to enable the GraphiQL interface. - graphiql: true, - - // Extends the error response with additional details from the Postgres error. Can be any combination of ['hint', 'detail', 'errcode']. Default is []. - extendedErrors: DEV ? ['hint', 'detail', 'errcode'] : [], - - // Set false to exclude filters, orderBy, and relations that would be expensive to access due to missing indexes. Changing this from true to false is a breaking change, but false to true is not. The default is true. - ignoreIndexes: false, - - // Set false (recommended) to exclude fields, queries and mutations that are not available to any possible user (determined from the user in connection string and any role they can become); set this option true to skip these checks and create GraphQL fields and types for everything. The default is true, in v5 the default will change to false. - ignoreRBAC: false, - - // Some one-to-one relations were previously detected as one-to-many - should we export 'only' the old relation shapes, both new and old but mark the old ones as 'deprecated' (default), or 'omit' (recommended) the old relation shapes entirely. - legacyRelations: 'omit', - - // If none of your RETURNS SETOF compound_type functions mix NULLs with the results then you may set this false to reduce the nullables in the GraphQL schema. - setofFunctionsContainNulls: false, - - // Enables adding a stack field to the error response. Can be either the boolean true (which results in a single stack string) or the string json (which causes the stack to become an array with elements for each line of the stack). Recommended in development, not recommended in production. - showErrorStack: DEV, - - // Should we use relay pagination, or simple collections? - // "omit" (default) - // relay connections only, "only" (not recommended) - // simple collections only (no Relay connections), "both" - both. - simpleCollections: 'omit', -}) - -const server = createServer(pg) -server.listen(PORT, (err) => { - if (err) { - console.error(err) - process.exit(1) - } - const {port} = server.address() - console.info(`PostGraphile listening on port ${port}`) -}) diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 0adcfc7..67df57d 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -19,7 +19,6 @@ export PGDATABASE='amtrak_2021_10_06' --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ --stats-active-trips-by-hour=view \ - --postgrest \ -- amtrak-gtfs-2021-10-06/*.txt \ | sponge | psql -b @@ -104,38 +103,4 @@ if [[ "$nrOfActiveTrips" != "127" ]]; then exit 1 fi -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -env \ - PGRST_DB_SCHEMAS=amtrak \ - PGRST_DB_ANON_ROLE=web_anon \ - PGRST_ADMIN_SERVER_PORT=3001 \ - PGRST_LOG_LEVEL=info \ - postgrest & - # docker run --rm -i \ - # -p 3000:3000 -p 3001:3001 \ - # -e PGHOST=host.docker.internal -e PGUSER -e PGPASSWORD -e PGDATABASE \ - # postgrest/postgrest & -sleep 3 - -health_status="$(curl 'http://localhost:3001/live' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$health_status" != '200' ]; then - 1>&2 echo "/live: expected 200, got $health_status" - exit 1 -fi - -stops_url='http://localhost:3000/stops?stop_name=ilike.%25palm%25&limit=1&order=stop_id.asc' -stops_status="$(curl "$stops_url" -H 'Accept: application/json' -I -fsS | grep -o -m1 -E '[0-9]{3}')" -if [ "$stops_status" != '200' ]; then - 1>&2 echo "$stops_url: expected 200, got $stops_status" - exit 1 -fi -stop_id="$(curl "$stops_url" -H 'Accept: application/json' -fsS | jq -rc '.[0].stop_id')" -if [ "$stop_id" != 'PDC' ]; then - 1>&2 echo "$stops_url: expected PDC, got $stop_id" - exit 1 -fi - echo 'works ✔' diff --git a/test/index.sh b/test/index.sh index a40fa08..eccf7f6 100755 --- a/test/index.sh +++ b/test/index.sh @@ -11,7 +11,6 @@ psql -t -c 'SELECT version()' ./calendar-dates-only.sh ./sample-gtfs-feed.sh ./amtrak-gtfs-2021-10-06.sh -./postgraphile.sh ./routes-without-agency-id.sh ./stops-without-level-id.sh ./invalid-empty-agency-id.sh diff --git a/test/postgraphile.sh b/test/postgraphile.sh deleted file mode 100755 index f06bce1..0000000 --- a/test/postgraphile.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -cd "$(dirname $0)" -set -x - -env | grep '^PG' || true - -psql -c 'create database postgraphile' -export PGDATABASE='postgraphile' - -../cli.js -d --trips-without-shape-id --postgraphile -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - | sponge | psql -b - -# kill child processes on exit -# https://stackoverflow.com/questions/360201/how-do-i-kill-background-processes-jobs-when-my-shell-script-exits/2173421#2173421 -trap 'exit_code=$?; kill -- $(jobs -p); exit $exit_code' SIGINT SIGTERM EXIT - -../scripts/run-postgraphile.js & -sleep 2 - -body=$(node -e 'process.stdout.write(JSON.stringify({query: fs.readFileSync("sample-gtfs-feed-postgraphile-test.graphql", {encoding: "utf8"})}))') -actual_path="$(mktemp -t sample-gtfs-feed-postgraphile-test-XXX)" -curl -X POST 'http://localhost:3000/graphql' -H 'Content-Type: application/json' -H 'Accept: application/json' --data "$body" -fsS | jq -r --tab . >"$actual_path" - -git diff --exit-code sample-gtfs-feed-postgraphile-test.res.json "$actual_path" - -echo 'works ✔' diff --git a/test/sample-gtfs-feed-postgraphile-test.graphql b/test/sample-gtfs-feed-postgraphile-test.graphql deleted file mode 100644 index bccbf0d..0000000 --- a/test/sample-gtfs-feed-postgraphile-test.graphql +++ /dev/null @@ -1,115 +0,0 @@ -query SampleGtfsFeedPostgraphileTest { - stopByStopId(stopId: "airport-1") { - translatedStopName(language: "de-DE") - stopId - stopLoc { - geojson - latitude - longitude - } - } - - routeByRouteId(routeId: "B") { - agency { - agencyName - agencyEmail - } - routeShortName - routeLongName - at: translatedRouteLongName(language: "de-AT") - de: translatedRouteLongName(language: "de-DE") - trips(first: 3, orderBy: TRIP_ID_ASC) { - nodes { - tripId - } - } - connections(orderBy: PRIMARY_KEY_ASC, offset: 3, first: 3) { - nodes { - tripId - fromStopSequence - fromStopId - tDeparture - tArrival - toStopId - toStopSequence - } - } - } - bOutboundOnWeekends: tripByTripId(tripId: "b-outbound-on-weekends") { - translatedTripHeadsign(language: "de-DE") - } - aDowntownAllDay: tripByTripId(tripId: "a-downtown-all-day") { - shape { - shape { - __typename - geojson - } - } - } - - # stop_times-based -> no frequencies_{row,it} - aOutboundAllDay20190301ArrDep: arrivalDepartureByArrivalDepartureId( - id: "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=" - ) { - # tripId: "a-outbound-all-day" - # date: "2019-03-01" - # stopSequence: 3 - # frequenciesRow: -1 - # frequenciesIt: -1 - arrivalDepartureId - tripId - date - stopSequence - frequenciesRow - frequenciesIt - } - # frequencies-based -> has frequencies_{row,it} - bDowntownOnWorkingDays20190608ArrDep: arrivalDepartureByArrivalDepartureId( - id: "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==" - ) { - # tripId: "b-downtown-on-working-days" - # date: "2019-06-08" - # stopSequence: 3 - # frequenciesRow: 1 - # frequenciesIt: 2 - arrivalDepartureId - tripId - date - stopSequence - frequenciesRow - frequenciesIt - } - - # stop_times-based -> no frequencies_{row,it} - aOutboundAllDay20190301Con: connectionByConnectionId( - id: "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=" - ) { - # tripId: "a-outbound-all-day" - # date: "2019-03-01" - # fromStopSequence: 3 - # frequenciesRow: -1 - # frequenciesIt: -1 - connectionId - tripId - date - fromStopSequence - frequenciesRow - frequenciesIt - } - # frequencies-based -> has frequencies_{row,it} - bDowntownOnWorkingDays20190608Con: connectionByConnectionId( - id: "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==" - ) { - # tripId: "b-downtown-on-working-days" - # date: "2019-06-08" - # fromStopSequence: 3 - # frequenciesRow: 1 - # frequenciesIt: 2 - connectionId - tripId - date - fromStopSequence - frequenciesRow - frequenciesIt - } -} diff --git a/test/sample-gtfs-feed-postgraphile-test.res.json b/test/sample-gtfs-feed-postgraphile-test.res.json deleted file mode 100644 index 76359a1..0000000 --- a/test/sample-gtfs-feed-postgraphile-test.res.json +++ /dev/null @@ -1,2316 +0,0 @@ -{ - "data": { - "stopByStopId": { - "translatedStopName": "Gleis 1", - "stopId": "airport-1", - "stopLoc": { - "geojson": { - "type": "Point", - "coordinates": [ - 13.5087, - 52.36396 - ] - }, - "latitude": 52.36396, - "longitude": 13.5087 - } - }, - "routeByRouteId": { - "agency": { - "agencyName": "Full Transit Agency", - "agencyEmail": "contact@fta.example.org" - }, - "routeShortName": "Babbage", - "routeLongName": "Charles Babbage Tram Line", - "at": "Tram-Linie Charles Babbage", - "de": null, - "trips": { - "nodes": [ - { - "tripId": "b-downtown-on-weekends" - }, - { - "tripId": "b-downtown-on-working-days" - }, - { - "tripId": "b-outbound-on-weekends" - } - ] - }, - "connections": { - "nodes": [ - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 3, - "fromStopId": "lake", - "tDeparture": "2019-03-09T12:24:00+00:00", - "tArrival": "2019-03-09T12:30:00+00:00", - "toStopId": "center", - "toStopSequence": 5 - }, - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 1, - "fromStopId": "airport", - "tDeparture": "2019-03-10T12:14:00+00:00", - "tArrival": "2019-03-10T12:22:00+00:00", - "toStopId": "lake", - "toStopSequence": 3 - }, - { - "tripId": "b-downtown-on-weekends", - "fromStopSequence": 3, - "fromStopId": "lake", - "tDeparture": "2019-03-10T12:24:00+00:00", - "tArrival": "2019-03-10T12:30:00+00:00", - "toStopId": "center", - "toStopSequence": 5 - } - ] - } - }, - "bOutboundOnWeekends": { - "translatedTripHeadsign": "Babbage (auswärts)" - }, - "aDowntownAllDay": { - "shape": { - "shape": { - "__typename": "GeometryLineString", - "geojson": { - "type": "LineString", - "coordinates": [ - [ - 13.510294914, - 52.364833832 - ], - [ - 13.510567665, - 52.364398956 - ], - [ - 13.510860443, - 52.363952637 - ], - [ - 13.511548042, - 52.362854004 - ], - [ - 13.511612892, - 52.362743378 - ], - [ - 13.511850357, - 52.362812042 - ], - [ - 13.513009071, - 52.363082886 - ], - [ - 13.513717651, - 52.363246918 - ], - [ - 13.514398575, - 52.363361359 - ], - [ - 13.516216278, - 52.363788605 - ], - [ - 13.516494751, - 52.363868713 - ], - [ - 13.516823769, - 52.364009857 - ], - [ - 13.516993523, - 52.364112854 - ], - [ - 13.517116547, - 52.364208221 - ], - [ - 13.517197609, - 52.364322662 - ], - [ - 13.517261505, - 52.364448547 - ], - [ - 13.517277718, - 52.364532471 - ], - [ - 13.517285347, - 52.364704132 - ], - [ - 13.517237663, - 52.365009308 - ], - [ - 13.517251968, - 52.365158081 - ], - [ - 13.517328262, - 52.365364075 - ], - [ - 13.517384529, - 52.365451813 - ], - [ - 13.517477036, - 52.365539551 - ], - [ - 13.517616272, - 52.365650177 - ], - [ - 13.517773628, - 52.365726471 - ], - [ - 13.518079758, - 52.365856171 - ], - [ - 13.518387794, - 52.365940094 - ], - [ - 13.528774261, - 52.368408203 - ], - [ - 13.529670715, - 52.368545532 - ], - [ - 13.530094147, - 52.368579865 - ], - [ - 13.5308218, - 52.368587494 - ], - [ - 13.531106949, - 52.368598938 - ], - [ - 13.531417847, - 52.368621826 - ], - [ - 13.531955719, - 52.36869812 - ], - [ - 13.532168388, - 52.368759155 - ], - [ - 13.532450676, - 52.368862152 - ], - [ - 13.53266716, - 52.368961334 - ], - [ - 13.532931328, - 52.369121552 - ], - [ - 13.533116341, - 52.369255066 - ], - [ - 13.533249855, - 52.36938858 - ], - [ - 13.533371925, - 52.369533539 - ], - [ - 13.533464432, - 52.369682312 - ], - [ - 13.533542633, - 52.369838715 - ], - [ - 13.533593178, - 52.370014191 - ], - [ - 13.533617973, - 52.370185852 - ], - [ - 13.533589363, - 52.370334625 - ], - [ - 13.533475876, - 52.370624542 - ], - [ - 13.533353806, - 52.370826721 - ], - [ - 13.533203125, - 52.371002197 - ], - [ - 13.532802582, - 52.371387482 - ], - [ - 13.532670021, - 52.37153244 - ], - [ - 13.532507896, - 52.371768951 - ], - [ - 13.532444, - 52.371963501 - ], - [ - 13.5324049, - 52.372131348 - ], - [ - 13.53239727, - 52.37229538 - ], - [ - 13.532422066, - 52.372528076 - ], - [ - 13.532460213, - 52.372646332 - ], - [ - 13.532538414, - 52.372817993 - ], - [ - 13.532709122, - 52.373149872 - ], - [ - 13.534140587, - 52.375667572 - ], - [ - 13.534313202, - 52.375961304 - ], - [ - 13.534439087, - 52.376140594 - ], - [ - 13.534526825, - 52.376251221 - ], - [ - 13.534785271, - 52.376514435 - ], - [ - 13.535042763, - 52.376712799 - ], - [ - 13.535244942, - 52.376853943 - ], - [ - 13.535474777, - 52.376983643 - ], - [ - 13.535713196, - 52.377109528 - ], - [ - 13.536309242, - 52.377346039 - ], - [ - 13.53663826, - 52.377441406 - ], - [ - 13.537053108, - 52.377536774 - ], - [ - 13.537810326, - 52.377681732 - ], - [ - 13.53807354, - 52.377750397 - ], - [ - 13.538312912, - 52.377830505 - ], - [ - 13.538555145, - 52.377925873 - ], - [ - 13.538812637, - 52.378055573 - ], - [ - 13.538974762, - 52.37815094 - ], - [ - 13.5391922, - 52.378314972 - ], - [ - 13.539357185, - 52.378479004 - ], - [ - 13.539421082, - 52.378543854 - ], - [ - 13.539493561, - 52.378623962 - ], - [ - 13.539569855, - 52.378723145 - ], - [ - 13.539703369, - 52.379005432 - ], - [ - 13.539748192, - 52.379161835 - ], - [ - 13.539772034, - 52.37940979 - ], - [ - 13.539751053, - 52.379619598 - ], - [ - 13.539697647, - 52.379798889 - ], - [ - 13.539621353, - 52.379974365 - ], - [ - 13.539505959, - 52.380153656 - ], - [ - 13.539352417, - 52.380329132 - ], - [ - 13.539167404, - 52.380493164 - ], - [ - 13.538882256, - 52.380710602 - ], - [ - 13.536517143, - 52.382324219 - ], - [ - 13.536241531, - 52.382499695 - ], - [ - 13.535950661, - 52.382644653 - ], - [ - 13.535591125, - 52.3828125 - ], - [ - 13.535319328, - 52.382923126 - ], - [ - 13.535028458, - 52.383018494 - ], - [ - 13.534606934, - 52.383136749 - ], - [ - 13.53421402, - 52.383220673 - ], - [ - 13.533993721, - 52.38325882 - ], - [ - 13.533719063, - 52.383296967 - ], - [ - 13.533379555, - 52.383335114 - ], - [ - 13.53301239, - 52.383358002 - ], - [ - 13.532653809, - 52.383365631 - ], - [ - 13.53222084, - 52.383361816 - ], - [ - 13.531785011, - 52.383354187 - ], - [ - 13.531435013, - 52.383361816 - ], - [ - 13.531114578, - 52.383388519 - ], - [ - 13.530774117, - 52.383441925 - ], - [ - 13.530474663, - 52.383522034 - ], - [ - 13.530198097, - 52.383605957 - ], - [ - 13.529940605, - 52.383716583 - ], - [ - 13.529669762, - 52.383857727 - ], - [ - 13.529401779, - 52.384044647 - ], - [ - 13.529109955, - 52.38432312 - ], - [ - 13.52870369, - 52.384784698 - ], - [ - 13.528428078, - 52.38508606 - ], - [ - 13.528366089, - 52.385158539 - ], - [ - 13.524540901, - 52.389453888 - ], - [ - 13.524550438, - 52.389503479 - ], - [ - 13.524573326, - 52.389541626 - ], - [ - 13.524604797, - 52.389583588 - ], - [ - 13.524658203, - 52.389625549 - ], - [ - 13.525242805, - 52.389953613 - ], - [ - 13.525495529, - 52.390113831 - ], - [ - 13.525518417, - 52.390159607 - ], - [ - 13.525501251, - 52.390201569 - ], - [ - 13.525468826, - 52.390254974 - ], - [ - 13.525419235, - 52.390304565 - ], - [ - 13.524431229, - 52.391159058 - ], - [ - 13.523122787, - 52.392383575 - ], - [ - 13.522995949, - 52.392505646 - ], - [ - 13.522948265, - 52.392536163 - ], - [ - 13.52287674, - 52.392559052 - ], - [ - 13.522799492, - 52.392566681 - ], - [ - 13.522711754, - 52.392566681 - ], - [ - 13.521859169, - 52.392444611 - ], - [ - 13.521745682, - 52.392436981 - ], - [ - 13.521669388, - 52.392440796 - ], - [ - 13.521622658, - 52.392456055 - ], - [ - 13.521595955, - 52.392478943 - ], - [ - 13.521548271, - 52.392578125 - ], - [ - 13.522637367, - 52.392738342 - ], - [ - 13.522878647, - 52.392772675 - ], - [ - 13.523015022, - 52.392837524 - ], - [ - 13.523111343, - 52.392879486 - ], - [ - 13.523198128, - 52.392917633 - ], - [ - 13.523303032, - 52.392993927 - ], - [ - 13.523317337, - 52.393058777 - ], - [ - 13.523306847, - 52.393119812 - ], - [ - 13.523284912, - 52.393169403 - ], - [ - 13.522663116, - 52.393768311 - ], - [ - 13.521858215, - 52.394523621 - ], - [ - 13.521655083, - 52.394649506 - ], - [ - 13.521375656, - 52.39491272 - ], - [ - 13.520638466, - 52.395599365 - ], - [ - 13.520013809, - 52.396232605 - ], - [ - 13.519786835, - 52.396499634 - ], - [ - 13.51952076, - 52.396839142 - ], - [ - 13.519312859, - 52.397209167 - ], - [ - 13.519210815, - 52.397247314 - ], - [ - 13.519133568, - 52.397315979 - ], - [ - 13.519043922, - 52.397338867 - ], - [ - 13.518992424, - 52.397354126 - ], - [ - 13.518731117, - 52.397266388 - ], - [ - 13.518521309, - 52.397186279 - ], - [ - 13.518030167, - 52.396968842 - ], - [ - 13.517698288, - 52.397338867 - ], - [ - 13.51756382, - 52.397563934 - ], - [ - 13.517389297, - 52.397800446 - ], - [ - 13.516566277, - 52.398674011 - ], - [ - 13.515673637, - 52.399570465 - ], - [ - 13.514561653, - 52.400661469 - ], - [ - 13.514300346, - 52.400932312 - ], - [ - 13.513332367, - 52.401851654 - ], - [ - 13.51246357, - 52.40272522 - ], - [ - 13.510783195, - 52.40435791 - ], - [ - 13.510543823, - 52.404605865 - ], - [ - 13.510230064, - 52.404914856 - ], - [ - 13.50899601, - 52.406147003 - ], - [ - 13.508612633, - 52.406547546 - ], - [ - 13.50774765, - 52.407478333 - ], - [ - 13.506917953, - 52.408348083 - ], - [ - 13.505527496, - 52.409721375 - ], - [ - 13.505458832, - 52.40978241 - ], - [ - 13.505138397, - 52.41009903 - ], - [ - 13.503731728, - 52.411491394 - ], - [ - 13.503533363, - 52.411678314 - ], - [ - 13.502279282, - 52.412883759 - ], - [ - 13.501524925, - 52.413482666 - ], - [ - 13.501321793, - 52.413619995 - ], - [ - 13.500832558, - 52.413936615 - ], - [ - 13.50038147, - 52.4141922 - ], - [ - 13.49997139, - 52.414409637 - ], - [ - 13.499858856, - 52.414455414 - ], - [ - 13.499188423, - 52.414749146 - ], - [ - 13.498696327, - 52.41493988 - ], - [ - 13.497921944, - 52.415218353 - ], - [ - 13.497368813, - 52.415431976 - ], - [ - 13.496650696, - 52.415706635 - ], - [ - 13.496446609, - 52.415782928 - ], - [ - 13.496009827, - 52.415969849 - ], - [ - 13.495700836, - 52.416107178 - ], - [ - 13.495515823, - 52.416194916 - ], - [ - 13.495312691, - 52.416297913 - ], - [ - 13.494745255, - 52.41658783 - ], - [ - 13.49464035, - 52.416652679 - ], - [ - 13.494258881, - 52.416881561 - ], - [ - 13.493819237, - 52.417167664 - ], - [ - 13.493548393, - 52.417369843 - ], - [ - 13.493290901, - 52.417572021 - ], - [ - 13.493026733, - 52.417778015 - ], - [ - 13.492693901, - 52.418022156 - ], - [ - 13.492493629, - 52.418174744 - ], - [ - 13.492147446, - 52.418441772 - ], - [ - 13.490100861, - 52.420017242 - ], - [ - 13.489993095, - 52.420093536 - ], - [ - 13.489733696, - 52.420288086 - ], - [ - 13.489574432, - 52.420440674 - ], - [ - 13.48927021, - 52.420764923 - ], - [ - 13.489129066, - 52.420928955 - ], - [ - 13.488491058, - 52.421710968 - ], - [ - 13.488237381, - 52.421993256 - ], - [ - 13.487900734, - 52.422344208 - ], - [ - 13.487172127, - 52.422939301 - ], - [ - 13.486559868, - 52.423408508 - ], - [ - 13.486092567, - 52.423770905 - ], - [ - 13.48562336, - 52.424152374 - ], - [ - 13.485471725, - 52.424255371 - ], - [ - 13.485077858, - 52.424537659 - ], - [ - 13.484401703, - 52.425022125 - ], - [ - 13.483383179, - 52.425769806 - ], - [ - 13.483257294, - 52.425865173 - ], - [ - 13.482924461, - 52.426101685 - ], - [ - 13.482698441, - 52.426265717 - ], - [ - 13.480806351, - 52.427612305 - ], - [ - 13.479895592, - 52.428302765 - ], - [ - 13.47981739, - 52.428356171 - ], - [ - 13.47854805, - 52.42930603 - ], - [ - 13.478359222, - 52.429431915 - ], - [ - 13.478157997, - 52.429595947 - ], - [ - 13.478037834, - 52.429683685 - ], - [ - 13.47772789, - 52.429954529 - ], - [ - 13.477515221, - 52.430137634 - ], - [ - 13.476874352, - 52.430652618 - ], - [ - 13.47661972, - 52.430850983 - ], - [ - 13.476373672, - 52.431026459 - ], - [ - 13.475787163, - 52.431369781 - ], - [ - 13.475365639, - 52.431587219 - ], - [ - 13.474074364, - 52.4322052 - ], - [ - 13.473599434, - 52.432434082 - ], - [ - 13.473415375, - 52.43252182 - ], - [ - 13.472993851, - 52.432712555 - ], - [ - 13.4716959, - 52.433292389 - ], - [ - 13.471329689, - 52.433441162 - ], - [ - 13.469817162, - 52.434043884 - ], - [ - 13.469201088, - 52.434314728 - ], - [ - 13.469059944, - 52.434391022 - ], - [ - 13.468596458, - 52.434631348 - ], - [ - 13.466616631, - 52.435817719 - ], - [ - 13.466080666, - 52.436122894 - ], - [ - 13.465838432, - 52.436260223 - ], - [ - 13.465315819, - 52.436565399 - ], - [ - 13.464496613, - 52.437023163 - ], - [ - 13.463171005, - 52.43762207 - ], - [ - 13.462702751, - 52.437843323 - ], - [ - 13.462409019, - 52.437988281 - ], - [ - 13.46231842, - 52.438037872 - ], - [ - 13.462058067, - 52.438179016 - ], - [ - 13.460422516, - 52.439193726 - ], - [ - 13.460037231, - 52.439479828 - ], - [ - 13.459775925, - 52.43970108 - ], - [ - 13.459723473, - 52.439754486 - ], - [ - 13.459409714, - 52.440021515 - ], - [ - 13.459005356, - 52.440368652 - ], - [ - 13.458240509, - 52.441017151 - ], - [ - 13.457676888, - 52.441509247 - ], - [ - 13.456965446, - 52.442108154 - ], - [ - 13.456719398, - 52.442359924 - ], - [ - 13.456583023, - 52.442497253 - ], - [ - 13.456512451, - 52.442592621 - ], - [ - 13.456332207, - 52.442821503 - ], - [ - 13.456071854, - 52.443252563 - ], - [ - 13.455370903, - 52.444972992 - ], - [ - 13.455272675, - 52.44519043 - ], - [ - 13.455172539, - 52.4454422 - ], - [ - 13.454929352, - 52.445533752 - ], - [ - 13.454372406, - 52.445556641 - ], - [ - 13.452836037, - 52.44562149 - ], - [ - 13.451435089, - 52.445671082 - ], - [ - 13.449950218, - 52.445732117 - ], - [ - 13.449712753, - 52.445739746 - ], - [ - 13.449320793, - 52.44575882 - ], - [ - 13.448624611, - 52.445781708 - ], - [ - 13.448477745, - 52.445789337 - ], - [ - 13.447191238, - 52.445838928 - ], - [ - 13.445914268, - 52.445884705 - ], - [ - 13.445550919, - 52.445896149 - ], - [ - 13.444639206, - 52.445934296 - ], - [ - 13.444497108, - 52.445941925 - ], - [ - 13.444350243, - 52.445953369 - ], - [ - 13.4439888, - 52.44600296 - ], - [ - 13.442544937, - 52.446834564 - ], - [ - 13.441972733, - 52.447113037 - ], - [ - 13.440879822, - 52.447547913 - ], - [ - 13.440397263, - 52.447731018 - ], - [ - 13.440110207, - 52.447826385 - ], - [ - 13.439696312, - 52.447929382 - ], - [ - 13.439285278, - 52.448001862 - ], - [ - 13.439059258, - 52.448059082 - ], - [ - 13.438909531, - 52.448108673 - ], - [ - 13.438746452, - 52.448192596 - ], - [ - 13.438674927, - 52.448131561 - ], - [ - 13.437895775, - 52.447433472 - ], - [ - 13.437681198, - 52.447235107 - ], - [ - 13.437045097, - 52.446689606 - ], - [ - 13.436873436, - 52.44651413 - ], - [ - 13.436362267, - 52.446037292 - ], - [ - 13.436190605, - 52.445854187 - ], - [ - 13.436362267, - 52.446037292 - ], - [ - 13.436873436, - 52.44651413 - ], - [ - 13.437045097, - 52.446689606 - ], - [ - 13.437681198, - 52.447235107 - ], - [ - 13.437660217, - 52.447479248 - ], - [ - 13.437644005, - 52.447570801 - ], - [ - 13.437449455, - 52.448303223 - ], - [ - 13.437498093, - 52.448410034 - ], - [ - 13.437361717, - 52.448432922 - ], - [ - 13.437185287, - 52.448471069 - ], - [ - 13.437046051, - 52.448516846 - ], - [ - 13.436974525, - 52.448551178 - ], - [ - 13.436769485, - 52.448654175 - ], - [ - 13.436707497, - 52.448688507 - ], - [ - 13.436658859, - 52.448745728 - ], - [ - 13.436600685, - 52.44877243 - ], - [ - 13.436264038, - 52.448947906 - ], - [ - 13.436333656, - 52.449050903 - ], - [ - 13.436501503, - 52.449287415 - ], - [ - 13.436527252, - 52.449321747 - ], - [ - 13.436580658, - 52.449398041 - ], - [ - 13.438248634, - 52.451702118 - ], - [ - 13.438462257, - 52.452030182 - ], - [ - 13.438505173, - 52.452144623 - ], - [ - 13.43860054, - 52.452201843 - ], - [ - 13.438648224, - 52.452415466 - ], - [ - 13.438677788, - 52.452579498 - ], - [ - 13.438673973, - 52.452697754 - ], - [ - 13.438651085, - 52.452781677 - ], - [ - 13.438504219, - 52.453048706 - ], - [ - 13.438376427, - 52.453136444 - ], - [ - 13.438240051, - 52.453483582 - ], - [ - 13.437959671, - 52.454143524 - ], - [ - 13.437572479, - 52.455108643 - ], - [ - 13.43737793, - 52.455593109 - ], - [ - 13.437252045, - 52.455905914 - ], - [ - 13.437194824, - 52.456039429 - ], - [ - 13.437150002, - 52.456150055 - ], - [ - 13.436961174, - 52.456607819 - ], - [ - 13.436709404, - 52.457176208 - ], - [ - 13.436512947, - 52.45759201 - ], - [ - 13.436303139, - 52.457977295 - ], - [ - 13.436096191, - 52.458377838 - ], - [ - 13.436008453, - 52.458568573 - ], - [ - 13.435801506, - 52.459049225 - ], - [ - 13.435427666, - 52.459831238 - ], - [ - 13.435299873, - 52.46018219 - ], - [ - 13.435123444, - 52.460720062 - ], - [ - 13.435070038, - 52.460891724 - ], - [ - 13.435009956, - 52.461063385 - ], - [ - 13.434931755, - 52.461303711 - ], - [ - 13.434843063, - 52.46149826 - ], - [ - 13.434798241, - 52.461856842 - ], - [ - 13.434743881, - 52.462051392 - ], - [ - 13.434661865, - 52.462287903 - ], - [ - 13.434613228, - 52.462429047 - ], - [ - 13.434565544, - 52.462532043 - ], - [ - 13.434479713, - 52.462696075 - ], - [ - 13.434398651, - 52.462814331 - ], - [ - 13.434216499, - 52.462982178 - ], - [ - 13.434041023, - 52.463176727 - ], - [ - 13.433871269, - 52.463363647 - ], - [ - 13.433573723, - 52.4637146 - ], - [ - 13.433339119, - 52.464027405 - ], - [ - 13.432988167, - 52.46452713 - ], - [ - 13.432909966, - 52.46465683 - ], - [ - 13.433052063, - 52.464736938 - ], - [ - 13.433185577, - 52.464847565 - ], - [ - 13.434347153, - 52.46578598 - ], - [ - 13.43439579, - 52.465824127 - ], - [ - 13.434561729, - 52.465961456 - ], - [ - 13.4347229, - 52.466087341 - ], - [ - 13.435704231, - 52.466880798 - ], - [ - 13.436393738, - 52.466960907 - ], - [ - 13.436722755, - 52.466999054 - ], - [ - 13.436852455, - 52.467014313 - ], - [ - 13.438138962, - 52.467159271 - ], - [ - 13.440406799, - 52.467430115 - ], - [ - 13.441808701, - 52.467594147 - ], - [ - 13.441857338, - 52.467605591 - ], - [ - 13.442008972, - 52.467670441 - ], - [ - 13.44198513, - 52.467803955 - ], - [ - 13.441916466, - 52.468208313 - ], - [ - 13.441903114, - 52.468284607 - ], - [ - 13.441795349, - 52.46887207 - ], - [ - 13.441762924, - 52.468978882 - ], - [ - 13.44181633, - 52.469089508 - ], - [ - 13.441812515, - 52.469108582 - ], - [ - 13.441781998, - 52.469345093 - ], - [ - 13.441745758, - 52.469604492 - ], - [ - 13.441641808, - 52.469726562 - ], - [ - 13.441613197, - 52.469841003 - ], - [ - 13.441549301, - 52.470172882 - ], - [ - 13.441507339, - 52.470413208 - ], - [ - 13.441498756, - 52.470462799 - ], - [ - 13.441394806, - 52.471038818 - ], - [ - 13.441366196, - 52.471206665 - ], - [ - 13.441405296, - 52.471279144 - ], - [ - 13.441369057, - 52.471508026 - ], - [ - 13.441458702, - 52.471515656 - ], - [ - 13.441511154, - 52.4715271 - ], - [ - 13.441576958, - 52.471569061 - ], - [ - 13.442343712, - 52.472263336 - ], - [ - 13.442426682, - 52.472301483 - ], - [ - 13.442523003, - 52.472324371 - ], - [ - 13.442516327, - 52.472366333 - ], - [ - 13.4425354, - 52.47240448 - ], - [ - 13.44308567, - 52.472888947 - ], - [ - 13.443483353, - 52.473255157 - ], - [ - 13.443502426, - 52.47328186 - ], - [ - 13.443569183, - 52.473423004 - ], - [ - 13.443605423, - 52.473564148 - ], - [ - 13.443622589, - 52.473712921 - ], - [ - 13.443624496, - 52.4737854 - ], - [ - 13.444513321, - 52.473873138 - ], - [ - 13.445295334, - 52.473960876 - ], - [ - 13.445151329, - 52.474128723 - ], - [ - 13.445291519, - 52.474163055 - ], - [ - 13.446117401, - 52.474380493 - ], - [ - 13.446929932, - 52.474597931 - ], - [ - 13.44698143, - 52.47460556 - ], - [ - 13.447024345, - 52.47460556 - ], - [ - 13.447067261, - 52.474601746 - ], - [ - 13.447439194, - 52.475803375 - ], - [ - 13.447550774, - 52.476112366 - ], - [ - 13.447616577, - 52.476333618 - ], - [ - 13.447663307, - 52.476425171 - ], - [ - 13.44804287, - 52.477684021 - ], - [ - 13.448085785, - 52.47782135 - ], - [ - 13.448119164, - 52.477935791 - ], - [ - 13.448202133, - 52.478160858 - ], - [ - 13.448659897, - 52.478504181 - ], - [ - 13.449481964, - 52.479129791 - ], - [ - 13.450310707, - 52.479755402 - ], - [ - 13.450602531, - 52.479976654 - ], - [ - 13.450725555, - 52.480072021 - ], - [ - 13.451802254, - 52.480880737 - ], - [ - 13.452273369, - 52.481243134 - ], - [ - 13.45246315, - 52.481388092 - ], - [ - 13.452612877, - 52.481498718 - ], - [ - 13.45324707, - 52.48197937 - ], - [ - 13.453961372, - 52.482517242 - ], - [ - 13.456096649, - 52.484138489 - ], - [ - 13.456254959, - 52.484260559 - ], - [ - 13.457948685, - 52.485546112 - ], - [ - 13.458347321, - 52.485839844 - ], - [ - 13.456992149, - 52.486808777 - ], - [ - 13.456432343, - 52.487201691 - ], - [ - 13.455653191, - 52.487758636 - ], - [ - 13.453977585, - 52.488941193 - ], - [ - 13.454572678, - 52.489376068 - ], - [ - 13.455964088, - 52.490432739 - ], - [ - 13.456983566, - 52.491188049 - ], - [ - 13.457713127, - 52.491714478 - ], - [ - 13.457920074, - 52.491786957 - ], - [ - 13.458016396, - 52.491840363 - ], - [ - 13.45939827, - 52.492835999 - ], - [ - 13.459500313, - 52.492912292 - ], - [ - 13.460221291, - 52.493465424 - ], - [ - 13.460617065, - 52.493797302 - ], - [ - 13.460804939, - 52.493942261 - ], - [ - 13.461070061, - 52.494159698 - ], - [ - 13.461850166, - 52.494800568 - ], - [ - 13.462110519, - 52.49521637 - ], - [ - 13.462246895, - 52.495323181 - ], - [ - 13.46389389, - 52.496665955 - ], - [ - 13.464204788, - 52.496753693 - ], - [ - 13.464544296, - 52.497119904 - ], - [ - 13.464606285, - 52.497215271 - ], - [ - 13.464643478, - 52.497310638 - ], - [ - 13.464666367, - 52.497421265 - ], - [ - 13.464753151, - 52.497566223 - ], - [ - 13.464924812, - 52.497829437 - ], - [ - 13.465081215, - 52.49805069 - ], - [ - 13.465513229, - 52.498714447 - ], - [ - 13.465647697, - 52.49892807 - ], - [ - 13.465722084, - 52.499099731 - ], - [ - 13.465786934, - 52.499248505 - ], - [ - 13.465964317, - 52.499668121 - ], - [ - 13.466071129, - 52.499950409 - ], - [ - 13.466582298, - 52.501167297 - ], - [ - 13.466583252, - 52.501277924 - ], - [ - 13.466501236, - 52.501941681 - ], - [ - 13.466501236, - 52.502063751 - ], - [ - 13.466513634, - 52.502140045 - ], - [ - 13.466565132, - 52.502277374 - ], - [ - 13.466641426, - 52.502422333 - ], - [ - 13.466691971, - 52.502490997 - ], - [ - 13.466762543, - 52.502563477 - ], - [ - 13.466827393, - 52.502609253 - ], - [ - 13.466879845, - 52.502632141 - ], - [ - 13.466954231, - 52.502655029 - ], - [ - 13.467047691, - 52.502666473 - ], - [ - 13.467115402, - 52.502670288 - ], - [ - 13.467172623, - 52.502670288 - ], - [ - 13.467508316, - 52.502624512 - ], - [ - 13.468510628, - 52.502464294 - ], - [ - 13.469779968, - 52.502254486 - ], - [ - 13.469991684, - 52.502220154 - ], - [ - 13.471398354, - 52.502017975 - ], - [ - 13.474914551, - 52.501502991 - ], - [ - 13.475172043, - 52.5014534 - ], - [ - 13.475209236, - 52.501560211 - ], - [ - 13.475388527, - 52.501979828 - ], - [ - 13.477026939, - 52.50170517 - ], - [ - 13.477138519, - 52.50169754 - ], - [ - 13.477298737, - 52.50170517 - ], - [ - 13.477692604, - 52.501735687 - ], - [ - 13.478367805, - 52.501785278 - ], - [ - 13.48141098, - 52.502010345 - ], - [ - 13.483391762, - 52.502151489 - ], - [ - 13.484308243, - 52.502223969 - ], - [ - 13.484597206, - 52.502246857 - ], - [ - 13.486724854, - 52.502410889 - ], - [ - 13.487017632, - 52.502426147 - ], - [ - 13.487155914, - 52.502399445 - ], - [ - 13.488366127, - 52.502124786 - ], - [ - 13.48913765, - 52.501945496 - ], - [ - 13.490619659, - 52.501483917 - ], - [ - 13.490981102, - 52.501377106 - ], - [ - 13.492963791, - 52.500965118 - ], - [ - 13.493370056, - 52.500881195 - ], - [ - 13.494781494, - 52.500595093 - ], - [ - 13.495025635, - 52.500537872 - ], - [ - 13.495450974, - 52.500431061 - ], - [ - 13.495686531, - 52.500354767 - ], - [ - 13.495876312, - 52.500293732 - ], - [ - 13.496304512, - 52.500156403 - ], - [ - 13.497889519, - 52.499641418 - ] - ] - } - } - } - }, - "aOutboundAllDay20190301ArrDep": { - "arrivalDepartureId": "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=", - "tripId": "a-outbound-all-day", - "date": "2019-03-01T00:00:00", - "stopSequence": 3, - "frequenciesRow": -1, - "frequenciesIt": -1 - }, - "bDowntownOnWorkingDays20190608ArrDep": { - "arrivalDepartureId": "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==", - "tripId": "b-downtown-on-working-days", - "date": "2019-03-08T00:00:00", - "stopSequence": 3, - "frequenciesRow": 1, - "frequenciesIt": 2 - }, - "aOutboundAllDay20190301Con": { - "connectionId": "YS1vdXRib3VuZC1hbGwtZGF5:MjAxOS0wMy0wMQ==:Mw==:LTE=:LTE=", - "tripId": "a-outbound-all-day", - "date": "2019-03-01T00:00:00", - "fromStopSequence": 3, - "frequenciesRow": -1, - "frequenciesIt": -1 - }, - "bDowntownOnWorkingDays20190608Con": { - "connectionId": "Yi1kb3dudG93bi1vbi13b3JraW5nLWRheXM=:MjAxOS0wMy0wOA==:Mw==:MQ==:Mg==", - "tripId": "b-downtown-on-working-days", - "date": "2019-03-08T00:00:00", - "fromStopSequence": 3, - "frequenciesRow": 1, - "frequenciesIt": 2 - } - } -} From f3cde473e62f76f60358fe73256bbc076b66e8e6 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 00:41:00 +0200 Subject: [PATCH 04/16] DuckDB rewrite: minor tweaks --- .eslintrc.json | 2 +- cli.js | 4 ++-- docs/import-metadata.md | 2 +- docs/multiple-datasets.md | 4 ++-- example.sh | 3 ++- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.eslintrc.json b/.eslintrc.json index 8e4e9a2..d31fd86 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -12,7 +12,7 @@ "node_modules" ], "rules": { - "no-unused-vars": "off", + "no-unused-vars": "warn", "no-irregular-whitespace": "off" } } diff --git a/cli.js b/cli.js index a907cc7..1e44729 100755 --- a/cli.js +++ b/cli.js @@ -41,10 +41,10 @@ const { 'stops-without-level-id': { type: 'boolean', }, - 'lower-case-lang-codes': { + 'stops-location-index': { type: 'boolean', }, - 'stops-location-index': { + 'lower-case-lang-codes': { type: 'boolean', }, 'stats-by-route-date': { diff --git a/docs/import-metadata.md b/docs/import-metadata.md index b74a47e..bc016ad 100644 --- a/docs/import-metadata.md +++ b/docs/import-metadata.md @@ -15,4 +15,4 @@ SELECT gtfs_via_postgres_options() -- {"schema": "public", "silent": false, "importStart": 1681417454781, "importMetadata": true, … } SELECT (gtfs_via_postgres_options())['tripsWithoutShapeId'] -- true -``` \ No newline at end of file +``` diff --git a/docs/multiple-datasets.md b/docs/multiple-datasets.md index f1249be..57a7c99 100644 --- a/docs/multiple-datasets.md +++ b/docs/multiple-datasets.md @@ -1,8 +1,8 @@ -# importing multiple datasets into one DB +# working with multiple datasets Using `gtfs-via-postgres`, you can import more than one dataset into a single PostgreSQL database by importing them into separate [schemas](https://www.postgresql.org/docs/14/ddl-schemas.html). You can then run queries combine or compare data from them. -As an example, let's import two datasets ([Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités)' and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg)'s) into separate schemas: +As an example, let's compare two datasets from [Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités) and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg). ```shell wget -U 'gtfs-via-postgres demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' diff --git a/example.sh b/example.sh index dabc52d..2dd8368 100755 --- a/example.sh +++ b/example.sh @@ -1,6 +1,7 @@ #!/bin/sh set -e +set -u set -o pipefail 2>&1 echo "importing into PostgreSQL:" @@ -13,7 +14,7 @@ psql -c "$(cat <<- EOM SELECT trip_id, route_id, from_stop_id, t_departure, - stop_sequence, + from_stop_sequence, to_stop_id, t_arrival FROM connections WHERE trip_id = 'during-dst-1' From 81138e24fc4a77d78e7b2bd48078a943ffb7c73a Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 10:43:29 +0200 Subject: [PATCH 05/16] DuckDB rewrite: adapt project metadata --- Dockerfile | 10 ++-- LICENSE-APACHE | 2 +- LICENSE-PROSPERITY.md | 2 +- docs/analysis/active-trips-by-hour.md | 2 +- .../feed-by-agency-route-stop-and-hour.md | 2 +- docs/analysis/feed-by-route-date.md | 2 +- docs/multiple-datasets.md | 4 +- package.json | 16 ++--- readme.md | 59 +++++++++---------- 9 files changed, 49 insertions(+), 50 deletions(-) diff --git a/Dockerfile b/Dockerfile index aa53467..38e989a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ FROM node:alpine -LABEL org.opencontainers.image.title="gtfs-via-postgres" -LABEL org.opencontainers.image.description="Process GTFS using PostgreSQL." +LABEL org.opencontainers.image.title="gtfs-via-duckdb" +LABEL org.opencontainers.image.description="Analyze GTFS datasets using DuckDB." LABEL org.opencontainers.image.authors="Jannis R " -LABEL org.opencontainers.image.documentation="https://github.com/public-transport/gtfs-via-postgres" -LABEL org.opencontainers.image.source="https://github.com/public-transport/gtfs-via-postgres" -LABEL org.opencontainers.image.revision="4.0.0" +LABEL org.opencontainers.image.documentation="https://github.com/public-transport/gtfs-via-duckdb" +LABEL org.opencontainers.image.source="https://github.com/public-transport/gtfs-via-duckdb" +LABEL org.opencontainers.image.revision="5.0.0" LABEL org.opencontainers.image.licenses="(Apache-2.0 AND Prosperity-3.0.0)" WORKDIR /app diff --git a/LICENSE-APACHE b/LICENSE-APACHE index 2b64243..09efbff 100644 --- a/LICENSE-APACHE +++ b/LICENSE-APACHE @@ -176,7 +176,7 @@ END OF TERMS AND CONDITIONS - Copyright 2020 gtfs-via-postgres contributors + Copyright 2020 gtfs-via-postgres & gtfs-via-duckdb contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/LICENSE-PROSPERITY.md b/LICENSE-PROSPERITY.md index ae979d3..3fb260c 100644 --- a/LICENSE-PROSPERITY.md +++ b/LICENSE-PROSPERITY.md @@ -2,7 +2,7 @@ Contributor: Jannis R -Source Code: https://github.com/public-transport/gtfs-via-postgres +Source Code: https://github.com/public-transport/gtfs-via-duckdb ## Purpose diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md index 9983331..6071b75 100644 --- a/docs/analysis/active-trips-by-hour.md +++ b/docs/analysis/active-trips-by-hour.md @@ -2,7 +2,7 @@ Do you want to know how many trips are running at a specific point in time? -`gtfs-via-postgres` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: +`gtfs-via-duckdb` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: - If you run `gtfs-to-sql` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. - If you pass `--stats-active-trips-by-hour=materialized-view`, the `stats_active_trips_by_hour` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (a minute for small feeds, many hours for large feeds). diff --git a/docs/analysis/feed-by-agency-route-stop-and-hour.md b/docs/analysis/feed-by-agency-route-stop-and-hour.md index 521482d..0c1273d 100644 --- a/docs/analysis/feed-by-agency-route-stop-and-hour.md +++ b/docs/analysis/feed-by-agency-route-stop-and-hour.md @@ -1,5 +1,5 @@ # analysing a GTFS dataset by route ID, stop ID and/or hour -With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-postgres` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`. +With the `--stats-by-route-and-stop-and-hour` option, `gtfs-via-duckdb` provides a view `stats_by_agency_route_stop_hour`. Just like [`stats_by_route_id_and_date`](feed-by-route-and-date.md), it aggregates all arrivals by `agency_id`, `route_id`, `stop_id` and `effective_hour`. Note: As a materialized view, `stats_by_agency_route_stop_hour` takes up a significant amount of space, e.g. 13GB with the 2023-05-02 VBB GTFS feed. diff --git a/docs/analysis/feed-by-route-date.md b/docs/analysis/feed-by-route-date.md index afb08c7..6720a94 100644 --- a/docs/analysis/feed-by-route-date.md +++ b/docs/analysis/feed-by-route-date.md @@ -6,7 +6,7 @@ Are you trying to answer a question like those below? - Has the number of stop time events decreased, compared to the last dataset version? - Do specific routes stop running during certain time periods? -`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: +`gtfs-via-duckdb` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: - If you run `gtfs-to-sql` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. - If you pass `--stats-by-route-date=materialized-view`, the `stats_by_route_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed). diff --git a/docs/multiple-datasets.md b/docs/multiple-datasets.md index 57a7c99..608998d 100644 --- a/docs/multiple-datasets.md +++ b/docs/multiple-datasets.md @@ -5,13 +5,13 @@ Using `gtfs-via-postgres`, you can import more than one dataset into a single Po As an example, let's compare two datasets from [Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités) and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg). ```shell -wget -U 'gtfs-via-postgres demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' +wget -U 'gtfs-via-duckdb demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' unzip -d paris.gtfs paris.gtfs.zip gtfs-to-sql --require-dependencies \ --schema paris -- paris.gtfs/*.txt \ | sponge | psql -b -wget -U 'gtfs-via-postgres demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' +wget -U 'gtfs-via-duckdb demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' unzip -d berlin.gtfs berlin.gtfs.zip gtfs-to-sql --require-dependencies \ --schema berlin -- berlin.gtfs/*.txt \ diff --git a/package.json b/package.json index e4e818f..ab67dd6 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { - "name": "gtfs-via-postgres", - "description": "Process GTFS using PostgreSQL.", - "version": "4.10.4", + "name": "gtfs-via-duckdb", + "description": "Analyze GTFS datasets using DuckDB.", + "version": "5.0.0", "main": "lib/index.js", "bin": { "gtfs-to-sql": "cli.js" @@ -22,8 +22,8 @@ "public transport", "transit", "convert", - "postgres", - "postgresql", + "duckdb", + "data analysis", "sql" ], "author": "Jannis R ", @@ -32,12 +32,12 @@ "Magnus Burton ", "smohiudd " ], - "homepage": "https://github.com/public-transport/gtfs-via-postgres/tree/4.10.4", + "homepage": "https://github.com/public-transport/gtfs-via-duckdb/tree/5.0.0", "repository": { "type": "git", - "url": "git+https://github.com/public-transport/gtfs-via-postgres.git" + "url": "git+https://github.com/public-transport/gtfs-via-duckdb.git" }, - "bugs": "https://github.com/public-transport/gtfs-via-postgres/issues", + "bugs": "https://github.com/public-transport/gtfs-via-duckdb/issues", "license": "(Apache-2.0 AND Prosperity-3.0.0)", "funding": [ { diff --git a/readme.md b/readme.md index aab4c31..2f72696 100644 --- a/readme.md +++ b/readme.md @@ -1,11 +1,11 @@ -# gtfs-via-postgres +# gtfs-via-duckdb -**Import [GTFS Static/Schedule](https://gtfs.org/documentation/schedule/reference/) datasets into a [PostgreSQL database](https://www.postgresql.org)**, to allow for efficient querying and analysis. +**Import [GTFS Static/Schedule](https://gtfs.org/documentation/schedule/reference/) datasets into a [DuckDB database](https://duckdb.org)**, to allow for efficient querying and analysis. -[![npm version](https://img.shields.io/npm/v/gtfs-via-postgres.svg)](https://www.npmjs.com/package/gtfs-via-postgres) -[![binary build status](https://img.shields.io/github/actions/workflow/status/public-transport/gtfs-via-postgres/publish.yml?label=binary%20build)](https://github.com/public-transport/gtfs-via-postgres/actions) +[![npm version](https://img.shields.io/npm/v/gtfs-via-duckdb.svg)](https://www.npmjs.com/package/gtfs-via-duckdb) +[![binary build status](https://img.shields.io/github/actions/workflow/status/public-transport/gtfs-via-duckdb/publish.yml?label=binary%20build)](https://github.com/public-transport/gtfs-via-duckdb/actions) [![Prosperity/Apache license](https://img.shields.io/static/v1?label=license&message=Prosperity%2FApache&color=0997E8)](#license) -![minimum Node.js version](https://img.shields.io/node/v/gtfs-via-postgres.svg) +![minimum Node.js version](https://img.shields.io/node/v/gtfs-via-duckdb.svg) [![support me via GitHub Sponsors](https://img.shields.io/badge/support%20me-donate-fa7664.svg)](https://github.com/sponsors/derhuerst) [![chat with me on Twitter](https://img.shields.io/badge/chat%20with%20me-on%20Twitter-1da1f2.svg)](https://twitter.com/derhuerst) @@ -15,7 +15,7 @@ - 🚀 is carefully optimised to let PostgreSQL's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) - ✅ validates and imports `translations.txt` -To work with the time-related data (`stop_times` etc.), `gtfs-via-postgres` supports two "mental models": +To work with the time-related data (`stop_times` etc.), `gtfs-via-duckdb` supports two "mental models": - the time-*unexpanded* data that is almost directly taken from the GTFS Schedule data – This is useful if you want to do network analysis. - the time-*expanded* view that "applies" every trip's `stop_times` rows to all of its service days – This is useful for routing & queries from the traveller's perspective. @@ -24,14 +24,14 @@ To work with the time-related data (`stop_times` etc.), `gtfs-via-postgres` supp ## Installation ```shell -npm install -g gtfs-via-postgres +npm install -g gtfs-via-duckdb ``` Or use [`npx`](https://npmjs.com/package/npx). ✨ -There are also [prebuilt binaries](https://github.com/public-transport/gtfs-via-postgres/releases/latest) and [Docker images](https://github.com/public-transport/gtfs-via-postgres/pkgs/container/gtfs-via-postgres) available. +There are also [prebuilt binaries](https://github.com/public-transport/gtfs-via-duckdb/releases/latest) and [Docker images](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb) available. -*Note:* `gtfs-via-postgres` **needs PostgreSQL >=14** to work, as it uses the [`WITH … AS NOT MATERIALIZED`](https://www.postgresql.org/docs/14/queries-with.html#id-1.5.6.12.7) syntax. You can check your PostgreSQL server's version with `psql -t -c 'SELECT version()'`. +*Note:* `gtfs-via-duckdb` **needs PostgreSQL >=14** to work, as it uses the [`WITH … AS NOT MATERIALIZED`](https://www.postgresql.org/docs/14/queries-with.html#id-1.5.6.12.7) syntax. You can check your PostgreSQL server's version with `psql -t -c 'SELECT version()'`. ## Getting Started @@ -72,12 +72,10 @@ env PGDATABASE=postgres psql -c 'create database vbb_2022_02_25' export PGDATABASE=vbb_2022_02_25 ``` -*Note*: `gtfs-via-postgres` generates SQL that contains the `CREATE EXTENSION postgis` instruction. For this to work, the PostgreSQL user you're connecting as needs the `CREATE` [permission](https://www.postgresql.org/docs/14/ddl-priv.html) on the database. Also, the `postgis` extension must either be marked as trusted (by putting `trusted = true` into `$(pg_config --sharedir)/extension/postgis.control`), or your user must be a superuser. - -Install `gtfs-via-postgres` and use it to import the GTFS data: +Install `gtfs-via-duckdb` and use it to import the GTFS data: ```sh -npm install -D gtfs-via-postgres +npm install -D gtfs-via-duckdb npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b # agency # calendar @@ -93,7 +91,7 @@ npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b Importing will take 10s to 10m, depending on the size of the feed. On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), importing the above feed takes about 4m; Importing the [260kb 2021-10-06 Amtrak feed](https://transitfeeds.com/p/amtrak/1136/20211006) takes 6s. -In addition to a table for each GTFS file, `gtfs-via-postgres` adds these views to help with real-world analysis: +In addition to a table for each GTFS file, `gtfs-via-duckdb` adds these views to help with real-world analysis: - `service_days` ([materialized](https://www.postgresql.org/docs/14/sql-creatematerializedview.html)) "applies" [`calendar_dates`](https://gtfs.org/documentation/schedule/reference/#calendar_datestxt) to [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt) to give you all days of operation for each "service" defined in [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt). - `arrivals_departures` "applies" [`stop_times`](https://gtfs.org/documentation/schedule/reference/#stop_timestxt)/[`frequencies`](https://gtfs.org/documentation/schedule/reference/#frequenciestxt) to [`trips`](https://gtfs.org/documentation/schedule/reference/#tripstxt) and `service_days` to give you all arrivals/departures at each stop with their *absolute* dates & times. It also resolves each stop's parent station ID & name. @@ -216,12 +214,12 @@ You can run queries with date+time values in any timezone (offset) and they will *Note:* Just like the `npm`-installed variant, the Docker integration too assumes that your GTFS dataset consists of individual files (i.e. unzipped). -Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs-via-postgres` Docker image](https://github.com/public-transport/gtfs-via-postgres/pkgs/container/gtfs-via-postgres): +Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs-via-duckdb` Docker image](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb): ```shell # variant A: use Docker image just to convert GTFS to SQL docker run --rm --volume /path/to/gtfs:/gtfs \ - ghcr.io/public-transport/gtfs-via-postgres --require-dependencies -- '/gtfs/*.csv' \ + ghcr.io/public-transport/gtfs-via-duckdb --require-dependencies -- '/gtfs/*.csv' \ | sponge | psql -b ``` @@ -229,14 +227,14 @@ docker run --rm --volume /path/to/gtfs:/gtfs \ With the code above, the `psql -b` process will run *outside* of the Docker container, so your host machine needs access to PostgreSQL. -If you want to directly import the GTFS data *from within the Docker container*, you need add `psql` to the image and run it from inside. To do that, write a new Dockerfile that extends the `ghcr.io/public-transport/gtfs-via-postgres` image: +If you want to directly import the GTFS data *from within the Docker container*, you need add `psql` to the image and run it from inside. To do that, write a new Dockerfile that extends the `ghcr.io/public-transport/gtfs-via-duckdb` image: ```Dockerfile -FROM ghcr.io/public-transport/gtfs-via-postgres +FROM ghcr.io/public-transport/gtfs-via-duckdb ENV PGPORT=5432 PGUSER=postgres WORKDIR /gtfs -# pass all arguments into gtfs-via-postgres, pipe output into psql: -ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-postgres $0 $@ | sponge | psql -b"] +# pass all arguments into gtfs-via-duckdb, pipe output into psql: +ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-duckdb $0 $@ | sponge | psql -b"] ``` ```shell @@ -262,11 +260,11 @@ In the nested `SELECT` query, you can use features like `WHERE`, `ORDER BY` and ### Querying stops by location efficiently -If you want to find stops by (geo)location, run `gtfs-via-postgres` with `--stops-location-index`. This will create a [spatial index](https://postgis.net/workshops/postgis-intro/indexing.html) on `stops.stop_loc`, so that most [PostGIS functions & operators](https://postgis.net/docs/manual-3.2/reference.html#Measurement_Functions) make use of it. +If you want to find stops by (geo)location, run `gtfs-via-duckdb` with `--stops-location-index`. This will create a [spatial index](https://postgis.net/workshops/postgis-intro/indexing.html) on `stops.stop_loc`, so that most [PostGIS functions & operators](https://postgis.net/docs/manual-3.2/reference.html#Measurement_Functions) make use of it. ### more guides -The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`. +The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-duckdb`. ## Correctness vs. Speed regarding GTFS Time Values @@ -280,7 +278,7 @@ Let's consider two examples: - A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59+01`. - A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01+02` which is *not* `3h1m` after `2021-03-28T00:00+01`. -`gtfs-via-postgres` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. +`gtfs-via-duckdb` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-postgres` provides two low-level helper functions `largest_arrival_time()` & `largest_departure_time()` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). @@ -311,9 +309,9 @@ AND "date" <= dates_filter_max('2022-03-23T12:35+01') -- evaluates to 2023-03-23 ## Performance -With all use cases I could think of, `gtfs-via-postgres` is reasonably fast. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-postgres/issues/new)! +`gtfs-via-duckdb` is fast enough for most use cases I can think of. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-duckdb/issues/new)! -The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-postgres@4.7.4` and PostgreSQL 14.7 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 12.6.8; All measurements are in milliseconds. +The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and PostgreSQL 14.7 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 12.6.8; All measurements are in milliseconds. | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | @@ -341,7 +339,7 @@ The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https:/ ## Related Projects -There are some projects that are very similar to `gtfs-via-postgres`: +There are some projects that are very similar to `gtfs-via-duckdb`: ### Node-GTFS @@ -367,9 +365,9 @@ I don't use it because There are several forks of the [original outdated project](https://github.com/cbick/gtfs_SQL_importer); [fitnr's fork](https://github.com/fitnr/gtfs-sql-importer) seems to be the most recent one. -The project has a slightly different goal than `gtfs-via-postgres`: While `gtfs-sql-importer` is designed to import multiple versions of a GTFS dataset in an idempotent fashion, `gtfs-via-postgres` assumes that *one* (version of a) GTFS dataset is imported into *one* DB exactly once. +The project has a slightly different goal than `gtfs-via-duckdb`: While `gtfs-sql-importer` is designed to import multiple versions of a GTFS dataset in an idempotent fashion, `gtfs-via-duckdb` assumes that *one* (version of a) GTFS dataset is imported into *one* DB exactly once. -`gtfs-via-postgres` aims to provide more tools – e.g. the `arrivals_departures` & `connections` views – to help with the analysis of a GTFS dataset, whereas `gtfs-sql-importer` just imports the data. +`gtfs-via-duckdb` aims to provide more tools – e.g. the `arrivals_departures` & `connections` views – to help with the analysis of a GTFS dataset, whereas `gtfs-sql-importer` just imports the data. ### other related projects @@ -382,11 +380,12 @@ The project has a slightly different goal than `gtfs-via-postgres`: While `gtfs- - [gtfs-lib](https://github.com/conveyal/gtfs-lib) – Java library & CLI for importing GTFS files into a PostgreSQL database. - [gtfs-schema](https://github.com/tyleragreen/gtfs-schema) – PostgreSQL schemas for GTFS feeds. (plain SQL) - [markusvalo/HSLtraffic](https://github.com/markusvalo/HSLtraffic) – Scripts to create a PostgreSQL database for HSL GTFS-data. (plain SQL) +- [smohiudd/gtfs-parquet-duckdb-wasm](https://github.com/smohiudd/gtfs-parquet-duckdb-wasm) – Test visualization of GTFS data using DuckDB-Wasm ([blog post](http://saadiqm.com/gtfs-parquet-duckdb-wasm/)) ## License -This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) contributions are licensed under the [*Prosperity Public License*](https://prosperitylicense.com), [contributions of other people](https://github.com/public-transport/gtfs-via-postgres/graphs/contributors) are licensed as [Apache 2.0](https://apache.org/licenses/LICENSE-2.0)**. +This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) contributions are licensed under the [*Prosperity Public License*](https://prosperitylicense.com), [contributions of other people](https://github.com/public-transport/gtfs-via-duckdb/graphs/contributors) are licensed as [Apache 2.0](https://apache.org/licenses/LICENSE-2.0)**. > This license allows you to use and share this software for noncommercial purposes for free and to try this software for commercial purposes for thirty days. @@ -397,6 +396,6 @@ This project is dual-licensed: **My ([@derhuerst](https://github.com/derhuerst)) ## Contributing -If you have a question or need support using `gtfs-via-postgres`, please double-check your code and setup first. If you think you have found a bug or want to propose a feature, use [the issues page](https://github.com/public-transport/gtfs-via-postgres/issues). +If you have a question or need support using `gtfs-via-duckdb`, please double-check your code and setup first. If you think you have found a bug or want to propose a feature, use [the issues page](https://github.com/public-transport/gtfs-via-duckdb/issues). By contributing, you agree to release your modifications under the [Apache 2.0 license](LICENSE-APACHE). From c5d91e5926d17c435859f8174cffa6ea604452a8 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 11:16:48 +0200 Subject: [PATCH 06/16] =?UTF-8?q?DuckDB=20rewrite:=20require=20Node.js=20v?= =?UTF-8?q?22=20=F0=9F=92=A5=F0=9F=92=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/publish.yml | 2 +- .github/workflows/smoke-test.yml | 2 +- .github/workflows/test.yml | 1 - package.json | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 342e01b..faf245d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: - name: setup Node uses: actions/setup-node@v4 with: - node-version: 20.x + node-version: 22.x - run: npm install - name: build binaries diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml index 03c5ae4..e870bbe 100644 --- a/.github/workflows/smoke-test.yml +++ b/.github/workflows/smoke-test.yml @@ -30,7 +30,7 @@ jobs: - name: setup Node uses: actions/setup-node@v1 with: - node-version: '20.x' + node-version: '22.x' - name: install sponge (moreutils) run: sudo apt install -y moreutils diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5b5fc7a..daf5025 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,7 +17,6 @@ jobs: strategy: matrix: node-version: - - '20.x' - '22.x' - '24.x' postgis-docker-tag: diff --git a/package.json b/package.json index ab67dd6..34120df 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,7 @@ } ], "engines": { - "node": ">=20.17" + "node": ">=22" }, "dependencies": { "csv-stringify": "^6.2.0", From 163fcd69b86a670c00429f3394a5d905644cf682 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 13:44:47 +0200 Subject: [PATCH 07/16] =?UTF-8?q?DuckDB=20rewrite:=20use=20ISO=208601=20ti?= =?UTF-8?q?mestamps=20including=20seconds=20=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/arrs_deps_by_route_name_and_time.sql | 6 +++--- benchmark/arrs_deps_by_station_and_time.sql | 6 +++--- .../arrs_deps_by_station_and_time_seq_0.sql | 6 +++--- benchmark/arrs_deps_by_stop_and_time.sql | 6 +++--- benchmark/arrs_deps_by_time.sql | 6 +++--- benchmark/arrs_deps_by_time_manual.sql | 2 +- benchmark/connections_by_route_name_and_time.sql | 6 +++--- benchmark/connections_by_station_and_time.sql | 6 +++--- .../connections_by_station_and_time_seq_0.sql | 6 +++--- benchmark/connections_by_stop_and_time.sql | 6 +++--- benchmark/connections_by_time.sql | 6 +++--- benchmark/connections_by_time_manual.sql | 2 +- example.sh | 4 ++-- readme.md | 16 ++++++++-------- test/amtrak-gtfs-2021-10-06.sh | 10 +++++----- test/sample-gtfs-feed.sh | 2 +- 16 files changed, 48 insertions(+), 48 deletions(-) diff --git a/benchmark/arrs_deps_by_route_name_and_time.sql b/benchmark/arrs_deps_by_route_name_and_time.sql index 360f77f..a669107 100644 --- a/benchmark/arrs_deps_by_route_name_and_time.sql +++ b/benchmark/arrs_deps_by_route_name_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE route_short_name = 'S1' -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/arrs_deps_by_station_and_time.sql b/benchmark/arrs_deps_by_station_and_time.sql index b297b68..a73d5f7 100644 --- a/benchmark/arrs_deps_by_station_and_time.sql +++ b/benchmark/arrs_deps_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0.sql b/benchmark/arrs_deps_by_station_and_time_seq_0.sql index 2a2a20d..0b02f35 100644 --- a/benchmark/arrs_deps_by_station_and_time_seq_0.sql +++ b/benchmark/arrs_deps_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') AND stop_sequence = 0 diff --git a/benchmark/arrs_deps_by_stop_and_time.sql b/benchmark/arrs_deps_by_stop_and_time.sql index 5b26ff6..26b4068 100644 --- a/benchmark/arrs_deps_by_stop_and_time.sql +++ b/benchmark/arrs_deps_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/arrs_deps_by_time.sql b/benchmark/arrs_deps_by_time.sql index 1d01275..99d84f1 100644 --- a/benchmark/arrs_deps_by_time.sql +++ b/benchmark/arrs_deps_by_time.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone) +WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/arrs_deps_by_time_manual.sql b/benchmark/arrs_deps_by_time_manual.sql index 5c4dada..1762bac 100644 --- a/benchmark/arrs_deps_by_time_manual.sql +++ b/benchmark/arrs_deps_by_time_manual.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' +WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' AND date >= '2022-08-08' AND date <= '2022-08-09' diff --git a/benchmark/connections_by_route_name_and_time.sql b/benchmark/connections_by_route_name_and_time.sql index ca5bcc0..69fa862 100644 --- a/benchmark/connections_by_route_name_and_time.sql +++ b/benchmark/connections_by_route_name_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE route_short_name = 'S1' -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time.sql b/benchmark/connections_by_station_and_time.sql index 861108e..769efb5 100644 --- a/benchmark/connections_by_station_and_time.sql +++ b/benchmark/connections_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time_seq_0.sql b/benchmark/connections_by_station_and_time_seq_0.sql index 7eaa73d..8a42b53 100644 --- a/benchmark/connections_by_station_and_time_seq_0.sql +++ b/benchmark/connections_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM connections WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') AND from_stop_sequence = 0 diff --git a/benchmark/connections_by_stop_and_time.sql b/benchmark/connections_by_stop_and_time.sql index 7baf415..c4bbfc1 100644 --- a/benchmark/connections_by_stop_and_time.sql +++ b/benchmark/connections_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02') -AND date <= dates_filter_max('2022-08-09T07:30+02') +AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02') +AND date <= dates_filter_max('2022-08-09T07:30:00+02') diff --git a/benchmark/connections_by_time.sql b/benchmark/connections_by_time.sql index de4dff1..403cac9 100644 --- a/benchmark/connections_by_time.sql +++ b/benchmark/connections_by_time.sql @@ -1,7 +1,7 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' -AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone) +WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' +AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) ORDER BY t_departure LIMIT 100 diff --git a/benchmark/connections_by_time_manual.sql b/benchmark/connections_by_time_manual.sql index c483d02..5372029 100644 --- a/benchmark/connections_by_time_manual.sql +++ b/benchmark/connections_by_time_manual.sql @@ -1,6 +1,6 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02' +WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' AND date >= '2022-08-08' AND date <= '2022-08-09' ORDER BY t_departure diff --git a/example.sh b/example.sh index 2dd8368..bfef2d0 100755 --- a/example.sh +++ b/example.sh @@ -18,7 +18,7 @@ psql -c "$(cat <<- EOM to_stop_id, t_arrival FROM connections WHERE trip_id = 'during-dst-1' - AND t_departure > '2019-03-31T01:55+01' AND t_departure < '2019-03-31T03:00+02' + AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND from_stop_id = 'airport' EOM)" @@ -31,7 +31,7 @@ psql -c "$(cat <<- EOM stop_sequence FROM arrivals_departures WHERE trip_id = 'during-dst-1' - AND t_departure > '2019-03-31T01:55+01' AND t_departure < '2019-03-31T03:00+02' + AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND stop_id = 'airport' EOM)" diff --git a/readme.md b/readme.md index 2f72696..913aad0 100644 --- a/readme.md +++ b/readme.md @@ -101,13 +101,13 @@ In addition to a table for each GTFS file, `gtfs-via-duckdb` adds these views to - `stats_by_agency_route_stop_hour` provides the number of arrivals/departures by agency ID, route ID, stop ID & hour. – [read more](docs/analysis/feed-by-agency-route-stop-and-hour.md) - In contrast to `stats_by_route_date` & `stats_by_agency_route_stop_hour`, `stats_active_trips_by_hour` provides the number of *currently running* trips for each hour in the feeds period of time. -As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01`: +As an example, we're going to use the `arrivals_departures` view to query all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01`: ```sql SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900120003' -AND t_departure >= '2022-03-23T12:30+01' AND t_departure <= '2022-03-23T12:35+01' +AND t_departure >= '2022-03-23T12:30:00+01' AND t_departure <= '2022-03-23T12:35:00+01' ``` `route_id` | `route_short_name` | `route_type` | `trip_id` | `date` | `stop_sequence` | `t_arrival` | `t_departure` | `stop_id` | `stop_name` | `station_id` | `station_name` @@ -275,14 +275,14 @@ This means that, in order to determine all *absolute* points in time where a par Let's consider two examples: -- A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59+01`. -- A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01+02` which is *not* `3h1m` after `2021-03-28T00:00+01`. +- A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59:00+01`. +- A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01:00+02` which is *not* `3h1m` after `2021-03-28T00:00:00+01`. `gtfs-via-duckdb` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-postgres` provides two low-level helper functions `largest_arrival_time()` & `largest_departure_time()` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). -For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01` within the [2022-02-25 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-02-25/), filtering by `date` speeds it up nicely (Apple M1, PostgreSQL 14.2): +For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2022-02-25 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-02-25/), filtering by `date` speeds it up nicely (Apple M1, PostgreSQL 14.2): `station_id` filter | `date` filter | query time | nr of results -|-|-|- @@ -300,10 +300,10 @@ Using `dates_filter_min(t_min)` & `dates_filter_max(t_max)`, we can easily filte SELECT * FROM arrivals_departures -- filter by absolute departure date+time -WHERE t_departure >= '2022-03-23T12:30+01' AND t_departure <= '2022-03-23T12:35+01' +WHERE t_departure >= '2022-03-23T12:30:00+01' AND t_departure <= '2022-03-23T12:35:00+01' -- allow "cutoffs" by filtering by date -AND "date" >= dates_filter_min('2022-03-23T12:30+01') -- evaluates to 2023-03-22 -AND "date" <= dates_filter_max('2022-03-23T12:35+01') -- evaluates to 2023-03-23 +AND "date" >= dates_filter_min('2022-03-23T12:30:00+01') -- evaluates to 2023-03-22 +AND "date" <= dates_filter_max('2022-03-23T12:35:00+01') -- evaluates to 2023-03-23 ``` diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 67df57d..79ef49b 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -76,7 +76,7 @@ SELECT nr_of_arrs FROM amtrak.stats_by_agency_route_stop_hour WHERE route_id = '40751' -- Acela AND stop_id = 'PHL' -- Philadelphia -AND effective_hour = '2022-07-24T09:00-05' +AND effective_hour = '2022-07-24 09:00:00-05:00' EOF ) acelaPhillyStat=$(psql --csv -t -c "$acelaPhillyStatQuery" | tail -n 1) @@ -87,16 +87,16 @@ fi nrOfActiveTripsQuery=$(cat << EOF SELECT nr_of_active_trips -FROM amtrak.stats_active_trips_by_hour -WHERE "hour" = '2021-11-26T04:00-05' +FROM stats_active_trips_by_hour +WHERE "hour" = '2021-11-26 04:00:00-05:00' EOF ) # Note: I'm not sure if 127 is correct, but it is in the right ballpark. 🙈 # The following query yields 150 connections, and it doesn't contain those who depart earlier and arrive later. # SELECT DISTINCT ON (trip_id) * # FROM amtrak.connections -# WHERE t_departure >= '2021-11-26T02:00-05' -# AND t_arrival <= '2021-11-26T06:00-05' +# WHERE t_departure >= '2021-11-26 02:00:00-05:00' +# AND t_arrival <= '2021-11-26 06:00:00-05:00' nrOfActiveTrips=$(psql --csv -t -c "$nrOfActiveTripsQuery" | tail -n 1) if [[ "$nrOfActiveTrips" != "127" ]]; then echo "unexpected no. of active trips at 2021-11-26T04:00-05: $nrOfActiveTrips" 1>&2 diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index a368ee6..59a4be4 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -93,7 +93,7 @@ connection_during_dst=$(cat << EOF extract(epoch from t_departure)::integer as dep FROM connections WHERE trip_id = 'during-dst-1' - AND t_departure = '2019-03-31T01:58+01' + AND t_departure = '2019-03-31T01:58:00+01:00' EOF ) dst1=$(psql --csv -t -c "$connection_during_dst" | head -n 1) From f0a724d7b06f5a9b891b1356c08a5833bc29a306 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 14:06:13 +0200 Subject: [PATCH 08/16] =?UTF-8?q?tests:=20don't=20depend=20on=20an=20impli?= =?UTF-8?q?cit=20row=20order=20=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/sample-gtfs-feed.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index 59a4be4..8834739 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -229,6 +229,7 @@ frequencies_it_query=$(cat << EOF SELECT t_departure, stop_sequence, stop_id, frequencies_it FROM arrivals_departures WHERE trip_id = 'b-downtown-on-working-days' AND "date" = '2019-05-29' AND frequencies_it = 3 +ORDER BY t_departure EOF ) frequencies_it_rows="$(psql --csv -t -c "$frequencies_it_query")" @@ -273,6 +274,7 @@ SELECT stop_url, stop_url_lang FROM stops_translated WHERE stop_id LIKE 'airport%' +ORDER BY stop_id, stop_name_lang, stop_desc_lang EOF ) stops_translated_rows="$(psql --csv -t -c "$stops_translated_query")" From a0de79dc3cc295f3ef65e148c774b41b986525e5 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 14:10:22 +0200 Subject: [PATCH 09/16] =?UTF-8?q?DuckDB=20rewrite:=20remove=20non-default?= =?UTF-8?q?=20schema=20support=20=F0=9F=92=A5=F0=9F=93=9D=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cli.js | 10 -- docs/import-metadata.md | 2 +- docs/multiple-datasets.md | 34 +------ index.js | 46 +-------- lib/agency.js | 6 +- lib/calendar.js | 22 ++--- lib/calendar_dates.js | 14 +-- lib/feed_info.js | 8 +- lib/frequencies.js | 20 ++-- lib/import_metadata.js | 6 +- lib/levels.js | 4 +- lib/pathways.js | 14 +-- lib/prerequisites.js | 12 +-- lib/routes.js | 14 +-- lib/service_days.js | 16 ++-- lib/shapes.js | 12 +-- lib/stats_active_trips_by_hour.js | 28 +++--- lib/stats_by_agency_route_stop_hour.js | 12 +-- lib/stats_by_route_date.js | 14 +-- lib/stop_times.js | 128 ++++++++++++------------- lib/stops.js | 28 +++--- lib/transfers.js | 24 ++--- lib/translations.js | 122 +++++++++++------------ lib/trips.js | 24 ++--- readme.md | 6 -- test/amtrak-gtfs-2021-10-06.sh | 8 +- test/index.sh | 1 - test/multiple-schemas.sh | 79 --------------- 28 files changed, 271 insertions(+), 443 deletions(-) delete mode 100755 test/multiple-schemas.sh diff --git a/cli.js b/cli.js index 1e44729..32d3587 100755 --- a/cli.js +++ b/cli.js @@ -56,9 +56,6 @@ const { 'stats-active-trips-by-hour': { type: 'string', }, - 'schema': { - type: 'string', - }, 'import-metadata': { type: 'boolean', } @@ -109,12 +106,6 @@ Options: currently running trips over time, by hour. Like --stats-by-route-date, this flag accepts none, view & materialized-view. - --schema The schema to use for the database. Default: public - Even when importing into a schema other than \`public\`, - a function \`public.gtfs_via_postgres_import_version()\` - gets created, to ensure that multiple imports into the - same database are all made using the same version. See - also multiple-datasets.md in the docs. --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - gtfs_via_postgres_version (text) @@ -155,7 +146,6 @@ const opt = { statsByRouteIdAndDate: flags['stats-by-route-date'] || 'none', statsByAgencyIdAndRouteIdAndStopAndHour: flags['stats-by-agency-route-stop-hour'] || 'none', statsActiveTripsByHour: flags['stats-active-trips-by-hour'] || 'none', - schema: flags['schema'] || 'public', importMetadata: !!flags['import-metadata'], } if ('stops-without-level-id' in flags) { diff --git a/docs/import-metadata.md b/docs/import-metadata.md index bc016ad..03df3e2 100644 --- a/docs/import-metadata.md +++ b/docs/import-metadata.md @@ -12,7 +12,7 @@ SELECT gtfs_via_postgres_version() -- 4.5.3 SELECT gtfs_via_postgres_options() --- {"schema": "public", "silent": false, "importStart": 1681417454781, "importMetadata": true, … } +-- {"silent": false, "importStart": 1681417454781, "importMetadata": true, … } SELECT (gtfs_via_postgres_options())['tripsWithoutShapeId'] -- true ``` diff --git a/docs/multiple-datasets.md b/docs/multiple-datasets.md index 608998d..40e533d 100644 --- a/docs/multiple-datasets.md +++ b/docs/multiple-datasets.md @@ -1,35 +1,3 @@ # working with multiple datasets -Using `gtfs-via-postgres`, you can import more than one dataset into a single PostgreSQL database by importing them into separate [schemas](https://www.postgresql.org/docs/14/ddl-schemas.html). You can then run queries combine or compare data from them. - -As an example, let's compare two datasets from [Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités) and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg). - -```shell -wget -U 'gtfs-via-duckdb demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' -unzip -d paris.gtfs paris.gtfs.zip -gtfs-to-sql --require-dependencies \ - --schema paris -- paris.gtfs/*.txt \ - | sponge | psql -b - -wget -U 'gtfs-via-duckdb demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' -unzip -d berlin.gtfs berlin.gtfs.zip -gtfs-to-sql --require-dependencies \ - --schema berlin -- berlin.gtfs/*.txt \ - | sponge | psql -b -``` - -We can now do queries across both datasets, for example finding the geographically furthest 2 stops: - -```sql --- warning: takes a long time to compute! -SELECT - paris.stop_id AS paris_stop_id, - berlin.stop_id AS berlin_stop_id -FROM - paris.stops paris, - berlin.stops berlin -ORDER BY paris.stop_loc <-> berlin.stop_loc DESC -LIMIT 100 -``` - -*Note:* During an import, a function `public.gtfs_via_postgres_import_version()` gets created that returns `gtfs-via-postgres`'s version. If that function already exists (because it has been created by a previous import), its return value is compared to `gtfs-via-postgres`'s version, and if these two versions are not equal, the second import will fail. This ensures that multiple imports into the same database can only be made using the exact same `gtfs-via-postgres` version. +Using `gtfs-via-postgres`, it is currently *not possible* to import more than one dataset into a single PostgreSQL database. diff --git a/index.js b/index.js index 400fa71..f169d2f 100644 --- a/index.js +++ b/index.js @@ -25,7 +25,6 @@ const convertGtfsToSql = async function* (files, opt = {}) { statsByRouteIdAndDate: 'none', statsByAgencyIdAndRouteIdAndStopAndHour: 'none', statsActiveTripsByHour: 'none', - schema: 'public', importMetadata: false, ...opt, } @@ -142,52 +141,9 @@ ${inspect(opt, {compact: false}).split('\n').map(line => '-- ' + line).join('\n' \\set ON_ERROR_STOP on CREATE EXTENSION IF NOT EXISTS postgis; -${opt.schema !== 'public' ? `CREATE SCHEMA IF NOT EXISTS "${opt.schema}";` : ''} BEGIN; --- gtfs-via-postgres supports importing >1 GTFS datasets into 1 DB, each dataset within its own schema. See https://github.com/public-transport/gtfs-via-postgres/issues/51 for more information. --- Because almost all helper utilities (enums, functions, etc.) are schema-specific, they get imported more than once. In order to prevent subtle bugs due to incompatibilities among two schemas imported by different gtfs-via-postgres versions, we mock a "mutex" here by checking for public.gtfs_via_postgres_import_version()'s return value. - --- todo: this can be done more elegantly: just a "DO" block, "ASSERT" that the version matches, create gtfs_via_postgres_import_version() in the "EXCEPTION" block -CREATE FUNCTION pg_temp.get_gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - DECLARE - res TEXT; - BEGIN - SELECT public.gtfs_via_postgres_import_version() INTO res; - RETURN res; - EXCEPTION - WHEN undefined_function THEN - -- do nothing, silence error - RETURN NULL; - END; -$$ -LANGUAGE plpgsql; - -DO $$ -BEGIN - IF EXISTS ( - SELECT version - FROM ( - SELECT pg_temp.get_gtfs_via_postgres_import_version() AS version - ) t - WHERE version != '${pkg.version}' - ) THEN - RAISE EXCEPTION 'existing GTFS data imported with an incompatible version of gtfs-via-postgres'; - END IF; -END -$$ -LANGUAGE plpgsql; - -CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() -RETURNS TEXT -AS $$ - SELECT '${pkg.version}' -$$ -LANGUAGE sql; - -\n` +` const csv = new Stringifier({quoted: true}) const nrOfRowsByName = new Map() diff --git a/lib/agency.js b/lib/agency.js index 450230c..832987e 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -2,19 +2,19 @@ // https://gtfs.org/documentation/schedule/reference/#agencytxt const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".agency ( +CREATE TABLE agency ( agency_id TEXT PRIMARY KEY, agency_name TEXT NOT NULL, agency_url TEXT NOT NULL, agency_timezone TEXT NOT NULL - CONSTRAINT valid_timezone CHECK ("${opt.schema}".is_timezone(agency_timezone)), + CONSTRAINT valid_timezone CHECK (is_timezone(agency_timezone)), agency_lang TEXT, -- todo: validate? agency_phone TEXT, agency_fare_url TEXT, agency_email TEXT ); -COPY "${opt.schema}".agency ( +COPY agency ( agency_id, agency_name, agency_url, diff --git a/lib/calendar.js b/lib/calendar.js index 3727eca..f06be15 100644 --- a/lib/calendar.js +++ b/lib/calendar.js @@ -2,26 +2,26 @@ // https://gtfs.org/documentation/schedule/reference/#calendartxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".availability AS ENUM ( +CREATE TYPE availability AS ENUM ( 'not_available' -- 0 – Service is not available for Mondays in the date range. , 'available' -- 1 – Service is available for all Mondays in the date range. ); -CREATE CAST ("${opt.schema}".availability AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (availability AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar ( +CREATE TABLE calendar ( service_id TEXT PRIMARY KEY, - monday "${opt.schema}".availability NOT NULL, - tuesday "${opt.schema}".availability NOT NULL, - wednesday "${opt.schema}".availability NOT NULL, - thursday "${opt.schema}".availability NOT NULL, - friday "${opt.schema}".availability NOT NULL, - saturday "${opt.schema}".availability NOT NULL, - sunday "${opt.schema}".availability NOT NULL, + monday availability NOT NULL, + tuesday availability NOT NULL, + wednesday availability NOT NULL, + thursday availability NOT NULL, + friday availability NOT NULL, + saturday availability NOT NULL, + sunday availability NOT NULL, start_date DATE NOT NULL, end_date DATE NOT NULL ); -COPY "${opt.schema}".calendar ( +COPY calendar ( service_id, monday, tuesday, diff --git a/lib/calendar_dates.js b/lib/calendar_dates.js index 924174f..5ee4deb 100644 --- a/lib/calendar_dates.js +++ b/lib/calendar_dates.js @@ -2,20 +2,20 @@ // https://gtfs.org/documentation/schedule/reference/#calendar_datestxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".exception_type_v AS ENUM ( +CREATE TYPE exception_type_v AS ENUM ( 'added' -- 1 – Service has been added for the specified date. , 'removed' -- 2 – Service has been removed for the specified date. ); -CREATE CAST ("${opt.schema}".exception_type_v AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (exception_type_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".calendar_dates ( +CREATE TABLE calendar_dates ( service_id TEXT NOT NULL, "date" DATE NOT NULL, PRIMARY KEY (service_id, "date"), - exception_type "${opt.schema}".exception_type_v NOT NULL + exception_type exception_type_v NOT NULL ); -COPY "${opt.schema}".calendar_dates ( +COPY calendar_dates ( service_id, date, exception_type @@ -39,8 +39,8 @@ const formatCalendarDatesRow = (e) => { const afterAll = (opt) => `\ \\. -CREATE INDEX ON "${opt.schema}".calendar_dates (service_id); -CREATE INDEX ON "${opt.schema}".calendar_dates (exception_type); +CREATE INDEX ON calendar_dates (service_id); +CREATE INDEX ON calendar_dates (exception_type); ` module.exports = { diff --git a/lib/feed_info.js b/lib/feed_info.js index 6cc1168..ad08de1 100644 --- a/lib/feed_info.js +++ b/lib/feed_info.js @@ -6,16 +6,16 @@ const beforeAll = (opt) => `\ -- https://github.com/MobilityData/gtfs-validator/blob/31ff374800f7d7883fd9de91b71049c2a4de4e45/main/src/main/java/org/mobilitydata/gtfsvalidator/validator/MatchingFeedAndAgencyLangValidator.java#L82 -- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html -- related: https://github.com/google/transit/pull/98 -CREATE TABLE "${opt.schema}".feed_info ( +CREATE TABLE feed_info ( feed_publisher_name TEXT PRIMARY KEY, feed_publisher_url TEXT NOT NULL, feed_lang TEXT NOT NULL CONSTRAINT valid_feed_lang CHECK ( - "${opt.schema}".is_valid_lang_code(feed_lang) + is_valid_lang_code(feed_lang) ), default_lang TEXT CONSTRAINT valid_default_lang CHECK ( - default_lang IS NULL OR "${opt.schema}".is_valid_lang_code(default_lang) + default_lang IS NULL OR is_valid_lang_code(default_lang) ), feed_start_date DATE, feed_end_date DATE, @@ -24,7 +24,7 @@ CREATE TABLE "${opt.schema}".feed_info ( feed_contact_url TEXT ); -COPY "${opt.schema}".feed_info ( +COPY feed_info ( feed_publisher_name, feed_publisher_url, feed_lang, diff --git a/lib/frequencies.js b/lib/frequencies.js index 39f7d33..ba0663a 100644 --- a/lib/frequencies.js +++ b/lib/frequencies.js @@ -4,21 +4,21 @@ const {formatTime} = require('./util') // https://gtfs.org/documentation/schedule/reference/#frequenciestxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".exact_times_v AS ENUM ( +CREATE TYPE exact_times_v AS ENUM ( 'frequency_based' -- 0 or empty - Frequency-based trips. , 'schedule_based' -- 1 – Schedule-based trips with the exact same headway throughout the day. In this case the end_time value must be greater than the last desired trip start_time but less than the last desired trip start_time + headway_secs. ); -CREATE CAST ("${opt.schema}".exact_times_v AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (exact_times_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".frequencies ( +CREATE TABLE frequencies ( -- Used to implement arrivals_departures & connections. Filled after COPY-ing, see below. frequencies_row INTEGER, trip_id TEXT NOT NULL, - FOREIGN KEY (trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (trip_id) REFERENCES trips, start_time INTERVAL NOT NULL, end_time INTERVAL NOT NULL, headway_secs INT NOT NULL, - exact_times "${opt.schema}".exact_times_v, + exact_times exact_times_v, -- frequencies' primary key currently is just (trip_id, start_time) -- see also https://github.com/google/transit/issues/514 -- todo: add primary key? @@ -31,7 +31,7 @@ CREATE TABLE "${opt.schema}".frequencies ( ) ); -COPY "${opt.schema}".frequencies ( +COPY frequencies ( trip_id, start_time, end_time, @@ -67,7 +67,7 @@ const afterAll = (opt) => `\ \\. -- frequencies_row is used to implement arrivals_departures & connections. -UPDATE "${opt.schema}".frequencies +UPDATE frequencies -- This is ugly, but AFAICT there is no cleaner way. -- see also https://stackoverflow.com/a/4359354/1072129 SET frequencies_row = t.frequencies_row @@ -76,7 +76,7 @@ FROM ( -- order by all columns so that we don't implicitly depend on the file's order (row_number() OVER (PARTITION BY trip_id, start_time ORDER BY end_time, headway_secs, exact_times))::integer AS frequencies_row, trip_id, start_time - FROM "${opt.schema}".frequencies + FROM frequencies ) AS t -- self-join -- frequencies' primary is just (trip_id, start_time) @@ -87,8 +87,8 @@ FROM ( WHERE frequencies.trip_id = t.trip_id AND frequencies.start_time = t.start_time; -CREATE INDEX ON "${opt.schema}".frequencies (trip_id); -CREATE INDEX ON "${opt.schema}".frequencies (exact_times); +CREATE INDEX ON frequencies (trip_id); +CREATE INDEX ON frequencies (exact_times); ` module.exports = { diff --git a/lib/import_metadata.js b/lib/import_metadata.js index c93e769..36bf8d8 100644 --- a/lib/import_metadata.js +++ b/lib/import_metadata.js @@ -8,19 +8,19 @@ const afterAll = (opt) => { // todo: escape properly return `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_data_imported_at () +CREATE OR REPLACE FUNCTION gtfs_data_imported_at () RETURNS TIMESTAMP WITH TIME ZONE AS $$ SELECT '${new Date(opt.importStart).toISOString()}'::timestamp with time zone; $$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_via_postgres_version () +CREATE OR REPLACE FUNCTION gtfs_via_postgres_version () RETURNS TEXT AS $$ SELECT '${pkg.version}'; $$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".gtfs_via_postgres_options () +CREATE OR REPLACE FUNCTION gtfs_via_postgres_options () RETURNS jsonb AS $$ SELECT '${JSON.stringify(opt).replace(/'/g, `''`)}'::jsonb; diff --git a/lib/levels.js b/lib/levels.js index ae72df2..5e276b2 100644 --- a/lib/levels.js +++ b/lib/levels.js @@ -4,13 +4,13 @@ const {formatTime} = require('./util') // https://gtfs.org/documentation/schedule/reference/#levelstxt const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".levels ( +CREATE TABLE levels ( level_id TEXT PRIMARY KEY, level_index DOUBLE PRECISION NOT NULL, level_name TEXT ); -COPY "${opt.schema}".levels ( +COPY levels ( level_id, level_index, level_name diff --git a/lib/pathways.js b/lib/pathways.js index bb00834..a89dd9c 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -4,7 +4,7 @@ const {formatTime} = require('./util') // https://gtfs.org/documentation/schedule/reference/#pathwaystxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".pathway_mode_v AS ENUM ( +CREATE TYPE pathway_mode_v AS ENUM ( 'walkway' -- 1 , 'stairs' -- 2 , 'moving_sidewalk_travelator' -- 3 – moving sidewalk/travelator @@ -14,15 +14,15 @@ CREATE TYPE "${opt.schema}".pathway_mode_v AS ENUM ( -- Fare gates may either separate paid areas of the station from unpaid ones, or separate different payment areas within the same station from each other. This information can be used to avoid routing passengers through stations using shortcuts that would require passengers to make unnecessary payments, like directing a passenger to walk through a subway platform to reach a busway. , 'exit_gate' -- 7 – Indicates a pathway exiting an area where proof-of-payment is required into an area where proof-of-payment is no longer required. ); -CREATE CAST ("${opt.schema}".pathway_mode_v AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (pathway_mode_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".pathways ( +CREATE TABLE pathways ( pathway_id TEXT PRIMARY KEY, from_stop_id TEXT NOT NULL, - FOREIGN KEY (from_stop_id) REFERENCES "${opt.schema}".stops (stop_id), + FOREIGN KEY (from_stop_id) REFERENCES stops (stop_id), to_stop_id TEXT NOT NULL, - FOREIGN KEY (to_stop_id) REFERENCES "${opt.schema}".stops (stop_id), - pathway_mode "${opt.schema}".pathway_mode_v NOT NULL, + FOREIGN KEY (to_stop_id) REFERENCES stops (stop_id), + pathway_mode pathway_mode_v NOT NULL, is_bidirectional BOOLEAN NOT NULL, length DOUBLE PRECISION, -- todo: add non-negative constraint traversal_time INTEGER, -- todo: add positive constraint @@ -33,7 +33,7 @@ CREATE TABLE "${opt.schema}".pathways ( reversed_signposted_as TEXT ); -COPY "${opt.schema}".pathways ( +COPY pathways ( pathway_id, from_stop_id, to_stop_id, diff --git a/lib/prerequisites.js b/lib/prerequisites.js index 2688e44..69c0ee5 100644 --- a/lib/prerequisites.js +++ b/lib/prerequisites.js @@ -7,7 +7,7 @@ const is_valid_lang_code = { -- https://www.postgresql.org/docs/current/infoschema-collations.html -- https://www.postgresql.org/docs/current/catalog-pg-collation.html -- todo [breaking]: rename to e.g. is_similar_to_bcp_47_tag? -CREATE OR REPLACE FUNCTION "${opt.schema}".is_bcp_47_tag( +CREATE OR REPLACE FUNCTION is_bcp_47_tag( input TEXT ) RETURNS BOOLEAN @@ -23,13 +23,13 @@ AS $$ $$ language sql STABLE; -- todo [breaking]: remove -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_lang_code( +CREATE OR REPLACE FUNCTION is_valid_lang_code( input TEXT ) RETURNS BOOLEAN AS $$ -- todo: see also https://github.com/MobilityData/gtfs-validator/issues/1987 - SELECT "${opt.schema}".is_bcp_47_tag(input); + SELECT is_bcp_47_tag(input); $$ language sql STABLE; `, @@ -37,7 +37,7 @@ $$ language sql STABLE; const is_timezone = { beforeAll: (opt) => `\ -- https://justatheory.com/2007/11/postgres-timezone-validation/ -CREATE OR REPLACE FUNCTION "${opt.schema}".is_timezone( +CREATE OR REPLACE FUNCTION is_timezone( tz TEXT ) RETURNS BOOLEAN @@ -56,14 +56,14 @@ $$ language plpgsql STABLE; } const shape_exists = { beforeAll: (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".shape_exists( +CREATE OR REPLACE FUNCTION shape_exists( some_shape_id TEXT ) RETURNS BOOLEAN AS $$ SELECT EXISTS ( SELECT shape_id - FROM "${opt.schema}".shapes + FROM shapes WHERE shape_id = some_shape_id LIMIT 1 ); diff --git a/lib/routes.js b/lib/routes.js index b2249ac..ee6b82b 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -240,7 +240,7 @@ const beforeAll = (opt) => { const extRouteTypes = routeTypesSchemes[opt.routeTypesScheme] return `\ -CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( +CREATE TYPE route_type_val AS ENUM ( -- basic types '0' -- 0 – Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area. , '1' -- 1 – Subway, Metro. Any underground rail system within a metropolitan area. @@ -256,24 +256,24 @@ CREATE TYPE "${opt.schema}".route_type_val AS ENUM ( -- extended types ${extRouteTypes.map(([route_type, desc]) => `, '${route_type}' -- ${desc}`).join('\n')} ); -CREATE CAST ("${opt.schema}".route_type_val AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (route_type_val AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".routes ( +CREATE TABLE routes ( route_id TEXT PRIMARY KEY, agency_id TEXT, - ${opt.routesWithoutAgencyId ? '' : `FOREIGN KEY (agency_id) REFERENCES "${opt.schema}".agency,`} + ${opt.routesWithoutAgencyId ? '' : `FOREIGN KEY (agency_id) REFERENCES agency,`} -- todo: Either route_short_name or route_long_name must be specified, or potentially both if appropriate. route_short_name TEXT, route_long_name TEXT, route_desc TEXT, - route_type "${opt.schema}".route_type_val NOT NULL, + route_type route_type_val NOT NULL, route_url TEXT, route_color TEXT, route_text_color TEXT, route_sort_order INT ); -COPY "${opt.schema}".routes ( +COPY routes ( route_id, agency_id, route_short_name, @@ -323,7 +323,7 @@ const formatRoutesRow = (r, opt, workingState) => { const afterAll = (opt) => `\ \\. -CREATE INDEX ON "${opt.schema}".routes (route_short_name); +CREATE INDEX ON routes (route_short_name); ` module.exports = { diff --git a/lib/service_days.js b/lib/service_days.js index f32bee7..ba9fa58 100644 --- a/lib/service_days.js +++ b/lib/service_days.js @@ -2,7 +2,7 @@ const afterAll = (opt) => `\ -- todo [breaking]: rename to service_dates? -CREATE MATERIALIZED VIEW "${opt.schema}".service_days AS +CREATE MATERIALIZED VIEW service_days AS SELECT base_days.service_id, base_days.date @@ -32,7 +32,7 @@ FROM ( end_date::TIMESTAMP, '1 day'::INTERVAL ) "date" - FROM "${opt.schema}".calendar + FROM calendar ) all_days_raw ) all_days WHERE (sunday = 'available' AND dow = 0) @@ -47,7 +47,7 @@ FROM ( -- "removed" exceptions LEFT JOIN ( SELECT * - FROM "${opt.schema}".calendar_dates + FROM calendar_dates WHERE exception_type = 'removed' ) removed ON base_days.service_id = removed.service_id @@ -56,17 +56,17 @@ WHERE removed.date IS NULL -- "added" exceptions UNION SELECT service_id, "date" -FROM "${opt.schema}".calendar_dates +FROM calendar_dates WHERE exception_type = 'added' ORDER BY service_id, "date"; -CREATE UNIQUE INDEX ON "${opt.schema}".service_days (service_id, date); +CREATE UNIQUE INDEX ON service_days (service_id, date); -CREATE INDEX ON "${opt.schema}".service_days (service_id); -CREATE INDEX ON "${opt.schema}".service_days (date); +CREATE INDEX ON service_days (service_id); +CREATE INDEX ON service_days (date); -- apparently the unique index (service_id, date) doesn't speed up queries -CREATE INDEX ON "${opt.schema}".service_days (service_id, date); +CREATE INDEX ON service_days (service_id, date); ` diff --git a/lib/shapes.js b/lib/shapes.js index dc4bd99..624f784 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -2,7 +2,7 @@ // https://gtfs.org/documentation/schedule/reference/#shapestxt const beforeAll = (opt) => `\ -CREATE TABLE "${opt.schema}".shapes ( +CREATE TABLE shapes ( id SERIAL PRIMARY KEY, shape_id TEXT, shape_pt_sequence INT, @@ -10,7 +10,7 @@ CREATE TABLE "${opt.schema}".shapes ( shape_dist_traveled REAL ); -COPY "${opt.schema}".shapes ( +COPY shapes ( shape_id, shape_pt_loc, shape_pt_sequence, @@ -30,10 +30,10 @@ const formatShapesRow = (s) => { const afterAll = (opt) => `\ \\. -CREATE INDEX shapes_by_shape_id ON "${opt.schema}".shapes (shape_id); -CREATE INDEX ON "${opt.schema}".shapes (shape_id, shape_pt_sequence); +CREATE INDEX shapes_by_shape_id ON shapes (shape_id); +CREATE INDEX ON shapes (shape_id, shape_pt_sequence); -CREATE OR REPLACE VIEW "${opt.schema}".shapes_aggregated AS +CREATE OR REPLACE VIEW shapes_aggregated AS SELECT shape_id, array_agg(shape_dist_traveled) AS distances_travelled, @@ -44,7 +44,7 @@ FROM ( shape_id, shape_dist_traveled, ST_AsText(shape_pt_loc)::geometry AS shape_pt_loc - FROM "${opt.schema}".shapes + FROM shapes ORDER by shape_id, shape_pt_sequence ) shapes GROUP BY shape_id; diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js index 17287e0..cadaa38 100644 --- a/lib/stats_active_trips_by_hour.js +++ b/lib/stats_active_trips_by_hour.js @@ -12,18 +12,18 @@ const afterAll = (opt) => { : `CREATE OR REPLACE VIEW` return `\ -CREATE MATERIALIZED VIEW "${opt.schema}".feed_time_frame AS +CREATE MATERIALIZED VIEW feed_time_frame AS WITH dates AS ( SELECT min("date") AS min, max("date") AS max - FROM "${opt.schema}".service_days + FROM service_days ), date_offset AS ( SELECT greatest( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() + largest_arrival_time(), + largest_departure_time() ) AS o ), date_min_max AS ( @@ -34,22 +34,22 @@ WITH ), min_dep AS ( SELECT min("t_departure") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date <= (SELECT min FROM date_min_max) ), min_arr AS ( SELECT min("t_arrival") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date <= (SELECT min FROM date_min_max) ), max_dep AS ( SELECT min("t_departure") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date >= (SELECT max FROM date_min_max) ), max_arr AS ( SELECT min("t_arrival") AS t - FROM "${opt.schema}".arrivals_departures, date_min_max + FROM arrivals_departures, date_min_max WHERE date >= (SELECT max FROM date_min_max) ) SELECT @@ -57,7 +57,7 @@ SELECT least(max_dep.t, max_arr.t) as max FROM min_dep, min_arr, max_dep, max_arr; -CREATE OR REPLACE FUNCTION "${opt.schema}".feed_time_series( +CREATE OR REPLACE FUNCTION feed_time_series( time_unit TEXT ) RETURNS SETOF timestamptz @@ -68,13 +68,13 @@ AS $$ date_trunc(time_unit, max), ('1 ' || time_unit)::interval ) as t - FROM "${opt.schema}".feed_time_frame + FROM feed_time_frame $$ LANGUAGE sql STABLE; -${createViewCmd} "${opt.schema}".stats_active_trips_by_hour AS +${createViewCmd} stats_active_trips_by_hour AS WITH all_hours AS NOT MATERIALIZED ( - SELECT "${opt.schema}".feed_time_series('hour') AS "hour" + SELECT feed_time_series('hour') AS "hour" ) SELECT DISTINCT ON ("hour") "hour", @@ -86,7 +86,7 @@ FROM ( FROM ( SELECT * FROM all_hours - LEFT JOIN "${opt.schema}".connections ON ( + LEFT JOIN connections ON ( date_trunc('hour', t_departure) <= "hour" AND date_trunc('hour', t_arrival) >= "hour" ) @@ -94,7 +94,7 @@ FROM ( ) cons; ${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_active_trips_by_hour ("hour"); +CREATE INDEX ON stats_active_trips_by_hour ("hour"); ` : ''} ` } diff --git a/lib/stats_by_agency_route_stop_hour.js b/lib/stats_by_agency_route_stop_hour.js index 6993c7a..625cfbc 100644 --- a/lib/stats_by_agency_route_stop_hour.js +++ b/lib/stats_by_agency_route_stop_hour.js @@ -12,19 +12,19 @@ const afterAll = (opt) => { : `CREATE OR REPLACE VIEW` return `\ -${createViewCmd} "${opt.schema}".stats_by_agency_route_stop_hour AS +${createViewCmd} stats_by_agency_route_stop_hour AS SELECT DISTINCT ON (agency_id, route_id, stop_id, effective_hour) agency_id, route_id, stop_id, station_id, "date" as service_date, date_trunc('hour', t_arrival) AS effective_hour, count(*) OVER (PARTITION BY route_id, stop_id, date_trunc('hour', t_arrival)) AS nr_of_arrs -FROM "${opt.schema}".arrivals_departures; +FROM arrivals_departures; ${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (route_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (stop_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (station_id); -CREATE INDEX ON "${opt.schema}".stats_by_agency_route_stop_hour (effective_hour); +CREATE INDEX ON stats_by_agency_route_stop_hour (route_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (stop_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (station_id); +CREATE INDEX ON stats_by_agency_route_stop_hour (effective_hour); ` : ''} ` diff --git a/lib/stats_by_route_date.js b/lib/stats_by_route_date.js index 12ca5b0..6a4021a 100644 --- a/lib/stats_by_route_date.js +++ b/lib/stats_by_route_date.js @@ -12,14 +12,14 @@ const afterAll = (opt) => { : `CREATE OR REPLACE VIEW` return `\ -${createViewCmd} "${opt.schema}".stats_by_route_date AS +${createViewCmd} stats_by_route_date AS WITH arrs_deps_with_svc_date AS NOT MATERIALIZED ( SELECT route_id, stop_sequence_consec, "date"::date AS svc_date, EXTRACT(DOW FROM "date") AS svc_dow - FROM "${opt.schema}".arrivals_departures + FROM arrivals_departures ), by_svc_date AS NOT MATERIALIZED ( SELECT DISTINCT ON (route_id, svc_date) @@ -35,7 +35,7 @@ WITH route_id, stop_sequence_consec, coalesce(t_departure, t_arrival)::date AS effective_date, EXTRACT(DOW FROM coalesce(t_departure, t_arrival)) AS effective_dow - FROM "${opt.schema}".arrivals_departures + FROM arrivals_departures ), by_effective_date AS NOT MATERIALIZED ( SELECT DISTINCT ON (route_id, effective_date) @@ -57,10 +57,10 @@ SELECT FROM by_svc_date; ${materialized ? `\ -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id); -CREATE INDEX ON "${opt.schema}".stats_by_route_date ("date"); -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, "date", is_effective); -CREATE INDEX ON "${opt.schema}".stats_by_route_date (route_id, dow, is_effective); +CREATE INDEX ON stats_by_route_date (route_id); +CREATE INDEX ON stats_by_route_date ("date"); +CREATE INDEX ON stats_by_route_date (route_id, "date", is_effective); +CREATE INDEX ON stats_by_route_date (route_id, dow, is_effective); ` : ''} ` } diff --git a/lib/stop_times.js b/lib/stop_times.js index cf40515..33c383e 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -4,40 +4,40 @@ const {formatTime} = require('./util') // https://gtfs.org/documentation/schedule/reference/#stop_timestxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".pickup_drop_off_type AS ENUM ( +CREATE TYPE pickup_drop_off_type AS ENUM ( 'regular' -- 0 or empty - Regularly scheduled pickup/dropoff. , 'not_available' -- 1 – No pickup/dropoff available. , 'call' -- 2 – Must phone agency to arrange pickup/dropoff. , 'driver' -- 3 – Must coordinate with driver to arrange pickup/dropoff. ); -CREATE CAST ("${opt.schema}".pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; -CREATE TYPE "${opt.schema}".timepoint_v AS ENUM ( +CREATE TYPE timepoint_v AS ENUM ( 'approximate' -- 0 – Times are considered approximate. , 'exact' -- 1 or empty - Times are considered exact. ); -CREATE CAST ("${opt.schema}".timepoint_v AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (timepoint_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".stop_times ( +CREATE TABLE stop_times ( trip_id TEXT NOT NULL, - FOREIGN KEY (trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (trip_id) REFERENCES trips, -- https://gist.github.com/derhuerst/574edc94981a21ef0ce90713f1cff7f6 arrival_time INTERVAL, departure_time INTERVAL, stop_id TEXT NOT NULL, - FOREIGN KEY (stop_id) REFERENCES "${opt.schema}".stops, + FOREIGN KEY (stop_id) REFERENCES stops, stop_sequence INT NOT NULL, stop_sequence_consec INT, stop_headsign TEXT, - pickup_type "${opt.schema}".pickup_drop_off_type, - drop_off_type "${opt.schema}".pickup_drop_off_type, + pickup_type pickup_drop_off_type, + drop_off_type pickup_drop_off_type, shape_dist_traveled REAL, - timepoint "${opt.schema}".timepoint_v, + timepoint timepoint_v, -- Used to implement frequencies.txt. Filled after COPY-ing, see below. trip_start_time INTERVAL ); -COPY "${opt.schema}".stop_times ( +COPY stop_times ( trip_id, arrival_time, departure_time, @@ -91,7 +91,7 @@ const afterAll = (opt) => `\ \\. -- trip_start_time is used to implement frequencies.txt. -UPDATE "${opt.schema}".stop_times +UPDATE stop_times -- This is ugly, but AFAICT there is no cleaner way. -- see also https://stackoverflow.com/a/4359354/1072129 SET trip_start_time = t.trip_start_time @@ -103,61 +103,61 @@ FROM ( first_value(arrival_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence) ) AS trip_start_time, trip_id, stop_sequence - FROM "${opt.schema}".stop_times + FROM stop_times ) AS t -- self-join WHERE stop_times.trip_id = t.trip_id AND stop_times.stop_sequence = t.stop_sequence; -CREATE INDEX ON "${opt.schema}".stop_times (trip_id); -CREATE INDEX ON "${opt.schema}".stop_times (stop_id); +CREATE INDEX ON stop_times (trip_id); +CREATE INDEX ON stop_times (stop_id); -UPDATE "${opt.schema}".stop_times +UPDATE stop_times SET stop_sequence_consec = t.seq FROM ( SELECT row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC)::integer - 1 AS seq, trip_id, stop_sequence - FROM "${opt.schema}".stop_times + FROM stop_times ) AS t -WHERE "${opt.schema}".stop_times.trip_id = t.trip_id -AND "${opt.schema}".stop_times.stop_sequence = t.stop_sequence; +WHERE stop_times.trip_id = t.trip_id +AND stop_times.stop_sequence = t.stop_sequence; -CREATE INDEX ON "${opt.schema}".stop_times (stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (trip_id, stop_sequence_consec); -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time DESC NULLS LAST); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time DESC NULLS LAST); +CREATE INDEX ON stop_times (stop_sequence_consec); +CREATE INDEX ON stop_times (trip_id, stop_sequence_consec); +CREATE INDEX ON stop_times (arrival_time DESC NULLS LAST); +CREATE INDEX ON stop_times (departure_time DESC NULLS LAST); -- todo: are these two necessary? -CREATE INDEX ON "${opt.schema}".stop_times (arrival_time); -CREATE INDEX ON "${opt.schema}".stop_times (departure_time); +CREATE INDEX ON stop_times (arrival_time); +CREATE INDEX ON stop_times (departure_time); -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_departure_time () +CREATE OR REPLACE FUNCTION largest_departure_time () RETURNS interval AS $$ SELECT departure_time - FROM "${opt.schema}".stop_times + FROM stop_times WHERE EXISTS ( SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + FROM trips + JOIN service_days ON service_days.service_id = trips.service_id WHERE trips.trip_id = stop_times.trip_id ) ORDER BY departure_time DESC NULLS LAST LIMIT 1; $$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".largest_arrival_time () +CREATE OR REPLACE FUNCTION largest_arrival_time () RETURNS interval AS $$ SELECT arrival_time - FROM "${opt.schema}".stop_times + FROM stop_times WHERE EXISTS ( SELECT * - FROM "${opt.schema}".trips - JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + FROM trips + JOIN service_days ON service_days.service_id = trips.service_id WHERE trips.trip_id = stop_times.trip_id ) ORDER BY arrival_time DESC NULLS LAST LIMIT 1; $$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_min ( +CREATE OR REPLACE FUNCTION dates_filter_min ( _timestamp TIMESTAMP WITH TIME ZONE ) RETURNS date AS $$ @@ -165,22 +165,22 @@ RETURNS date AS $$ 'day', _timestamp - GREATEST( - "${opt.schema}".largest_arrival_time(), - "${opt.schema}".largest_departure_time() + largest_arrival_time(), + largest_departure_time() ) -- we assume the DST <-> standard time shift is always <= 1h - '1 hour 1 second'::interval ); $$ LANGUAGE SQL IMMUTABLE; -- This function doesn't do much, we just provide it to match date_filter_min(). -CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_max ( +CREATE OR REPLACE FUNCTION dates_filter_max ( _timestamp TIMESTAMP WITH TIME ZONE ) RETURNS date AS $$ SELECT date_trunc('day', _timestamp); $$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures AS +CREATE OR REPLACE VIEW arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT agency.agency_id, @@ -238,12 +238,12 @@ WITH stop_times_based AS NOT MATERIALIZED ( 'no_info_or_inherit' ) AS wheelchair_boarding FROM ( - "${opt.schema}".stop_times s - JOIN "${opt.schema}".stops ON s.stop_id = stops.stop_id - LEFT JOIN "${opt.schema}".stops stations ON stops.parent_station = stations.stop_id - JOIN "${opt.schema}".trips ON s.trip_id = trips.trip_id - JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( + stop_times s + JOIN stops ON s.stop_id = stops.stop_id + LEFT JOIN stops stations ON stops.parent_station = stations.stop_id + JOIN trips ON s.trip_id = trips.trip_id + JOIN routes ON trips.route_id = routes.route_id + LEFT JOIN agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -251,7 +251,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - JOIN "${opt.schema}".service_days ON trips.service_id = service_days.service_id + JOIN service_days ON trips.service_id = service_days.service_id ) -- todo: this slows down slightly -- ORDER BY route_id, s.trip_id, "date", stop_sequence @@ -335,17 +335,17 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN "${opt.schema}".frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? ) t ) t ) frequencies_based; -CREATE OR REPLACE FUNCTION "${opt.schema}".arrival_departure_by_arrival_departure_id(id TEXT) -RETURNS "${opt.schema}".arrivals_departures +CREATE OR REPLACE FUNCTION arrival_departure_by_arrival_departure_id(id TEXT) +RETURNS arrivals_departures AS $$ SELECT * - FROM "${opt.schema}".arrivals_departures + FROM arrivals_departures WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp AND stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer @@ -355,7 +355,7 @@ AS $$ LIMIT 1; $$ LANGUAGE SQL STABLE STRICT; -CREATE OR REPLACE VIEW "${opt.schema}".connections AS +CREATE OR REPLACE VIEW connections AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT route_id, @@ -459,9 +459,9 @@ WITH stop_times_based AS NOT MATERIALIZED ( nullif(to_stations.wheelchair_boarding, 'no_info_or_inherit'), 'no_info_or_inherit' ) AS to_wheelchair_boarding - FROM "${opt.schema}".trips - LEFT JOIN "${opt.schema}".routes ON trips.route_id = routes.route_id - LEFT JOIN "${opt.schema}".agency ON ( + FROM trips + LEFT JOIN routes ON trips.route_id = routes.route_id + LEFT JOIN agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -469,16 +469,16 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - LEFT JOIN "${opt.schema}".stop_times ON trips.trip_id = stop_times.trip_id - LEFT JOIN "${opt.schema}".stops from_stops ON stop_times.stop_id = from_stops.stop_id - LEFT JOIN "${opt.schema}".stops from_stations ON from_stops.parent_station = from_stations.stop_id - INNER JOIN "${opt.schema}".stop_times to_stop_times ON stop_times.trip_id = to_stop_times.trip_id AND stop_times.stop_sequence_consec + 1 = to_stop_times.stop_sequence_consec - INNER JOIN "${opt.schema}".stops to_stops ON to_stop_times.stop_id = to_stops.stop_id - LEFT JOIN "${opt.schema}".stops to_stations ON to_stops.parent_station = to_stations.stop_id + LEFT JOIN stop_times ON trips.trip_id = stop_times.trip_id + LEFT JOIN stops from_stops ON stop_times.stop_id = from_stops.stop_id + LEFT JOIN stops from_stations ON from_stops.parent_station = from_stations.stop_id + INNER JOIN stop_times to_stop_times ON stop_times.trip_id = to_stop_times.trip_id AND stop_times.stop_sequence_consec + 1 = to_stop_times.stop_sequence_consec + INNER JOIN stops to_stops ON to_stop_times.stop_id = to_stops.stop_id + LEFT JOIN stops to_stations ON to_stops.parent_station = to_stations.stop_id ) trips JOIN ( SELECT * - FROM "${opt.schema}".service_days + FROM service_days ORDER BY service_id, "date" ) service_days ON trips.service_id = service_days.service_id ) @@ -583,17 +583,17 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN "${opt.schema}".frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? ) t ) t ) frequencies_based; -CREATE OR REPLACE FUNCTION "${opt.schema}".connection_by_connection_id(id TEXT) -RETURNS "${opt.schema}".connections +CREATE OR REPLACE FUNCTION connection_by_connection_id(id TEXT) +RETURNS connections AS $$ SELECT * - FROM "${opt.schema}".connections + FROM connections WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp AND from_stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer diff --git a/lib/stops.js b/lib/stops.js index 53549b4..a76f5ba 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -2,14 +2,14 @@ // https://gtfs.org/documentation/schedule/reference/#stopstxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".location_type_val AS ENUM ( +CREATE TYPE location_type_val AS ENUM ( 'stop' -- 0 (or blank): Stop (or Platform). A location where passengers board or disembark from a transit vehicle. Is called a platform when defined within a parent_station. , 'station' -- 1 – Station. A physical structure or area that contains one or more platform. , 'entrance_exit' -- 2 – Entrance/Exit. A location where passengers can enter or exit a station from the street. If an entrance/exit belongs to multiple stations, it can be linked by pathways to both, but the data provider must pick one of them as parent. , 'node' -- 3 – Generic Node. A location within a station, not matching any other location_type, which can be used to link together pathways define in pathways.txt. , 'boarding_area' -- 4 – Boarding Area. A specific location on a platform, where passengers can board and/or alight vehicles. ); -CREATE CAST ("${opt.schema}".location_type_val AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (location_type_val AS text) WITH INOUT AS IMPLICIT; -- For parentless stops: -- 0 or empty - No accessibility information for the stop. @@ -25,14 +25,14 @@ CREATE CAST ("${opt.schema}".location_type_val AS text) WITH INOUT AS IMPLICIT; -- 0 or empty - Station entrance will inherit its wheelchair_boarding behavior from the parent station, if specified for the parent. -- 1 - Station entrance is wheelchair accessible. -- 2 - No accessible path from station entrance to stops/platforms. -CREATE TYPE "${opt.schema}".wheelchair_boarding_val AS ENUM ( +CREATE TYPE wheelchair_boarding_val AS ENUM ( 'no_info_or_inherit' , 'accessible' , 'not_accessible' ); -CREATE CAST ("${opt.schema}".wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".stops ( +CREATE TABLE stops ( stop_id TEXT PRIMARY KEY, stop_code TEXT, -- todo: Required for locations which are stops (location_type=0), stations (location_type=1) or entrances/exits (location_type=2). Optional for locations which are generic nodes (location_type=3) or boarding areas (location_type=4). @@ -41,16 +41,16 @@ CREATE TABLE "${opt.schema}".stops ( stop_loc geography(POINT), -- stop_lat/stop_lon zone_id TEXT, stop_url TEXT, - location_type "${opt.schema}".location_type_val, + location_type location_type_val, parent_station TEXT, - stop_timezone TEXT CHECK ("${opt.schema}".is_timezone(stop_timezone)), - wheelchair_boarding "${opt.schema}".wheelchair_boarding_val, + stop_timezone TEXT CHECK (is_timezone(stop_timezone)), + wheelchair_boarding wheelchair_boarding_val, level_id TEXT, - ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES "${opt.schema}".levels,`} + ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES levels,`} platform_code TEXT ); -COPY "${opt.schema}".stops ( +COPY stops ( stop_id, stop_code, stop_name, @@ -108,12 +108,12 @@ const formatStopsRow = (s) => { const afterAll = (opt) => `\ \\. -ALTER TABLE "${opt.schema}".stops +ALTER TABLE stops ADD CONSTRAINT stops_parent_station_fkey -FOREIGN KEY (parent_station) REFERENCES "${opt.schema}".stops; +FOREIGN KEY (parent_station) REFERENCES stops; -CREATE INDEX ON "${opt.schema}".stops (parent_station); -${opt.stopsLocationIndex ? `CREATE INDEX ON "${opt.schema}".stops (stop_loc);` : ''} +CREATE INDEX ON stops (parent_station); +${opt.stopsLocationIndex ? `CREATE INDEX ON stops (stop_loc);` : ''} ` module.exports = { diff --git a/lib/transfers.js b/lib/transfers.js index c0c5be9..cfd550b 100644 --- a/lib/transfers.js +++ b/lib/transfers.js @@ -2,33 +2,33 @@ // https://gtfs.org/documentation/schedule/reference/#transferstxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".transfer_type_v AS ENUM ( +CREATE TYPE transfer_type_v AS ENUM ( 'recommended' -- 0 or empty - Recommended transfer point between routes. , 'timed' -- 1 - Timed transfer point between two routes. The departing vehicle is expected to wait for the arriving one and leave sufficient time for a rider to transfer between routes. , 'minimum_time' -- 2 – Transfer requires a minimum amount of time between arrival and departure to ensure a connection. The time required to transfer is specified by min_transfer_time. , 'impossible' -- 3 - Transfers are not possible between routes at the location. ); -CREATE CAST ("${opt.schema}".transfer_type_v AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (transfer_type_v AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".transfers ( +CREATE TABLE transfers ( id SERIAL PRIMARY KEY, from_stop_id TEXT, - FOREIGN KEY (from_stop_id) REFERENCES "${opt.schema}".stops, + FOREIGN KEY (from_stop_id) REFERENCES stops, to_stop_id TEXT, - FOREIGN KEY (to_stop_id) REFERENCES "${opt.schema}".stops, - transfer_type "${opt.schema}".transfer_type_v, + FOREIGN KEY (to_stop_id) REFERENCES stops, + transfer_type transfer_type_v, min_transfer_time INT, from_route_id TEXT, - FOREIGN KEY (from_route_id) REFERENCES "${opt.schema}".routes, + FOREIGN KEY (from_route_id) REFERENCES routes, to_route_id TEXT, - FOREIGN KEY (from_route_id) REFERENCES "${opt.schema}".routes, + FOREIGN KEY (from_route_id) REFERENCES routes, from_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips, + FOREIGN KEY (from_trip_id) REFERENCES trips, to_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES "${opt.schema}".trips + FOREIGN KEY (from_trip_id) REFERENCES trips ); -ALTER TABLE "${opt.schema}".transfers +ALTER TABLE transfers ADD CONSTRAINT transfers_sig UNIQUE ( from_stop_id, @@ -39,7 +39,7 @@ UNIQUE ( to_trip_id ); -COPY "${opt.schema}".transfers ( +COPY transfers ( from_stop_id, to_stop_id, transfer_type, diff --git a/lib/translations.js b/lib/translations.js index ca08054..8c858aa 100644 --- a/lib/translations.js +++ b/lib/translations.js @@ -2,20 +2,20 @@ // https://gtfs.org/documentation/schedule/reference/#translationstxt const beforeAll = (opt) => `\ -CREATE OR REPLACE FUNCTION "${opt.schema}".table_exists( +CREATE OR REPLACE FUNCTION table_exists( t_name TEXT ) RETURNS BOOLEAN AS $$ SELECT EXISTS ( SELECT FROM pg_tables - WHERE schemaname = '${opt.schema}' + WHERE schemaname = 'main' AND tablename = t_name LIMIT 1 ); $$ LANGUAGE sql STABLE; -CREATE OR REPLACE FUNCTION "${opt.schema}".column_exists( +CREATE OR REPLACE FUNCTION column_exists( t_name TEXT, c_name TEXT ) @@ -23,29 +23,29 @@ RETURNS BOOLEAN AS $$ SELECT EXISTS ( SELECT FROM information_schema.columns - WHERE table_schema = '${opt.schema}' + WHERE table_schema = 'main' AND table_name = t_name AND column_name = c_name LIMIT 1 ); $$ LANGUAGE sql STABLE; -CREATE TABLE "${opt.schema}"._translations_ref_cols ( +CREATE TABLE _translations_ref_cols ( table_name TEXT PRIMARY KEY, -- todo: only check if columns exist when table exists? record_id_col TEXT NOT NULL CONSTRAINT valid_record_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) + NOT table_exists(table_name) OR - "${opt.schema}".column_exists(table_name, record_id_col) + column_exists(table_name, record_id_col) ), record_sub_id_col TEXT CONSTRAINT valid_record_sub_id_col CHECK ( - NOT "${opt.schema}".table_exists(table_name) + NOT table_exists(table_name) OR record_sub_id_col IS NULL OR - "${opt.schema}".column_exists(table_name, record_sub_id_col) + column_exists(table_name, record_sub_id_col) ) ); @@ -86,7 +86,7 @@ CREATE TABLE "${opt.schema}"._translations_ref_cols ( -- > - start_time for frequencies.txt -- > - to_stop_id for transfers.txt -- https://gtfs.org/documentation/schedule/reference/#translationstxt -INSERT INTO "${opt.schema}"._translations_ref_cols ( +INSERT INTO _translations_ref_cols ( table_name, record_id_col, record_sub_id_col @@ -109,7 +109,7 @@ INSERT INTO "${opt.schema}"._translations_ref_cols ( ('transfers', 'from_stop_id', 'to_stop_id') ; -CREATE OR REPLACE FUNCTION "${opt.schema}".row_exists( +CREATE OR REPLACE FUNCTION row_exists( table_name TEXT, col_a_name TEXT, col_a_value TEXT, @@ -129,7 +129,7 @@ AS $$ WHERE %I = %L -- col_a_name, col_a_value LIMIT 1 ) - ', '${opt.schema}', table_name, col_a_name, col_a_value) + ', 'main', table_name, col_a_name, col_a_value) INTO STRICT result; RETURN result; ELSE @@ -141,7 +141,7 @@ AS $$ AND %I = %L -- col_b_name, col_b_value LIMIT 1 ) - ', '${opt.schema}', table_name, col_a_name, col_a_value, col_b_name, col_b_value) + ', 'main', table_name, col_a_name, col_a_value, col_b_name, col_b_value) INTO STRICT result; RETURN result; END IF; @@ -161,7 +161,7 @@ $$ LANGUAGE plpgsql STABLE; -- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail -- todo: assert that it fails with 2 rows -CREATE OR REPLACE FUNCTION "${opt.schema}".is_valid_translation_ref( +CREATE OR REPLACE FUNCTION is_valid_translation_ref( _table_name TEXT, _field_name TEXT, _record_id TEXT, @@ -177,12 +177,12 @@ AS $$ BEGIN IF _record_id IS NOT NULL THEN SELECT record_id_col - FROM "${opt.schema}"._translations_ref_cols + FROM _translations_ref_cols WHERE table_name = _table_name LIMIT 1 INTO _record_id_col; SELECT record_sub_id_col - FROM "${opt.schema}"._translations_ref_cols + FROM _translations_ref_cols WHERE table_name = _table_name LIMIT 1 INTO _record_sub_id_col; @@ -193,7 +193,7 @@ AS $$ MESSAGE = format('record_sub_id must be NULL for %I but is %L', _table_name, _record_sub_id), ERRCODE = 'data_exception'; END IF; - SELECT "${opt.schema}".row_exists( + SELECT row_exists( _table_name, _record_id_col, _record_id, _record_sub_id_col, _record_sub_id @@ -201,7 +201,7 @@ AS $$ INTO STRICT result; RETURN result; ELSEIF _field_value IS NOT NULL THEN - SELECT "${opt.schema}".row_exists( + SELECT row_exists( _table_name, _field_name, _field_value, NULL, NULL @@ -224,7 +224,7 @@ $$ LANGUAGE plpgsql STABLE; -- related: https://github.com/google/transit/pull/98 -- https://gtfs.org/documentation/schedule/reference/#translationstxt -CREATE TABLE "${opt.schema}".translations ( +CREATE TABLE translations ( -- > Defines the table that contains the field to be translated. Allowed values are: -- > agency, stops, routes, trips, stop_times, pathways, levels, feed_info, attributions -- > Any file added to GTFS will have a table_name value equivalent to the file name, as listed above (i.e., not including the .txt file extension). @@ -233,16 +233,16 @@ CREATE TABLE "${opt.schema}".translations ( -- > Name of the field to be translated. […] Fields with other types should not be translated. field_name TEXT NOT NULL CONSTRAINT valid_field_name CHECK ( - NOT "${opt.schema}".table_exists(table_name) + NOT table_exists(table_name) OR - "${opt.schema}".column_exists(table_name, field_name) + column_exists(table_name, field_name) ), language TEXT NOT NULL CONSTRAINT valid_language CHECK ( - NOT "${opt.schema}".table_exists(table_name) + NOT table_exists(table_name) OR - "${opt.schema}".is_valid_lang_code(language) + is_valid_lang_code(language) ), translation TEXT NOT NULL, @@ -296,11 +296,11 @@ CREATE TABLE "${opt.schema}".translations ( ), CONSTRAINT valid_reference CHECK ( - NOT "${opt.schema}".table_exists(table_name) + NOT table_exists(table_name) OR table_name = 'feed_info' OR - "${opt.schema}".is_valid_translation_ref( + is_valid_translation_ref( table_name, field_name, record_id, @@ -322,7 +322,7 @@ CREATE TABLE "${opt.schema}".translations ( ) ); -COPY "${opt.schema}".translations ( +COPY translations ( table_name, field_name, language, @@ -349,7 +349,7 @@ const afterAll = (opt) => `\ \\. -- todo -CREATE INDEX ON "${opt.schema}".translations ( +CREATE INDEX ON translations ( table_name, field_name, language, @@ -358,7 +358,7 @@ CREATE INDEX ON "${opt.schema}".translations ( field_value ); -CREATE OR REPLACE VIEW "${opt.schema}".stops_translated AS +CREATE OR REPLACE VIEW stops_translated AS SELECT -- almost all columns, duh -- todo: find a way to use all columns without explicitly enumerating them here @@ -378,21 +378,21 @@ SELECT wheelchair_boarding, level_id, platform_code -FROM "${opt.schema}".stops s -LEFT JOIN "${opt.schema}".translations stop_n_t ON ( +FROM stops s +LEFT JOIN translations stop_n_t ON ( stop_n_t.table_name = 'stops' AND stop_n_t.field_name = 'stop_name' AND (s.stop_id = stop_n_t.record_id OR s.stop_name = stop_n_t.field_value) ) -LEFT JOIN "${opt.schema}".translations stop_d_t ON ( +LEFT JOIN translations stop_d_t ON ( stop_d_t.table_name = 'stops' AND stop_d_t.field_name = 'stop_desc' AND (s.stop_id = stop_d_t.record_id OR s.stop_name = stop_d_t.field_value) ) -LEFT JOIN "${opt.schema}".translations stop_u_t ON ( +LEFT JOIN translations stop_u_t ON ( stop_u_t.table_name = 'stops' AND stop_u_t.field_name = 'stop_url' AND (s.stop_id = stop_u_t.record_id OR s.stop_name = stop_u_t.field_value) ); -CREATE OR REPLACE VIEW "${opt.schema}".routes_translated AS +CREATE OR REPLACE VIEW routes_translated AS SELECT -- almost all columns, duh -- todo: find a way to use all columns without explicitly enumerating them here @@ -410,26 +410,26 @@ SELECT route_color, route_text_color, route_sort_order -FROM "${opt.schema}".routes r -LEFT JOIN "${opt.schema}".translations route_s_t ON ( +FROM routes r +LEFT JOIN translations route_s_t ON ( route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' AND (r.route_id = route_s_t.record_id OR r.route_short_name = route_s_t.field_value) ) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( +LEFT JOIN translations route_l_t ON ( route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' AND (r.route_id = route_l_t.record_id OR r.route_long_name = route_l_t.field_value) ) -LEFT JOIN "${opt.schema}".translations route_d_t ON ( +LEFT JOIN translations route_d_t ON ( route_d_t.table_name = 'routes' AND route_d_t.field_name = 'route_desc' AND (r.route_id = route_d_t.record_id OR r.route_long_name = route_d_t.field_value) ) -LEFT JOIN "${opt.schema}".translations route_u_t ON ( +LEFT JOIN translations route_u_t ON ( route_u_t.table_name = 'routes' AND route_u_t.field_name = 'route_url' AND (r.route_id = route_u_t.record_id OR r.route_long_name = route_u_t.field_value) ); -- todo [breaking]: remove in favor of trip_headsign_translations & trip_short_name_translations -CREATE OR REPLACE VIEW "${opt.schema}".trips_translated AS +CREATE OR REPLACE VIEW trips_translated AS SELECT -- almost all columns, duh -- todo: find a way to use all columns without explicitly enumerating them here @@ -445,17 +445,17 @@ SELECT shape_id, wheelchair_accessible, bikes_allowed -FROM "${opt.schema}".trips t -LEFT JOIN "${opt.schema}".translations trip_s_t ON ( +FROM trips t +LEFT JOIN translations trip_s_t ON ( trip_s_t.table_name = 'trips' AND trip_s_t.field_name = 'trip_short_name' AND (t.trip_id = trip_s_t.record_id OR t.trip_headsign = trip_s_t.field_value) ) -LEFT JOIN "${opt.schema}".translations trip_h_t ON ( +LEFT JOIN translations trip_h_t ON ( trip_h_t.table_name = 'trips' AND trip_h_t.field_name = 'trip_headsign' AND (t.trip_id = trip_h_t.record_id OR t.trip_headsign = trip_h_t.field_value) ); -CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures_translated AS +CREATE OR REPLACE VIEW arrivals_departures_translated AS SELECT -- almost all columns, duh -- todo: find a way to use all columns without explicitly enumerating them here @@ -484,29 +484,29 @@ SELECT station_id, coalesce(station_t.translation, station_name) as station_name, station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".arrivals_departures ad -LEFT JOIN "${opt.schema}".translations route_s_t ON ( +FROM arrivals_departures ad +LEFT JOIN translations route_s_t ON ( route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) ) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( +LEFT JOIN translations route_l_t ON ( route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) ) -LEFT JOIN "${opt.schema}".translations trip_t ON ( +LEFT JOIN translations trip_t ON ( trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) ) -LEFT JOIN "${opt.schema}".translations stop_t ON ( +LEFT JOIN translations stop_t ON ( stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) ) -LEFT JOIN "${opt.schema}".translations station_t ON ( +LEFT JOIN translations station_t ON ( station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' AND station_t.language = stop_t.language AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) ) -LEFT JOIN "${opt.schema}".translations stop_times_t ON ( +LEFT JOIN translations stop_times_t ON ( stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' AND ( (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) @@ -514,7 +514,7 @@ LEFT JOIN "${opt.schema}".translations stop_times_t ON ( ) ); -CREATE OR REPLACE VIEW "${opt.schema}".connections_translated AS +CREATE OR REPLACE VIEW connections_translated AS SELECT -- almost all columns, duh -- todo: find a way to use all columns without explicitly enumerating them here @@ -561,46 +561,46 @@ SELECT to_station_id, coalesce(to_station.translation, to_station_name) as to_station_name, to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM "${opt.schema}".connections c -LEFT JOIN "${opt.schema}".translations route_s_t ON ( +FROM connections c +LEFT JOIN translations route_s_t ON ( route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) ) -LEFT JOIN "${opt.schema}".translations route_l_t ON ( +LEFT JOIN translations route_l_t ON ( route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) ) -LEFT JOIN "${opt.schema}".translations trip_t ON ( +LEFT JOIN translations trip_t ON ( trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) ) -LEFT JOIN "${opt.schema}".translations from_stop ON ( +LEFT JOIN translations from_stop ON ( from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) ) -LEFT JOIN "${opt.schema}".translations from_station ON ( +LEFT JOIN translations from_station ON ( from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' AND from_station.language = from_stop.language AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) ) -LEFT JOIN "${opt.schema}".translations to_stop ON ( +LEFT JOIN translations to_stop ON ( to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' AND to_stop.language = from_stop.language AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) ) -LEFT JOIN "${opt.schema}".translations to_station ON ( +LEFT JOIN translations to_station ON ( to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' AND to_station.language = from_stop.language AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) ) -LEFT JOIN "${opt.schema}".translations from_stop_times_t ON ( +LEFT JOIN translations from_stop_times_t ON ( from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' AND ( (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) OR c.from_stop_headsign = from_stop_times_t.field_value ) ) -LEFT JOIN "${opt.schema}".translations to_stop_times_t ON ( +LEFT JOIN translations to_stop_times_t ON ( to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' AND ( (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) diff --git a/lib/trips.js b/lib/trips.js index 599ad9e..ac72d2a 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -2,38 +2,38 @@ // https://gtfs.org/documentation/schedule/reference/#tripstxt const beforeAll = (opt) => `\ -CREATE TYPE "${opt.schema}".wheelchair_accessibility AS ENUM ( +CREATE TYPE wheelchair_accessibility AS ENUM ( 'unknown' -- 0 or empty - No accessibility information for the trip. , 'accessible' -- 1 – Vehicle being used on this particular trip can accommodate at least one rider in a wheelchair. , 'not_accessible' -- 2 – No riders in wheelchairs can be accommodated on this trip. ); -CREATE CAST ("${opt.schema}".wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; -CREATE TYPE "${opt.schema}".bikes_allowance AS ENUM ( +CREATE TYPE bikes_allowance AS ENUM ( 'unknown' -- 0 or empty - No bike information for the trip. , 'allowed' -- 1 – Vehicle being used on this particular trip can accommodate at least one bicycle. , 'not_allowed' -- 2 – No bicycles are allowed on this trip. ); -CREATE CAST ("${opt.schema}".bikes_allowance AS text) WITH INOUT AS IMPLICIT; +CREATE CAST (bikes_allowance AS text) WITH INOUT AS IMPLICIT; -CREATE TABLE "${opt.schema}".trips ( +CREATE TABLE trips ( trip_id TEXT PRIMARY KEY, route_id TEXT NOT NULL, - FOREIGN KEY (route_id) REFERENCES "${opt.schema}".routes, - service_id TEXT NOT NULL, -- references "${opt.schema}".service_days.service_id + FOREIGN KEY (route_id) REFERENCES routes, + service_id TEXT NOT NULL, -- references service_days.service_id trip_headsign TEXT, trip_short_name TEXT, direction_id INT, block_id TEXT, shape_id TEXT, -- todo: add NOT NULL? - ${opt.tripsWithoutShapeId ? '' : `CONSTRAINT valid_shape_id CHECK ("${opt.schema}".shape_exists(shape_id)),`} + ${opt.tripsWithoutShapeId ? '' : `CONSTRAINT valid_shape_id CHECK (shape_exists(shape_id)),`} -- todo [breaking]: use 0/unknown for empty values - wheelchair_accessible "${opt.schema}".wheelchair_accessibility, + wheelchair_accessible wheelchair_accessibility, -- todo [breaking]: use 0/unknown for empty values - bikes_allowed "${opt.schema}".bikes_allowance + bikes_allowed bikes_allowance ); -COPY "${opt.schema}".trips ( +COPY trips ( trip_id, route_id, service_id, @@ -81,7 +81,7 @@ const formatTripsRow = (t) => { const afterAll = (opt) => `\ \\. -CREATE INDEX ON "${opt.schema}".trips (route_id); +CREATE INDEX ON trips (route_id); ` module.exports = { diff --git a/readme.md b/readme.md index 913aad0..34b0c28 100644 --- a/readme.md +++ b/readme.md @@ -184,12 +184,6 @@ Options: currently running trips over time, by hour. Like --stats-by-route-date, this flag accepts none, view & materialized-view. - --schema The schema to use for the database. Default: public - Even when importing into a schema other than `public`, - a function `public.gtfs_via_postgres_import_version()` - gets created, to ensure that multiple imports into the - same database are all made using the same version. See - also multiple-datasets.md in the docs. --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - gtfs_via_postgres_version (text) diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 79ef49b..809c8e2 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -14,7 +14,7 @@ ls -lh amtrak-gtfs-2021-10-06 psql -c 'create database amtrak_2021_10_06' export PGDATABASE='amtrak_2021_10_06' -../cli.js -d --trips-without-shape-id --schema amtrak \ +../cli.js -d --trips-without-shape-id \ --import-metadata \ --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ @@ -24,7 +24,7 @@ export PGDATABASE='amtrak_2021_10_06' query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival -from amtrak.arrivals_departures +from arrivals_departures where stop_id = 'BHM' -- Birmingham and date = '2021-11-26' order by t_arrival @@ -59,7 +59,7 @@ fi acelaStatQuery=$(cat << EOF SELECT nr_of_trips, nr_of_arrs_deps -FROM amtrak.stats_by_route_date +FROM stats_by_route_date WHERE route_id = '40751' -- Acela AND date = '2021-11-26' AND is_effective = True @@ -73,7 +73,7 @@ fi acelaPhillyStatQuery=$(cat << EOF SELECT nr_of_arrs -FROM amtrak.stats_by_agency_route_stop_hour +FROM stats_by_agency_route_stop_hour WHERE route_id = '40751' -- Acela AND stop_id = 'PHL' -- Philadelphia AND effective_hour = '2022-07-24 09:00:00-05:00' diff --git a/test/index.sh b/test/index.sh index eccf7f6..d8b7194 100755 --- a/test/index.sh +++ b/test/index.sh @@ -14,6 +14,5 @@ psql -t -c 'SELECT version()' ./routes-without-agency-id.sh ./stops-without-level-id.sh ./invalid-empty-agency-id.sh -./multiple-schemas.sh echo -e "\n\n✔︎ tests passing" diff --git a/test/multiple-schemas.sh b/test/multiple-schemas.sh deleted file mode 100755 index 7ddabbd..0000000 --- a/test/multiple-schemas.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -cd "$(dirname $0)" -set -x - -env | grep '^PG' || true - -unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 -ls -lh amtrak-gtfs-2021-10-06 - -psql -c 'create database multiple_schemas' -export PGDATABASE='multiple_schemas' - -../cli.js -d --trips-without-shape-id \ - --schema one \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b - -../cli.js -d --trips-without-shape-id \ - --schema two \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b - -# https://dba.stackexchange.com/a/72656 -nr_of_unequal_stops=$(cat << EOF -SELECT count(*) -FROM one.stops a -FULL OUTER JOIN two.stops b ON ( - a.stop_id = b.stop_id -) -WHERE ( - a.stop_code IS DISTINCT FROM b.stop_code - OR a.stop_name IS DISTINCT FROM b.stop_name - OR a.stop_desc IS DISTINCT FROM b.stop_desc - OR a.stop_loc IS DISTINCT FROM b.stop_loc - OR a.zone_id IS DISTINCT FROM b.zone_id - OR a.stop_url IS DISTINCT FROM b.stop_url - OR a.location_type::TEXT IS DISTINCT FROM b.location_type::TEXT - OR a.parent_station IS DISTINCT FROM b.parent_station - OR a.stop_timezone IS DISTINCT FROM b.stop_timezone - OR a.wheelchair_boarding::TEXT IS DISTINCT FROM b.wheelchair_boarding::TEXT - OR a.level_id IS DISTINCT FROM b.level_id - OR a.platform_code IS DISTINCT FROM b.platform_code -) -EOF -) - -unequal_stops_1=$(psql --csv -t -c "$nr_of_unequal_stops" | head -n 1) -if [[ "$unequal_stops_1" -ne 0 ]]; then - 1>&2 echo "$unequal_stops_1 unequal stops between one.stops & two.stops" - exit 1 -fi - -# todo: assert that more tables are equal? - -# put an incompatible version -psql -c "$(cat << EOF -CREATE OR REPLACE FUNCTION public.gtfs_via_postgres_import_version() -RETURNS TEXT -AS \$\$ - SELECT '0.1.2' -\$\$ -LANGUAGE SQL -EOF -)" - -# expect another import to fail -if ../cli.js -d --trips-without-shape-id \ - --schema three \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b; then - 1>&2 echo "re-import with incompatible version didn't fail" - exit 1 -fi - -echo 'works ✔' From 4f50a200da8a7ac62dec18adaf4902cedf2e672f Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 14:57:16 +0200 Subject: [PATCH 10/16] =?UTF-8?q?DuckDB=20rewrite:=20port=20import=20logic?= =?UTF-8?q?=20to=20DuckDB,=20adapt=20tests,=20docs=20&=20CI=20=F0=9F=92=A5?= =?UTF-8?q?=F0=9F=93=9D=E2=9C=85=F0=9F=92=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/test.yml | 32 +- .gitignore | 5 + Dockerfile | 7 +- .../arrs_deps_by_route_name_and_time.sql | 4 +- benchmark/as-md.js | 56 +- benchmark/index.cjs | 125 ++ benchmark/index.sql | 110 -- benchmark/init.sh | 3 +- benchmark/stops_by_distance.sql | 2 +- cli.js | 37 +- docs/analysis/active-trips-by-hour.md | 2 +- docs/analysis/feed-by-route-date.md | 2 +- docs/import-metadata.md | 2 +- docs/multiple-datasets.md | 38 +- example.sh | 16 +- index.js | 115 +- lib/agency.js | 57 +- lib/calendar.js | 94 +- lib/calendar_dates.js | 75 +- lib/columns.js | 39 + lib/deps.js | 26 +- lib/feed_info.js | 75 +- lib/frequencies.js | 148 +- lib/get.js | 5 + lib/import_metadata.js | 39 +- lib/index.js | 5 +- lib/levels.js | 50 +- lib/pathways.js | 120 +- lib/prerequisites.js | 126 +- lib/routes.js | 107 +- lib/rows-count.js | 15 + lib/run.js | 5 + lib/service_days.js | 39 +- lib/shapes.js | 111 +- lib/stats_active_trips_by_hour.js | 46 +- lib/stats_by_agency_route_stop_hour.js | 26 +- lib/stats_by_route_date.js | 26 +- lib/stop_times.js | 567 ++++---- lib/stops.js | 208 ++- lib/transfers.js | 99 +- lib/translations.js | 1289 +++++++++-------- lib/trips.js | 117 +- lib/util.js | 12 - package.json | 10 +- readme.md | 112 +- test/amtrak-gtfs-2021-10-06.sh | 25 +- test/calendar-dates-only.sh | 15 +- test/index.sh | 3 +- test/invalid-empty-agency-id.sh | 9 +- test/multiple-datasets.sh | 138 ++ test/routes-without-agency-id.sh | 4 +- test/sample-gtfs-feed.sh | 56 +- test/stops-without-level-id.sh | 10 +- 53 files changed, 2405 insertions(+), 2059 deletions(-) create mode 100755 benchmark/index.cjs delete mode 100644 benchmark/index.sql create mode 100644 lib/columns.js create mode 100644 lib/get.js create mode 100644 lib/rows-count.js create mode 100644 lib/run.js delete mode 100644 lib/util.js create mode 100755 test/multiple-datasets.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index daf5025..b805645 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,11 +19,6 @@ jobs: node-version: - '22.x' - '24.x' - postgis-docker-tag: - - '14-3.5-alpine' - - '15-3.5-alpine' - - '16-3.5-alpine' - - '17-3.5-alpine' steps: - name: checkout @@ -32,35 +27,14 @@ jobs: uses: actions/setup-node@v4 with: node-version: ${{ matrix.node-version }} - - name: install sponge (moreutils) - run: sudo apt install -y moreutils - - name: install & start PostgreSQL with PostGIS - # todo: currently, it uses mdillon, which doesn't have PostgreSQL 14 - # uses: huaxk/postgis-action@v1 - # with: - # postgresql version: '${{ matrix.postgis-docker-tag }}' - # postgresql password: password - # postgresql user: postgres - # postgresql db: postgres + - name: install DuckDB run: | - docker run -d \ - -e POSTGRES_USER=$PGUSER -e POSTGRES_PASSWORD=$PGPASSWORD -e POSTGRES_DB=$PGDATABASE \ - -p 5432:5432 postgis/postgis:${{ matrix.postgis-docker-tag }} \ - -c timezone=Europe/Berlin - env: - PGUSER: postgres - PGPASSWORD: password - PGDATABASE: postgres + curl -fsSL -U '${{ github.repository }} CI' 'https://install.duckdb.org' | sh + export PATH="$HOME/.duckdb/cli/latest:$PATH" - run: npm install - run: npm run lint - name: npm test run : npm test - env: - PGHOST: localhost - PGPORT: '5432' - PGUSER: postgres - PGPASSWORD: password - PGDATABASE: postgres diff --git a/.gitignore b/.gitignore index e9d5a74..62aa85a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,13 @@ pnpm-debug.log /shrinkwrap.yaml /test/amtrak-gtfs-2021-10-06 +/test/*.duckdb /*.gtfs /*.gtfs.zip /*.gtfs.tar.gz /*.gtfs.tar.zst + +/*.duckdb +/*.duckdb.gz +/*.duckdb.br diff --git a/Dockerfile b/Dockerfile index 38e989a..55b5bea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,16 +9,11 @@ LABEL org.opencontainers.image.licenses="(Apache-2.0 AND Prosperity-3.0.0)" WORKDIR /app -# Both moreutils (providing sponge) and postgresql-client (providing psql) are not required but come in handy for users. -RUN apk add --no-cache \ - postgresql-client \ - moreutils - ADD package.json /app RUN npm install --production && npm cache clean --force ADD . /app -RUN ln -s /app/cli.js /usr/local/bin/gtfs-via-postgres +RUN ln -s /app/cli.js /usr/local/bin/gtfs-via-duckdb VOLUME /gtfs WORKDIR /gtfs diff --git a/benchmark/arrs_deps_by_route_name_and_time.sql b/benchmark/arrs_deps_by_route_name_and_time.sql index a669107..88d63ad 100644 --- a/benchmark/arrs_deps_by_route_name_and_time.sql +++ b/benchmark/arrs_deps_by_route_name_and_time.sql @@ -2,5 +2,5 @@ SELECT * FROM arrivals_departures WHERE route_short_name = 'S1' AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/as-md.js b/benchmark/as-md.js index 2765c3c..017d1d3 100755 --- a/benchmark/as-md.js +++ b/benchmark/as-md.js @@ -1,39 +1,31 @@ #!/usr/bin/env node -const {pipeline, Transform} = require('stream') -const csvParser = require('csv-parser') -const {ok} = require('assert') +const {createInterface} = require('node:readline') -let firstRow = true +const linewise = createInterface({ + input: process.stdin, + // Note: We use the crlfDelay option to recognize all instances of CR LF as a single line break. + crlfDelay: Infinity, +}) -pipeline( - process.stdin, - csvParser(), - new Transform({ - objectMode: true, - transform: function (row, _, cb) { - if (firstRow) { - firstRow = false +;(async () => { + let firstRow = true + for await (const line of linewise) { + const row = JSON.parse(line) - const keys = Object.keys(row).filter(key => key !== 'filename') - process.stdout.write(`| ${keys.join(' | ')} |\n`) - process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`) - } + if (firstRow) { + firstRow = false - const formattedVals = Object.entries(row) - .map(([key, val]) => { - if (key === 'query') return '
' + val.replace(/\n/g, '
') + '
' - return val - }) - process.stdout.write(`| ${formattedVals.join(' | ')} |\n`) + const keys = Object.keys(row).filter(key => key !== 'filename') + process.stdout.write(`| ${keys.join(' | ')} |\n`) + process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`) + } - cb() - }, - }), - process.stdout, - (err) => { - if (!err) return; - console.error(err) - process.exit(1) - }, -) + const formattedVals = Object.entries(row) + .map(([key, val]) => { + if (key === 'query') return '
' + val.replace(/\n/g, '
') + '
' + return typeof val === 'number' && !Number.isInteger(val) ? Math.round(val * 100) / 100 : val + }) + process.stdout.write(`| ${formattedVals.join(' | ')} |\n`) + } +})() diff --git a/benchmark/index.cjs b/benchmark/index.cjs new file mode 100755 index 0000000..0d42a33 --- /dev/null +++ b/benchmark/index.cjs @@ -0,0 +1,125 @@ +#!/usr/bin/env node + +const {parseArgs} = require('node:util') +const {readFile} = require('node:fs/promises') +const {DuckDBInstance} = require('@duckdb/node-api') +const {Bench: Benchmark} = require('tinybench') +const {basename} = require('node:path') + +// adapted from https://stackoverflow.com/a/55297611/1072129 +const quantile = (sorted, q) => { + const pos = (sorted.length - 1) * q + const base = Math.floor(pos) + const rest = pos - base + if (base + 1 < sorted.length) { + return sorted[base] + rest * (sorted[base + 1] - sorted[base]) + } else { + return sorted[base] + } +} + +const { + values: flags, + positionals: args, +} = parseArgs({ + options: { + 'help': { + type: 'boolean', + short: 'h', + }, + }, + allowPositionals: true, +}) + +if (flags.help) { + process.stdout.write(` +Usage: + benchmark [options] [--] ... +\n`) + process.exit(0) +} + +;(async () => { + +const [pathToDb, ...queryFiles] = args +if (!pathToDb) { + console.error('you must pass the path to a DuckDB db file') + process.exit(1) +} +if (queryFiles.length === 0) { + console.error('you must pass >0 SQL files') + process.exit(1) +} +const instance = await DuckDBInstance.create(pathToDb, { + access_mode: 'READ_ONLY', +}) +const db = await instance.connect() + +await db.run(`\ +INSTALL spatial; +LOAD spatial; +`) + +const queriesByName = new Map() +const benchmark = new Benchmark({ + // - The default minimum number of iterations is too high. + // - The default minimum time is too low. + warmup: true, + warmupIterations: 1, + warmupTime: 5000, // 5s + iterations: 3, + time: 10000, // 10s +}) +await Promise.all( + queryFiles + .filter(queryFile => queryFile.slice(-9) !== '.skip.sql') + .map(async (queryFile) => { + const name = basename(queryFile) + const query = await readFile(queryFile, {encoding: 'utf8'}) + queriesByName.set(name, query) + benchmark.add(name, async () => { + await db.run(query) + }) + }), +) + +// do all queries once, to make sure they work +for (const [name, query] of queriesByName.entries()) { + try { + await db.run(query) + } catch (err) { + err.benchmark = name + err.query = query + throw err + } +} + +benchmark.addEventListener('cycle', (ev) => { + const {task} = ev + const query = queriesByName.get(task.name) + if ('error' in task.result) { + console.error(task.result) + process.exit(1) + } + const samples = Array.from(task.result.samples).sort() + console.log(JSON.stringify({ + query, + avg: task.result.latency.mean, + min: task.result.latency.min, + p25: quantile(samples, .25), + p50: task.result.latency.p50, + p75: task.result.latency.p75, + p95: quantile(samples, .95), + p99: task.result.latency.p99, + max: task.result.latency.max, + iterations: task.result.samples.length, + })) +}) + +await benchmark.run() + +})() +.catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/benchmark/index.sql b/benchmark/index.sql deleted file mode 100644 index 9d0371f..0000000 --- a/benchmark/index.sql +++ /dev/null @@ -1,110 +0,0 @@ -BEGIN; -CREATE TEMP TABLE _benchmark ( - filename TEXT, - query TEXT, - avg FLOAT, - min FLOAT, - p25 FLOAT, - p50 FLOAT, - p75 FLOAT, - p95 FLOAT, - p99 FLOAT, - max FLOAT, - iterations INTEGER -); - --- slightly modified from "How to benchmark PostgreSQL queries well" --- https://www.tangramvision.com/blog/how-to-benchmark-postgresql-queries-well#sql-function-with-clock_timestamp -CREATE OR REPLACE FUNCTION bench(_filename TEXT, _query TEXT, _iterations INTEGER) -RETURNS void -AS $$ -DECLARE - _warmup_iterations INTEGER; - _start TIMESTAMPTZ; - _end TIMESTAMPTZ; - _delta DOUBLE PRECISION; -BEGIN - CREATE TEMP TABLE IF NOT EXISTS _bench_results ( - elapsed DOUBLE PRECISION - ); - - -- Warm the cache - _warmup_iterations = GREATEST(3, _iterations / 10); - FOR i IN 1.._warmup_iterations LOOP - EXECUTE _query; - END LOOP; - - FOR i IN 1.._iterations LOOP - _start = clock_timestamp(); - EXECUTE _query; - _end = clock_timestamp(); - _delta = 1000 * (extract(epoch from _end) - extract(epoch from _start)); - INSERT INTO _bench_results VALUES (_delta); - END LOOP; - - INSERT INTO _benchmark - SELECT - _filename, - _query, - round(avg(elapsed)::numeric, 0), - min(elapsed), - round((percentile_cont(0.25) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.50) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.75) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.95) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - round((percentile_cont(0.99) WITHIN GROUP (ORDER BY elapsed))::numeric, 0), - max(elapsed), - _iterations - FROM _bench_results; - - DROP TABLE _bench_results; -END -$$ -LANGUAGE plpgsql; - --- We aim for ~4s per benchmark, but take more time for slow benchmarks. --- Apple Silicon M2, most queries seem to be single-threaded. -\set query `cat arrs_deps_by_non_existent_stop.sql` -SELECT bench('arrs_deps_by_non_existent_stop.sql', :'query', 500); -\set query `cat arrs_deps_by_route_name_and_time.sql` -SELECT bench('arrs_deps_by_route_name_and_time.sql', :'query', 90); -\set query `cat arrs_deps_by_station_and_time.sql` -SELECT bench('arrs_deps_by_station_and_time.sql', :'query', 170); -\set query `cat arrs_deps_by_station_and_time_seq_0.sql` -SELECT bench('arrs_deps_by_station_and_time_seq_0.sql', :'query', 500); -\set query `cat arrs_deps_by_stop.sql` -SELECT bench('arrs_deps_by_stop.sql', :'query', 50); -\set query `cat arrs_deps_by_stop_and_time.sql` -SELECT bench('arrs_deps_by_stop_and_time.sql', :'query', 400); -\set query `cat arrs_deps_by_time.sql` -SELECT bench('arrs_deps_by_time.sql', :'query', 5); -\set query `cat arrs_deps_by_time_manual.sql` -SELECT bench('arrs_deps_by_time_manual.sql', :'query', 5); -\set query `cat arrs_deps_by_trip_and_date.sql` -SELECT bench('arrs_deps_by_trip_and_date.sql', :'query', 500); -\set query `cat connections_by_non_existent_stop.sql` -SELECT bench('connections_by_non_existent_stop.sql', :'query', 500); -\set query `cat connections_by_route_name_and_time.sql` -SELECT bench('connections_by_route_name_and_time.sql', :'query', 20); -\set query `cat connections_by_station_and_time.sql` -SELECT bench('connections_by_station_and_time.sql', :'query', 50); -\set query `cat connections_by_station_and_time_seq_0.sql` -SELECT bench('connections_by_station_and_time_seq_0.sql', :'query', 300); -\set query `cat connections_by_stop.sql` -SELECT bench('connections_by_stop.sql', :'query', 40); -\set query `cat connections_by_stop_and_time.sql` -SELECT bench('connections_by_stop_and_time.sql', :'query', 200); -\set query `cat connections_by_time.sql` -SELECT bench('connections_by_time.sql', :'query', 3); -\set query `cat connections_by_time_manual.sql` -SELECT bench('connections_by_time_manual.sql', :'query', 3); -\set query `cat connections_by_trip_and_date.sql` -SELECT bench('connections_by_trip_and_date.sql', :'query', 500); -\set query `cat stats_by_route_date.sql` -SELECT bench('stats_by_route_date.sql', :'query', 5); -\set query `cat stops_by_distance.sql` -SELECT bench('stops_by_distance.sql', :'query', 170); - -SELECT * FROM _benchmark; - -ROLLBACK; diff --git a/benchmark/init.sh b/benchmark/init.sh index 1465b89..a0f0a6e 100755 --- a/benchmark/init.sh +++ b/benchmark/init.sh @@ -12,4 +12,5 @@ env | grep '^PG' || true ../cli.js -d \ --stops-location-index --stats-by-route-date=view \ - ../vbb-2022-07-01.gtfs/*.csv | sponge | psql -b + vbb-2022-07-01.gtfs.duckdb \ + ../vbb-2022-07-01.gtfs/*.csv diff --git a/benchmark/stops_by_distance.sql b/benchmark/stops_by_distance.sql index fc112f9..ff351c6 100644 --- a/benchmark/stops_by_distance.sql +++ b/benchmark/stops_by_distance.sql @@ -1,4 +1,4 @@ SELECT * FROM stops -ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC +ORDER BY ST_Distance(stop_loc::geometry, ST_Point(9.7, 50.547)) ASC LIMIT 100 diff --git a/cli.js b/cli.js index 32d3587..b5292bb 100755 --- a/cli.js +++ b/cli.js @@ -66,7 +66,7 @@ const { if (flags.help) { process.stdout.write(` Usage: - gtfs-to-sql [options] [--] ... + import-gtfs-into-duckdb [options] [--] ... Options: --silent -s Don't show files being converted. --require-dependencies -d Require files that the specified GTFS files depend @@ -108,11 +108,14 @@ Options: none, view & materialized-view. --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - - gtfs_via_postgres_version (text) - - gtfs_via_postgres_options (jsonb) + - gtfs_via_duckdb_version (text) + - gtfs_via_duckdb_options (jsonb) +Notes: + If you just want to check if the GTFS data can be imported but don't care about the + resulting DuckDB database file, you can import into an in-memory database by specifying + \`:memory:\` as the . Examples: - gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump + import-gtfs-into-duckdb some-gtfs.duckdb some-gtfs/*.txt [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J @@ -126,11 +129,11 @@ if (flags.version) { } const {basename, extname} = require('path') -const {pipeline} = require('stream') const convertGtfsToSql = require('./index') -const DataError = require('./lib/data-error') -const files = args.map((file) => { +const [pathToDb] = args + +const files = args.slice(1).map((file) => { const name = basename(file, extname(file)) return {name, file} }) @@ -155,16 +158,8 @@ if ('lower-case-lang-codes' in flags) { opt.lowerCaseLanguageCodes = flags['lower-case-lang-codes'] } -pipeline( - convertGtfsToSql(files, opt), - process.stdout, - (err) => { - if (!err) return; - if (err instanceof DataError) { - console.error(String(err)) - } else if (err.code !== 'EPIPE') { - console.error(err) - } - process.exit(1) - } -) +convertGtfsToSql(pathToDb, files, opt) +.catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/docs/analysis/active-trips-by-hour.md b/docs/analysis/active-trips-by-hour.md index 6071b75..da64e3b 100644 --- a/docs/analysis/active-trips-by-hour.md +++ b/docs/analysis/active-trips-by-hour.md @@ -4,7 +4,7 @@ Do you want to know how many trips are running at a specific point in time? `gtfs-via-duckdb` optionally provides a **(materialized) view `stats_active_trips_by_hour` to answer this. Use the `--stats-active-trips-by-hour` flag to enable it**: -- If you run `gtfs-to-sql` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. +- If you run `gtfs-to-duckdb` with `--stats-active-trips-by-hour=view`, `stats_active_trips_by_hour` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_active_trips_by_hour` rarely or in time-uncritical scenarios. - If you pass `--stats-active-trips-by-hour=materialized-view`, the `stats_active_trips_by_hour` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (a minute for small feeds, many hours for large feeds). ## example: number of active trips over the course of a day diff --git a/docs/analysis/feed-by-route-date.md b/docs/analysis/feed-by-route-date.md index 6720a94..e7af15a 100644 --- a/docs/analysis/feed-by-route-date.md +++ b/docs/analysis/feed-by-route-date.md @@ -8,7 +8,7 @@ Are you trying to answer a question like those below? `gtfs-via-duckdb` optionally provides a **(materialized) view `stats_by_route_date` to help with such SQL queries. Use the `--stats-by-route-date` flag to enable it** in the generated SQL: -- If you run `gtfs-to-sql` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. +- If you run `gtfs-to-duckdb` with `--stats-by-route-date=view`, `stats_by_route_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_date` rarely or in time-uncritical scenarios. - If you pass `--stats-by-route-date=materialized-view`, the `stats_by_route_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed). `stats_by_route_date` has the following columns: diff --git a/docs/import-metadata.md b/docs/import-metadata.md index 03df3e2..740a028 100644 --- a/docs/import-metadata.md +++ b/docs/import-metadata.md @@ -1,6 +1,6 @@ # import metadata -If you run `gtfs-to-sql` with the `--import-metadata` option, it will create functions providing information about the imported feed as well as the import process. +If you run `gtfs-to-duckdb` with the `--import-metadata` option, it will create functions providing information about the imported feed as well as the import process. An example with the [2023-04-05 VBB GTFS feed](https://vbb-gtfs.jannisr.de/2023-04-05): diff --git a/docs/multiple-datasets.md b/docs/multiple-datasets.md index 40e533d..725e068 100644 --- a/docs/multiple-datasets.md +++ b/docs/multiple-datasets.md @@ -1,3 +1,39 @@ # working with multiple datasets -Using `gtfs-via-postgres`, it is currently *not possible* to import more than one dataset into a single PostgreSQL database. +Using [DuckDB's ability to attach databases to one session](https://duckdb.org/docs/stable/sql/statements/attach), you can run queries combining or comparing data from multiple GTFS datasets. + +As an example, let's compare two datasets from [Paris](https://en.wikipedia.org/wiki/Île-de-France_Mobilités) and [Berlin](https://en.wikipedia.org/wiki/Verkehrsverbund_Berlin-Brandenburg). + +First, we import each into its own database: + +```shell +wget -U 'gtfs-via-duckdb demo' -O paris.gtfs.zip 'https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip' +unzip -d paris.gtfs paris.gtfs.zip +gtfs-to-duckdb --require-dependencies \ + paris.gtfs.duckdb \ + paris.gtfs/*.txt + +wget -U 'gtfs-via-duckdb demo' -O berlin.gtfs.zip 'https://www.vbb.de/vbbgtfs' +unzip -d berlin.gtfs berlin.gtfs.zip +gtfs-to-duckdb --require-dependencies \ + berlin.gtfs.duckdb \ + berlin.gtfs/*.txt +``` + +In a new DuckDB shell/session, we can now do queries across both datasets, for example finding the geographically furthest 2 stops: + +```sql +ATTACH 'paris.gtfs.duckdb' AS paris; +ATTACH 'berlin.gtfs.duckdb' AS berlin; + +-- warning: takes a long time to compute! +SELECT + paris.stop_id AS paris_stop_id, + berlin.stop_id AS berlin_stop_id +FROM + paris.stops paris, + berlin.stops berlin +-- todo: does this operator work in DuckDB? +ORDER BY paris.stop_loc <-> berlin.stop_loc DESC +LIMIT 100 +``` diff --git a/example.sh b/example.sh index bfef2d0..86ded12 100755 --- a/example.sh +++ b/example.sh @@ -4,13 +4,15 @@ set -e set -u set -o pipefail -2>&1 echo "importing into PostgreSQL:" +rm -f example.duckdb + +2>&1 echo "importing into example.duckdb:" ./cli.js --ignore-unsupported --require-dependencies --trips-without-shape-id --silent \ - node_modules/sample-gtfs-feed/gtfs/*.txt \ - | sponge | psql -b + example.duckdb \ + node_modules/sample-gtfs-feed/gtfs/*.txt 2>&1 echo "\nfetching a connection during DST switch:" -psql -c "$(cat <<- EOM +duckdb -csv -c "$(cat <<- EOM SELECT trip_id, route_id, from_stop_id, t_departure, @@ -21,10 +23,10 @@ psql -c "$(cat <<- EOM AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND from_stop_id = 'airport' -EOM)" +EOM)" example.duckdb 2>&1 echo "\nfetching the departure at the same time:" -psql -c "$(cat <<- EOM +duckdb -csv -c "$(cat <<- EOM SELECT trip_id, route_id, stop_id, t_departure, @@ -34,4 +36,4 @@ psql -c "$(cat <<- EOM AND t_departure > '2019-03-31T01:55:00+01:00' AND t_departure < '2019-03-31T03:00:00+02:00' -- AND route_id = 'D' -- AND stop_id = 'airport' -EOM)" +EOM)" example.duckdb diff --git a/index.js b/index.js index f169d2f..317f3d9 100644 --- a/index.js +++ b/index.js @@ -1,16 +1,19 @@ 'use strict' -const debug = require('debug')('gtfs-via-postgres') -const {randomBytes} = require('crypto') +const createDebug = require('debug') const sequencify = require('sequencify') -const {inspect} = require('util') -const readCsv = require('gtfs-utils/read-csv') -const {Stringifier} = require('csv-stringify') +const {DuckDBInstance} = require('@duckdb/node-api') const formatters = require('./lib') const getDependencies = require('./lib/deps') -const pkg = require('./package.json') +const RUN = require('./lib/run.js') +const GET = require('./lib/get.js') + +const debug = createDebug('gtfs-via-duckdb') +const debugSql = createDebug('gtfs-via-duckdb:sql') + +const convertGtfsToSql = async (pathToDb, files, opt = {}) => { + debug('pathToDb', pathToDb) -const convertGtfsToSql = async function* (files, opt = {}) { opt = { silent: false, // todo [breaking]: make the default! @@ -46,21 +49,17 @@ const convertGtfsToSql = async function* (files, opt = {}) { debug('files', files) const fileNames = files.map(f => f.name) + opt.files = fileNames const deps = getDependencies(opt, fileNames) debug('deps', deps) const tasks = { // file name -> [dep name] - 'is_valid_lang_code': { + 'valid_lang_codes': { dep: [], }, - 'is_timezone': { + 'valid_timezones': { dep: [], }, - ...(tripsWithoutShapeId ? {} : { - 'shape_exists': { - dep: [...deps.shape_exists], - }, - }), // special handling of calendar/calendar_dates: // service_days relies on *both* calendar's & calendar_dates' tables to @@ -133,19 +132,35 @@ const convertGtfsToSql = async function* (files, opt = {}) { opt.importStart = Date.now() - yield `\ --- GTFS SQL dump generated by ${pkg.name} v${pkg.version} --- ${pkg.homepage} --- options: -${inspect(opt, {compact: false}).split('\n').map(line => '-- ' + line).join('\n')} - -\\set ON_ERROR_STOP on -CREATE EXTENSION IF NOT EXISTS postgis; -BEGIN; + const instance = await DuckDBInstance.create(pathToDb) + const db = await instance.connect() + db[RUN] = async (query, ...args) => { + debugSql('db[RUN]', query, ...args) + try { + return await db.run(query, ...args) + } catch (err) { + err.query = query + err.args = args + throw err + } + } + db[GET] = async (query, ...args) => { + debugSql('db[GET]', query, ...args) + try { + const result = await db.runAndReadAll(query, ...args) + return result.getRowObjects() + } catch (err) { + err.query = query + err.args = args + throw err + } + } -` + await db[RUN](` +-- todo +-- BEGIN TRANSACTION; +`) - const csv = new Stringifier({quoted: true}) const nrOfRowsByName = new Map() const workingState = { nrOfRowsByName, @@ -154,47 +169,29 @@ BEGIN; for (const name of order) { if (!silent) console.error(name) const task = tasks[name] - yield `-- ${name}\n-----------------\n\n` - - const { - beforeAll, - afterAll, - } = formatters[name] - - if ('string' === typeof beforeAll && beforeAll) { - yield beforeAll - } else if ('function' === typeof beforeAll) { - yield beforeAll(opt, workingState) - } - if (task.file) { - const {formatRow} = formatters[name] - let nrOfRows = 0 - for await (const rawRow of await readCsv(task.file)) { - const row = formatRow(rawRow, opt, workingState) - let formattedRow = null - csv.api.__transform(row, (_formattedRow) => { - formattedRow = _formattedRow - }) - yield formattedRow - nrOfRows++ - } + const importData = formatters[name] - nrOfRowsByName.set(name, nrOfRows) - // todo [breaking]: indent with \t - // todo [breaking]: print a summary of all files instead - if (!silent) console.error(` processed ${nrOfRows} rows`) + // calendar's & calendar_dates's importData() should run even if their respective files are not present. + // Also, the frequencies table is needed for stop_times's arrivals_departures & connections views. + if (!task.file && importData.runDespiteMissingSrcFile !== true) { + console.error('skipping!') // todo: remove + continue } - if ('string' === typeof afterAll && afterAll) { - yield afterAll + ';\n' - } else if ('function' === typeof afterAll) { - yield afterAll(opt, workingState) + ';\n' + try { + await importData(db, task.file || null, opt, workingState) + } catch (err) { + err.gtfsFile = name + throw err } } - yield `\ -COMMIT;` + debug('workingState', workingState) + + // todo + // await db[RUN]('COMMIT') + debug('done!') } module.exports = convertGtfsToSql diff --git a/lib/agency.js b/lib/agency.js index 832987e..1208f08 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -1,50 +1,39 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#agencytxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToAgency, opt, workingState) => { + await db[RUN](`\ CREATE TABLE agency ( agency_id TEXT PRIMARY KEY, agency_name TEXT NOT NULL, agency_url TEXT NOT NULL, - agency_timezone TEXT NOT NULL - CONSTRAINT valid_timezone CHECK (is_timezone(agency_timezone)), + agency_timezone TEXT NOT NULL REFERENCES valid_timezones (tz), agency_lang TEXT, -- todo: validate? agency_phone TEXT, agency_fare_url TEXT, agency_email TEXT ); -COPY agency ( - agency_id, - agency_name, - agency_url, - agency_timezone, - agency_lang, - agency_phone, - agency_fare_url, - agency_email -) FROM STDIN csv; -` +INSERT INTO agency +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToAgency}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] +); +`) -const formatAgencyRow = (a) => { - return [ - a.agency_id || null, - a.agency_name || null, - a.agency_url || null, - a.agency_timezone || null, - a.agency_lang || null, - a.agency_phone || null, - a.agency_fare_url || null, - a.agency_email || null, - ] + workingState.nrOfRowsByName.set('agency', await queryNumberOfRows(db, 'agency', opt)) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatAgencyRow, - afterAll, -} +module.exports = importData diff --git a/lib/calendar.js b/lib/calendar.js index f06be15..c8ecbe2 100644 --- a/lib/calendar.js +++ b/lib/calendar.js @@ -1,12 +1,16 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#calendartxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToCalendar, opt, workingState) => { + await db[RUN](`\ CREATE TYPE availability AS ENUM ( 'not_available' -- 0 – Service is not available for Mondays in the date range. , 'available' -- 1 – Service is available for all Mondays in the date range. ); -CREATE CAST (availability AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (availability AS text) WITH INOUT AS IMPLICIT; CREATE TABLE calendar ( service_id TEXT PRIMARY KEY, @@ -20,48 +24,54 @@ CREATE TABLE calendar ( start_date DATE NOT NULL, end_date DATE NOT NULL ); +`) -COPY calendar ( - service_id, - monday, - tuesday, - wednesday, - thursday, - friday, - saturday, - sunday, - start_date, - end_date -) FROM STDIN csv; -` - -const availability = (val) => { - if (val === '0') return 'not_available' - if (val === '1') return 'available' - throw new Error('invalid availability: ' + val) -} + if (pathToCalendar !== null) { + await db[RUN](`\ +INSERT INTO calendar +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::availability)[monday + 1] AS monday, + enum_range(NULL::availability)[tuesday + 1] AS tuesday, + enum_range(NULL::availability)[wednesday + 1] AS wednesday, + enum_range(NULL::availability)[thursday + 1] AS thursday, + enum_range(NULL::availability)[friday + 1] AS friday, + enum_range(NULL::availability)[saturday + 1] AS saturday, + enum_range(NULL::availability)[sunday + 1] AS sunday, + array_slice(start_date, 0, 4) || '-' || array_slice(start_date, 5, 6) || '-' || array_slice(start_date, 7, 8) AS start_date, + array_slice(end_date, 0, 4) || '-' || array_slice(end_date, 5, 6) || '-' || array_slice(end_date, 7, 8) AS end_date +) +FROM read_csv( + '${pathToCalendar}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + service_id: 'TEXT', + monday: 'UINTEGER', + tuesday: 'UINTEGER', + wednesday: 'UINTEGER', + thursday: 'UINTEGER', + friday: 'UINTEGER', + saturday: 'UINTEGER', + sunday: 'UINTEGER', + start_date: 'TEXT', + end_date: 'TEXT' + } +); +`) + } -const formatCalendarRow = (c) => { - return [ - c.service_id || null, - c.monday ? availability(c.monday) : null, - c.tuesday ? availability(c.tuesday) : null, - c.wednesday ? availability(c.wednesday) : null, - c.thursday ? availability(c.thursday) : null, - c.friday ? availability(c.friday) : null, - c.saturday ? availability(c.saturday) : null, - c.sunday ? availability(c.sunday) : null, - c.start_date, - c.end_date, - ] + workingState.nrOfRowsByName.set('calendar', await queryNumberOfRows(db, 'calendar', opt)) } -const afterAll = `\ -\\. -` +importData.runDespiteMissingSrcFile = true -module.exports = { - beforeAll, - formatRow: formatCalendarRow, - afterAll, -} +module.exports = importData diff --git a/lib/calendar_dates.js b/lib/calendar_dates.js index 5ee4deb..beb8a67 100644 --- a/lib/calendar_dates.js +++ b/lib/calendar_dates.js @@ -1,50 +1,61 @@ 'use strict' +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#calendar_datestxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToCalendarDates, opt, workingState) => { + await db[RUN](`\ CREATE TYPE exception_type_v AS ENUM ( 'added' -- 1 – Service has been added for the specified date. , 'removed' -- 2 – Service has been removed for the specified date. ); -CREATE CAST (exception_type_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (exception_type_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE calendar_dates ( service_id TEXT NOT NULL, "date" DATE NOT NULL, - PRIMARY KEY (service_id, "date"), + CONSTRAINT primary_key PRIMARY KEY (service_id, "date"), exception_type exception_type_v NOT NULL ); +`) + if (pathToCalendarDates !== null) { + await db[RUN](`\ +INSERT INTO calendar_dates +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + array_slice(date, 0, 4) || '-' || array_slice(date, 5, 6) || '-' || array_slice(date, 7, 8) AS date, + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::exception_type_v)[exception_type] AS exception_type, +) +FROM read_csv( + '${pathToCalendarDates}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + service_id: 'TEXT', + date: 'TEXT', + exception_type: 'UINTEGER' + } +); +`) + } -COPY calendar_dates ( - service_id, - date, - exception_type -) FROM STDIN csv; -` - -const exceptionType = (val) => { - if (val === '1') return 'added' - if (val === '2') return 'removed' - throw new Error('invalid exception_type: ' + val) -} + await db[RUN](`\ +CREATE INDEX calendar_dates_service_id ON calendar_dates (service_id); +CREATE INDEX calendar_dates_exception_type ON calendar_dates (exception_type); +`) -const formatCalendarDatesRow = (e) => { - return [ - e.service_id || null, - e.date, - e.exception_type ? exceptionType(e.exception_type) : null, - ] + workingState.nrOfRowsByName.set('calendar_dates', await queryNumberOfRows(db, 'calendar_dates', opt)) } -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON calendar_dates (service_id); -CREATE INDEX ON calendar_dates (exception_type); -` +importData.runDespiteMissingSrcFile = true -module.exports = { - beforeAll, - formatRow: formatCalendarDatesRow, - afterAll, -} +module.exports = importData diff --git a/lib/columns.js b/lib/columns.js new file mode 100644 index 0000000..f60547a --- /dev/null +++ b/lib/columns.js @@ -0,0 +1,39 @@ +'use strict' + +const GET = require('./get.js') +const {queryNumberOfRows} = require('./rows-count.js') + +// https://gtfs.org/documentation/schedule/reference/#stop_timestxt +const queryFileColumns = async (db, pathToFile) => { + const columns = await db[GET]( + `\ + DESCRIBE ( + SELECT * + FROM read_csv( + $1, + header = true + ) + LIMIT 1 + ) +`, + [pathToFile], + ) + return columns +} + +const queryIfColumnsExist = async (db, pathToFile, columns) => { + const res = Object.create(null) + const existing = new Set( + (await queryFileColumns(db, pathToFile)) + .map(col => col.column_name), + ) + for (const column of columns) { + res[column] = existing.has(column) + } + return res +} + +module.exports = { + queryFileColumns, + queryIfColumnsExist, +} diff --git a/lib/deps.js b/lib/deps.js index b2b9d66..b744c1c 100644 --- a/lib/deps.js +++ b/lib/deps.js @@ -7,14 +7,11 @@ const getDependencies = (opt, files) => { stopsWithoutLevelId, } = opt return { - shape_exists: [ - 'shapes', - ], agency: [ - 'is_timezone', + 'valid_timezones', ], stops: [ - 'is_timezone', + 'valid_timezones', ...(stopsWithoutLevelId ? [] : ['levels']), ], transfers: [ @@ -32,7 +29,7 @@ const getDependencies = (opt, files) => { trips: [ 'routes', 'service_days', - ...(tripsWithoutShapeId ? [] : ['shapes', 'shape_exists']), + ...(tripsWithoutShapeId ? [] : ['shapes']), ], frequencies: [ 'trips', @@ -41,10 +38,10 @@ const getDependencies = (opt, files) => { 'stops', ], feed_info: [ - 'is_valid_lang_code', + 'valid_lang_codes', ], translations: [ - 'is_valid_lang_code', + 'valid_lang_codes', // > table_name // > Defines the dataset table that contains the field to be translated. The following values are allowed: // > agency @@ -60,14 +57,14 @@ const getDependencies = (opt, files) => { // todo: respect opt.*! // these are soft dependencies, they are not depended upon, they must only be imported first // todo: only specify dependencies here if the files are not in use + + // these are required files anyways 'agency', 'stops', 'routes', 'trips', - ...(files.includes('stop_times') - ? ['stop_times'] - : [] - ), + 'stop_times', + // these are optional, so we only depend on them if they are present ...(files.includes('feed_info') ? ['feed_info'] : [] @@ -80,7 +77,12 @@ const getDependencies = (opt, files) => { ? ['levels'] : [] ), + ...(files.includes('calendar') ? ['calendar'] : []), + ...(files.includes('calendar_dates') ? ['calendar_dates'] : []), // not supported yet: attributions + // not supported yet: fare_attributes/fare_rules + // not supported yet: frequencies + // not supported yet: transfers ], } } diff --git a/lib/feed_info.js b/lib/feed_info.js index ad08de1..d865e3a 100644 --- a/lib/feed_info.js +++ b/lib/feed_info.js @@ -1,7 +1,10 @@ 'use strict' +const RUN = require('./run.js') + // https://gtfs.org/documentation/schedule/reference/#feed_infotxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToFeedInfo, opt, workingState) => { + await db[RUN](`\ -- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate *_lang. -- https://github.com/MobilityData/gtfs-validator/blob/31ff374800f7d7883fd9de91b71049c2a4de4e45/main/src/main/java/org/mobilitydata/gtfsvalidator/validator/MatchingFeedAndAgencyLangValidator.java#L82 -- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html @@ -9,14 +12,10 @@ const beforeAll = (opt) => `\ CREATE TABLE feed_info ( feed_publisher_name TEXT PRIMARY KEY, feed_publisher_url TEXT NOT NULL, - feed_lang TEXT NOT NULL - CONSTRAINT valid_feed_lang CHECK ( - is_valid_lang_code(feed_lang) - ), - default_lang TEXT - CONSTRAINT valid_default_lang CHECK ( - default_lang IS NULL OR is_valid_lang_code(default_lang) - ), + feed_lang TEXT NOT NULL, + FOREIGN KEY (feed_lang) REFERENCES valid_lang_codes, + default_lang TEXT, + FOREIGN KEY (default_lang) REFERENCES valid_lang_codes, feed_start_date DATE, feed_end_date DATE, feed_version TEXT, @@ -24,39 +23,29 @@ CREATE TABLE feed_info ( feed_contact_url TEXT ); -COPY feed_info ( - feed_publisher_name, - feed_publisher_url, - feed_lang, - default_lang, - feed_start_date, - feed_end_date, - feed_version, - feed_contact_email, - feed_contact_url -) FROM STDIN csv; -` - -const formatFeedInfoRow = (i) => { - return [ - i.feed_publisher_name || null, - i.feed_publisher_url || null, - i.feed_lang || null, - i.default_lang || null, - i.feed_start_date || null, - i.feed_end_date || null, - i.feed_version || null, - i.feed_contact_email || null, - i.feed_contact_url || null, - ] +INSERT INTO feed_info +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + ( + array_slice(feed_start_date, 0, 4) + || '-' || array_slice(feed_start_date, 5, 6) + || '-' || array_slice(feed_start_date, 7, 8) + ) AS feed_start_date, + ( + array_slice(feed_end_date, 0, 4) + || '-' || array_slice(feed_end_date, 5, 6) + || '-' || array_slice(feed_end_date, 7, 8) + ) AS feed_end_date +) +FROM read_csv( + '${pathToFeedInfo}', + header = true, + -- > Option to skip type detection for CSV parsing and assume all columns to be of type VARCHAR [a.k.a. TEXT]. + all_varchar = true +); +`) } -const afterAll = `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatFeedInfoRow, - afterAll, -} +module.exports = importData diff --git a/lib/frequencies.js b/lib/frequencies.js index ba0663a..03b7971 100644 --- a/lib/frequencies.js +++ b/lib/frequencies.js @@ -1,98 +1,106 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#frequenciestxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToFrequencies, opt, workingState) => { + await db[RUN](`\ CREATE TYPE exact_times_v AS ENUM ( 'frequency_based' -- 0 or empty - Frequency-based trips. , 'schedule_based' -- 1 – Schedule-based trips with the exact same headway throughout the day. In this case the end_time value must be greater than the last desired trip start_time but less than the last desired trip start_time + headway_secs. ); -CREATE CAST (exact_times_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (exact_times_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE frequencies ( - -- Used to implement arrivals_departures & connections. Filled after COPY-ing, see below. + -- Used to implement arrivals_departures & connections. Filled by the INSERT below. frequencies_row INTEGER, trip_id TEXT NOT NULL, FOREIGN KEY (trip_id) REFERENCES trips, start_time INTERVAL NOT NULL, + -- todo, once supported by DuckDB: PRIMARY KEY (trip_id, start_time) end_time INTERVAL NOT NULL, headway_secs INT NOT NULL, - exact_times exact_times_v, - -- frequencies' primary key currently is just (trip_id, start_time) - -- see also https://github.com/google/transit/issues/514 - -- todo: add primary key? - UNIQUE ( - trip_id, - start_time, - end_time, - headway_secs, - exact_times - ) + exact_times exact_times_v -- todo: NOT NULL & ifnull() + -- frequencies' primary is just (trip_id, start_time). however, the definition for the headway_secs field says: + -- > Multiple headways may be defined for the same trip, but must not overlap. New headways may start at the exact time the previous headway ends. + -- https://gtfs.org/documentation/schedule/reference/#frequenciestxt + -- todo: add a unique constraint once there is consensus in https://github.com/google/transit/issues/514 ); +`) -COPY frequencies ( - trip_id, - start_time, - end_time, - headway_secs, - exact_times -) FROM STDIN csv; -` - -const exactTimes = (val) => { - if (val === '0') return 'frequency_based' - if (val === '1') return 'schedule_based' - throw new Error('invalid exact_times: ' + val) -} + if (pathToFrequencies === null) { + // todo: keep? + // workingState.nrOfRowsByName.set('frequencies', 0) + return; + } -const formatFrequenciesRow = (f) => { - const startTime = f.start_time - ? formatTime(f.start_time) - : null - const endTime = f.end_time - ? formatTime(f.end_time) - : null - - return [ - f.trip_id || null, - startTime, - endTime, - f.headway_secs ? parseInt(f.headway_secs) : null, - f.exact_times ? exactTimes(f.exact_times) : null, - ] -} + // exact_times is optional, so the entire columns can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + exact_times: has_exact_times, + } = await queryIfColumnsExist(db, pathToFrequencies, [ + 'exact_times', + ]) -const afterAll = (opt) => `\ -\\. + await db[RUN](`\ +INSERT INTO frequencies +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + ${has_exact_times ? `` : `NULL AS exact_times,`} + * + REPLACE ( + -- dummy entry in case no optional column is present + trip_id AS trip_id, + ${has_exact_times ? `\ + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + -- Also, we explicitly cast until https://github.com/duckdb/duckdb/issues/17431 is resolved. + enum_range(NULL::exact_times_v)[exact_times::integer + 1] AS exact_times + ` : ``} + ), + row_number() OVER (PARTITION BY trip_id, exact_times) AS frequencies_row +FROM read_csv( + '${pathToFrequencies}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + start_time: 'INTERVAL', + end_time: 'INTERVAL', + ${has_exact_times ? `exact_times: 'INTEGER',` : ``} + } +); +`) --- frequencies_row is used to implement arrivals_departures & connections. -UPDATE frequencies --- This is ugly, but AFAICT there is no cleaner way. --- see also https://stackoverflow.com/a/4359354/1072129 -SET frequencies_row = t.frequencies_row -FROM ( - SELECT - -- order by all columns so that we don't implicitly depend on the file's order - (row_number() OVER (PARTITION BY trip_id, start_time ORDER BY end_time, headway_secs, exact_times))::integer AS frequencies_row, - trip_id, start_time - FROM frequencies -) AS t --- self-join + await db[RUN](`\ +-- We create UNIQUE index *afterwards* to make the data import faster. -- frequencies' primary is just (trip_id, start_time) -- however, the definition for the headway_secs field says: -- > Multiple headways may be defined for the same trip, but must not overlap. New headways may start at the exact time the previous headway ends. -- https://gtfs.org/documentation/schedule/reference/#frequenciestxt --- todo: add \`frequencies.exact_times::text = t.exact_times::text\`? once there is consensus in https://github.com/google/transit/issues/514 -WHERE frequencies.trip_id = t.trip_id -AND frequencies.start_time = t.start_time; +-- todo: add more columns once there is consensus in https://github.com/google/transit/issues/514 +CREATE UNIQUE INDEX frequencies_unique ON frequencies ( + trip_id, + -- As of v1.0.0, DuckDB does not support UNIQUE indexes on INTERVAL columns yet, so we cast to INTEGER. + (start_time::string) +); -CREATE INDEX ON frequencies (trip_id); -CREATE INDEX ON frequencies (exact_times); -` +CREATE INDEX frequencies_trip_id ON frequencies (trip_id); +CREATE INDEX frequencies_exact_times ON frequencies (exact_times); +`) -module.exports = { - beforeAll, - formatRow: formatFrequenciesRow, - afterAll, + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } + +importData.runDespiteMissingSrcFile = true + +module.exports = importData diff --git a/lib/get.js b/lib/get.js new file mode 100644 index 0000000..9affca3 --- /dev/null +++ b/lib/get.js @@ -0,0 +1,5 @@ +'use strict' + +const GET = Symbol('get') + +module.exports = GET \ No newline at end of file diff --git a/lib/import_metadata.js b/lib/import_metadata.js index 36bf8d8..8343191 100644 --- a/lib/import_metadata.js +++ b/lib/import_metadata.js @@ -1,33 +1,30 @@ 'use strict' const {strictEqual} = require('assert') +const RUN = require('./run.js') const pkg = require('../package.json') -const afterAll = (opt) => { +const populateImportMetadata = async (db, _, opt) => { strictEqual(typeof opt.importStart, 'number', 'opt.importStart must be a number') // todo: escape properly - return `\ -CREATE OR REPLACE FUNCTION gtfs_data_imported_at () -RETURNS TIMESTAMP WITH TIME ZONE -AS $$ - SELECT '${new Date(opt.importStart).toISOString()}'::timestamp with time zone; -$$ LANGUAGE SQL IMMUTABLE; + await db[RUN](`\ +CREATE OR REPLACE FUNCTION gtfs_data_imported_at (a) +AS ( + '${new Date(opt.importStart).toISOString()}'::timestamp with time zone +); -CREATE OR REPLACE FUNCTION gtfs_via_postgres_version () -RETURNS TEXT -AS $$ - SELECT '${pkg.version}'; -$$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION gtfs_via_duckdb_version () +AS ( + '${pkg.version}'::text +); -CREATE OR REPLACE FUNCTION gtfs_via_postgres_options () -RETURNS jsonb -AS $$ - SELECT '${JSON.stringify(opt).replace(/'/g, `''`)}'::jsonb; -$$ LANGUAGE SQL IMMUTABLE; -` +CREATE OR REPLACE FUNCTION gtfs_via_duckdb_options () +AS ( + '${JSON.stringify(opt).replace(/'/g, `''`)}'::json +); +`) } +populateImportMetadata.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = populateImportMetadata diff --git a/lib/index.js b/lib/index.js index 9490a3c..c8e7431 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,8 +1,9 @@ 'use strict' module.exports = { - is_valid_lang_code: require('./prerequisites').is_valid_lang_code, - is_timezone: require('./prerequisites').is_timezone, + icu: require('./prerequisites').icu, + valid_lang_codes: require('./prerequisites').valid_lang_codes, + valid_timezones: require('./prerequisites').valid_timezones, shape_exists: require('./prerequisites').shape_exists, agency: require('./agency'), calendar: require('./calendar'), diff --git a/lib/levels.js b/lib/levels.js index 5e276b2..1e48772 100644 --- a/lib/levels.js +++ b/lib/levels.js @@ -1,36 +1,38 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#levelstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToLevels, opt, workingState) => { + await db[RUN](`\ CREATE TABLE levels ( level_id TEXT PRIMARY KEY, - level_index DOUBLE PRECISION NOT NULL, + level_index REAL NOT NULL, level_name TEXT ); -COPY levels ( - level_id, - level_index, - level_name -) FROM STDIN csv; -` - -const formatLevelsRow = (l) => { - return [ - l.level_id, - parseFloat(l.level_index), - l.level_name || null, - ] -} +INSERT INTO levels +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToLevels}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + level_index: 'REAL', + } +); -const afterAll = `\ -\\. -` +`) -module.exports = { - beforeAll, - formatRow: formatLevelsRow, - afterAll, + workingState.nrOfRowsByName.set('levels', await queryNumberOfRows(db, 'levels', opt)) } + +module.exports = importData diff --git a/lib/pathways.js b/lib/pathways.js index a89dd9c..8980d9b 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -1,9 +1,33 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#pathwaystxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToPathways, opt, workingState) => { + // Several columns are optional, so their columns may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + length: has_length, + traversal_time: has_traversal_time, + stair_count: has_stair_count, + max_slope: has_max_slope, + min_width: has_min_width, + signposted_as: has_signposted_as, + reversed_signposted_as: has_reversed_signposted_as, + } = await queryIfColumnsExist(db, pathToPathways, [ + 'length', + 'traversal_time', + 'stair_count', + 'max_slope', + 'min_width', + 'signposted_as', + 'reversed_signposted_as', + ]) + + await db[RUN](`\ CREATE TYPE pathway_mode_v AS ENUM ( 'walkway' -- 1 , 'stairs' -- 2 @@ -14,7 +38,7 @@ CREATE TYPE pathway_mode_v AS ENUM ( -- Fare gates may either separate paid areas of the station from unpaid ones, or separate different payment areas within the same station from each other. This information can be used to avoid routing passengers through stations using shortcuts that would require passengers to make unnecessary payments, like directing a passenger to walk through a subway platform to reach a busway. , 'exit_gate' -- 7 – Indicates a pathway exiting an area where proof-of-payment is required into an area where proof-of-payment is no longer required. ); -CREATE CAST (pathway_mode_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (pathway_mode_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE pathways ( pathway_id TEXT PRIMARY KEY, @@ -24,70 +48,44 @@ CREATE TABLE pathways ( FOREIGN KEY (to_stop_id) REFERENCES stops (stop_id), pathway_mode pathway_mode_v NOT NULL, is_bidirectional BOOLEAN NOT NULL, - length DOUBLE PRECISION, -- todo: add non-negative constraint + length REAL, -- todo: add non-negative constraint traversal_time INTEGER, -- todo: add positive constraint stair_count INTEGER, -- todo: add non-0 constraint - max_slope DOUBLE PRECISION, - min_width DOUBLE PRECISION, -- todo: add positive constraint + max_slope REAL, + min_width REAL, -- todo: add positive constraint signposted_as TEXT, reversed_signposted_as TEXT ); -COPY pathways ( - pathway_id, - from_stop_id, - to_stop_id, - pathway_mode, - is_bidirectional, - length, - traversal_time, - stair_count, - max_slope, - min_width, - signposted_as, - reversed_signposted_as -) FROM STDIN csv; -` - -const pathwayMode = (val) => { - if (val === '1') return 'walkway' - if (val === '2') return 'stairs' - if (val === '3') return 'moving_sidewalk_travelator' - if (val === '4') return 'escalator' - if (val === '5') return 'elevator' - if (val === '6') return 'fare_gate' - if (val === '7') return 'exit_gate' - throw new Error('invalid pathway_mode: ' + val) -} - -const formatPathwaysRow = (p) => { - let is_bidirectional - if (p.is_bidirectional === '0') is_bidirectional = 'false' - else if (p.is_bidirectional === '1') is_bidirectional = 'true' - else throw new Error('invalid is_bidirectional: ' + p.is_bidirectional) +INSERT INTO pathways +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- todo: check that is_bidirectional is actually 0 or 1 + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::pathway_mode_v)[pathway_mode] AS pathway_mode +) +FROM read_csv( + '${pathToPathways}', + header = true, + all_varchar = true, + types = { + pathway_mode: 'INTEGER', + is_bidirectional: 'INTEGER' + ${has_length ? `, length: 'REAL'` : ``} + ${has_traversal_time ? `, traversal_time: 'INTEGER'` : ``} + ${has_stair_count ? `, stair_count: 'INTEGER'` : ``} + ${has_max_slope ? `, max_slope: 'REAL'` : ``} + ${has_min_width ? `, min_width: 'REAL'` : ``} + ${has_signposted_as ? `, signposted_as: 'TEXT'` : ``} + ${has_reversed_signposted_as ? `, reversed_signposted_as: 'TEXT'` : ``} + } +); +`) - return [ - p.pathway_id, - p.from_stop_id, - p.to_stop_id, - pathwayMode(p.pathway_mode), - is_bidirectional, - p.length, - p.traversal_time, - p.stair_count, - p.max_slope, - p.min_width, - p.signposted_as || null, - p.reversed_signposted_as || null, - ] + workingState.nrOfRowsByName.set('pathways', await queryNumberOfRows(db, 'pathways', opt)) } -const afterAll = (opt) => `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatPathwaysRow, - afterAll, -} +module.exports = importData diff --git a/lib/prerequisites.js b/lib/prerequisites.js index 69c0ee5..54ec918 100644 --- a/lib/prerequisites.js +++ b/lib/prerequisites.js @@ -1,79 +1,69 @@ 'use strict' -const is_valid_lang_code = { - beforeAll: (opt) => `\ --- Unfortunately information_schema.collations.collation_name only has --- identifiers with "_", not with "-", so we use pg_collation instead. --- https://www.postgresql.org/docs/current/infoschema-collations.html --- https://www.postgresql.org/docs/current/catalog-pg-collation.html --- todo [breaking]: rename to e.g. is_similar_to_bcp_47_tag? -CREATE OR REPLACE FUNCTION is_bcp_47_tag( - input TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT collctype - FROM pg_collation - WHERE ${opt.lowerCaseLanguageCodes ? `lower(collctype)` : `collctype`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} - OR ${opt.lowerCaseLanguageCodes ? `lower(collname)` : `collname`} = ${opt.lowerCaseLanguageCodes ? `lower(input)` : `input`} || '-x-icu' - LIMIT 1 - ); -$$ language sql STABLE; +const RUN = require('./run.js') --- todo [breaking]: remove -CREATE OR REPLACE FUNCTION is_valid_lang_code( - input TEXT -) -RETURNS BOOLEAN -AS $$ - -- todo: see also https://github.com/MobilityData/gtfs-validator/issues/1987 - SELECT is_bcp_47_tag(input); -$$ language sql STABLE; +const valid_lang_codes = async (db, _, opt) => { + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -`, +-- todo: once https://github.com/MobilityData/gtfs-validator/issues/1987 is solved, adapt this code +-- Unfortunately pragma_collations().collname only has +-- identifiers with "_", not with "-", so we use pg_collation instead. +-- see also https://duckdb.org/docs/sql/expressions/collations#icu-collations +-- todo: Also, entries like "de_DE" are missing. +CREATE TABLE valid_lang_codes ( + -- As of DuckDB v1.2.0, referring to this table via either a subquery or a plain foreign key doesn't work because + -- - subqueries are prohibited in CHECK constraints, and + -- - the foreign key doesn't seem to work with a NOCASE primary key. + -- This is why we use a case-sensitive primary key and unnest() to enumerate all (relevant) casings ourselves. + lang_code TEXT PRIMARY KEY, +); +INSERT INTO valid_lang_codes +SELECT * +FROM ( + SELECT + unnest([ + collname, + CASE WHEN contains(collname, '-') THEN + concat_ws('-', split_part(collname, '-', 1), upper(split_part(collname, '-', 2))) + ELSE + NULL + END + ]) AS lang_code + FROM ( + SELECT + replace(collname, '_', '-') AS collname + FROM pragma_collations() + ) t +) t +WHERE lang_code IS NOT NULL; +`) } -const is_timezone = { - beforeAll: (opt) => `\ --- https://justatheory.com/2007/11/postgres-timezone-validation/ -CREATE OR REPLACE FUNCTION is_timezone( - tz TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - date TIMESTAMPTZ; - BEGIN - date := now() AT TIME ZONE tz; - RETURN TRUE; - EXCEPTION WHEN invalid_parameter_value THEN - RETURN FALSE; - END; -$$ language plpgsql STABLE; +valid_lang_codes.runDespiteMissingSrcFile = true -`, -} -const shape_exists = { - beforeAll: (opt) => `\ -CREATE OR REPLACE FUNCTION shape_exists( - some_shape_id TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT shape_id - FROM shapes - WHERE shape_id = some_shape_id - LIMIT 1 - ); -$$ language sql STABLE; +const valid_timezones = async (db, _, opt) => { + // DuckDB v0.10: "subqueries prohibited in CHECK constraints" + // > CONSTRAINT valid_timezone CHECK (is_timezone(agency_timezone)) + // or inlined: + // > CONSTRAINT valid_timezone CHECK (EXISTS(SELECT name FROM pg_timezone_names() WHERE name = agency_timezone)) + // so we create a helper table instead + await db[RUN](`\ +INSTALL icu; -- todo: make install optional? +LOAD icu; -`, +CREATE TABLE valid_timezones( + tz TEXT PRIMARY KEY +); +INSERT INTO valid_timezones ( + SELECT name AS tz + FROM pg_timezone_names() +); +`) } +valid_timezones.runDespiteMissingSrcFile = true module.exports = { - is_valid_lang_code, - is_timezone, - shape_exists, + valid_lang_codes, + valid_timezones, } diff --git a/lib/routes.js b/lib/routes.js index ee6b82b..522db29 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -1,6 +1,8 @@ 'use strict' -const DataError = require('./data-error') +// const DataError = require('./data-error') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') // Google's "Extended GTFS Route Types" // https://developers.google.com/transit/gtfs/reference/extended-route-types @@ -233,13 +235,28 @@ const routeTypesSchemes = Object.assign(Object.create(null), { }) // https://gtfs.org/documentation/schedule/reference/#routestxt -const beforeAll = (opt) => { +const importData = async (db, pathToRoutes, opt, workingState) => { if (!(opt.routeTypesScheme in routeTypesSchemes)) { throw new Error(`invalid opt.routeTypesScheme, must be one of these: ${Object.keys(routeTypesSchemes).join(', ')}.`) } const extRouteTypes = routeTypesSchemes[opt.routeTypesScheme] - return `\ + // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. + // It seems that GTFS has allowed this at least since 2016: + // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 + const exactly1Agency = workingState.nrOfRowsByName.get('agency') === 1 + // todo: throw special error indicating an error in the input data? does the foreign key constraint achieve this implicitly? old code: + // throw new DataError( + // 'routes', + // 'agency_id must not be empty/null', + // [ + // 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' + // ], + // ) + + const withAgencyFKey = !opt.routesWithoutAgencyId && !exactly1Agency + + await db[RUN](`\ CREATE TYPE route_type_val AS ENUM ( -- basic types '0' -- 0 – Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area. @@ -256,12 +273,15 @@ CREATE TYPE route_type_val AS ENUM ( -- extended types ${extRouteTypes.map(([route_type, desc]) => `, '${route_type}' -- ${desc}`).join('\n')} ); -CREATE CAST (route_type_val AS text) WITH INOUT AS IMPLICIT; CREATE TABLE routes ( route_id TEXT PRIMARY KEY, - agency_id TEXT, - ${opt.routesWithoutAgencyId ? '' : `FOREIGN KEY (agency_id) REFERENCES agency,`} + -- As of DuckDB v1.3.0, a foreign key constraint does not enforce non-NULL values. + agency_id TEXT ${withAgencyFKey ? `NOT NULL` : ''}, + ${withAgencyFKey + ? '' + : `FOREIGN KEY (agency_id) REFERENCES agency,` + } -- todo: Either route_short_name or route_long_name must be specified, or potentially both if appropriate. route_short_name TEXT, route_long_name TEXT, @@ -273,61 +293,28 @@ CREATE TABLE routes ( route_sort_order INT ); -COPY routes ( - route_id, - agency_id, - route_short_name, - route_long_name, - route_desc, - route_type, - route_url, - route_color, - route_text_color, - route_sort_order -) FROM STDIN csv; -` -} - -const formatRoutesRow = (r, opt, workingState) => { - const agency_id = r.agency_id || null - if (agency_id === null && !opt.routesWithoutAgencyId) { - // The GTFS spec allows routes.agency_id to be empty/null if there is exactly one agency in the feed. - // It seems that GTFS has allowed this at least since 2016: - // https://github.com/google/transit/blame/217e9bf/gtfs/spec/en/reference.md#L544-L554 - if (workingState.nrOfRowsByName.get('agency') !== 1) { - // todo: throw special error indicating an error in the input data - throw new DataError( - 'routes', - 'agency_id must not be empty/null', - [ - 'The GTFS spec allows routes.agency_id to be empty/null only if there is exactly one agency in the feed.' - ], - ) - } +INSERT INTO routes +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * +FROM read_csv( + '${pathToRoutes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + route_type: 'TEXT', } +); - return [ - r.route_id || null, - agency_id, - r.route_short_name || null, - r.route_long_name || null, - r.route_desc || null, - r.route_type || null, - r.route_url || null, - r.route_color || null, - r.route_text_color || null, - r.route_sort_order ? parseInt(r.route_sort_order) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON routes (route_short_name); -` +CREATE INDEX routes_route_short_name ON routes (route_short_name); +`) -module.exports = { - beforeAll, - formatRow: formatRoutesRow, - afterAll, + workingState.nrOfRowsByName.set('routes', await queryNumberOfRows(db, 'routes', opt)) } + +module.exports = importData diff --git a/lib/rows-count.js b/lib/rows-count.js new file mode 100644 index 0000000..f88ebd2 --- /dev/null +++ b/lib/rows-count.js @@ -0,0 +1,15 @@ +'use strict' + +const GET = require('./get.js') + +const queryNumberOfRows = async (db, dbName, opt) => { + const [{count: nrOfRows}] = await db[GET](` + SELECT count(*) AS count + FROM "${dbName}" + `) + return nrOfRows +} + +module.exports = { + queryNumberOfRows, +} diff --git a/lib/run.js b/lib/run.js new file mode 100644 index 0000000..a155611 --- /dev/null +++ b/lib/run.js @@ -0,0 +1,5 @@ +'use strict' + +const RUN = Symbol('run') + +module.exports = RUN diff --git a/lib/service_days.js b/lib/service_days.js index ba9fa58..d83267e 100644 --- a/lib/service_days.js +++ b/lib/service_days.js @@ -1,8 +1,21 @@ 'use strict' -const afterAll = (opt) => `\ +const RUN = require('./run.js') + +// https://gtfs.org/documentation/schedule/reference/#calendar_datestxt +const importData = async (db, _, opt, workingState) => { + await db[RUN](`\ +-- DuckDB currently has no materialized views, only tables. +-- see https://github.com/duckdb/duckdb/discussions/3638#discussioncomment-2801284 +-- todo: what if i modify calendar/calendar_dates? define triggers? -- todo [breaking]: rename to service_dates? -CREATE MATERIALIZED VIEW service_days AS +CREATE TABLE service_days ( + service_id TEXT NOT NULL, + date TIMESTAMP NOT NULL, + PRIMARY KEY (service_id, date) +); + +INSERT INTO service_days SELECT base_days.service_id, base_days.date @@ -16,7 +29,7 @@ FROM ( SELECT service_id, "date", - extract(dow FROM "date") dow, + date_part('dow', "date") dow, sunday, monday, tuesday, @@ -27,11 +40,11 @@ FROM ( FROM ( SELECT *, - generate_series( + unnest(generate_series( start_date::TIMESTAMP, end_date::TIMESTAMP, '1 day'::INTERVAL - ) "date" + )) "date" FROM calendar ) all_days_raw ) all_days @@ -61,15 +74,15 @@ WHERE exception_type = 'added' ORDER BY service_id, "date"; -CREATE UNIQUE INDEX ON service_days (service_id, date); +CREATE UNIQUE INDEX service_days_unique_service_id_date ON service_days (service_id, date); -CREATE INDEX ON service_days (service_id); -CREATE INDEX ON service_days (date); +CREATE INDEX service_days_service_id ON service_days (service_id); +CREATE INDEX service_days_date ON service_days (date); -- apparently the unique index (service_id, date) doesn't speed up queries -CREATE INDEX ON service_days (service_id, date); +CREATE INDEX service_days_service_id_date ON service_days (service_id, date); +`) +} -` +importData.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = importData diff --git a/lib/shapes.js b/lib/shapes.js index 624f784..4e48a05 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -1,58 +1,75 @@ 'use strict' -// https://gtfs.org/documentation/schedule/reference/#shapestxt -const beforeAll = (opt) => `\ -CREATE TABLE shapes ( - id SERIAL PRIMARY KEY, - shape_id TEXT, - shape_pt_sequence INT, - shape_pt_loc geography(POINT), - shape_dist_traveled REAL -); +const GET = require('./get.js') +const {queryIfColumnsExist} = require('./columns.js') -COPY shapes ( - shape_id, - shape_pt_loc, - shape_pt_sequence, - shape_dist_traveled -) FROM STDIN csv; -` - -const formatShapesRow = (s) => { - return [ - s.shape_id || null, - `POINT(${parseFloat(s.shape_pt_lon)} ${parseFloat(s.shape_pt_lat)})`, - s.shape_pt_sequence ? parseInt(s.shape_pt_sequence) : null, - s.shape_dist_traveled ? parseInt(s.shape_dist_traveled) : null, - ] -} +// https://gtfs.org/documentation/schedule/reference/#shapestxt +const importData = async (db, pathToShapes, opt, workingState) => { + // shape_dist_traveled is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + shape_dist_traveled: has_shape_dist_traveled, + } = await queryIfColumnsExist(db, pathToShapes, [ + 'shape_dist_traveled', + ]) -const afterAll = (opt) => `\ -\\. + // todo: why does extracting `Count` directly work here and not with other files? + const [ + {Count: nrOfShapes}, + ] = await db[GET](`\ +INSTALL spatial; -- todo: make install optional? +LOAD spatial; -CREATE INDEX shapes_by_shape_id ON shapes (shape_id); -CREATE INDEX ON shapes (shape_id, shape_pt_sequence); +CREATE TABLE shapes ( + shape_id TEXT PRIMARY KEY, + shape GEOMETRY, + distances_travelled REAL[] +); -CREATE OR REPLACE VIEW shapes_aggregated AS +INSERT INTO shapes +-- WITH +-- csv_columns AS ( +-- SELECT list(column_name) AS cols +-- FROM ( +-- DESCRIBE ( +-- SELECT * +-- FROM read_csv( +-- 'node_modules/sample-gtfs-feed/gtfs/shapes.txt', +-- header = true +-- ) +-- ) +-- ) columns +-- ), +-- table_columns AS ( +-- SELECT list(column_name) +-- FROM ( +-- DESCRIBE shapes +-- ) columns +-- ) +-- SELECT COLUMNS(x -> x IN (SELECT cols FROM csv_columns)) SELECT - shape_id, - array_agg(shape_dist_traveled) AS distances_travelled, - -- todo [breaking]: make this a geography! - ST_MakeLine(array_agg(shape_pt_loc)) AS shape + any_value(shape_id) AS shape_id, + ST_MakeLine(array_agg(ST_Point(shape_pt_lon, shape_pt_lat))) AS shape, + ${has_shape_dist_traveled ? `array_agg(shape_dist_traveled)` : `NULL`} AS distances_travelled FROM ( - SELECT - shape_id, - shape_dist_traveled, - ST_AsText(shape_pt_loc)::geometry AS shape_pt_loc - FROM shapes - ORDER by shape_id, shape_pt_sequence -) shapes + SELECT * + FROM read_csv( + '${pathToShapes}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] + ) + ORDER BY shape_id, shape_pt_sequence +) t GROUP BY shape_id; +`) -` - -module.exports = { - beforeAll, - formatRow: formatShapesRow, - afterAll, + // Note: This is not the number of shapes.txt rows! + workingState.nrOfRowsByName.set('shapes', nrOfShapes) } + +module.exports = importData diff --git a/lib/stats_active_trips_by_hour.js b/lib/stats_active_trips_by_hour.js index cadaa38..9329217 100644 --- a/lib/stats_active_trips_by_hour.js +++ b/lib/stats_active_trips_by_hour.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsActiveTripsByHourView = async (db, _, opt) => { let materialized = false if (opt.statsActiveTripsByHour === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsActiveTripsByHour: materialized-view is currently not supported') + // materialized = true } else if (opt.statsActiveTripsByHour !== 'view') { throw new Error('invalid opt.statsActiveTripsByHour, must be one of these: none, view, materialized-view.') } @@ -11,8 +17,10 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ -CREATE MATERIALIZED VIEW feed_time_frame AS + await db[RUN](`\ +-- todo: use materialized view once DuckDB supports that +-- see also https://github.com/duckdb/duckdb/discussions/3638 +CREATE TABLE feed_time_frame AS WITH dates AS ( SELECT @@ -21,13 +29,13 @@ WITH FROM service_days ), date_offset AS ( - SELECT greatest( - largest_arrival_time(), - largest_departure_time() - ) AS o + SELECT + largest AS o + FROM largest_arr_dep_time ), date_min_max AS ( SELECT + -- todo date_trunc('day', min + o) AS min, date_trunc('day', max - o) AS max FROM dates, date_offset @@ -58,10 +66,9 @@ SELECT FROM min_dep, min_arr, max_dep, max_arr; CREATE OR REPLACE FUNCTION feed_time_series( - time_unit TEXT + time_unit ) -RETURNS SETOF timestamptz -AS $$ +AS ( SELECT generate_series( date_trunc(time_unit, min), @@ -69,12 +76,12 @@ AS $$ ('1 ' || time_unit)::interval ) as t FROM feed_time_frame -$$ LANGUAGE sql STABLE; +); ${createViewCmd} stats_active_trips_by_hour AS WITH all_hours AS NOT MATERIALIZED ( - SELECT feed_time_series('hour') AS "hour" + SELECT unnest(feed_time_series('hour')) AS "hour" ) SELECT DISTINCT ON ("hour") "hour", @@ -92,13 +99,14 @@ FROM ( ) ) t ) cons; +`) -${materialized ? `\ + if (materialized) { + await db[RUN](`\ CREATE INDEX ON stats_active_trips_by_hour ("hour"); -` : ''} -` +`) + } } +createStatsActiveTripsByHourView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsActiveTripsByHourView diff --git a/lib/stats_by_agency_route_stop_hour.js b/lib/stats_by_agency_route_stop_hour.js index 625cfbc..e937019 100644 --- a/lib/stats_by_agency_route_stop_hour.js +++ b/lib/stats_by_agency_route_stop_hour.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsByAgencyIdAndRouteIdAndStopAndHourView = async (db, _, opt) => { let materialized = false if (opt.statsByAgencyIdAndRouteIdAndStopAndHour === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsByAgencyIdAndRouteIdAndStopAndHour: materialized-view is currently not supported') + // materialized = true } else if (opt.statsByAgencyIdAndRouteIdAndStopAndHour !== 'view') { throw new Error('invalid opt.statsByAgencyIdAndRouteIdAndStopAndHour, must be one of these: none, view, materialized-view.') } @@ -11,7 +17,7 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ + await db[RUN](`\ ${createViewCmd} stats_by_agency_route_stop_hour AS SELECT DISTINCT ON (agency_id, route_id, stop_id, effective_hour) agency_id, route_id, stop_id, station_id, @@ -19,17 +25,17 @@ SELECT DISTINCT ON (agency_id, route_id, stop_id, effective_hour) date_trunc('hour', t_arrival) AS effective_hour, count(*) OVER (PARTITION BY route_id, stop_id, date_trunc('hour', t_arrival)) AS nr_of_arrs FROM arrivals_departures; +`) -${materialized ? `\ + if (materialized) { + await db[RUN](`\ CREATE INDEX ON stats_by_agency_route_stop_hour (route_id); CREATE INDEX ON stats_by_agency_route_stop_hour (stop_id); CREATE INDEX ON stats_by_agency_route_stop_hour (station_id); CREATE INDEX ON stats_by_agency_route_stop_hour (effective_hour); -` : ''} - -` +`) + } } +createStatsByAgencyIdAndRouteIdAndStopAndHourView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsByAgencyIdAndRouteIdAndStopAndHourView diff --git a/lib/stats_by_route_date.js b/lib/stats_by_route_date.js index 6a4021a..eaec758 100644 --- a/lib/stats_by_route_date.js +++ b/lib/stats_by_route_date.js @@ -1,9 +1,15 @@ 'use strict' -const afterAll = (opt) => { +const {fail} = require('assert') +const RUN = require('./run.js') + +const createStatsByRouteAndDateView = async (db, _, opt) => { let materialized = false if (opt.statsByRouteIdAndDate === 'materialized-view') { - materialized = true + // todo: support it once DuckDB supports materialized views + // see also https://github.com/duckdb/duckdb/discussions/3638 + fail('opt.statsByRouteIdAndDate: materialized-view is currently not supported') + // materialized = true } else if (opt.statsByRouteIdAndDate !== 'view') { throw new Error('invalid opt.statsByRouteIdAndDate, must be one of these: none, view, materialized-view.') } @@ -11,7 +17,7 @@ const afterAll = (opt) => { ? `CREATE MATERIALIZED VIEW` : `CREATE OR REPLACE VIEW` - return `\ + await db[RUN](`\ ${createViewCmd} stats_by_route_date AS WITH arrs_deps_with_svc_date AS NOT MATERIALIZED ( @@ -55,16 +61,16 @@ SELECT *, False AS is_effective FROM by_svc_date; - -${materialized ? `\ +`) + if (materialized) { + await db[RUN](`\ CREATE INDEX ON stats_by_route_date (route_id); CREATE INDEX ON stats_by_route_date ("date"); CREATE INDEX ON stats_by_route_date (route_id, "date", is_effective); CREATE INDEX ON stats_by_route_date (route_id, dow, is_effective); -` : ''} -` +`) + } } +createStatsByRouteAndDateView.runDespiteMissingSrcFile = true -module.exports = { - afterAll, -} +module.exports = createStatsByRouteAndDateView diff --git a/lib/stop_times.js b/lib/stop_times.js index 33c383e..a8c3607 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -1,22 +1,37 @@ 'use strict' -const {formatTime} = require('./util') +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') // https://gtfs.org/documentation/schedule/reference/#stop_timestxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToStopTimes, opt, workingState) => { + // timepoint & shape_dist_traveled are optional, so the entire columns can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + shape_dist_traveled: has_shape_dist_traveled, + timepoint: has_timepoint, + } = await queryIfColumnsExist(db, pathToStopTimes, [ + 'shape_dist_traveled', + 'timepoint', + ]) + + await db[RUN](`\ CREATE TYPE pickup_drop_off_type AS ENUM ( 'regular' -- 0 or empty - Regularly scheduled pickup/dropoff. , 'not_available' -- 1 – No pickup/dropoff available. , 'call' -- 2 – Must phone agency to arrange pickup/dropoff. , 'driver' -- 3 – Must coordinate with driver to arrange pickup/dropoff. ); -CREATE CAST (pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (pickup_drop_off_type AS text) WITH INOUT AS IMPLICIT; CREATE TYPE timepoint_v AS ENUM ( 'approximate' -- 0 – Times are considered approximate. , 'exact' -- 1 or empty - Times are considered exact. ); -CREATE CAST (timepoint_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (timepoint_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE stop_times ( trip_id TEXT NOT NULL, @@ -29,157 +44,122 @@ CREATE TABLE stop_times ( stop_sequence INT NOT NULL, stop_sequence_consec INT, stop_headsign TEXT, - pickup_type pickup_drop_off_type, - drop_off_type pickup_drop_off_type, + pickup_type pickup_drop_off_type, -- todo: NOT NULL & ifnull() + drop_off_type pickup_drop_off_type, -- todo: NOT NULL & ifnull() shape_dist_traveled REAL, timepoint timepoint_v, - -- Used to implement frequencies.txt. Filled after COPY-ing, see below. - trip_start_time INTERVAL + -- Used to implement frequencies.txt. Filled below. + trip_start_time INTERVAL, + PRIMARY KEY (trip_id, stop_sequence) ); -COPY stop_times ( - trip_id, - arrival_time, - departure_time, - stop_id, - stop_sequence, - stop_headsign, - pickup_type, - drop_off_type, - shape_dist_traveled, - timepoint -) FROM STDIN csv; -` - -const pickupDropOffType = (val) => { - if (val === '0') return 'regular' - if (val === '1') return 'not_available' - if (val === '2') return 'call' - if (val === '3') return 'driver' - throw new Error('invalid/unsupported pickup_type/drop_off_type: ' + val) -} - -const timepoint = (val) => { - if (val === '0') return 'approximate' - if (val === '1') return 'exact' - throw new Error('invalid/unsupported timepoint_v: ' + val) -} - -const formatStopTimesRow = (s) => { - const arrTime = s.arrival_time - ? formatTime(s.arrival_time) - : null - const depTime = s.departure_time - ? formatTime(s.departure_time) - : null - - return [ - s.trip_id || null, - arrTime, - depTime, - s.stop_id || null, - s.stop_sequence ? parseInt(s.stop_sequence) : null, - s.stop_headsign || null, - s.pickup_type ? pickupDropOffType(s.pickup_type) : null, - s.drop_off_type ? pickupDropOffType(s.drop_off_type) : null, - s.shape_dist_traveled || null, - s.timepoint ? timepoint(s.timepoint) : null, - ] -} - -const afterAll = (opt) => `\ -\\. - --- trip_start_time is used to implement frequencies.txt. -UPDATE stop_times --- This is ugly, but AFAICT there is no cleaner way. --- see also https://stackoverflow.com/a/4359354/1072129 -SET trip_start_time = t.trip_start_time -FROM ( - SELECT - -- todo: is frequencies.txt relative to 1st arrival_time or departure_time? - coalesce( - first_value(departure_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence), - first_value(arrival_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence) - ) AS trip_start_time, - trip_id, stop_sequence - FROM stop_times -) AS t --- self-join -WHERE stop_times.trip_id = t.trip_id -AND stop_times.stop_sequence = t.stop_sequence; - -CREATE INDEX ON stop_times (trip_id); -CREATE INDEX ON stop_times (stop_id); - -UPDATE stop_times -SET stop_sequence_consec = t.seq -FROM ( - SELECT - row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC)::integer - 1 AS seq, - trip_id, stop_sequence - FROM stop_times -) AS t -WHERE stop_times.trip_id = t.trip_id -AND stop_times.stop_sequence = t.stop_sequence; - -CREATE INDEX ON stop_times (stop_sequence_consec); -CREATE INDEX ON stop_times (trip_id, stop_sequence_consec); -CREATE INDEX ON stop_times (arrival_time DESC NULLS LAST); -CREATE INDEX ON stop_times (departure_time DESC NULLS LAST); --- todo: are these two necessary? -CREATE INDEX ON stop_times (arrival_time); -CREATE INDEX ON stop_times (departure_time); +INSERT INTO stop_times +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + -- We stay compatible with PostgreSQL's row_number() here, which starts with 0. + row_number() OVER (PARTITION BY trip_id ORDER BY stop_sequence ASC) - 1 AS stop_sequence_consec, + ${has_shape_dist_traveled ? `` : `NULL AS shape_dist_traveled,`} + ${has_timepoint ? `` : `NULL AS timepoint,`} + * + REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::pickup_drop_off_type)[drop_off_type + 1] AS drop_off_type, + enum_range(NULL::pickup_drop_off_type)[pickup_type + 1] AS pickup_type + ${has_timepoint ? `,enum_range(NULL::timepoint_v)[timepoint + 1] AS timepoint` : ''} + ), + -- todo: is frequencies.txt relative to 1st arrival_time or departure_time? + coalesce( + first_value(departure_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence), + first_value(arrival_time) OVER (PARTITION BY trip_id ORDER BY stop_sequence) + ) AS trip_start_time +FROM read_csv( + '${pathToStopTimes}', + header = true, + all_varchar = true, + types = { + arrival_time: 'INTERVAL', + departure_time: 'INTERVAL', + stop_sequence: 'INTEGER', + pickup_type: 'INTEGER', + drop_off_type: 'INTEGER', + ${has_shape_dist_traveled ? `shape_dist_traveled: 'REAL',` : ``} + ${has_timepoint ? `timepoint: 'INTEGER',` : ``} + } +); -CREATE OR REPLACE FUNCTION largest_departure_time () -RETURNS interval AS $$ - SELECT departure_time - FROM stop_times - WHERE EXISTS ( - SELECT * - FROM trips - JOIN service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id - ) - ORDER BY departure_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; -CREATE OR REPLACE FUNCTION largest_arrival_time () -RETURNS interval AS $$ - SELECT arrival_time - FROM stop_times - WHERE EXISTS ( - SELECT * - FROM trips - JOIN service_days ON service_days.service_id = trips.service_id - WHERE trips.trip_id = stop_times.trip_id +-- todo: are all of them beneficial/necessary? +CREATE INDEX stop_times_trip_id ON stop_times (trip_id); +CREATE INDEX stop_times_stop_id ON stop_times (stop_id); +CREATE INDEX stop_times_stop_sequence_consec ON stop_times (stop_sequence_consec); +CREATE INDEX stop_times_trip_id_stop_sequence_consec ON stop_times (trip_id, stop_sequence_consec); +-- As of DuckDB v1.0.0, indexes on INTERVAL columns are not supported yet. +-- todo: alternatively just change these columns to INTEGER? +-- CREATE INDEX stop_times_arrival_time ON stop_times (arrival_time); +-- CREATE INDEX stop_times_departure_time ON stop_times (departure_time); + +-- todo: use materialized view once DuckDB supports that +-- see also https://github.com/duckdb/duckdb/discussions/3638 +CREATE TABLE largest_arr_dep_time AS +WITH + largest_departure_time AS ( + SELECT departure_time + FROM stop_times stop_times + WHERE EXISTS ( + SELECT * + FROM trips trips + JOIN service_days service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY departure_time DESC + LIMIT 1 + ), + largest_arrival_time AS ( + SELECT arrival_time + FROM stop_times stop_times + WHERE EXISTS ( + SELECT * + FROM trips trips + JOIN service_days service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY arrival_time DESC + LIMIT 1 ) - ORDER BY arrival_time DESC NULLS LAST - LIMIT 1; -$$ LANGUAGE SQL IMMUTABLE; +SELECT + to_seconds(greatest( + epoch(arrival_time), + epoch(departure_time) + )) AS largest +FROM largest_departure_time, largest_arrival_time; + CREATE OR REPLACE FUNCTION dates_filter_min ( - _timestamp TIMESTAMP WITH TIME ZONE + _timestamp ) -RETURNS date AS $$ +AS ( SELECT date_trunc( 'day', - _timestamp - - GREATEST( - largest_arrival_time(), - largest_departure_time() - ) + _timestamp::TIMESTAMP WITH TIME ZONE + - largest -- we assume the DST <-> standard time shift is always <= 1h - '1 hour 1 second'::interval - ); -$$ LANGUAGE SQL IMMUTABLE; + )::DATE AS date_min + FROM largest_arr_dep_time +); -- This function doesn't do much, we just provide it to match date_filter_min(). CREATE OR REPLACE FUNCTION dates_filter_max ( - _timestamp TIMESTAMP WITH TIME ZONE + _timestamp ) -RETURNS date AS $$ - SELECT date_trunc('day', _timestamp); -$$ LANGUAGE SQL IMMUTABLE; +AS ( + SELECT date_trunc( + 'day', + _timestamp::TIMESTAMP WITH TIME ZONE + )::DATE AS date_max +); +-- todo: add "ORDER BY stop_sequence_consec ASC" without affecting performance? CREATE OR REPLACE VIEW arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT @@ -239,11 +219,11 @@ WITH stop_times_based AS NOT MATERIALIZED ( ) AS wheelchair_boarding FROM ( stop_times s - JOIN stops ON s.stop_id = stops.stop_id + JOIN stops stops ON s.stop_id = stops.stop_id LEFT JOIN stops stations ON stops.parent_station = stations.stop_id - JOIN trips ON s.trip_id = trips.trip_id - JOIN routes ON trips.route_id = routes.route_id - LEFT JOIN agency ON ( + JOIN trips trips ON s.trip_id = trips.trip_id + JOIN routes routes ON trips.route_id = routes.route_id + LEFT JOIN agency agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -251,7 +231,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - JOIN service_days ON trips.service_id = service_days.service_id + JOIN service_days service_days ON trips.service_id = service_days.service_id ) -- todo: this slows down slightly -- ORDER BY route_id, s.trip_id, "date", stop_sequence @@ -259,74 +239,72 @@ WITH stop_times_based AS NOT MATERIALIZED ( -- stop_times-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as arrival_departure_id, - stop_times_based.*, -- todo: expose local arrival/departure "wall clock time"? -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based arrivals/departures SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as arrival_departure_id, * FROM ( SELECT - *, - row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, stop_sequence_consec ORDER BY t_departure ASC)::integer AS frequencies_it + row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, stop_sequence_consec ORDER BY t_departure ASC) AS frequencies_it, + * FROM ( SELECT - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - agency_id, - route_id, route_short_name, route_long_name, route_type, - trip_id, direction_id, trip_headsign, wheelchair_accessible, bikes_allowed, - service_id, - shape_id, - "date", - stop_sequence, stop_sequence_consec, - stop_headsign, pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_arrival - trip_start_time + start_time, - t_arrival - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - departure_time, -- todo [breaking]: this is misleading, remove it - generate_series( - t_departure - trip_start_time + start_time, - t_departure - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - trip_start_time, - stop_id, stop_name, - station_id, station_name, - wheelchair_boarding, - frequencies_row + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + trip_start_time, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_arrival - trip_start_time + start_time, + t_arrival - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival, + unnest(generate_series( + t_departure - trip_start_time + start_time, + t_departure - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + ) FROM ( SELECT stop_times_based.*, @@ -335,26 +313,29 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? +) frequencies_based ) t -) t -) frequencies_based; - -CREATE OR REPLACE FUNCTION arrival_departure_by_arrival_departure_id(id TEXT) -RETURNS arrivals_departures -AS $$ - SELECT * - FROM arrivals_departures - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; - +) t; + +-- CREATE OR REPLACE FUNCTION arrival_departure_by_arrival_departure_id(id TEXT) +-- RETURNS arrivals_departures +-- AS $$ +-- SELECT * +-- FROM arrivals_departures arrivals_departures +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; +`) + + await db[RUN](`\ +-- todo: add "ORDER BY stop_sequence_consec ASC" without affecting performance? CREATE OR REPLACE VIEW connections AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT @@ -459,9 +440,9 @@ WITH stop_times_based AS NOT MATERIALIZED ( nullif(to_stations.wheelchair_boarding, 'no_info_or_inherit'), 'no_info_or_inherit' ) AS to_wheelchair_boarding - FROM trips - LEFT JOIN routes ON trips.route_id = routes.route_id - LEFT JOIN agency ON ( + FROM trips trips + LEFT JOIN routes routes ON trips.route_id = routes.route_id + LEFT JOIN agency agency ON ( -- The GTFS spec allows routes.agency_id to be NULL if there is exactly one agency in the feed. -- Note: We implicitly rely on other parts of the code base to validate that agency has just one row! -- It seems that GTFS has allowed this at least since 2016: @@ -469,7 +450,7 @@ WITH stop_times_based AS NOT MATERIALIZED ( routes.agency_id IS NULL -- match first (and only) agency OR routes.agency_id = agency.agency_id -- match by ID ) - LEFT JOIN stop_times ON trips.trip_id = stop_times.trip_id + LEFT JOIN stop_times stop_times ON trips.trip_id = stop_times.trip_id LEFT JOIN stops from_stops ON stop_times.stop_id = from_stops.stop_id LEFT JOIN stops from_stations ON from_stops.parent_station = from_stations.stop_id INNER JOIN stop_times to_stop_times ON stop_times.trip_id = to_stop_times.trip_id AND stop_times.stop_sequence_consec + 1 = to_stop_times.stop_sequence_consec @@ -478,103 +459,77 @@ WITH stop_times_based AS NOT MATERIALIZED ( ) trips JOIN ( SELECT * - FROM service_days + FROM service_days service_days ORDER BY service_id, "date" ) service_days ON trips.service_id = service_days.service_id ) -- stop_times-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) -- frequencies_row - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) -- frequencies_it - || ':' || encode('-1'::bytea, 'base64') + || ':' || to_base64(encode('-1')) ) as connection_id, - stop_times_based.*, - -1 AS frequencies_row, - -1 AS frequencies_it + -1 AS frequencies_it, + + stop_times_based.* + EXCLUDE ( + arrival_time, + departure_time + ) FROM stop_times_based -UNION ALL +UNION ALL BY NAME -- frequencies-based connections SELECT ( - encode(trip_id::bytea, 'base64') - || ':' || encode(( + to_base64(encode(trip_id)) + || ':' || to_base64(encode( extract(ISOYEAR FROM "date") || '-' || lpad(extract(MONTH FROM "date")::text, 2, '0') || '-' || lpad(extract(DAY FROM "date")::text, 2, '0') - )::bytea, 'base64') - || ':' || encode((from_stop_sequence::text)::bytea, 'base64') - || ':' || encode((frequencies_row::text)::bytea, 'base64') - || ':' || encode((frequencies_it::text)::bytea, 'base64') + )) + || ':' || to_base64(encode(from_stop_sequence::text)) + || ':' || to_base64(encode(frequencies_row::text)) + || ':' || to_base64(encode(frequencies_it::text)) ) as connection_id, - - frequencies_based.* + * FROM ( SELECT - *, - row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, from_stop_sequence_consec ORDER BY t_departure ASC)::integer AS frequencies_it + row_number() OVER (PARTITION BY trip_id, "date", frequencies_row, from_stop_sequence_consec ORDER BY t_departure ASC) AS frequencies_it, + * FROM ( SELECT - -- stop_times_based.* except t_arrival & t_departure, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, route_short_name, route_long_name, route_type, - trip_id, - service_id, - direction_id, - trip_headsign, - wheelchair_accessible, - bikes_allowed, - trip_start_time, - - from_stop_id, - from_stop_name, - from_station_id, - from_station_name, - from_wheelchair_boarding, - - from_stop_headsign, - from_pickup_type, - generate_series( - t_departure - trip_start_time + start_time, - t_departure - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_stop_sequence_consec, - from_timepoint, - - "date", - - to_timepoint, - to_stop_sequence, - to_stop_sequence_consec, - generate_series( - t_arrival - trip_start_time + start_time, - t_arrival - trip_start_time + end_time, - INTERVAL '1 second' * headway_secs - ) as t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - to_stop_headsign, - - to_stop_id, - to_stop_name, - to_station_id, - to_station_name, - to_wheelchair_boarding, - - frequencies_row + frequencies_based.* + EXCLUDE ( + arrival_time, + departure_time, + start_time, + end_time, + trip_start_time, + headway_secs + ) + REPLACE ( + unnest(generate_series( + t_departure - trip_start_time + start_time, + t_departure - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_departure, + unnest(generate_series( + t_arrival - trip_start_time + start_time, + t_arrival - trip_start_time + end_time, + INTERVAL '1 second' * headway_secs + )) as t_arrival + ) FROM ( SELECT stop_times_based.*, @@ -583,32 +538,28 @@ FROM ( frequencies.headway_secs, frequencies_row FROM stop_times_based - JOIN frequencies ON frequencies.trip_id = stop_times_based.trip_id + JOIN frequencies frequencies ON frequencies.trip_id = stop_times_based.trip_id WHERE frequencies.exact_times = 'schedule_based' -- todo: is this correct? +) frequencies_based ) t -) t -) frequencies_based; - -CREATE OR REPLACE FUNCTION connection_by_connection_id(id TEXT) -RETURNS connections -AS $$ - SELECT * - FROM connections - WHERE trip_id = convert_from(decode(split_part(id, ':', 1), 'base64'), 'UTF-8')::text - AND "date" = (convert_from(decode(split_part(id, ':', 2), 'base64'), 'UTF-8')::text)::timestamp - AND from_stop_sequence = (convert_from(decode(split_part(id, ':', 3), 'base64'), 'UTF-8')::text)::integer - AND (convert_from(decode(split_part(id, ':', 4), 'base64'), 'UTF-8')::text)::integer = frequencies_row - AND (convert_from(decode(split_part(id, ':', 5), 'base64'), 'UTF-8')::text)::integer = frequencies_it - -- todo: what if there are >1 rows? - LIMIT 1; -$$ LANGUAGE SQL STABLE STRICT; -` - - - - -module.exports = { - beforeAll, - formatRow: formatStopTimesRow, - afterAll, +) t; + +-- CREATE OR REPLACE FUNCTION connection_by_connection_id(id TEXT) +-- RETURNS connections +-- AS $$ +-- SELECT * +-- FROM connections connections +-- WHERE trip_id = decode(from_base64(split_part(id, ':', 1))) +-- AND "date" = decode(from_base64(split_part(id, ':', 2)))::timestamp +-- AND from_stop_sequence = decode(from_base64(split_part(id, ':', 3)))::integer +-- AND decode(from_base64(split_part(id, ':', 4)))::integer = frequencies_row +-- AND decode(from_base64(split_part(id, ':', 5)))::integer = frequencies_it +-- -- todo: what if there are >1 rows? +-- LIMIT 1; +-- $$ LANGUAGE SQL STABLE STRICT; +`) + + workingState.nrOfRowsByName.set('stop_times', await queryNumberOfRows(db, 'stop_times', opt)) } + +module.exports = importData diff --git a/lib/stops.js b/lib/stops.js index a76f5ba..d207bba 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -1,7 +1,40 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#stopstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToStops, opt, workingState) => { + // Several columns are optional, so they may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + stop_code: has_stop_code, + stop_desc: has_stop_desc, + zone_id: has_zone_id, + stop_url: has_stop_url, + location_type: has_location_type, + parent_station: has_parent_station, + stop_timezone: has_stop_timezone, + wheelchair_boarding: has_wheelchair_boarding, + level_id: has_level_id, + platform_code: has_platform_code, + } = await queryIfColumnsExist(db, pathToStops, [ + 'stop_code', + 'stop_desc', + 'zone_id', + 'stop_url', + 'location_type', + 'parent_station', + 'stop_timezone', + 'wheelchair_boarding', + 'level_id', + 'platform_code', + ]) + + await db[RUN](`\ CREATE TYPE location_type_val AS ENUM ( 'stop' -- 0 (or blank): Stop (or Platform). A location where passengers board or disembark from a transit vehicle. Is called a platform when defined within a parent_station. , 'station' -- 1 – Station. A physical structure or area that contains one or more platform. @@ -9,7 +42,7 @@ CREATE TYPE location_type_val AS ENUM ( , 'node' -- 3 – Generic Node. A location within a station, not matching any other location_type, which can be used to link together pathways define in pathways.txt. , 'boarding_area' -- 4 – Boarding Area. A specific location on a platform, where passengers can board and/or alight vehicles. ); -CREATE CAST (location_type_val AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (location_type_val AS text) WITH INOUT AS IMPLICIT; -- For parentless stops: -- 0 or empty - No accessibility information for the stop. @@ -30,7 +63,10 @@ CREATE TYPE wheelchair_boarding_val AS ENUM ( , 'accessible' , 'not_accessible' ); -CREATE CAST (wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (wheelchair_boarding_val AS text) WITH INOUT AS IMPLICIT; + +INSTALL spatial; -- todo: make install optional? +LOAD spatial; CREATE TABLE stops ( stop_id TEXT PRIMARY KEY, @@ -38,86 +74,116 @@ CREATE TABLE stops ( -- todo: Required for locations which are stops (location_type=0), stations (location_type=1) or entrances/exits (location_type=2). Optional for locations which are generic nodes (location_type=3) or boarding areas (location_type=4). stop_name TEXT, stop_desc TEXT, - stop_loc geography(POINT), -- stop_lat/stop_lon + stop_loc GEOMETRY, -- stop_lat/stop_lon zone_id TEXT, stop_url TEXT, location_type location_type_val, parent_station TEXT, - stop_timezone TEXT CHECK (is_timezone(stop_timezone)), + -- In stops.txt, *any* row's parent_station might reference *any* other row. Essentially, stops.txt describes a tree. + -- As of DuckDB v1.0.0, it *seems* like adding a foreign key constraint here doesn't work, even if we order the stops to put parents before their children (see below). + -- todo: Report this with DuckDB? Alternatively, add the constraint after the import (see below). + -- FOREIGN KEY (parent_station) REFERENCES stops, + stop_timezone TEXT, + FOREIGN KEY (stop_timezone) REFERENCES valid_timezones, wheelchair_boarding wheelchair_boarding_val, level_id TEXT, ${opt.stopsWithoutLevelId ? '' : `FOREIGN KEY (level_id) REFERENCES levels,`} platform_code TEXT ); -COPY stops ( - stop_id, - stop_code, - stop_name, - stop_desc, - stop_loc, - zone_id, - stop_url, - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -) FROM STDIN csv; -` +INSERT INTO stops +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +WITH RECURSIVE + stops AS ( + SELECT + ${has_stop_code ? `` : `NULL AS stop_code,`} + ${has_stop_desc ? `` : `NULL AS stop_desc,`} + ${has_zone_id ? `` : `NULL AS zone_id,`} + ${has_stop_url ? `` : `NULL AS stop_url,`} + ${has_location_type ? `` : `NULL AS location_type,`} + ${has_parent_station ? `` : `NULL AS parent_station,`} + ${has_stop_timezone ? `` : `NULL AS stop_timezone,`} + ${has_wheelchair_boarding ? `` : `NULL AS wheelchair_boarding,`} + ${has_level_id ? `` : `NULL AS level_id,`} + ${has_platform_code ? `` : `NULL AS platform_code,`} + ST_Point(stop_lon, stop_lat) AS stop_loc, + * + EXCLUDE ( + stop_lat, stop_lon + ) + REPLACE ( + -- dummy entry in case no optional column is present + stop_id AS stop_id, + ${has_location_type ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::location_type_val)[location_type + 1] AS location_type, + ` : ``} + ${has_wheelchair_boarding ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::wheelchair_boarding_val)[ifnull(wheelchair_boarding, 0) + 1] AS wheelchair_boarding + ` : ``} + ) + FROM read_csv( + '${pathToStops}', + header = true, + -- > This option allows you to specify the types that the sniffer will use when detecting CSV column types. + -- > default: SQLNULL, BOOLEAN, BIGINT, DOUBLE, TIME, DATE, TIMESTAMP, VARCHAR + -- We omit BOOLEAN because GTFS just uses integers for boolean-like fields (e.g. timepoint in trips.txt). + -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. + auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'], + types = { + -- dummy entry in case no optional column is present + stop_id: 'TEXT', + ${has_stop_code ? `stop_code: 'TEXT',` : ``} + ${has_location_type ? `location_type: 'INTEGER',` : ``} + ${has_wheelchair_boarding ? `wheelchair_boarding: 'INTEGER',` : ``} + ${has_platform_code ? `platform_code: 'TEXT',` : ``} + } + ) + ), + -- order the stops to put parents before their children + stops_sorted_by_parents AS ( + ( + SELECT + *, + stop_id AS root_id, + 1 AS recursion_level + FROM stops + WHERE parent_station IS NULL + ) + UNION ALL + ( + SELECT + children.*, + parent.root_id, + parent.recursion_level + 1 + FROM stops children + JOIN stops_sorted_by_parents parent ON parent.stop_id = children.parent_station + ) + ) +SELECT * EXCLUDE ( + -- omit sorting helper columns + root_id, + recursion_level +) +FROM stops_sorted_by_parents +ORDER BY root_id, recursion_level, stop_id; -const locationType = (val) => { - if (val === '0') return 'stop' - if (val === '1') return 'station' - if (val === '2') return 'entrance_exit' - if (val === '3') return 'node' - if (val === '4') return 'boarding_area' - throw new Error('invalid/unsupported location_type: ' + val) -} +-- todo: DuckDB v1.0.0 doesn't support them yet: +-- > The ADD CONSTRAINT and DROP CONSTRAINT clauses are not yet supported in DuckDB. +-- ALTER TABLE stops +-- ADD CONSTRAINT stops_parent_station_fkey +-- FOREIGN KEY (parent_station) REFERENCES stops; -const wheelchairBoarding = (val) => { - if (val === '0') return 'no_info_or_inherit' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid/unsupported wheelchair_boarding: ' + val) -} +CREATE INDEX stops_parent_station ON stops (parent_station); +${opt.stopsLocationIndex ? `CREATE INDEX stops_stop_loc ON stops (stop_loc);` : ''} +`) -const formatStopsRow = (s) => { - return [ - s.stop_id || null, - s.stop_code || null, - s.stop_name || null, - s.stop_desc || null, - `POINT(${parseFloat(s.stop_lon)} ${parseFloat(s.stop_lat)})`, - s.zone_id || null, - s.stop_url || null, - s.location_type - ? locationType(s.location_type) - : null, - s.parent_station || null, - s.stop_timezone || null, - s.wheelchair_boarding - ? wheelchairBoarding(s.wheelchair_boarding) - : null, - s.level_id || null, - s.platform_code || null, - ] + workingState.nrOfRowsByName.set('stops', await queryNumberOfRows(db, 'stops', opt)) } -const afterAll = (opt) => `\ -\\. - -ALTER TABLE stops -ADD CONSTRAINT stops_parent_station_fkey -FOREIGN KEY (parent_station) REFERENCES stops; - -CREATE INDEX ON stops (parent_station); -${opt.stopsLocationIndex ? `CREATE INDEX ON stops (stop_loc);` : ''} -` - -module.exports = { - beforeAll, - formatRow: formatStopsRow, - afterAll, -} +module.exports = importData diff --git a/lib/transfers.js b/lib/transfers.js index cfd550b..b2844d1 100644 --- a/lib/transfers.js +++ b/lib/transfers.js @@ -1,17 +1,30 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#transferstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToTransfers, opt, workingState) => { + // min_transfer_time is optional, so the entire column can be missing. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + min_transfer_time: has_min_transfer_time, + } = await queryIfColumnsExist(db, pathToTransfers, [ + 'min_transfer_time', + ]) + + await db[RUN](`\ CREATE TYPE transfer_type_v AS ENUM ( 'recommended' -- 0 or empty - Recommended transfer point between routes. , 'timed' -- 1 - Timed transfer point between two routes. The departing vehicle is expected to wait for the arriving one and leave sufficient time for a rider to transfer between routes. , 'minimum_time' -- 2 – Transfer requires a minimum amount of time between arrival and departure to ensure a connection. The time required to transfer is specified by min_transfer_time. , 'impossible' -- 3 - Transfers are not possible between routes at the location. ); -CREATE CAST (transfer_type_v AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (transfer_type_v AS text) WITH INOUT AS IMPLICIT; CREATE TABLE transfers ( - id SERIAL PRIMARY KEY, from_stop_id TEXT, FOREIGN KEY (from_stop_id) REFERENCES stops, to_stop_id TEXT, @@ -25,59 +38,39 @@ CREATE TABLE transfers ( from_trip_id TEXT, FOREIGN KEY (from_trip_id) REFERENCES trips, to_trip_id TEXT, - FOREIGN KEY (from_trip_id) REFERENCES trips + FOREIGN KEY (from_trip_id) REFERENCES trips, + -- We're not using a primary key index here because several columns can be NULL. + UNIQUE ( + from_stop_id, + from_trip_id, + from_route_id, + to_stop_id, + to_trip_id, + to_route_id + ) ); -ALTER TABLE transfers -ADD CONSTRAINT transfers_sig -UNIQUE ( - from_stop_id, - to_stop_id, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id +INSERT INTO transfers +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT * REPLACE ( + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::transfer_type_v)[transfer_type + 1] AS transfer_type +) +FROM read_csv( + '${pathToTransfers}', + header = true, + all_varchar = true, + types = { + transfer_type: 'INTEGER' + ${has_min_transfer_time ? `, min_transfer_time: 'INTEGER'` : ``} + } ); +`) -COPY transfers ( - from_stop_id, - to_stop_id, - transfer_type, - min_transfer_time, - from_route_id, - to_route_id, - from_trip_id, - to_trip_id -) FROM STDIN csv; -` - -const transferType = (val) => { - if (val === '0') return 'recommended' - if (val === '1') return 'timed' - if (val === '2') return 'minimum_time' - if (val === '3') return 'impossible' - throw new Error('invalid/unsupported transfer_type: ' + val) + workingState.nrOfRowsByName.set('frequencies', await queryNumberOfRows(db, 'frequencies', opt)) } -const formatTransfersRow = (t) => { - return [ - t.from_stop_id || null, - t.to_stop_id || null, - t.transfer_type ? transferType(t.transfer_type) : null, - t.min_transfer_time ? parseInt(t.min_transfer_time) : null, - t.from_route_id, - t.to_route_id, - t.from_trip_id, - t.to_trip_id, - ] -} - -const afterAll = (opt) => `\ -\\. -` - -module.exports = { - beforeAll, - formatRow: formatTransfersRow, - afterAll, -} +module.exports = importData diff --git a/lib/translations.js b/lib/translations.js index 8c858aa..c9d69f7 100644 --- a/lib/translations.js +++ b/lib/translations.js @@ -1,616 +1,743 @@ 'use strict' +const {strictEqual} = require('assert') +const RUN = require('./run.js') +const {queryNumberOfRows} = require('./rows-count.js') + +// > ## record_id +// > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: +// > - agency_id for agency +// > - stop_id for stops +// > - route_id for routes +// > - trip_id for trips +// > - trip_id for stop_times +// > - pathway_id for pathways +// > - level_id for levels +// > - attribution_id for attribution +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: +// > - service_id for calendar +// > - service_id for calendar_dates +// > - fare_id for fare_attributes +// > - fare_id for fare_rules +// > - shape_id for shapes +// > - trip_id for frequencies +// > - from_stop_id for transfers +// > ## record_sub_id +// > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: +// > - None for agency.txt +// > - None for stops.txt +// > - None for routes.txt +// > - None for trips.txt +// > - stop_sequence for stop_times.txt +// > - None for pathways.txt +// > - None for levels.txt +// > - None for attributions.txt +// > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: +// > - None for calendar.txt +// > - date for calendar_dates.txt +// > - None for fare_attributes.txt +// > - route_id for fare_rules.txt +// > - None for shapes.txt +// > - start_time for frequencies.txt +// > - to_stop_id for transfers.txt // https://gtfs.org/documentation/schedule/reference/#translationstxt -const beforeAll = (opt) => `\ -CREATE OR REPLACE FUNCTION table_exists( - t_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM pg_tables - WHERE schemaname = 'main' - AND tablename = t_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; - -CREATE OR REPLACE FUNCTION column_exists( - t_name TEXT, - c_name TEXT -) -RETURNS BOOLEAN -AS $$ - SELECT EXISTS ( - SELECT FROM information_schema.columns - WHERE table_schema = 'main' - AND table_name = t_name - AND column_name = c_name - LIMIT 1 - ); -$$ LANGUAGE sql STABLE; - -CREATE TABLE _translations_ref_cols ( - table_name TEXT PRIMARY KEY, - -- todo: only check if columns exist when table exists? - record_id_col TEXT NOT NULL - CONSTRAINT valid_record_id_col CHECK ( - NOT table_exists(table_name) - OR - column_exists(table_name, record_id_col) - ), - record_sub_id_col TEXT - CONSTRAINT valid_record_sub_id_col CHECK ( - NOT table_exists(table_name) - OR - record_sub_id_col IS NULL - OR - column_exists(table_name, record_sub_id_col) - ) -); - --- > ## record_id --- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below: --- > - agency_id for agency --- > - stop_id for stops --- > - route_id for routes --- > - trip_id for trips --- > - trip_id for stop_times --- > - pathway_id for pathways --- > - level_id for levels --- > - attribution_id for attribution --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_id for those tables: --- > - service_id for calendar --- > - service_id for calendar_dates --- > - fare_id for fare_attributes --- > - fare_id for fare_rules --- > - shape_id for shapes --- > - trip_id for frequencies --- > - from_stop_id for transfers --- > ## record_sub_id --- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: --- > - None for agency.txt --- > - None for stops.txt --- > - None for routes.txt --- > - None for trips.txt --- > - stop_sequence for stop_times.txt --- > - None for pathways.txt --- > - None for levels.txt --- > - None for attributions.txt --- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: --- > - None for calendar.txt --- > - date for calendar_dates.txt --- > - None for fare_attributes.txt --- > - route_id for fare_rules.txt --- > - None for shapes.txt --- > - start_time for frequencies.txt --- > - to_stop_id for transfers.txt --- https://gtfs.org/documentation/schedule/reference/#translationstxt -INSERT INTO _translations_ref_cols ( - table_name, - record_id_col, - record_sub_id_col -) VALUES - -- todo: feed_info - ('agency', 'agency_id', NULL), - ('stops', 'stop_id', NULL), - ('routes', 'route_id', NULL), - ('trips', 'trip_id', NULL), - ('stop_times', 'trip_id', 'stop_sequence'), - ('pathways', 'pathway_id', NULL), - ('levels', 'level_id', NULL), - ('attribution', 'attribution_id', NULL), - ('calendar', 'service_id', NULL), - ('calendar_dates', 'service_id', 'date'), - ('fare_attributes', 'fare_id', NULL), - ('fare_rules', 'fare_id', 'route_id'), - ('shapes', 'shape_id', NULL), - ('frequencies', 'trip_id', 'start_time'), - ('transfers', 'from_stop_id', 'to_stop_id') -; - -CREATE OR REPLACE FUNCTION row_exists( - table_name TEXT, - col_a_name TEXT, - col_a_value TEXT, - col_b_name TEXT, - col_b_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - result BOOLEAN; - BEGIN - IF col_b_name IS NULL THEN - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - LIMIT 1 - ) - ', 'main', table_name, col_a_name, col_a_value) - INTO STRICT result; - RETURN result; - ELSE - EXECUTE format(' - SELECT EXISTS( - SELECT * - FROM %I.%I -- schema, table_name - WHERE %I = %L -- col_a_name, col_a_value - AND %I = %L -- col_b_name, col_b_value - LIMIT 1 - ) - ', 'main', table_name, col_a_name, col_a_value, col_b_name, col_b_value) - INTO STRICT result; - RETURN result; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; - --- todo: assert that row_exists works as intended --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', NULL, NULL); -- Virchowstr. (Berlin) --- SELECT row_exists('stops', 'stop_id', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_name', 'non-existent', NULL, NULL); --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'de:11000:900120017'); -- Virchowstr. (Berlin) with valid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'parent_station', 'non-existent'); -- Virchowstr. (Berlin) with invalid parent_station --- SELECT row_exists('stops', 'stop_id', 'de:11000:900120017::2', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- SELECT row_exists('stops', 'stop_name', 'Virchowstr. (Berlin)', 'non-existent', 'de:11000:900120017'); -- Virchowstr. (Berlin) with invalid column B, should fail --- todo: assert that it fails with 2 rows - -CREATE OR REPLACE FUNCTION is_valid_translation_ref( - _table_name TEXT, - _field_name TEXT, - _record_id TEXT, - _record_sub_id TEXT, - _field_value TEXT -) -RETURNS BOOLEAN -AS $$ - DECLARE - _record_id_col TEXT; - _record_sub_id_col TEXT; - result BOOLEAN; - BEGIN - IF _record_id IS NOT NULL THEN - SELECT record_id_col - FROM _translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_id_col; - SELECT record_sub_id_col - FROM _translations_ref_cols - WHERE table_name = _table_name - LIMIT 1 - INTO _record_sub_id_col; - - IF _record_sub_id_col IS NULL AND _record_sub_id IS NOT NULL THEN - RAISE EXCEPTION - USING - MESSAGE = format('record_sub_id must be NULL for %I but is %L', _table_name, _record_sub_id), - ERRCODE = 'data_exception'; - END IF; - SELECT row_exists( - _table_name, - _record_id_col, _record_id, - _record_sub_id_col, _record_sub_id - ) - INTO STRICT result; - RETURN result; - ELSEIF _field_value IS NOT NULL THEN - SELECT row_exists( - _table_name, - _field_name, _field_value, - NULL, NULL - ) - INTO STRICT result; - RETURN result; - ELSE - RAISE EXCEPTION - USING - MESSAGE = 'Either record_id or field_value must be NOT NULL', - HINT = 'Refer to translations.txt the GTFS Static/Schedule reference.', - ERRCODE = 'data_exception'; - END IF; - END; -$$ LANGUAGE plpgsql STABLE; - --- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". --- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 --- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html --- related: https://github.com/google/transit/pull/98 - --- https://gtfs.org/documentation/schedule/reference/#translationstxt -CREATE TABLE translations ( - -- > Defines the table that contains the field to be translated. Allowed values are: - -- > agency, stops, routes, trips, stop_times, pathways, levels, feed_info, attributions - -- > Any file added to GTFS will have a table_name value equivalent to the file name, as listed above (i.e., not including the .txt file extension). - table_name TEXT NOT NULL, - - -- > Name of the field to be translated. […] Fields with other types should not be translated. - field_name TEXT NOT NULL - CONSTRAINT valid_field_name CHECK ( - NOT table_exists(table_name) - OR - column_exists(table_name, field_name) - ), - - language TEXT NOT NULL - CONSTRAINT valid_language CHECK ( - NOT table_exists(table_name) - OR - is_valid_lang_code(language) - ), - - translation TEXT NOT NULL, - - -- > Defines the record that corresponds to the field to be translated. The value in record_id must be the first or only field of a table's primary key, as defined in the primary key attribute for each table and below […]. - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. […] - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if field_value is empty. - record_id TEXT, - - -- > Helps the record that contains the field to be translated when the table doesn’t have a unique ID. Therefore, the value in record_sub_id is the secondary ID of the table, as defined by the table below: - -- > - None for agency.txt - -- > - None for stops.txt - -- > - None for routes.txt - -- > - None for trips.txt - -- > - stop_sequence for stop_times.txt - -- > - None for pathways.txt - -- > - None for levels.txt - -- > - None for attributions.txt - -- > Fields in tables not defined above should not be translated. However producers sometimes add extra fields that are outside the official specification and these unofficial fields may be translated. Below is the recommended way to use record_sub_id for those tables: - -- > - None for calendar.txt - -- > - date for calendar_dates.txt - -- > - None for fare_attributes.txt - -- > - route_id for fare_rules.txt - -- > - None for shapes.txt - -- > - start_time for frequencies.txt - -- > - to_stop_id for transfers.txt - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if field_value is defined. - -- > - Required if table_name=stop_times and record_id is defined. - record_sub_id TEXT, - - -- > Instead of […] using record_id and record_sub_id, this field can be used […]. When used, the translation will be applied when the fields identified by table_name and field_name contains the exact same value defined in field_value. - -- > The field must have exactly the value defined in field_value. If only a subset of the value matches field_value, the translation won’t be applied. - -- > Conditionally Required: - -- > - Forbidden if table_name is feed_info. - -- > - Forbidden if record_id is defined. - -- > - Required if record_id is empty. - -- todo: - -- > If two translation rules match the same record (one with field_value, and the other one with record_id), the rule with record_id takes precedence. - field_value TEXT, - - CONSTRAINT field_value_or_record_id CHECK ( - field_value IS NULL OR record_id IS NULL - ), - CONSTRAINT not_with_feed_info CHECK ( - field_value IS NULL OR table_name != 'feed_info' - ), +const supportedTranslationRefs = new Map([ + ['agency', { + src_table_name: 'agency', + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stops', { + src_table_name: 'stops', + record_id_column: 'stop_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['routes', { + src_table_name: 'routes', + record_id_column: 'route_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['trips', { + src_table_name: 'trips', + record_id_column: 'trip_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['stop_times', { + src_table_name: 'stop_times', + record_id_column: 'trip_id', + record_sub_id_column: 'stop_sequence', record_sub_id_column_type: 'INTEGER', + }], + ['pathways', { + src_table_name: 'pathways', + record_id_column: 'pathway_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['levels', { + src_table_name: 'levels', + record_id_column: 'level_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // todo: attribution.txt is not supported yet + // ['attribution', { + // src_table_name: 'attribution', + // record_id_column: 'attribution_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + ['calendar', { + src_table_name: 'calendar', + record_id_column: 'service_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + ['calendar_dates', { + src_table_name: 'calendar_dates', + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // ['fare_attributes', { + // src_table_name: 'fare_attributes', + // record_id_column: 'fare_id', + // record_sub_id_column: null, record_sub_id_column_type: null, + // }], + // ['fare_rules', { + // src_table_name: 'fare_rules', + // record_id_column: 'fare_id', + // record_sub_id_column: 'route_id', record_sub_id_column_type: 'TEXT', + // }], + ['shapes', { + src_table_name: 'shapes', + record_id_column: 'shape_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }], + // frequencies.txt has no primary key and/or unique index yet because DuckDB doesn't support indexes on INTERVAL. See frequencies.js for more details. + // ['frequencies', { + // src_table_name: 'frequencies', + // record_id_column: 'trip_id', + // record_sub_id_column: 'start_time', record_sub_id_column_type: 'INTERVAL', + // }], + // transfers' rows are *not* unique on (from_stop_id, to_stop_id), so we cannot create a foreign key reference on the table. + // todo: find a workaround + // ['transfers', { + // src_table_name: 'transfers', + // record_id_column: 'from_stop_id', + // record_sub_id_column: 'to_stop_id', record_sub_id_column_type: 'TEXT', + // }], + ['feed_info', { + src_table_name: 'feed_info', + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }], +]) + +const _srcTableRefSql = (table_name) => { + return `_translations_${table_name}` +} - CONSTRAINT valid_reference CHECK ( - NOT table_exists(table_name) - OR - table_name = 'feed_info' - OR - is_valid_translation_ref( - table_name, - field_name, - record_id, - record_sub_id, - field_value +const _srcTablesSql = (pathToTranslations, table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, record_sub_id_column_type, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const srcTableRef = _srcTableRefSql(table_name) + + return `\ +CREATE TABLE ${srcTableRef} ( + ${hasCol ? `record_id TEXT NOT NULL,` : ``} + ${hasSubCol ? `record_sub_id ${record_sub_id_column_type} NOT NULL,` : ``} +${hasCol ? `\ + FOREIGN KEY ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} ) - ), - - -- > Primary key (table_name, field_name, language, record_id, record_sub_id, field_value) - -- https://gtfs.org/documentation/schedule/reference/#translationstxt - -- PostgreSQL doesn't allow NULL values for primary key columns, so we use UNIQUE. - UNIQUE ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value - ) + REFERENCES ${table_name} ( + ${colRef} + ${hasSubCol ? `, ${subColRef}` : ``} + ),\ +` : ``} + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -COPY translations ( - table_name, +INSERT INTO ${srcTableRef} +SELECT + ${hasCol ? `record_id,` : ``} + ${hasSubCol ? `record_sub_id,` : ``} field_name, language, - translation, - record_id, - record_sub_id, - field_value -) FROM STDIN csv; + translation +FROM read_csv( + '${pathToTranslations}', + header = true, + all_varchar = true +) +WHERE table_name = '${table_name}' +-- todo: support field_value-based translations +AND field_value IS NULL; ` - -const formatTranslationsRow = (t) => { - return [ - t.table_name || null, - t.field_name || null, - t.language || null, - t.translation || null, - t.record_id || null, - t.record_sub_id || null, - t.field_value || null, - ] } - -const afterAll = (opt) => `\ -\\. - --- todo -CREATE INDEX ON translations ( - table_name, - field_name, - language, - record_id, - record_sub_id, - field_value +strictEqual( + _srcTablesSql('foo/trans.txt', 'feed_info', { + record_id_column: null, + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ +CREATE TABLE _translations_feed_info ( + + + + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -CREATE OR REPLACE VIEW stops_translated AS +INSERT INTO _translations_feed_info SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - stop_id, - stop_code, - coalesce(stop_n_t.translation, stop_name) as stop_name, - stop_n_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(stop_d_t.translation, stop_desc) as stop_desc, - stop_d_t.language as stop_desc_lang, -- todo: fall back to feed_info.feed_lang? - stop_loc, - zone_id, - coalesce(stop_u_t.translation, stop_url) as stop_url, - stop_u_t.language as stop_url_lang, -- todo: fall back to feed_info.feed_lang? - location_type, - parent_station, - stop_timezone, - wheelchair_boarding, - level_id, - platform_code -FROM stops s -LEFT JOIN translations stop_n_t ON ( - stop_n_t.table_name = 'stops' AND stop_n_t.field_name = 'stop_name' - AND (s.stop_id = stop_n_t.record_id OR s.stop_name = stop_n_t.field_value) + + + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN translations stop_d_t ON ( - stop_d_t.table_name = 'stops' AND stop_d_t.field_name = 'stop_desc' - AND (s.stop_id = stop_d_t.record_id OR s.stop_name = stop_d_t.field_value) +WHERE table_name = 'feed_info' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with feed_info.txt', ) -LEFT JOIN translations stop_u_t ON ( - stop_u_t.table_name = 'stops' AND stop_u_t.field_name = 'stop_url' - AND (s.stop_id = stop_u_t.record_id OR s.stop_name = stop_u_t.field_value) +strictEqual( + _srcTablesSql('foo/trans.txt', 'calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ +CREATE TABLE _translations_calendar_dates ( + record_id TEXT NOT NULL, + record_sub_id DATE NOT NULL, + FOREIGN KEY ( + record_id + , record_sub_id + ) + REFERENCES calendar_dates ( + "service_id" + , "date" + ), + field_name TEXT NOT NULL, -- todo: validate via all_columns helper view + language TEXT NOT NULL, -- todo: validate just like agency.agency_lang + translation TEXT NOT NULL ); -CREATE OR REPLACE VIEW routes_translated AS +INSERT INTO _translations_calendar_dates SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - agency_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_d_t.translation, route_desc) as route_desc, - route_d_t.language as route_desc_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - coalesce(route_u_t.translation, route_url) as route_url, - route_u_t.language as route_url_lang, -- todo: fall back to feed_info.feed_lang? - route_color, - route_text_color, - route_sort_order -FROM routes r -LEFT JOIN translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (r.route_id = route_s_t.record_id OR r.route_short_name = route_s_t.field_value) + record_id, + record_sub_id, + field_name, + language, + translation +FROM read_csv( + 'foo/trans.txt', + header = true, + all_varchar = true ) -LEFT JOIN translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (r.route_id = route_l_t.record_id OR r.route_long_name = route_l_t.field_value) +WHERE table_name = 'calendar_dates' +-- todo: support field_value-based translations +AND field_value IS NULL; +`, + '_srcTablesSql with calendar_dates.txt', ) -LEFT JOIN translations route_d_t ON ( - route_d_t.table_name = 'routes' AND route_d_t.field_name = 'route_desc' - AND (r.route_id = route_d_t.record_id OR r.route_long_name = route_d_t.field_value) + +const _selectToBeMergedSql = (table_name, translationRef) => { + const { + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const hasSubCol = record_sub_id_column !== null + const srcTableRef = _srcTableRefSql(table_name) + + return `\ + SELECT + '${table_name}' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + ${hasCol ? `record_id::TEXT as record_id,` : ``} + ${hasSubCol ? `record_sub_id::TEXT as record_sub_id,` : ``} + * + ${hasCol ? `EXCLUDE ( + record_id + ${hasSubCol ? `, record_sub_id` : ``} + )` : ``} + FROM ${srcTableRef} +` +} +strictEqual( + _selectToBeMergedSql('agency', { + record_id_column: 'agency_id', + record_sub_id_column: null, record_sub_id_column_type: null, + }), + `\ + SELECT + 'agency' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + + * + EXCLUDE ( + record_id + + ) + FROM _translations_agency +`, + '_selectToBeMergedSql with agency.txt', +) +strictEqual( + _selectToBeMergedSql('calendar_dates', { + record_id_column: 'service_id', + record_sub_id_column: 'date', record_sub_id_column_type: 'DATE', + }), + `\ + SELECT + 'calendar_dates' AS table_name, + -- Some UNION-ed tables have non-TEXT record_id/record_sub_id columns (e.g. INTEGER). + -- Given that UNION ALL does implicit casts to match the *first* table, we force TEXT here so that we do not depend on their order. + record_id::TEXT as record_id, + record_sub_id::TEXT as record_sub_id, + * + EXCLUDE ( + record_id + , record_sub_id + ) + FROM _translations_calendar_dates +`, + '_selectToBeMergedSql with calendar_dates.txt', ) -LEFT JOIN translations route_u_t ON ( - route_u_t.table_name = 'routes' AND route_u_t.field_name = 'route_url' - AND (r.route_id = route_u_t.record_id OR r.route_long_name = route_u_t.field_value) -); --- todo [breaking]: remove in favor of trip_headsign_translations & trip_short_name_translations -CREATE OR REPLACE VIEW trips_translated AS +const _translatedSql = (table_name, translatedCols) => { + const _transRefSql = (col) => `"trans_${col}"` + + const _sqls = Array.from(translatedCols.entries()) + .map(([col, translationRef]) => { + const { + src_table_name, + record_id_column, + record_sub_id_column, + } = translationRef + + const hasCol = record_id_column !== null + const colRef = hasCol ? `"${record_id_column}"` : null + const hasSubCol = record_sub_id_column !== null + const subColRef = hasSubCol ? `"${record_sub_id_column}"` : null + const srcTableRef = _srcTableRefSql(src_table_name) + const transRef = _transRefSql(col) + + return { + colLangSelect: `\ + ${transRef}.language AS "${col}_lang",`, + colReplace: `\ + coalesce(${transRef}.translation, "${col}") AS "${col}"`, + transJoin: `\ +LEFT JOIN ${srcTableRef} ${transRef} ON ( + ${transRef}.field_name = '${col}' + ${hasCol ? `AND data.${colRef} = ${transRef}.record_id` : ``} + ${hasSubCol ? `AND data.${subColRef} = ${transRef}.record_sub_id` : ``} +)`, + } + }) + + return `\ +CREATE VIEW ${table_name}_translated AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - trip_id, - route_id, - service_id, - coalesce(trip_h_t.translation, trip_headsign) as trip_headsign, - trip_h_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(trip_s_t.translation, trip_short_name) as trip_short_name, - trip_s_t.language as trip_short_name_lang, -- todo: fall back to feed_info.feed_lang? - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -FROM trips t -LEFT JOIN translations trip_s_t ON ( - trip_s_t.table_name = 'trips' AND trip_s_t.field_name = 'trip_short_name' - AND (t.trip_id = trip_s_t.record_id OR t.trip_headsign = trip_s_t.field_value) + -- todo: fall back to feed_info.feed_lang? +${_sqls.map(sql => sql.colLangSelect).join('\n')} + data.* + REPLACE ( +${_sqls.map(sql => sql.colReplace).join(',\n')} + ) +FROM ${table_name} data +${_sqls.map(sql => sql.transJoin).join('\n')}; +` +} +{ + const agencyRef = supportedTranslationRefs.get('agency') + strictEqual( + _translatedSql('agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ])), + `\ +CREATE VIEW agency_translated AS +SELECT + -- todo: fall back to feed_info.feed_lang? + "trans_agency_name".language AS "agency_name_lang", + "trans_agency_url".language AS "agency_url_lang", + data.* + REPLACE ( + coalesce("trans_agency_name".translation, "agency_name") AS "agency_name", + coalesce("trans_agency_url".translation, "agency_url") AS "agency_url" + ) +FROM agency data +LEFT JOIN _translations_agency "trans_agency_name" ON ( + "trans_agency_name".field_name = 'agency_name' + AND data."agency_id" = "trans_agency_name".record_id + ) -LEFT JOIN translations trip_h_t ON ( - trip_h_t.table_name = 'trips' AND trip_h_t.field_name = 'trip_headsign' - AND (t.trip_id = trip_h_t.record_id OR t.trip_headsign = trip_h_t.field_value) +LEFT JOIN _translations_agency "trans_agency_url" ON ( + "trans_agency_url".field_name = 'agency_url' + AND data."agency_id" = "trans_agency_url".record_id + ); - -CREATE OR REPLACE VIEW arrivals_departures_translated AS +`, + '_translatedSql with agency.txt', + ) +} +{ + const calendarDatesRef = supportedTranslationRefs.get('calendar_dates') + strictEqual( + _translatedSql('calendar_dates', new Map([ + ['foo', calendarDatesRef], + ['b-a-r', calendarDatesRef], + ])), + `\ +CREATE VIEW calendar_dates_translated AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - service_id, - shape_id, - "date", - stop_sequence, - coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, - stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - pickup_type, drop_off_type, shape_dist_traveled, timepoint, - tz, - arrival_time, t_arrival, - departure_time, t_departure, - stop_id, - coalesce(stop_t.translation, stop_name) as stop_name, - stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? - station_id, - coalesce(station_t.translation, station_name) as station_name, - station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM arrivals_departures ad -LEFT JOIN translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) -) -LEFT JOIN translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) -) -LEFT JOIN translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) -) -LEFT JOIN translations stop_t ON ( - stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' - AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) -) -LEFT JOIN translations station_t ON ( - station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' - AND station_t.language = stop_t.language - AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) -) -LEFT JOIN translations stop_times_t ON ( - stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' - AND ( - (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) - OR ad.stop_headsign = stop_times_t.field_value + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" ) +FROM calendar_dates data +LEFT JOIN _translations_calendar_dates "trans_foo" ON ( + "trans_foo".field_name = 'foo' + AND data."service_id" = "trans_foo".record_id + AND data."date" = "trans_foo".record_sub_id +) +LEFT JOIN _translations_calendar_dates "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + AND data."service_id" = "trans_b-a-r".record_id + AND data."date" = "trans_b-a-r".record_sub_id ); - -CREATE OR REPLACE VIEW connections_translated AS +`, + '_translatedSql with calendar_dates.txt', + ) +} +{ + const feedInfoRef = supportedTranslationRefs.get('feed_info') + strictEqual( + _translatedSql('feed_info', new Map([ + ['foo', { + ...feedInfoRef, + src_table_name: 'some-other-table', + }], + ['b-a-r', feedInfoRef], + ])), + `\ +CREATE VIEW feed_info_translated AS SELECT - -- almost all columns, duh - -- todo: find a way to use all columns without explicitly enumerating them here - route_id, - coalesce(route_s_t.translation, route_short_name) as route_short_name, - route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? - coalesce(route_l_t.translation, route_long_name) as route_long_name, - route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? - route_type, - trip_id, - service_id, - direction_id, - coalesce(trip_t.translation, trip_headsign) as trip_headsign, - trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? - - from_stop_id, - coalesce(from_stop.translation, from_stop_name) as from_stop_name, - from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - from_station_id, - coalesce(from_station.translation, from_station_name) as from_station_name, - from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? - - coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, - from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - from_pickup_type, - t_departure, - departure_time, -- todo [breaking]: this is misleading, remove it - from_stop_sequence, - from_timepoint, - - "date", - - to_timepoint, - to_stop_sequence, - t_arrival, - arrival_time, -- todo [breaking]: this is misleading, remove it - to_drop_off_type, - coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, - to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? - - to_stop_id, - coalesce(to_stop.translation, to_stop_name) as to_stop_name, - to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? - to_station_id, - coalesce(to_station.translation, to_station_name) as to_station_name, - to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? -FROM connections c -LEFT JOIN translations route_s_t ON ( - route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' - AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) -) -LEFT JOIN translations route_l_t ON ( - route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' - AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) -) -LEFT JOIN translations trip_t ON ( - trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' - AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) -) -LEFT JOIN translations from_stop ON ( - from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' - AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) -) -LEFT JOIN translations from_station ON ( - from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' - AND from_station.language = from_stop.language - AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) -) -LEFT JOIN translations to_stop ON ( - to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' - AND to_stop.language = from_stop.language - AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) -) -LEFT JOIN translations to_station ON ( - to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' - AND to_station.language = from_stop.language - AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) -) -LEFT JOIN translations from_stop_times_t ON ( - from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) - OR c.from_stop_headsign = from_stop_times_t.field_value + -- todo: fall back to feed_info.feed_lang? + "trans_foo".language AS "foo_lang", + "trans_b-a-r".language AS "b-a-r_lang", + data.* + REPLACE ( + coalesce("trans_foo".translation, "foo") AS "foo", + coalesce("trans_b-a-r".translation, "b-a-r") AS "b-a-r" ) +FROM feed_info data +LEFT JOIN _translations_some-other-table "trans_foo" ON ( + "trans_foo".field_name = 'foo' + + ) -LEFT JOIN translations to_stop_times_t ON ( - to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' - AND ( - (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) - OR c.to_stop_headsign = to_stop_times_t.field_value - ) +LEFT JOIN _translations_feed_info "trans_b-a-r" ON ( + "trans_b-a-r".field_name = 'b-a-r' + + ); -` +`, + '_translatedSql with feed_info.txt', + ) +} + +// https://gtfs.org/documentation/schedule/reference/#translationstxt +const importData = async (db, pathToTranslations, opt, workingState) => { + const translationRefs = new Map( + supportedTranslationRefs.entries() + // If there is no such file/table, don't allow translations for it. + .filter(([table_name]) => opt.files.includes(table_name)) + ) + + const selectsToBeMerged = [] + for (const [table_name, translationRef] of translationRefs.entries()) { + await db[RUN](_srcTablesSql(pathToTranslations, table_name, translationRef)) + selectsToBeMerged.push(_selectToBeMergedSql(table_name, translationRef)) + } -module.exports = { - beforeAll, - formatRow: formatTranslationsRow, - afterAll, + await db[RUN](`\ +-- The MobilityData GTFS Validator just uses Java's Locale#toLanguageTag() to validate "language". +-- https://github.com/MobilityData/gtfs-validator/blob/a11b7489902dd54dc194af1f1515583406ba3716/main/src/main/java/org/mobilitydata/gtfsvalidator/table/GtfsTranslationSchema.java#L36 +-- https://docs.oracle.com/javase/7/docs/api/java/util/Locale.html +-- related: https://github.com/google/transit/pull/98 + +-- We mimick a true table with a view that UNIONs all individual _translations_* tables. +CREATE VIEW translations AS +${selectsToBeMerged.map(sql => `(${sql})`).join(`UNION ALL BY NAME`)}; +`) + + const agencyRef = supportedTranslationRefs.get('agency') + const stopsRef = supportedTranslationRefs.get('stops') + const routesRef = supportedTranslationRefs.get('routes') + const tripsRef = supportedTranslationRefs.get('trips') + const stopTimesRef = supportedTranslationRefs.get('stop_times') + const pathwaysRef = supportedTranslationRefs.get('pathways') + const levelsRef = supportedTranslationRefs.get('levels') + const feedInfoRef = supportedTranslationRefs.get('feed_info') + const preTranslatedColumns = new Map([ + ['agency', new Map([ + ['agency_name', agencyRef], + ['agency_url', agencyRef], + ['agency_phone', agencyRef], + ['agency_fare_url', agencyRef], + ['agency_email', agencyRef], + ])], + ['stops', new Map([ + ['stop_code', stopsRef], + ['stop_name', stopsRef], + // todo: not supported yet by stops.js + // ['tts_stop_name', stopsRef], + ['stop_desc', stopsRef], + ['stop_url', stopsRef], + ['platform_code', stopsRef], + ])], + ['routes', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['route_desc', routesRef], + ['route_url', routesRef], + ])], + ['trips', new Map([ + ['trip_headsign', tripsRef], + ['trip_short_name', tripsRef], + // todo: not supported yet by trips.js + // ['trip_desc', tripsRef], + // ['trip_url', tripsRef], + ])], + ['stop_times', new Map([ + ['stop_headsign', stopTimesRef], + ])], + // todo: fare_attributes.txt & fare_rules.txt are not supported yet + // todo: frequencies.txt (see above) + // todo: areas.txt is not supported yet + // todo: networks.txt is not supported yet + ['pathways', new Map([ + ['signposted_as', pathwaysRef], + ['reversed_signposted_as', pathwaysRef], + ])], + ['levels', new Map([ + ['level_name', levelsRef], + ])], + // todo: location_groups.txt is not supported yet + // todo: booking_rules.txt is not supported yet + ['feed_info', new Map([ + ['feed_publisher_name', feedInfoRef], + ['feed_publisher_url', feedInfoRef], + ['feed_version', feedInfoRef], + ['feed_contact_email', feedInfoRef], + ['feed_contact_url', feedInfoRef], + ])], + // todo: attribution.txt is not supported yet + + ]) + for (const [table_name, translatedCols] of preTranslatedColumns) { + if (!opt.files.includes(table_name)) { + // If there is no such file/table, don't allow translations for it. + continue + } + + await db[RUN](_translatedSql(table_name, translatedCols)) + } + + // *_translated for tables/views made up by gtfs-via-duckdb + { + await db[RUN](_translatedSql('arrivals_departures', new Map([ + ['route_short_name', routesRef], + ['route_long_name', routesRef], + ['trip_headsign', tripsRef], + ['stop_headsign', stopsRef], + ['stop_name', stopsRef], + // todo: ['station_name', stopsRef], + ]))) + } + // todo: connections + +// `\ +// -- CREATE OR REPLACE VIEW arrivals_departures_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- service_id, +// -- shape_id, +// -- "date", +// -- stop_sequence, +// -- coalesce(stop_times_t.translation, stop_headsign) as stop_headsign, +// -- stop_times_t.language as stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- pickup_type, drop_off_type, shape_dist_traveled, timepoint, +// -- tz, +// -- arrival_time, t_arrival, +// -- departure_time, t_departure, +// -- stop_id, +// -- coalesce(stop_t.translation, stop_name) as stop_name, +// -- stop_t.language as stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- station_id, +// -- coalesce(station_t.translation, station_name) as station_name, +// -- station_t.language as station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM arrivals_departures ad +// -- LEFT JOIN translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (ad.route_id = route_s_t.record_id OR ad.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (ad.route_id = route_l_t.record_id OR ad.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (ad.trip_id = trip_t.record_id OR ad.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN translations stop_t ON ( +// -- stop_t.table_name = 'stops' AND stop_t.field_name = 'stop_name' +// -- AND (ad.stop_id = stop_t.record_id OR ad.stop_name = stop_t.field_value) +// -- ) +// -- LEFT JOIN translations station_t ON ( +// -- station_t.table_name = 'stops' AND station_t.field_name = 'stop_name' +// -- AND station_t.language = stop_t.language +// -- AND (ad.station_id = station_t.record_id OR ad.station_name = station_t.field_value) +// -- ) +// -- LEFT JOIN translations stop_times_t ON ( +// -- stop_times_t.table_name = 'stop_times' AND stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (ad.trip_id = stop_times_t.record_id AND ad.stop_sequence = stop_times_t.record_sub_id::integer) +// -- OR ad.stop_headsign = stop_times_t.field_value +// -- ) +// -- ); +// -- +// -- CREATE OR REPLACE VIEW connections_translated AS +// -- SELECT +// -- -- almost all columns, duh +// -- -- todo: find a way to use all columns without explicitly enumerating them here +// -- route_id, +// -- coalesce(route_s_t.translation, route_short_name) as route_short_name, +// -- route_s_t.language as route_short_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- coalesce(route_l_t.translation, route_long_name) as route_long_name, +// -- route_l_t.language as route_long_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- route_type, +// -- trip_id, +// -- service_id, +// -- direction_id, +// -- coalesce(trip_t.translation, trip_headsign) as trip_headsign, +// -- trip_t.language as trip_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- from_stop_id, +// -- coalesce(from_stop.translation, from_stop_name) as from_stop_name, +// -- from_stop.language as from_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_station_id, +// -- coalesce(from_station.translation, from_station_name) as from_station_name, +// -- from_station.language as from_station_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- coalesce(from_stop_times_t.translation, from_stop_headsign) as from_stop_headsign, +// -- from_stop_times_t.language as from_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- from_pickup_type, +// -- t_departure, +// -- departure_time, -- todo [breaking]: this is misleading, remove it +// -- from_stop_sequence, +// -- from_timepoint, +// -- +// -- "date", +// -- +// -- to_timepoint, +// -- to_stop_sequence, +// -- t_arrival, +// -- arrival_time, -- todo [breaking]: this is misleading, remove it +// -- to_drop_off_type, +// -- coalesce(to_stop_times_t.translation, to_stop_headsign) as to_stop_headsign, +// -- to_stop_times_t.language as to_stop_headsign_lang, -- todo: fall back to feed_info.feed_lang? +// -- +// -- to_stop_id, +// -- coalesce(to_stop.translation, to_stop_name) as to_stop_name, +// -- to_stop.language as to_stop_name_lang, -- todo: fall back to feed_info.feed_lang? +// -- to_station_id, +// -- coalesce(to_station.translation, to_station_name) as to_station_name, +// -- to_station.language as to_station_name_lang -- todo: fall back to feed_info.feed_lang? +// -- FROM connections c +// -- LEFT JOIN translations route_s_t ON ( +// -- route_s_t.table_name = 'routes' AND route_s_t.field_name = 'route_short_name' +// -- AND (c.route_id = route_s_t.record_id OR c.route_short_name = route_s_t.field_value) +// -- ) +// -- LEFT JOIN translations route_l_t ON ( +// -- route_l_t.table_name = 'routes' AND route_l_t.field_name = 'route_long_name' +// -- AND (c.route_id = route_l_t.record_id OR c.route_long_name = route_l_t.field_value) +// -- ) +// -- LEFT JOIN translations trip_t ON ( +// -- trip_t.table_name = 'trips' AND trip_t.field_name = 'trip_headsign' +// -- AND (c.trip_id = trip_t.record_id OR c.trip_headsign = trip_t.field_value) +// -- ) +// -- LEFT JOIN translations from_stop ON ( +// -- from_stop.table_name = 'stops' AND from_stop.field_name = 'stop_name' +// -- AND (c.from_stop_id = from_stop.record_id OR c.from_stop_name = from_stop.field_value) +// -- ) +// -- LEFT JOIN translations from_station ON ( +// -- from_station.table_name = 'stops' AND from_station.field_name = 'stop_name' +// -- AND from_station.language = from_stop.language +// -- AND (c.from_station_id = from_station.record_id OR c.from_station_name = from_station.field_value) +// -- ) +// -- LEFT JOIN translations to_stop ON ( +// -- to_stop.table_name = 'stops' AND to_stop.field_name = 'stop_name' +// -- AND to_stop.language = from_stop.language +// -- AND (c.to_stop_id = to_stop.record_id OR c.to_stop_name = to_stop.field_value) +// -- ) +// -- LEFT JOIN translations to_station ON ( +// -- to_station.table_name = 'stops' AND to_station.field_name = 'stop_name' +// -- AND to_station.language = from_stop.language +// -- AND (c.to_station_id = to_station.record_id OR c.to_station_name = to_station.field_value) +// -- ) +// -- LEFT JOIN translations from_stop_times_t ON ( +// -- from_stop_times_t.table_name = 'stop_times' AND from_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = from_stop_times_t.record_id AND c.from_stop_sequence = from_stop_times_t.record_sub_id::integer) +// -- OR c.from_stop_headsign = from_stop_times_t.field_value +// -- ) +// -- ) +// -- LEFT JOIN translations to_stop_times_t ON ( +// -- to_stop_times_t.table_name = 'stop_times' AND to_stop_times_t.field_name = 'stop_headsign' +// -- AND ( +// -- (c.trip_id = to_stop_times_t.record_id AND c.to_stop_sequence = to_stop_times_t.record_sub_id::integer) +// -- OR c.to_stop_headsign = to_stop_times_t.field_value +// -- ) +// -- ); +// `; + + workingState.nrOfRowsByName.set('translations', await queryNumberOfRows(db, 'translations', opt)) } + +module.exports = importData diff --git a/lib/trips.js b/lib/trips.js index ac72d2a..ee58fbd 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -1,91 +1,90 @@ 'use strict' +const RUN = require('./run.js') +const {queryIfColumnsExist} = require('./columns.js') +const {queryNumberOfRows} = require('./rows-count.js') + // https://gtfs.org/documentation/schedule/reference/#tripstxt -const beforeAll = (opt) => `\ +const importData = async (db, pathToTrips, opt, workingState) => { + // Several columns are optional, so they may be missing in a `read_csv()` result. + // It seems like, as of DuckDB v1.0.0, there is no way to assign default values to missing columns, neither with read_csv() nor with a nested subquery. + // todo: github ticket? + // This is why we check the file first and then programmatically determine the set of SELECT-ed columns below. + const { + wheelchair_accessible: has_wheelchair_accessible, + bikes_allowed: has_bikes_allowed, + } = await queryIfColumnsExist(db, pathToTrips, [ + 'wheelchair_accessible', + 'bikes_allowed', + ]) + + await db[RUN](`\ CREATE TYPE wheelchair_accessibility AS ENUM ( 'unknown' -- 0 or empty - No accessibility information for the trip. , 'accessible' -- 1 – Vehicle being used on this particular trip can accommodate at least one rider in a wheelchair. , 'not_accessible' -- 2 – No riders in wheelchairs can be accommodated on this trip. ); -CREATE CAST (wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (wheelchair_accessibility AS text) WITH INOUT AS IMPLICIT; CREATE TYPE bikes_allowance AS ENUM ( 'unknown' -- 0 or empty - No bike information for the trip. , 'allowed' -- 1 – Vehicle being used on this particular trip can accommodate at least one bicycle. , 'not_allowed' -- 2 – No bicycles are allowed on this trip. ); -CREATE CAST (bikes_allowance AS text) WITH INOUT AS IMPLICIT; +-- CREATE CAST (bikes_allowance AS text) WITH INOUT AS IMPLICIT; CREATE TABLE trips ( trip_id TEXT PRIMARY KEY, route_id TEXT NOT NULL, FOREIGN KEY (route_id) REFERENCES routes, + -- todo: add foreign key constraint? service_id TEXT NOT NULL, -- references service_days.service_id trip_headsign TEXT, trip_short_name TEXT, direction_id INT, block_id TEXT, - shape_id TEXT, -- todo: add NOT NULL? - ${opt.tripsWithoutShapeId ? '' : `CONSTRAINT valid_shape_id CHECK (shape_exists(shape_id)),`} - -- todo [breaking]: use 0/unknown for empty values + shape_id TEXT, + ${opt.tripsWithoutShapeId ? '' : `FOREIGN KEY (shape_id) REFERENCES shapes,`} wheelchair_accessible wheelchair_accessibility, -- todo [breaking]: use 0/unknown for empty values bikes_allowed bikes_allowance ); -COPY trips ( - trip_id, - route_id, - service_id, - trip_headsign, - trip_short_name, - direction_id, - block_id, - shape_id, - wheelchair_accessible, - bikes_allowed -) FROM STDIN csv; -` - -const wheelchairAccessibility = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'accessible' - if (val === '2') return 'not_accessible' - throw new Error('invalid wheelchair_accessibility: ' + val) -} - -const bikesAllowance = (val) => { - if (val === '0') return 'unknown' - if (val === '1') return 'allowed' - if (val === '2') return 'not_allowed' - throw new Error('invalid bikes_allowance: ' + val) -} +INSERT INTO trips +-- Matching by name allows the CSV file to have a different set and order of columns. +-- todo: handle the CSV file having *additional* columns +BY NAME +SELECT + ${has_wheelchair_accessible ? `` : `NULL AS wheelchair_accessible,`} + ${has_bikes_allowed ? `` : `NULL AS bikes_allowed,`} + * + REPLACE ( + -- dummy entry in case no optional column is present + trip_id AS trip_id, + ${has_wheelchair_accessible ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::wheelchair_accessibility)[wheelchair_accessible + 1] AS wheelchair_accessible, + ` : ``} + ${has_bikes_allowed ? ` + -- Casting an integer to an enum (using the index) is currently not possible, so we have to compute the availability index by hand using enum_range(). + -- DuckDB array/list indixes are 1-based. + enum_range(NULL::bikes_allowance)[bikes_allowed + 1] AS bikes_allowed + ` : ``} + ) +FROM read_csv( + '${pathToTrips}', + header = true, + all_varchar = true, + types = { + direction_id: 'INTEGER', + ${has_wheelchair_accessible ? `wheelchair_accessible: 'INTEGER',` : ``} + ${has_bikes_allowed ? `bikes_allowed: 'INTEGER',` : ``} + } +); +`) -const formatTripsRow = (t) => { - return [ - t.trip_id || null, - t.route_id || null, - t.service_id || null, - t.trip_headsign || null, - t.trip_short_name || null, - t.direction_id ? parseInt(t.direction_id) : null, - t.block_id || null, - t.shape_id || null, - t.wheelchair_accessible - ? wheelchairAccessibility(t.wheelchair_accessible) - : null, - t.bikes_allowed ? bikesAllowance(t.bikes_allowed) : null, - ] + workingState.nrOfRowsByName.set('trips', await queryNumberOfRows(db, 'trips', opt)) } -const afterAll = (opt) => `\ -\\. - -CREATE INDEX ON trips (route_id); -` - -module.exports = { - beforeAll, - formatRow: formatTripsRow, - afterAll, -} +module.exports = importData diff --git a/lib/util.js b/lib/util.js deleted file mode 100644 index affe20b..0000000 --- a/lib/util.js +++ /dev/null @@ -1,12 +0,0 @@ -'use strict' - -const parseTime = require('gtfs-utils/parse-time') - -const formatTime = (gtfsTime) => { - const {hours: h, minutes: m, seconds: s} = parseTime(gtfsTime) - return `${h} hours ${m} minutes ${s === null ? 0 : s} seconds` -} - -module.exports = { - formatTime, -} diff --git a/package.json b/package.json index 34120df..7f4f480 100644 --- a/package.json +++ b/package.json @@ -1,10 +1,11 @@ { + "private": true, "name": "gtfs-via-duckdb", "description": "Analyze GTFS datasets using DuckDB.", "version": "5.0.0", "main": "lib/index.js", "bin": { - "gtfs-to-sql": "cli.js" + "gtfs-to-duckdb": "cli.js" }, "files": [ "cli.js", @@ -57,7 +58,7 @@ "node": ">=22" }, "dependencies": { - "csv-stringify": "^6.2.0", + "@duckdb/node-api": "^1.2.2-alpha.18", "debug": "^4.3.3", "gtfs-utils": "^5.1.0", "sequencify": "0.0.7" @@ -66,13 +67,14 @@ "@yao-pkg/pkg": "^6.6.0", "csv-parser": "^3.0.0", "eslint": "^8.33.0", - "sample-gtfs-feed": "^0.13.0" + "sample-gtfs-feed": "^0.13.0", + "tinybench": "^4.0.1" }, "scripts": { "test": "./test/index.sh", "lint": "eslint .", "benchmark": "./benchmark/run.sh", - "build-binaries": "pkg --public -t node22-macos-x64,node22-macos-arm64,node22-linux-x64,node22-linux-arm64 -o dist/gtfs-via-postgres cli.js && gzip -k --best dist/gtfs-via-postgres-*", + "build-binaries": "pkg --public -t node22-macos-x64,node22-macos-arm64,node22-linux-x64,node22-linux-arm64 -o dist/gtfs-via-duckdb cli.js && gzip -k --best dist/gtfs-via-duckdb-*", "prepublishOnly": "npm run lint && npm test" } } diff --git a/readme.md b/readme.md index 34b0c28..36a5942 100644 --- a/readme.md +++ b/readme.md @@ -11,8 +11,8 @@ - ✅ handles [daylight saving time correctly](#correctness-vs-speed-regarding-gtfs-time-values) but retains reasonable lookup performance - ✅ supports `frequencies.txt` -- ✨ joins `stop_times.txt`/`frequencies.txt`, `calendar.txt`/`calendar_dates.txt`, `trips.txt`, `route.txt` & `stops.txt` into [views](https://www.postgresql.org/docs/14/sql-createview.html) for straightforward data analysis (see below) -- 🚀 is carefully optimised to let PostgreSQL's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) +- ✨ joins `stop_times.txt`/`frequencies.txt`, `calendar.txt`/`calendar_dates.txt`, `trips.txt`, `route.txt` & `stops.txt` into [views](https://duckdb.org/docs/stable/sql/statements/create_view) for straightforward data analysis (see below) +- 🚀 is carefully optimised to let DuckDB's query planner do its magic, yielding quick lookups even with large datasets (see [performance section](#performance)) - ✅ validates and imports `translations.txt` To work with the time-related data (`stop_times` etc.), `gtfs-via-duckdb` supports two "mental models": @@ -31,11 +31,19 @@ Or use [`npx`](https://npmjs.com/package/npx). ✨ There are also [prebuilt binaries](https://github.com/public-transport/gtfs-via-duckdb/releases/latest) and [Docker images](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb) available. -*Note:* `gtfs-via-duckdb` **needs PostgreSQL >=14** to work, as it uses the [`WITH … AS NOT MATERIALIZED`](https://www.postgresql.org/docs/14/queries-with.html#id-1.5.6.12.7) syntax. You can check your PostgreSQL server's version with `psql -t -c 'SELECT version()'`. +> [!NOTE] +> `gtfs-via-duckdb` **needs DuckDB >=1.2** and its [`icu`](https://duckdb.org/docs/stable/extensions/icu) and [`spatial`](https://duckdb.org/docs/stable/extensions/spatial/overview) extensions to work. ## Getting Started +Install the DuckDB [`icu`](https://duckdb.org/docs/stable/extensions/icu) and [`spatial`](https://duckdb.org/docs/stable/extensions/spatial/overview) extensions. + +```shell +duckdb_cli -c 'INSTALL icu' +duckdb_cli -c 'INSTALL spatial' +``` + If you have a `.zip` GTFS feed, unzip it into individual files. We're going to use the [2022-07-01 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-07-01/) as an example, which consists of individual files already. @@ -63,20 +71,12 @@ ls -lh gtfs # 16M trips.csv ``` -Depending on your specific setup, configure access to the PostgreSQL database via [`PG*` environment variables](https://www.postgresql.org/docs/14/libpq-envars.html): - -```sh -export PGUSER=postgres -export PGPASSWORD=password -env PGDATABASE=postgres psql -c 'create database vbb_2022_02_25' -export PGDATABASE=vbb_2022_02_25 -``` - Install `gtfs-via-duckdb` and use it to import the GTFS data: ```sh npm install -D gtfs-via-duckdb -npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b +npm exec -- gtfs-to-duckdb --require-dependencies -- gtfs.duckdb gtfs/*.csv +# todo # agency # calendar # CREATE EXTENSION @@ -89,14 +89,14 @@ npm exec -- gtfs-to-sql --require-dependencies -- gtfs/*.csv | sponge | psql -b # COMMIT ``` -Importing will take 10s to 10m, depending on the size of the feed. On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), importing the above feed takes about 4m; Importing the [260kb 2021-10-06 Amtrak feed](https://transitfeeds.com/p/amtrak/1136/20211006) takes 6s. +Importing will take a few seconds to a few minutes, depending on the size of the feed. On an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop, importing the above feed takes about 30s. In addition to a table for each GTFS file, `gtfs-via-duckdb` adds these views to help with real-world analysis: -- `service_days` ([materialized](https://www.postgresql.org/docs/14/sql-creatematerializedview.html)) "applies" [`calendar_dates`](https://gtfs.org/documentation/schedule/reference/#calendar_datestxt) to [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt) to give you all days of operation for each "service" defined in [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt). +- `service_days` (table) "applies" [`calendar_dates`](https://gtfs.org/documentation/schedule/reference/#calendar_datestxt) to [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt) to give you all days of operation for each "service" defined in [`calendar`](https://gtfs.org/documentation/schedule/reference/#calendartxt). - `arrivals_departures` "applies" [`stop_times`](https://gtfs.org/documentation/schedule/reference/#stop_timestxt)/[`frequencies`](https://gtfs.org/documentation/schedule/reference/#frequenciestxt) to [`trips`](https://gtfs.org/documentation/schedule/reference/#tripstxt) and `service_days` to give you all arrivals/departures at each stop with their *absolute* dates & times. It also resolves each stop's parent station ID & name. - `connections` "applies" [`stop_times`](https://gtfs.org/documentation/schedule/reference/#stop_timestxt)/[`frequencies`](https://gtfs.org/documentation/schedule/reference/#frequenciestxt) to [`trips`](https://gtfs.org/documentation/schedule/reference/#tripstxt) and `service_days`, just like `arrivals_departures`, but gives you departure (at stop A) & arrival (at stop B) *pairs*. -- `shapes_aggregated` aggregates individual shape points in [`shapes`](https://gtfs.org/documentation/schedule/reference/#shapestxt) into a [PostGIS `LineString`](http://postgis.net/workshops/postgis-intro/geometries.html#linestrings). +- `shapes_aggregated` aggregates individual shape points in [`shapes`](https://gtfs.org/documentation/schedule/reference/#shapestxt) into a [`LineString`](https://duckdb.org/docs/stable/extensions/spatial/overview#the-geometry-type). - `stats_by_route_date` provides the number of arrivals/departures by route ID and date. – [read more](docs/analysis/feed-by-route-date.md) - `stats_by_agency_route_stop_hour` provides the number of arrivals/departures by agency ID, route ID, stop ID & hour. – [read more](docs/analysis/feed-by-agency-route-stop-and-hour.md) - In contrast to `stats_by_route_date` & `stats_by_agency_route_stop_hour`, `stats_active_trips_by_hour` provides the number of *currently running* trips for each hour in the feeds period of time. @@ -144,7 +144,7 @@ AND (stop_url_lang = 'de-CH' OR stop_url_lang IS NULL) ``` Usage: - gtfs-to-sql [options] [--] ... + import-gtfs-into-duckdb [options] [--] ... Options: --silent -s Don't show files being converted. --require-dependencies -d Require files that the specified GTFS files depend @@ -186,23 +186,23 @@ Options: none, view & materialized-view. --import-metadata Create functions returning import metadata: - gtfs_data_imported_at (timestamp with time zone) - - gtfs_via_postgres_version (text) - - gtfs_via_postgres_options (jsonb) + - gtfs_via_duckdb_version (text) + - gtfs_via_duckdb_options (jsonb) +Notes: + If you just want to check if the GTFS data can be imported but don't care about the + resulting DuckDB database file, you can import into an in-memory database by specifying + `:memory:` as the . Examples: - gtfs-to-sql some-gtfs/*.txt | sponge | psql -b # import into PostgreSQL - gtfs-to-sql -u -- some-gtfs/*.txt | gzip >gtfs.sql.gz # generate a gzipped SQL dump + import-gtfs-into-duckdb some-gtfs.duckdb some-gtfs/*.txt [1] https://developers.google.com/transit/gtfs/reference/extended-route-types [2] https://groups.google.com/g/gtfs-changes/c/keT5rTPS7Y0/m/71uMz2l6ke0J ``` -Some notable limitations mentioned in the [PostgreSQL 14 documentation on date/time types](https://www.postgresql.org/docs/14/datatype-datetime.html): - -> For `timestamp with time zone`, the internally stored value is always in UTC (Universal Coordinated Time, traditionally known as Greenwich Mean Time, GMT). An input value that has an explicit time zone specified is converted to UTC using the appropriate offset for that time zone. - -> When a `timestamp with time zone` value is output, it is always converted from UTC to the current `timezone` zone, and displayed as local time in that zone. To see the time in another time zone, either change `timezone` or use the `AT TIME ZONE` construct […]. - -You can run queries with date+time values in any timezone (offset) and they will be processed correctly, but the output will always be in the database timezone (offset), unless you have explicitly used `AT TIME ZONE`. +> [!TIP] +> DuckDB will always store `timestamp with time zone` values as microsends since the [Unix epoch](https://en.wikipedia.org/wiki/Unix_time) (similar to UTC). An input value with an explicit offset specified (e.g. `2022-03-04T05:06:07+08:00`) is converted to the internal representation using the offset. +> When the stored value is queried, it is always converted back into the current offset of the timezone specified by the `TimeZone` config. To see the time in another time zone, [change the `TimeZone` config](https://duckdb.org/docs/1.2/sql/data_types/timestamp#settings). +> TLDR: You can run queries with date+time values in any timezone (offset) and they will be processed correctly. ### With Docker @@ -210,51 +210,24 @@ You can run queries with date+time values in any timezone (offset) and they will Instead of installing via `npm`, you can use [the `ghcr.io/public-transport/gtfs-via-duckdb` Docker image](https://github.com/public-transport/gtfs-via-duckdb/pkgs/container/gtfs-via-duckdb): -```shell -# variant A: use Docker image just to convert GTFS to SQL -docker run --rm --volume /path/to/gtfs:/gtfs \ - ghcr.io/public-transport/gtfs-via-duckdb --require-dependencies -- '/gtfs/*.csv' \ - | sponge | psql -b -``` - *Note:* Remember to pass the `/gtfs/*.csv` glob as a string (with `'`), so that it gets evaluated *inside* the Docker container. -With the code above, the `psql -b` process will run *outside* of the Docker container, so your host machine needs access to PostgreSQL. - -If you want to directly import the GTFS data *from within the Docker container*, you need add `psql` to the image and run it from inside. To do that, write a new Dockerfile that extends the `ghcr.io/public-transport/gtfs-via-duckdb` image: - -```Dockerfile -FROM ghcr.io/public-transport/gtfs-via-duckdb -ENV PGPORT=5432 PGUSER=postgres -WORKDIR /gtfs -# pass all arguments into gtfs-via-duckdb, pipe output into psql: -ENTRYPOINT ["/bin/sh", "-c", "gtfs-via-duckdb $0 $@ | sponge | psql -b"] -``` - ```shell -# start PostgreSQL DB in another container "db" -docker run --name db -p 5432:5432 -e POSTGRES_PASSWORD=password postgis/postgis - -# variant B: use Docker image to convert GTFS to SQL and import it directly -docker build -t import-gtfs . # build helper Docker image from Dockerfile docker run --rm --volume /path/to/gtfs:/gtfs \ - --link db -e PGHOST=db -e PGPASSWORD=password \ - import-gtfs --require-dependencies -- '/gtfs/*.csv' + ghcr.io/public-transport/gtfs-via-duckdb --require-dependencies -- '/gtfs/*.csv' ``` ### Exporting data efficiently -If you want to export data from the database, use the [`COPY` command](https://www.postgresql.org/docs/14/sql-copy.html); On an [M1 MacBook Air](https://en.wikipedia.org/wiki/MacBook_Air_(Apple_silicon)#Third_generation_(Retina_with_Apple_silicon)), PostgreSQL 14 can export about 500k `connections` rows per second. +If you want to export data from the database, use the [`COPY` command](https://duckdb.org/docs/stable/sql/statements/copy). ```shell -psql -c 'COPY (SELECT * FROM connections) TO STDOUT csv HEADER' >connections.csv +duckdb -c 'COPY (SELECT * FROM connections) TO STDOUT csv HEADER' my-gtfs.duckdb >my-gtfs-connections.csv ``` -In the nested `SELECT` query, you can use features like `WHERE`, `ORDER BY` and `LIMIT`. Because `psql` passes on the exported data right away, you could stream it into another process. - ### Querying stops by location efficiently -If you want to find stops by (geo)location, run `gtfs-via-duckdb` with `--stops-location-index`. This will create a [spatial index](https://postgis.net/workshops/postgis-intro/indexing.html) on `stops.stop_loc`, so that most [PostGIS functions & operators](https://postgis.net/docs/manual-3.2/reference.html#Measurement_Functions) make use of it. +If you want to find stops by (geo)location, run `gtfs-via-duckdb` with `--stops-location-index`. This will create a [spatial index](https://duckdb.org/docs/stable/extensions/spatial/r-tree_indexes) on `stops.stop_loc`, so that most spatial queries can be done efficiently. ### more guides @@ -274,19 +247,19 @@ Let's consider two examples: `gtfs-via-duckdb` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. -However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-postgres` provides two low-level helper functions `largest_arrival_time()` & `largest_departure_time()` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). +However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows DuckDB to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-duckdb` provides a low-level helper table `largest_arr_dep_time` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). -For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2022-02-25 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-02-25/), filtering by `date` speeds it up nicely (Apple M1, PostgreSQL 14.2): +For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2022-07-01 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-07-01/), filtering by `date` speeds it up nicely (Apple M2, DuckDB v1.3.0): `station_id` filter | `date` filter | query time | nr of results -|-|-|- -`de:11000:900120003` | *none* | 230ms | ~574k -`de:11000:900120003` | `2022-03-13` >= `date` < `2022-04-08` | 105ms | ~51k -`de:11000:900120003` | `2022-03-23` >= `date` < `2022-03-24` | 55ms | ~2k -`de:11000:900120003` | `2022-03-22` > `date` < `2022-03-24` | 55ms | ~2k -*none* | *none* | 192s | 370m -*none* | `2022-03-13` >= `date` < `2022-04-08` | 34s | ~35m -*none* | `2022-03-22` > `date` < `2022-03-24` | 2.4s | ~1523k +`de:11000:900120003` | *none* | todo | ~todok +`de:11000:900120003` | `2022-03-13` >= `date` < `2022-04-08` | todo | ~todok +`de:11000:900120003` | `2022-03-23` >= `date` < `2022-03-24` | todo | ~todok +`de:11000:900120003` | `2022-03-22` > `date` < `2022-03-24` | todo | ~todok +*none* | *none* | todo | todom +*none* | `2022-03-13` >= `date` < `2022-04-08` | todo | ~todom +*none* | `2022-03-22` > `date` < `2022-03-24` | todo | ~todok Using `dates_filter_min(t_min)` & `dates_filter_max(t_max)`, we can easily filter by `date`. When filtering by `t_departure` (absolute departure date+time), `t_min` is the lower `t_departure` bound, whereas `t_max` is the upper bound. The VBB example above can be queried like this: @@ -305,7 +278,8 @@ AND "date" <= dates_filter_max('2022-03-23T12:35:00+01') -- evaluates to 2023-03 `gtfs-via-duckdb` is fast enough for most use cases I can think of. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-duckdb/issues/new)! -The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and PostgreSQL 14.7 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 12.6.8; All measurements are in milliseconds. +todo: re-run them +The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.6; All measurements are in milliseconds. | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 809c8e2..83a3f7f 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -11,16 +11,15 @@ env | grep '^PG' || true unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 ls -lh amtrak-gtfs-2021-10-06 -psql -c 'create database amtrak_2021_10_06' -export PGDATABASE='amtrak_2021_10_06' +path_to_db="$(mktemp -d -t gtfs)/amtrak-gtfs-2021-10-06.duckdb" ../cli.js -d --trips-without-shape-id \ --import-metadata \ --stats-by-route-date=view \ --stats-by-agency-route-stop-hour=view \ --stats-active-trips-by-hour=view \ - -- amtrak-gtfs-2021-10-06/*.txt \ - | sponge | psql -b + "$path_to_db" \ + -- amtrak-gtfs-2021-10-06/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -32,26 +31,20 @@ EOF ) # 2021-11-26T15:15:00-05:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1637957700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2021-11-27T13:45:00-05:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" "$path_to_db" | tail -n 1) if [[ "$arrN" != "1638038700" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 fi -version=$(psql --csv -t -c "SELECT split_part(amtrak.gtfs_via_postgres_version(), '.', 1)" | tail -n 1) -if [[ "$version" != "4" ]]; then - echo "invalid gtfs_via_postgres_version(): $version" 1>&2 - exit 1 -fi - -fMin=$(psql --csv -t -c "SELECT amtrak.dates_filter_min('2021-11-27T13:45:00-06')" | tail -n 1) +fMin=$(duckdb -csv -noheader -c "SELECT dates_filter_min('2021-11-27T13:45:00-06')" "$path_to_db" | tail -n 1) if [[ "$fMin" != "2021-11-24" ]]; then echo "invalid dates_filter_min(…): $fMin" 1>&2 exit 1 @@ -65,7 +58,7 @@ AND date = '2021-11-26' AND is_effective = True EOF ) -acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1) +acelaStat=$(duckdb -csv -noheader -c "$acelaStatQuery" "$path_to_db" | tail -n 1) if [[ "$acelaStat" != "16,190" ]]; then echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 exit 1 @@ -79,7 +72,7 @@ AND stop_id = 'PHL' -- Philadelphia AND effective_hour = '2022-07-24 09:00:00-05:00' EOF ) -acelaPhillyStat=$(psql --csv -t -c "$acelaPhillyStatQuery" | tail -n 1) +acelaPhillyStat=$(duckdb -csv -noheader -c "$acelaPhillyStatQuery" "$path_to_db" | tail -n 1) if [[ "$acelaPhillyStat" != "2" ]]; then echo "invalid stats for route 40751 (Acela) at PHL (Philadelphia) on 2021-11-26: $acelaPhillyStat" 1>&2 exit 1 @@ -97,7 +90,7 @@ EOF # FROM amtrak.connections # WHERE t_departure >= '2021-11-26 02:00:00-05:00' # AND t_arrival <= '2021-11-26 06:00:00-05:00' -nrOfActiveTrips=$(psql --csv -t -c "$nrOfActiveTripsQuery" | tail -n 1) +nrOfActiveTrips=$(duckdb -csv -noheader -c "$nrOfActiveTripsQuery" "$path_to_db" | tail -n 1) if [[ "$nrOfActiveTrips" != "127" ]]; then echo "unexpected no. of active trips at 2021-11-26T04:00-05: $nrOfActiveTrips" 1>&2 exit 1 diff --git a/test/calendar-dates-only.sh b/test/calendar-dates-only.sh index 1014ad1..e11dc4f 100755 --- a/test/calendar-dates-only.sh +++ b/test/calendar-dates-only.sh @@ -8,12 +8,11 @@ set -x env | grep '^PG' || true -psql -c 'create database calendar_dates_only' -export PGDATABASE='calendar_dates_only' +path_to_db="$(mktemp -d -t gtfs)/calendar-dates-only.duckdb" ../cli.js -d --trips-without-shape-id -- \ - calendar-dates-only/*.txt \ - | sponge | psql -b + "$path_to_db" \ + calendar-dates-only/*.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -24,14 +23,14 @@ EOF ) # 2019-07-15T15:30:00+02:00 -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1563197400" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi # 2019-07-20T15:30:00+02:00 -arrN=$(psql --csv -t -c "$query" | tail -n 1) +arrN=$(duckdb -csv -noheader -c "$query" "$path_to_db" | tail -n 1) if [[ "$arrN" != "1563629400" ]]; then echo "invalid 2nd t_arrival: $arrN" 1>&2 exit 1 @@ -43,7 +42,7 @@ from arrivals_departures where agency_id IS NULL EOF ) -agency_id_null_count="$(psql --csv -t -c "$agency_id_null")" +agency_id_null_count="$(duckdb -csv -noheader -c "$agency_id_null" "$path_to_db")" if [[ "$agency_id_null_count" != "0" ]]; then echo ">0 rows with agency_id = null" 1>&2 exit 1 @@ -57,7 +56,7 @@ FROM arrivals_departures ORDER BY stop_id, trip_id EOF ) -wheelchair_boarding_rows="$(psql --csv -t -c "$wheelchair_boarding_query")" +wheelchair_boarding_rows="$(duckdb -csv -noheader -c "$wheelchair_boarding_query" "$path_to_db")" wheelchair_boarding_expected="$(echo -e "airport,accessible\nairport-1,not_accessible\nlake,no_info_or_inherit\nmuseum,no_info_or_inherit")" if [[ "$wheelchair_boarding_rows" != "$wheelchair_boarding_expected" ]]; then echo "invalid wheelchair_boarding values" 1>&2 diff --git a/test/index.sh b/test/index.sh index d8b7194..d8512f0 100755 --- a/test/index.sh +++ b/test/index.sh @@ -6,7 +6,7 @@ set -o pipefail cd "$(dirname $0)" set -x -psql -t -c 'SELECT version()' +duckdb --version ./calendar-dates-only.sh ./sample-gtfs-feed.sh @@ -14,5 +14,6 @@ psql -t -c 'SELECT version()' ./routes-without-agency-id.sh ./stops-without-level-id.sh ./invalid-empty-agency-id.sh +./multiple-datasets.sh echo -e "\n\n✔︎ tests passing" diff --git a/test/invalid-empty-agency-id.sh b/test/invalid-empty-agency-id.sh index c4dbba8..9245389 100755 --- a/test/invalid-empty-agency-id.sh +++ b/test/invalid-empty-agency-id.sh @@ -9,8 +9,9 @@ set -x # Refer to https://github.com/public-transport/gtfs-via-postgres/issues/45 for context. # The "core" bug: A feed without routes.agency_id should be importable. -if ../cli.js -d --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null; then +# However, this only applies if there is exactly one route. If there are >1 routes, every route must have an agency_id. +if ../cli.js -d --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt; then echo "import didn't fail" 1>&2 exit 1 else @@ -20,6 +21,6 @@ fi # A related bug: With --routes-without-agency-id, lib/deps.js *does not* specify routes to depend on agency. # *In some cases*, this causes agency to be processed *after* routes, causing the routes processing to fail. # see also https://github.com/public-transport/gtfs-via-postgres/issues/45#issuecomment-1632649826 -../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- \ - invalid-empty-agency-id/*.txt >/dev/null +../cli.js -d --routes-without-agency-id --trips-without-shape-id -s -- ':memory:' \ + invalid-empty-agency-id/*.txt echo 'did not fail even with --routes-without-agency-id ✔' diff --git a/test/multiple-datasets.sh b/test/multiple-datasets.sh new file mode 100755 index 0000000..fd17846 --- /dev/null +++ b/test/multiple-datasets.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +set -e +set -u +set -o pipefail +cd "$(dirname $0)" +set -x + +env | grep '^PG' || true + +unzip -q -j -n amtrak-gtfs-2021-10-06.zip -d amtrak-gtfs-2021-10-06 +ls -lh amtrak-gtfs-2021-10-06 + +db_dir="$(mktemp -d -t gtfs)" +path_to_db1="$db_dir/multiple-schemas-1.duckdb" +path_to_db2="$db_dir/multiple-schemas-2.duckdb" + +shopt -s extglob + +../cli.js -d --trips-without-shape-id \ + "$path_to_db1" \ + -- amtrak-gtfs-2021-10-06/!(transfers).txt + +../cli.js -d --trips-without-shape-id \ + "$path_to_db2" \ + -- amtrak-gtfs-2021-10-06/*.txt + +shopt -u extglob + +query_prefix=$(cat << EOF +ATTACH DATABASE '$path_to_db1' AS one (READ_ONLY); +ATTACH DATABASE '$path_to_db2' AS two (READ_ONLY); +SET search_path = 'one,two'; +EOF +) + +tables_query=$(cat << EOF +$query_prefix +SELECT + (table_catalog || '.' || table_name) AS table_name +FROM information_schema.tables +WHERE table_schema = 'main' +AND table_catalog = ANY(['one', 'two']) +ORDER BY table_catalog, table_name; +EOF +) +tables_rows=$(duckdb -csv -noheader -c "$tables_query") +# note that one.transfers is missing +tables_expected=$(cat << EOF +one.agency +one.arrivals_departures +one.calendar +one.calendar_dates +one.connections +one.feed_info +one.frequencies +one.largest_arr_dep_time +one.routes +one.service_days +one.stop_times +one.stops +one.trips +one.valid_lang_codes +one.valid_timezones +two.agency +two.arrivals_departures +two.calendar +two.calendar_dates +two.connections +two.feed_info +two.frequencies +two.largest_arr_dep_time +two.routes +two.service_days +two.stop_times +two.stops +two.transfers +two.trips +two.valid_lang_codes +two.valid_timezones +EOF +) +if [[ "$tables_rows" != "$tables_expected" ]]; then + echo "unexpected list of tables" 1>&2 + exit 1 +fi + +# https://dba.stackexchange.com/a/72656 +nr_of_unequal_stops=$(cat << EOF +$query_prefix +SELECT count(*) +FROM one.stops a +FULL OUTER JOIN two.stops b ON ( + a.stop_id = b.stop_id +) +WHERE ( + a.stop_code IS DISTINCT FROM b.stop_code + OR a.stop_name IS DISTINCT FROM b.stop_name + OR a.stop_desc IS DISTINCT FROM b.stop_desc + OR a.stop_loc IS DISTINCT FROM b.stop_loc + OR a.zone_id IS DISTINCT FROM b.zone_id + OR a.stop_url IS DISTINCT FROM b.stop_url + OR a.location_type::TEXT IS DISTINCT FROM b.location_type::TEXT + OR a.parent_station IS DISTINCT FROM b.parent_station + OR a.stop_timezone IS DISTINCT FROM b.stop_timezone + OR a.wheelchair_boarding::TEXT IS DISTINCT FROM b.wheelchair_boarding::TEXT + OR a.level_id IS DISTINCT FROM b.level_id + OR a.platform_code IS DISTINCT FROM b.platform_code +) +EOF +) + +unequal_stops_1=$(duckdb -csv -noheader -c "$nr_of_unequal_stops" | head -n 1) +if [[ "$unequal_stops_1" -ne 0 ]]; then + 1>&2 echo "$unequal_stops_1 unequal stops between one.stops & two.stops" + exit 1 +fi + +# # put an incompatible version +# duckdb -c "$(cat << EOF +# CREATE OR REPLACE FUNCTION public.gtfs_via_duckdb_import_version() +# RETURNS TEXT +# AS \$\$ +# SELECT '0.1.2' +# \$\$ +# LANGUAGE SQL +# EOF +# )" + +# # expect another import to fail +# if ../cli.js -d --trips-without-shape-id \ +# "$path_to_db" \ +# -- amtrak-gtfs-2021-10-06/*.txt; then +# 1>&2 echo "re-import with incompatible version didn't fail" +# exit 1 +# fi + +echo 'works ✔' diff --git a/test/routes-without-agency-id.sh b/test/routes-without-agency-id.sh index 51e1530..c63e296 100755 --- a/test/routes-without-agency-id.sh +++ b/test/routes-without-agency-id.sh @@ -7,7 +7,7 @@ cd "$(dirname $0)" set -x ../cli.js -d --routes-without-agency-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/*.txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/*.txt echo 'works ✔' diff --git a/test/sample-gtfs-feed.sh b/test/sample-gtfs-feed.sh index 8834739..fe136a4 100755 --- a/test/sample-gtfs-feed.sh +++ b/test/sample-gtfs-feed.sh @@ -8,11 +8,15 @@ set -x env | grep '^PG' || true -psql -c 'create database sample_gtfs_feed' -export PGDATABASE='sample_gtfs_feed' +# path_to_db="sample-gtfs-feed.duckdb" +path_to_db="$(mktemp -d)/sample-gtfs-feed.duckdb" +# path_to_db=':memory:' +# todo: what about sample-gtfs-feed@0.13? # --lower-case-lang-codes: Even though sample-gtfs-feed@0.11.2 *does not* contain invalid-case language codes (e.g. de_aT or de-at), we check that with --lower-case-lang-codes valid ones are still accepted. ../cli.js -d --trips-without-shape-id --lower-case-lang-codes -- \ + "$path_to_db" \ + ../node_modules/sample-gtfs-feed/gtfs/feed_info.txt \ ../node_modules/sample-gtfs-feed/gtfs/agency.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar.txt \ ../node_modules/sample-gtfs-feed/gtfs/calendar_dates.txt \ @@ -23,8 +27,7 @@ export PGDATABASE='sample_gtfs_feed' ../node_modules/sample-gtfs-feed/gtfs/stop_times.txt \ ../node_modules/sample-gtfs-feed/gtfs/levels.txt \ ../node_modules/sample-gtfs-feed/gtfs/pathways.txt \ - ../node_modules/sample-gtfs-feed/gtfs/translations.txt \ - | sponge | psql -b + ../node_modules/sample-gtfs-feed/gtfs/translations.txt query=$(cat << EOF select extract(epoch from t_arrival)::integer as t_arrival @@ -34,18 +37,19 @@ order by t_arrival EOF ) -arr1=$(psql --csv -t -c "$query" | head -n 1) +arr1=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 1) if [[ "$arr1" != "1553993700" ]]; then echo "invalid 1st t_arrival: $arr1" 1>&2 exit 1 fi -arr2=$(psql --csv -t -c "$query" | head -n 2 | tail -n 1) +arr2=$(duckdb -csv -noheader -c "$query" "$path_to_db" | head -n 2 | tail -n 1) if [[ "$arr2" != "1553994180" ]]; then echo "invalid 2nd t_arrival: $arr2" 1>&2 exit 1 fi +# In sample-gtfs-feed@0.13, the frequencies-based arrivals/departures are earlier (from 8:00 until 8:59) than the stop_times-based ones (13:13), so across all service days, the earliest departure has to be a frequencies-based one. arrs_deps_b_downtown_on_working_days=$(cat << EOF SELECT stop_sequence, @@ -58,12 +62,12 @@ arrs_deps_b_downtown_on_working_days=$(cat << EOF LIMIT 2 EOF ) -freq_arr_dep1=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 1) +freq_arr_dep1=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" "$path_to_db" | head -n 1) if [[ "$freq_arr_dep1" != "1,1552028340,1552028400,1,1" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep1" 1>&2 exit 1 fi -freq_arr_dep2=$(psql --csv -t -c "$arrs_deps_b_downtown_on_working_days" | head -n 2 | tail -n 1) +freq_arr_dep2=$(duckdb -csv -noheader -c "$arrs_deps_b_downtown_on_working_days" "$path_to_db" | head -n 2 | tail -n 1) if [[ "$freq_arr_dep2" != "1,1552028640,1552028700,1,2" ]]; then echo "invalid/missing frequencies-based arrival/departure: $freq_arr_dep2" 1>&2 exit 1 @@ -81,7 +85,7 @@ cons_b_downtown_on_working_days=$(cat << EOF LIMIT 1 EOF ) -freq_con1=$(psql --csv -t -c "$cons_b_downtown_on_working_days") +freq_con1=$(duckdb -csv -noheader -c "$cons_b_downtown_on_working_days" "$path_to_db") if [[ "$freq_con1" != "1,1552028400,3,1552028760" ]]; then echo "invalid/missing frequencies-based connection: $freq_con1" 1>&2 exit 1 @@ -96,7 +100,7 @@ connection_during_dst=$(cat << EOF AND t_departure = '2019-03-31T01:58:00+01:00' EOF ) -dst1=$(psql --csv -t -c "$connection_during_dst" | head -n 1) +dst1=$(duckdb -csv -noheader -c "$connection_during_dst" "$path_to_db" | head -n 1) if [[ "$dst1" != "0,1553993880" ]]; then echo "invalid/missing DST t_departure: $dst1" 1>&2 exit 1 @@ -113,8 +117,8 @@ airport_levels=$(cat << EOF LIMIT 1 EOF ) -lvl1=$(psql --csv -t -c "$airport_levels" | head -n 1) -if [[ "$lvl1" != "airport-level-0,0,ground level" ]]; then +lvl1=$(duckdb -csv -noheader -c "$airport_levels" "$path_to_db" | head -n 1) +if [[ "$lvl1" != 'airport-level-0,0.0,ground level' ]]; then echo "invalid/missing lowest airport-% level: $lvl1" 1>&2 exit 1 fi @@ -129,8 +133,8 @@ airportPathway=$(cat << EOF LIMIT 1 EOF ) -pw1=$(psql --csv -t -c "$airportPathway" | head -n 1) -if [[ "$pw1" != "escalator,f" ]]; then +pw1=$(duckdb -csv -noheader -c "$airportPathway" "$path_to_db" | head -n 1) +if [[ "$pw1" != 'escalator,false' ]]; then echo "invalid/missing DST t_departure: $pw1" 1>&2 exit 1 fi @@ -143,7 +147,7 @@ timepoint_exact=$(cat << EOF LIMIT 1 EOF ) -exact1=$(psql --csv -t -c "$timepoint_exact" | head -n 1) +exact1=$(duckdb -csv -noheader -c "$timepoint_exact" "$path_to_db" | head -n 1) if [[ "$exact1" != "exact" ]]; then echo "invalid/missing DST t_departure: $exact1" 1>&2 exit 1 @@ -157,7 +161,7 @@ stops_translations=$(cat << EOF AND record_id = 'airport-entrance' EOF ) -airport_entrance_translation=$(psql --csv -t -c "$stops_translations") +airport_entrance_translation=$(duckdb -csv -noheader -c "$stops_translations" "$path_to_db") if [[ "$airport_entrance_translation" != "Eingang,de-DE" ]]; then echo "invalid/missing stop translation: $airport_entrance_translation" 1>&2 exit 1 @@ -173,7 +177,7 @@ stops_translated=$(cat << EOF AND stop_id = 'airport-entrance' EOF ) -translated_airport_entrance=$(psql --csv -t -c "$stops_translated") +translated_airport_entrance=$(duckdb -csv -noheader -c "$stops_translated" "$path_to_db") if [[ "$translated_airport_entrance" != "airport-entrance,Eingang,de-DE" ]]; then echo "invalid/missing translated stop: $translated_airport_entrance" 1>&2 exit 1 @@ -187,10 +191,10 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -wheelchair_accessible_arrs_deps_rows="$(psql --csv -t -c "$wheelchair_accessible_arrs_deps_query")" +wheelchair_accessible_arrs_deps_rows="$(duckdb -csv -noheader -c "$wheelchair_accessible_arrs_deps_query" "$path_to_db")" wheelchair_accessible_arrs_deps_expected=$(cat << EOF -a-downtown-all-day, -a-outbound-all-day, +a-downtown-all-day,NULL +a-outbound-all-day,NULL b-downtown-on-weekends,accessible b-downtown-on-working-days,accessible b-outbound-on-weekends,unknown @@ -210,10 +214,10 @@ WHERE route_id = ANY(ARRAY['A', 'B']) ORDER BY trip_id EOF ) -bikes_allowed_arrs_deps_rows="$(psql --csv -t -c "$bikes_allowed_arrs_deps_query")" +bikes_allowed_arrs_deps_rows="$(duckdb -csv -noheader -c "$bikes_allowed_arrs_deps_query" "$path_to_db")" bikes_allowed_arrs_deps_expected=$(cat << EOF -a-downtown-all-day, -a-outbound-all-day, +a-downtown-all-day,NULL +a-outbound-all-day,NULL b-downtown-on-weekends,unknown b-downtown-on-working-days,unknown b-outbound-on-weekends,allowed @@ -232,7 +236,7 @@ WHERE trip_id = 'b-downtown-on-working-days' AND "date" = '2019-05-29' AND frequ ORDER BY t_departure EOF ) -frequencies_it_rows="$(psql --csv -t -c "$frequencies_it_query")" +frequencies_it_rows="$(duckdb -csv -noheader -c "$frequencies_it_query" "$path_to_db")" frequencies_it_expected=$(cat << EOF 2019-05-29 08:10:00+02,1,airport,3 2019-05-29 08:18:00+02,3,lake,3 @@ -254,7 +258,7 @@ ORDER BY t_departure ASC LIMIT 3 EOF ) -frequencies_it_connections_rows="$(psql --csv -t -c "$frequencies_it_connections_query")" +frequencies_it_connections_rows="$(duckdb -csv -noheader -c "$frequencies_it_connections_query" "$path_to_db")" frequencies_it_connections_expected=$(cat << EOF 1,2019-03-08 08:00:00+01,2019-03-08 08:06:00+01,1 1,2019-03-08 08:05:00+01,2019-03-08 08:11:00+01,2 @@ -277,7 +281,7 @@ WHERE stop_id LIKE 'airport%' ORDER BY stop_id, stop_name_lang, stop_desc_lang EOF ) -stops_translated_rows="$(psql --csv -t -c "$stops_translated_query")" +stops_translated_rows="$(duckdb -csv -noheader -nullvalue '' -c "$stops_translated_query" "$path_to_db")" stops_translated_expected=$(cat << EOF airport,International Airport (ABC),,train station at the Internationl Airport (ABC),,https://fta.example.org/stations/airport.html, airport-1,Gleis 1,de-DE,Platform 1,,, diff --git a/test/stops-without-level-id.sh b/test/stops-without-level-id.sh index 7431429..473a2b0 100755 --- a/test/stops-without-level-id.sh +++ b/test/stops-without-level-id.sh @@ -8,16 +8,16 @@ set -x shopt -s extglob -# When omitting levels.txt, --stops-without-level-id/opt.stopsWithoutLevelId should be true by default. +# Importing should work *without* levels.txt. # see also https://github.com/public-transport/gtfs-via-postgres/issues/43 ../cli.js -d -s -- \ - ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ - | grep -c 'stopsWithoutLevelId: true' + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt # Importing should work *with* --stops-without-level-id (and without levels.txt). # see also https://github.com/public-transport/gtfs-via-postgres/issues/43#issuecomment-1632657546 ../cli.js -d -s --stops-without-level-id -- \ - ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt \ - >/dev/null + ':memory:' \ + ../node_modules/sample-gtfs-feed/gtfs/!(levels).txt echo 'works ✔' From ab401d7dc3fd360b87f1c94314a7c1f696e4ff89 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 15:30:01 +0200 Subject: [PATCH 11/16] =?UTF-8?q?benchmarks=20&=20readme:=20update=20to=20?= =?UTF-8?q?2025-05-21=20VBB=20GTFS=20dataset=20=F0=9F=93=9D=E2=9A=A1?= =?UTF-8?q?=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../arrs_deps_by_route_name_and_time.sql | 6 +-- benchmark/arrs_deps_by_station_and_time.sql | 6 +-- .../arrs_deps_by_station_and_time_seq_0.sql | 6 +-- benchmark/arrs_deps_by_stop_and_time.sql | 6 +-- benchmark/arrs_deps_by_time.sql | 6 +-- benchmark/arrs_deps_by_time_manual.sql | 6 +-- benchmark/arrs_deps_by_trip_and_date.sql | 4 +- .../connections_by_route_name_and_time.sql | 6 +-- benchmark/connections_by_station_and_time.sql | 8 ++-- .../connections_by_station_and_time_seq_0.sql | 10 ++-- benchmark/connections_by_stop_and_time.sql | 6 +-- benchmark/connections_by_time.sql | 6 +-- benchmark/connections_by_time_manual.sql | 5 +- benchmark/connections_by_trip_and_date.sql | 4 +- benchmark/init.sh | 8 ++-- benchmark/stats_by_route_date.sql | 2 +- readme.md | 48 +++++++++---------- 17 files changed, 71 insertions(+), 72 deletions(-) diff --git a/benchmark/arrs_deps_by_route_name_and_time.sql b/benchmark/arrs_deps_by_route_name_and_time.sql index 88d63ad..ced7804 100644 --- a/benchmark/arrs_deps_by_route_name_and_time.sql +++ b/benchmark/arrs_deps_by_route_name_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE route_short_name = 'S1' -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/arrs_deps_by_station_and_time.sql b/benchmark/arrs_deps_by_station_and_time.sql index a73d5f7..f163fd6 100644 --- a/benchmark/arrs_deps_by_station_and_time.sql +++ b/benchmark/arrs_deps_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0.sql b/benchmark/arrs_deps_by_station_and_time_seq_0.sql index 0b02f35..9bace6c 100644 --- a/benchmark/arrs_deps_by_station_and_time_seq_0.sql +++ b/benchmark/arrs_deps_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM arrivals_departures WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') AND stop_sequence = 0 diff --git a/benchmark/arrs_deps_by_stop_and_time.sql b/benchmark/arrs_deps_by_stop_and_time.sql index 26b4068..195a3aa 100644 --- a/benchmark/arrs_deps_by_stop_and_time.sql +++ b/benchmark/arrs_deps_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM arrivals_departures WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/arrs_deps_by_time.sql b/benchmark/arrs_deps_by_time.sql index 99d84f1..f7158ed 100644 --- a/benchmark/arrs_deps_by_time.sql +++ b/benchmark/arrs_deps_by_time.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND "date" >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND "date" <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) diff --git a/benchmark/arrs_deps_by_time_manual.sql b/benchmark/arrs_deps_by_time_manual.sql index 1762bac..74e8a01 100644 --- a/benchmark/arrs_deps_by_time_manual.sql +++ b/benchmark/arrs_deps_by_time_manual.sql @@ -1,5 +1,5 @@ SELECT * FROM arrivals_departures -WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= '2022-08-08' -AND date <= '2022-08-09' +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' diff --git a/benchmark/arrs_deps_by_trip_and_date.sql b/benchmark/arrs_deps_by_trip_and_date.sql index 89d4609..2a90f80 100644 --- a/benchmark/arrs_deps_by_trip_and_date.sql +++ b/benchmark/arrs_deps_by_trip_and_date.sql @@ -1,4 +1,4 @@ SELECT * FROM arrivals_departures -WHERE trip_id = '168977951' -AND date > '2022-08-08' AND date <= '2022-08-09' +WHERE trip_id = '262623609' -- route_id=10144_109, route_short_name=S2 +AND date = '2025-05-27' diff --git a/benchmark/connections_by_route_name_and_time.sql b/benchmark/connections_by_route_name_and_time.sql index 69fa862..feac3ae 100644 --- a/benchmark/connections_by_route_name_and_time.sql +++ b/benchmark/connections_by_route_name_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE route_short_name = 'S1' -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time.sql b/benchmark/connections_by_station_and_time.sql index 769efb5..6e68e61 100644 --- a/benchmark/connections_by_station_and_time.sql +++ b/benchmark/connections_by_station_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections -WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_station_and_time_seq_0.sql b/benchmark/connections_by_station_and_time_seq_0.sql index 8a42b53..40c19b2 100644 --- a/benchmark/connections_by_station_and_time_seq_0.sql +++ b/benchmark/connections_by_station_and_time_seq_0.sql @@ -1,7 +1,7 @@ SELECT * FROM connections -WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') -AND from_stop_sequence = 0 +WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') +AND from_stop_sequence_consec = 0 diff --git a/benchmark/connections_by_stop_and_time.sql b/benchmark/connections_by_stop_and_time.sql index c4bbfc1..e161f36 100644 --- a/benchmark/connections_by_stop_and_time.sql +++ b/benchmark/connections_by_stop_and_time.sql @@ -1,6 +1,6 @@ SELECT * FROM connections WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) -AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02') -AND date <= dates_filter_max('2022-08-09T07:30:00+02') +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02') +AND date <= dates_filter_max('2025-05-27T07:30:00+02') diff --git a/benchmark/connections_by_time.sql b/benchmark/connections_by_time.sql index 403cac9..8b7205c 100644 --- a/benchmark/connections_by_time.sql +++ b/benchmark/connections_by_time.sql @@ -1,7 +1,7 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone) -AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone) +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone) +AND date <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone) ORDER BY t_departure LIMIT 100 diff --git a/benchmark/connections_by_time_manual.sql b/benchmark/connections_by_time_manual.sql index 5372029..4a2dc73 100644 --- a/benchmark/connections_by_time_manual.sql +++ b/benchmark/connections_by_time_manual.sql @@ -1,7 +1,6 @@ SELECT * FROM connections -WHERE t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02' -AND date >= '2022-08-08' -AND date <= '2022-08-09' +WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' AND date <= '2025-05-27' ORDER BY t_departure LIMIT 100 diff --git a/benchmark/connections_by_trip_and_date.sql b/benchmark/connections_by_trip_and_date.sql index 93ef135..c5ece9b 100644 --- a/benchmark/connections_by_trip_and_date.sql +++ b/benchmark/connections_by_trip_and_date.sql @@ -1,4 +1,4 @@ SELECT * FROM connections -WHERE trip_id = '168977951' -AND date > '2022-08-08' AND date <= '2022-08-09' +WHERE trip_id = '262535123' -- route_id=17452_900 (M4) +AND date >= '2025-05-26' AND date <= '2025-06-01' diff --git a/benchmark/init.sh b/benchmark/init.sh index a0f0a6e..9c1e638 100755 --- a/benchmark/init.sh +++ b/benchmark/init.sh @@ -5,12 +5,12 @@ set -o pipefail cd "$(dirname "$0")" set -x -wget --compression auto -r --no-parent --no-directories -R .csv.gz -P ../vbb-2022-07-01.gtfs -N 'https://vbb-gtfs.jannisr.de/2022-07-01/' -ls -lh ../vbb-2022-07-01.gtfs +wget --compression auto -r --no-parent --no-directories -R .csv.gz,.csv.br -P ../vbb-2025-05-21.gtfs -N 'https://vbb-gtfs.jannisr.de/2025-05-21/' +ls -lh ../vbb-2025-05-21.gtfs env | grep '^PG' || true ../cli.js -d \ --stops-location-index --stats-by-route-date=view \ - vbb-2022-07-01.gtfs.duckdb \ - ../vbb-2022-07-01.gtfs/*.csv + vbb-2025-05-21.gtfs.duckdb \ + ../vbb-2025-05-21.gtfs/*.csv diff --git a/benchmark/stats_by_route_date.sql b/benchmark/stats_by_route_date.sql index a894e09..4f8b5dc 100644 --- a/benchmark/stats_by_route_date.sql +++ b/benchmark/stats_by_route_date.sql @@ -1,5 +1,5 @@ SELECT * FROM stats_by_route_date WHERE route_id = '17452_900' -- M4 -AND date >= '2022-08-08' AND date <= '2022-08-14' +AND date >= '2025-05-26' AND date <= '2025-06-01' AND is_effective = true diff --git a/readme.md b/readme.md index 36a5942..c197e5b 100644 --- a/readme.md +++ b/readme.md @@ -46,29 +46,29 @@ duckdb_cli -c 'INSTALL spatial' If you have a `.zip` GTFS feed, unzip it into individual files. -We're going to use the [2022-07-01 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-07-01/) as an example, which consists of individual files already. +We're going to use the [2025-05-21 *VBB* feed](https://vbb-gtfs.jannisr.de/2025-05-21/) as an example, which consists of individual files already. ```sh wget --compression auto \ - -r --no-parent --no-directories -R .csv.gz \ - -P gtfs -N 'https://vbb-gtfs.jannisr.de/2022-07-01/' + -r --no-parent --no-directories -R .csv.gz -R .csv.br \ + -P gtfs -N 'https://vbb-gtfs.jannisr.de/2025-05-21/' # … # Downloaded 14 files in 20s. ls -lh gtfs -# 3.3K agency.csv -# 97K calendar.csv -# 1.1M calendar_dates.csv +# 3.2K agency.csv +# 107K calendar.csv +# 1.2M calendar_dates.csv # 2.5K datapackage.json # 64B frequencies.csv -# 5.9K levels.csv +# 6.1K levels.csv # 246B license -# 8.3M pathways.csv -# 49K routes.csv -# 146M shapes.csv -# 368M stop_times.csv -# 5.0M stops.csv -# 4.7M transfers.csv -# 16M trips.csv +# 8.9M pathways.csv +# 50K routes.csv +# 152M shapes.csv +# 383M stop_times.csv +# 7.0M stops.csv +# 3.0M transfers.csv +# 17M trips.csv ``` Install `gtfs-via-duckdb` and use it to import the GTFS data: @@ -112,14 +112,14 @@ AND t_departure >= '2022-03-23T12:30:00+01' AND t_departure <= '2022-03-23T12:35 `route_id` | `route_short_name` | `route_type` | `trip_id` | `date` | `stop_sequence` | `t_arrival` | `t_departure` | `stop_id` | `stop_name` | `station_id` | `station_name` -|-|-|-|-|-|-|-|-|-|-|- -`10148_109` | `S3` | `109` | `169035756` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:31:24+01` | `2022-03-23 12:32:12+01` | `de:11000:900120003:2:53` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10148_109` | `S3` | `109` | `169035899` | `2022-03-23 00:00:00` | `10` | `2022-03-23 12:33:06+01` | `2022-03-23 12:33:54+01` | `de:11000:900120003:3:55` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10162_109` | `S7` | `109` | `169128381` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:33:54+01` | `2022-03-23 12:34:42+01` | `de:11000:900120003:2:53` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10162_109` | `S7` | `109` | `169128495` | `2022-03-23 00:00:00` | `9` | `2022-03-23 12:30:36+01` | `2022-03-23 12:31:24+01` | `de:11000:900120003:3:55` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10223_109` | `S41` | `109` | `169054370` | `2022-03-23 00:00:00` | `21` | `2022-03-23 12:30:24+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5:58` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`10227_109` | `S42` | `109` | `169071882` | `2022-03-23 00:00:00` | `6` | `2022-03-23 12:30:30+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5:59` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`19040_100` | `RB14` | `100` | `178748721` | `2022-03-23 00:00:00` | `13` | `2022-03-23 12:30:00+01` | `2022-03-23 12:30:00+01` | `de:11000:900120003:1:50` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` -`22664_2` | `FEX` | `2` | `178748125` | `2022-03-23 00:00:00` | `1` | `2022-03-23 12:32:00+01` | `2022-03-23 12:34:00+01` | `de:11000:900120003:4:57` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10148_109` | `S3` | `109` | `169035756` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:31:24+01` | `2022-03-23 12:32:12+01` | `de:11000:900120003:2` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10148_109` | `S3` | `109` | `169035899` | `2022-03-23 00:00:00` | `10` | `2022-03-23 12:33:06+01` | `2022-03-23 12:33:54+01` | `de:11000:900120003:3` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10162_109` | `S7` | `109` | `169128381` | `2022-03-23 00:00:00` | `19` | `2022-03-23 12:33:54+01` | `2022-03-23 12:34:42+01` | `de:11000:900120003:2` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10162_109` | `S7` | `109` | `169128495` | `2022-03-23 00:00:00` | `9` | `2022-03-23 12:30:36+01` | `2022-03-23 12:31:24+01` | `de:11000:900120003:3` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10223_109` | `S41` | `109` | `169054370` | `2022-03-23 00:00:00` | `21` | `2022-03-23 12:30:24+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`10227_109` | `S42` | `109` | `169071882` | `2022-03-23 00:00:00` | `6` | `2022-03-23 12:30:30+01` | `2022-03-23 12:31:12+01` | `de:11000:900120003:5` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`19040_100` | `RB14` | `100` | `178748721` | `2022-03-23 00:00:00` | `13` | `2022-03-23 12:30:00+01` | `2022-03-23 12:30:00+01` | `de:11000:900120003:1` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` +`22664_2` | `FEX` | `2` | `178748125` | `2022-03-23 00:00:00` | `1` | `2022-03-23 12:32:00+01` | `2022-03-23 12:34:00+01` | `de:11000:900120003:4` | `S Ostkreuz Bhf (Berlin)` | `de:11000:900120003` | `S Ostkreuz Bhf (Berlin)` ### translations @@ -249,7 +249,7 @@ Let's consider two examples: However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows DuckDB to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. `gtfs-via-duckdb` provides a low-level helper table `largest_arr_dep_time` for this, as well as two high-level helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` (see below). -For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2022-07-01 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-07-01/), filtering by `date` speeds it up nicely (Apple M2, DuckDB v1.3.0): +For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30:00+01` and `2022-03-23T12:35:00+01` within the [2025-05-21 *VBB* feed](https://vbb-gtfs.jannisr.de/2025-05-21/), filtering by `date` speeds it up nicely (Apple M2, DuckDB v1.3.0): `station_id` filter | `date` filter | query time | nr of results -|-|-|- @@ -279,7 +279,7 @@ AND "date" <= dates_filter_max('2022-03-23T12:35:00+01') -- evaluates to 2023-03 `gtfs-via-duckdb` is fast enough for most use cases I can think of. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-duckdb/issues/new)! todo: re-run them -The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2022-07-01/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.6; All measurements are in milliseconds. +The following benchmarks were run with the [2025-05-21 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2025-05-21/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.6; All measurements are in milliseconds. | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | From 032614cd2baedcc4a08f1308d91975ac468b1929 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 27 Jun 2025 15:35:39 +0200 Subject: [PATCH 12/16] =?UTF-8?q?add=20benchmarks:=20arrs/deps=20with=20ma?= =?UTF-8?q?nual=20time=20filtering=20=E2=9A=A1=EF=B8=8F=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark/arrs_deps_by_station_and_time_manual.sql | 6 ++++++ benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql | 7 +++++++ benchmark/arrs_deps_by_stop_and_time_manual.sql | 6 ++++++ 3 files changed, 19 insertions(+) create mode 100644 benchmark/arrs_deps_by_station_and_time_manual.sql create mode 100644 benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql create mode 100644 benchmark/arrs_deps_by_stop_and_time_manual.sql diff --git a/benchmark/arrs_deps_by_station_and_time_manual.sql b/benchmark/arrs_deps_by_station_and_time_manual.sql new file mode 100644 index 0000000..3bca576 --- /dev/null +++ b/benchmark/arrs_deps_by_station_and_time_manual.sql @@ -0,0 +1,6 @@ +SELECT * +FROM arrivals_departures +WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' diff --git a/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql b/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql new file mode 100644 index 0000000..5201d1f --- /dev/null +++ b/benchmark/arrs_deps_by_station_and_time_seq_0_manual.sql @@ -0,0 +1,7 @@ +SELECT * +FROM arrivals_departures +WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' +AND stop_sequence = 0 diff --git a/benchmark/arrs_deps_by_stop_and_time_manual.sql b/benchmark/arrs_deps_by_stop_and_time_manual.sql new file mode 100644 index 0000000..5a71d6d --- /dev/null +++ b/benchmark/arrs_deps_by_stop_and_time_manual.sql @@ -0,0 +1,6 @@ +SELECT * +FROM arrivals_departures +WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin) +AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02' +AND date >= '2025-05-25' +AND date <= '2025-05-27' From a3c7a33851f1d6a91d3eb1ac0479d0008461c0b1 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Sat, 2 Aug 2025 17:07:08 +0200 Subject: [PATCH 13/16] DuckDB rewrite: add unique indexes for primary keys --- lib/agency.js | 3 +++ lib/levels.js | 2 ++ lib/pathways.js | 3 +++ lib/routes.js | 3 +++ lib/shapes.js | 6 ++++++ lib/stop_times.js | 3 +++ lib/stops.js | 3 +++ lib/trips.js | 3 +++ 8 files changed, 26 insertions(+) diff --git a/lib/agency.js b/lib/agency.js index 1208f08..3e80856 100644 --- a/lib/agency.js +++ b/lib/agency.js @@ -31,6 +31,9 @@ FROM read_csv( -- We omit DATE/TIME/TIMESTAMP because GTFS formats them differently. auto_type_candidates = ['NULL', 'BIGINT', 'DOUBLE', 'VARCHAR'] ); + +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX agency_agency_id ON agency(agency_id); `) workingState.nrOfRowsByName.set('agency', await queryNumberOfRows(db, 'agency', opt)) diff --git a/lib/levels.js b/lib/levels.js index 1e48772..591f79d 100644 --- a/lib/levels.js +++ b/lib/levels.js @@ -30,6 +30,8 @@ FROM read_csv( } ); +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX levels_level_id ON levels(level_id); `) workingState.nrOfRowsByName.set('levels', await queryNumberOfRows(db, 'levels', opt)) diff --git a/lib/pathways.js b/lib/pathways.js index 8980d9b..9f0837f 100644 --- a/lib/pathways.js +++ b/lib/pathways.js @@ -83,6 +83,9 @@ FROM read_csv( ${has_reversed_signposted_as ? `, reversed_signposted_as: 'TEXT'` : ``} } ); + +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX pathways_pathway_id ON pathways(pathway_id); `) workingState.nrOfRowsByName.set('pathways', await queryNumberOfRows(db, 'pathways', opt)) diff --git a/lib/routes.js b/lib/routes.js index 522db29..7485f53 100644 --- a/lib/routes.js +++ b/lib/routes.js @@ -311,6 +311,9 @@ FROM read_csv( } ); +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX routes_route_id ON routes(route_id); + CREATE INDEX routes_route_short_name ON routes (route_short_name); `) diff --git a/lib/shapes.js b/lib/shapes.js index 4e48a05..7f19d9f 100644 --- a/lib/shapes.js +++ b/lib/shapes.js @@ -1,6 +1,7 @@ 'use strict' const GET = require('./get.js') +const RUN = require('./run.js') const {queryIfColumnsExist} = require('./columns.js') // https://gtfs.org/documentation/schedule/reference/#shapestxt @@ -66,6 +67,11 @@ FROM ( ORDER BY shape_id, shape_pt_sequence ) t GROUP BY shape_id; +`) + + await db[RUN](`\ +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX shapes_shape_id ON shapes(shape_id); `) // Note: This is not the number of shapes.txt rows! diff --git a/lib/stop_times.js b/lib/stop_times.js index a8c3607..db9d75e 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -90,6 +90,9 @@ FROM read_csv( } ); +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX stop_times_trip_id_stop_sequence ON stop_times(trip_id, stop_sequence); + -- todo: are all of them beneficial/necessary? CREATE INDEX stop_times_trip_id ON stop_times (trip_id); CREATE INDEX stop_times_stop_id ON stop_times (stop_id); diff --git a/lib/stops.js b/lib/stops.js index d207bba..c43c33e 100644 --- a/lib/stops.js +++ b/lib/stops.js @@ -179,6 +179,9 @@ ORDER BY root_id, recursion_level, stop_id; -- ADD CONSTRAINT stops_parent_station_fkey -- FOREIGN KEY (parent_station) REFERENCES stops; +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX stops_stop_id ON stops(stop_id); + CREATE INDEX stops_parent_station ON stops (parent_station); ${opt.stopsLocationIndex ? `CREATE INDEX stops_stop_loc ON stops (stop_loc);` : ''} `) diff --git a/lib/trips.js b/lib/trips.js index ee58fbd..d66f7a4 100644 --- a/lib/trips.js +++ b/lib/trips.js @@ -82,6 +82,9 @@ FROM read_csv( ${has_bikes_allowed ? `bikes_allowed: 'INTEGER',` : ``} } ); + +-- For a primary key, DuckDB doesn't create an index automatically. +CREATE UNIQUE INDEX trips_trip_id ON trips(trip_id); `) workingState.nrOfRowsByName.set('trips', await queryNumberOfRows(db, 'trips', opt)) From 9090469e889ac3a01033346f221322939be3e5b4 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Fri, 11 Jul 2025 22:16:25 +0200 Subject: [PATCH 14/16] =?UTF-8?q?queryFileColumns():=20don't=20pass=20path?= =?UTF-8?q?=20to=20CSV=20via=20parameter=20=F0=9F=90=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This works around a crash I couldn't make sense of. --- lib/columns.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/columns.js b/lib/columns.js index f60547a..7629125 100644 --- a/lib/columns.js +++ b/lib/columns.js @@ -10,13 +10,15 @@ const queryFileColumns = async (db, pathToFile) => { DESCRIBE ( SELECT * FROM read_csv( - $1, + -- Using a parameter would be the proper & safer approach here, but it crashes DuckDB as of v1.3.2. + -- $1, + '${pathToFile}', header = true ) LIMIT 1 ) `, - [pathToFile], + // [pathToFile], ) return columns } From 1fdee9f7e9cfc6ec4e8629ebe9a8a380e9d2a548 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Mon, 4 Aug 2025 16:54:39 +0200 Subject: [PATCH 15/16] DuckDB rewrite: @duckdb/node-api@1.3.2-alpha.25 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7f4f480..0d30a73 100644 --- a/package.json +++ b/package.json @@ -58,7 +58,7 @@ "node": ">=22" }, "dependencies": { - "@duckdb/node-api": "^1.2.2-alpha.18", + "@duckdb/node-api": "^1.3.2-alpha.25", "debug": "^4.3.3", "gtfs-utils": "^5.1.0", "sequencify": "0.0.7" From 53520341a0696c0018a1d6464f5315a30164e189 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Wed, 13 Aug 2025 02:13:46 +0200 Subject: [PATCH 16/16] =?UTF-8?q?readme:=20update=20benchmark=20results=20?= =?UTF-8?q?=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- readme.md | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/readme.md b/readme.md index c197e5b..f8ce109 100644 --- a/readme.md +++ b/readme.md @@ -278,32 +278,30 @@ AND "date" <= dates_filter_max('2022-03-23T12:35:00+01') -- evaluates to 2023-03 `gtfs-via-duckdb` is fast enough for most use cases I can think of. If there's a particular kind of query that you think should be faster, please [open an Issue](https://github.com/public-transport/gtfs-via-duckdb/issues/new)! -todo: re-run them -The following benchmarks were run with the [2025-05-21 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2025-05-21/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.6; All measurements are in milliseconds. +The following benchmarks were run with the [2025-05-21 VBB GTFS dataset](https://vbb-gtfs.jannisr.de/2025-05-21/) (41k `stops`, 6m `stop_times`, 207m arrivals/departures) using `gtfs-via-duckdb@5.0.0` and DuckDB v1.3 on an [M2](https://en.wikipedia.org/wiki/Apple_M2) laptop running macOS 14.7.7; All measurements are in milliseconds. | query | avg | min | p25 | p50 | p75 | p95 | p99 | max | iterations | | - | - | - | - | - | - | - | - | - | - | -|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_SetSRID(ST_MakePoint(9.7, 50.547), 4326)) ASC
LIMIT 100
| 15 | 14.982 | 15 | 15 | 15 | 15 | 15 | 15.488 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 61 | 60.901 | 61 | 61 | 61 | 61 | 62 | 61.778 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 33 | 33.129 | 33 | 33 | 33 | 33 | 33 | 33.342 | 40 | -|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND stop_sequence = 0
| 5 | 4.548 | 5 | 5 | 5 | 5 | 5 | 4.598 | 50 | -|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 8 | 8.038 | 8 | 8 | 8 | 8 | 8 | 8.164 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 2 | 1.878 | 2 | 2 | 2 | 2 | 2 | 1.911 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 58 | 57.485 | 58 | 58 | 58 | 58 | 58 | 57.789 | 100 | -|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 2 | 1.832 | 2 | 2 | 2 | 2 | 2 | 1.876 | 100 | -|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
| 6310 | 6238.819 | 6241 | 6262 | 6311 | 6503 | 6560 | 6573.768 | 10 | -|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
| 4931 | 4914.388 | 4925 | 4928 | 4937 | 4946 | 4948 | 4948.689 | 10 | -|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 164 | 163.018 | 163 | 164 | 164 | 164 | 165 | 166.568 | 100 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 59 | 58.137 | 58 | 58 | 59 | 60 | 61 | 61.461 | 40 | -|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
AND from_stop_sequence = 0
| 7 | 7.439 | 7 | 7 | 7 | 7 | 7 | 7.49 | 50 | -|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02')
AND date <= dates_filter_max('2022-08-09T07:30+02')
| 15 | 14.529 | 15 | 15 | 15 | 15 | 15 | 14.698 | 100 | -|
SELECT *
FROM connections
WHERE trip_id = '168977951'
AND date > '2022-08-08' AND date <= '2022-08-09'
| 3 | 2.86 | 3 | 3 | 3 | 3 | 3 | 2.931 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 73 | 72.687 | 73 | 73 | 73 | 73 | 73 | 73.35 | 100 | -|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 3 | 3.428 | 3 | 3 | 3 | 3 | 4 | 3.525 | 100 | -|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= dates_filter_min('2022-08-09T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2022-08-09T07:30+02'::timestamp with time zone)
ORDER BY t_departure
LIMIT 100
| 13127 | 13056.841 | 13086 | 13125 | 13170 | 13194 | 13199 | 13200.027 | 7 | -|
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date >= '2022-08-08'
AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 6417 | 6237.932 | 6346 | 6394 | 6512 | 6562 | 6570 | 6571.455 | 7 | -|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2862 | 2853.972 | 2860 | 2863 | 2863 | 2867 | 2867 | 2866.798 | 10 | - +|
SELECT *
FROM stops
ORDER BY ST_Distance(stop_loc::geometry, ST_Point(9.7, 50.547)) ASC
LIMIT 100
| 6.35 | 5.91 | 5.98 | 6.25 | 6.6 | 6.86 | 8.41 | 10.05 | 1576 | +|
SELECT *
FROM arrivals_departures
WHERE route_short_name = 'S1'
AND t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02'
AND date >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone)
AND date <= dates_filter_max('2025-05-27T07:30+02'::timestamp with time zone)
| 305.15 | 260.52 | 303.8 | 307.73 | 312.2 | 320.64 | 326.84 | 328.44 | 33 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 129.43 | 119.85 | 126.19 | 128.62 | 131.84 | 138.44 | 140.46 | 142 | 78 | +|
SELECT *
FROM arrivals_departures
WHERE station_id = 'de:11000:900100001' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
AND stop_sequence = 0
| 81.42 | 65.73 | 79.48 | 82.11 | 84.33 | 87.26 | 89.64 | 102.97 | 123 | +|
SELECT *
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 83.79 | 64.57 | 82.15 | 84.64 | 85.83 | 91.36 | 95.79 | 97.08 | 120 | +|
SELECT *
FROM arrivals_departures
WHERE trip_id = '262623609' -- route_id=10144_109, route_short_name=S2
AND date = '2025-05-27'
| 14.25 | 12.38 | 13.42 | 13.98 | 14.84 | 16.12 | 18.98 | 21.77 | 702 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 70.9 | 67.54 | 69.09 | 70.1 | 72.47 | 75.73 | 77.24 | 78.83 | 142 | +|
SELECT count(*)
FROM arrivals_departures
WHERE stop_id = 'definitely-non-existent'
| 23.61 | 20.31 | 21.97 | 22.67 | 24.84 | 27.51 | 30.78 | 40.43 | 424 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= '2025-05-25'
AND date <= '2025-05-27'
| 1269.86 | 1139.03 | 1254.52 | 1272.09 | 1318.94 | 1329.66 | 1331.44 | 1331.89 | 8 | +|
SELECT *
FROM arrivals_departures
WHERE t_departure >= '2025-05-27T07:10:00+02' AND t_departure <= '2025-05-27T07:30:00+02'
AND "date" >= dates_filter_min('2025-05-27T07:10:00+02'::timestamp with time zone)
AND "date" <= dates_filter_max('2025-05-27T07:30:00+02'::timestamp with time zone)
| 34148.21 | 32101.25 | 33459.12 | 34816.99 | 35171.69 | 35455.44 | 35512.2 | 35526.38 | 3 | +|
SELECT *
FROM connections
WHERE route_short_name = 'S1'
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 8697.84 | 8629.78 | 8673.26 | 8716.73 | 8731.86 | 8743.96 | 8746.39 | 8746.99 | 3 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 1154.01 | 1070.8 | 1115.77 | 1156.47 | 1168.38 | 1243.5 | 1281.37 | 1290.84 | 9 | +|
SELECT *
FROM connections
WHERE from_station_id = 'de:11000:900194006' -- S Schöneweide/Sterndamm (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
AND from_stop_sequence_consec = 0
| 482.23 | 454.29 | 466.55 | 467.45 | 475.64 | 555.32 | 571.05 | 574.98 | 21 | +|
SELECT *
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
AND t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02')
AND date <= dates_filter_max('2025-05-27T07:30+02')
| 885.14 | 835.29 | 869.24 | 875.76 | 909.79 | 922.32 | 923.64 | 923.97 | 12 | +|
SELECT *
FROM connections
WHERE trip_id = '262535123' -- route_id=17452_900 (M4)
AND date >= '2025-05-26' AND date <= '2025-06-01'
| 19.31 | 15.83 | 18.02 | 18.99 | 20.27 | 22.76 | 24.78 | 27.96 | 519 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 341.42 | 263.96 | 340.65 | 346.83 | 350.72 | 355.91 | 358.76 | 359.65 | 30 | +|
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 343.5 | 314.1 | 319.13 | 345.04 | 354.63 | 362.52 | 463.4 | 503.94 | 30 | +|
SELECT *
FROM connections
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= dates_filter_min('2025-05-27T07:10+02'::timestamp with time zone)
AND date <= dates_filter_max('2025-05-27T07:30+02'::timestamp with time zone)
ORDER BY t_departure
LIMIT 100
| 1013055.35 | 986377.24 | 1026394.41 | 1009900.4 | 1026394.41 | 992028.36 | 1042228.66 | 1042888.42 | 3 | +|
SELECT *
FROM connections
WHERE t_departure >= '2025-05-27T07:10+02' AND t_departure <= '2025-05-27T07:30+02'
AND date >= '2025-05-25' AND date <= '2025-05-27'
ORDER BY t_departure
LIMIT 100
| 16347.21 | 16250.36 | 16285.17 | 16319.98 | 16395.63 | 16456.16 | 16468.27 | 16471.29 | 3 | +|
SELECT *
FROM stats_by_route_date
WHERE route_id = '17452_900' -- M4
AND date >= '2025-05-26' AND date <= '2025-06-01'
AND is_effective = true
| 4765.59 | 4704.49 | 4706.87 | 4709.25 | 4796.14 | 4865.64 | 4879.54 | 4883.02 | 3 | ## Related Projects