Skip to content

Commit 32092bf

Browse files
committed
DuckDB rewrite: port import logic to DuckDB, adapt tests, docs & CI πŸ’₯πŸ“βœ…πŸ’š
1 parent a0de79d commit 32092bf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2404
-2059
lines changed

β€Ž.github/workflows/test.ymlβ€Ž

Lines changed: 2 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ jobs:
1919
node-version:
2020
- '22.x'
2121
- '24.x'
22-
postgis-docker-tag:
23-
- '14-3.5-alpine'
24-
- '15-3.5-alpine'
25-
- '16-3.5-alpine'
26-
- '17-3.5-alpine'
2722

2823
steps:
2924
- name: checkout
@@ -32,35 +27,13 @@ jobs:
3227
uses: actions/setup-node@v4
3328
with:
3429
node-version: ${{ matrix.node-version }}
35-
- name: install sponge (moreutils)
36-
run: sudo apt install -y moreutils
3730

38-
- name: install & start PostgreSQL with PostGIS
39-
# todo: currently, it uses mdillon, which doesn't have PostgreSQL 14
40-
# uses: huaxk/postgis-action@v1
41-
# with:
42-
# postgresql version: '${{ matrix.postgis-docker-tag }}'
43-
# postgresql password: password
44-
# postgresql user: postgres
45-
# postgresql db: postgres
31+
- name: install DuckDB
4632
run: |
47-
docker run -d \
48-
-e POSTGRES_USER=$PGUSER -e POSTGRES_PASSWORD=$PGPASSWORD -e POSTGRES_DB=$PGDATABASE \
49-
-p 5432:5432 postgis/postgis:${{ matrix.postgis-docker-tag }} \
50-
-c timezone=Europe/Berlin
51-
env:
52-
PGUSER: postgres
53-
PGPASSWORD: password
54-
PGDATABASE: postgres
33+
curl -U '${{ github.repository }} CI' 'https://install.duckdb.org' | sh
5534
5635
- run: npm install
5736

5837
- run: npm run lint
5938
- name: npm test
6039
run : npm test
61-
env:
62-
PGHOST: localhost
63-
PGPORT: '5432'
64-
PGUSER: postgres
65-
PGPASSWORD: password
66-
PGDATABASE: postgres

β€Ž.gitignoreβ€Ž

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,13 @@ pnpm-debug.log
1010
/shrinkwrap.yaml
1111

1212
/test/amtrak-gtfs-2021-10-06
13+
/test/*.duckdb
1314

1415
/*.gtfs
1516
/*.gtfs.zip
1617
/*.gtfs.tar.gz
1718
/*.gtfs.tar.zst
19+
20+
/*.duckdb
21+
/*.duckdb.gz
22+
/*.duckdb.br

β€ŽDockerfileβ€Ž

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,11 @@ LABEL org.opencontainers.image.licenses="(Apache-2.0 AND Prosperity-3.0.0)"
99

1010
WORKDIR /app
1111

12-
# Both moreutils (providing sponge) and postgresql-client (providing psql) are not required but come in handy for users.
13-
RUN apk add --no-cache \
14-
postgresql-client \
15-
moreutils
16-
1712
ADD package.json /app
1813
RUN npm install --production && npm cache clean --force
1914

2015
ADD . /app
21-
RUN ln -s /app/cli.js /usr/local/bin/gtfs-via-postgres
16+
RUN ln -s /app/cli.js /usr/local/bin/gtfs-via-duckdb
2217

2318
VOLUME /gtfs
2419
WORKDIR /gtfs

β€Žbenchmark/arrs_deps_by_route_name_and_time.sqlβ€Ž

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ SELECT *
22
FROM arrivals_departures
33
WHERE route_short_name = 'S1'
44
AND t_departure >= '2022-08-09T07:10:00+02' AND t_departure <= '2022-08-09T07:30:00+02'
5-
AND date >= dates_filter_min('2022-08-09T07:10:00+02')
6-
AND date <= dates_filter_max('2022-08-09T07:30:00+02')
5+
AND date >= dates_filter_min('2022-08-09T07:10:00+02'::timestamp with time zone)
6+
AND date <= dates_filter_max('2022-08-09T07:30:00+02'::timestamp with time zone)

β€Žbenchmark/as-md.jsβ€Ž

Lines changed: 24 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,31 @@
11
#!/usr/bin/env node
22

3-
const {pipeline, Transform} = require('stream')
4-
const csvParser = require('csv-parser')
5-
const {ok} = require('assert')
3+
const {createInterface} = require('node:readline')
64

7-
let firstRow = true
5+
const linewise = createInterface({
6+
input: process.stdin,
7+
// Note: We use the crlfDelay option to recognize all instances of CR LF as a single line break.
8+
crlfDelay: Infinity,
9+
})
810

9-
pipeline(
10-
process.stdin,
11-
csvParser(),
12-
new Transform({
13-
objectMode: true,
14-
transform: function (row, _, cb) {
15-
if (firstRow) {
16-
firstRow = false
11+
;(async () => {
12+
let firstRow = true
13+
for await (const line of linewise) {
14+
const row = JSON.parse(line)
1715

18-
const keys = Object.keys(row).filter(key => key !== 'filename')
19-
process.stdout.write(`| ${keys.join(' | ')} |\n`)
20-
process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`)
21-
}
16+
if (firstRow) {
17+
firstRow = false
2218

23-
const formattedVals = Object.entries(row)
24-
.map(([key, val]) => {
25-
if (key === 'query') return '<pre>' + val.replace(/\n/g, '<br>') + '</pre>'
26-
return val
27-
})
28-
process.stdout.write(`| ${formattedVals.join(' | ')} |\n`)
19+
const keys = Object.keys(row).filter(key => key !== 'filename')
20+
process.stdout.write(`| ${keys.join(' | ')} |\n`)
21+
process.stdout.write(`| ${keys.map(_ => '-').join(' | ')} |\n`)
22+
}
2923

30-
cb()
31-
},
32-
}),
33-
process.stdout,
34-
(err) => {
35-
if (!err) return;
36-
console.error(err)
37-
process.exit(1)
38-
},
39-
)
24+
const formattedVals = Object.entries(row)
25+
.map(([key, val]) => {
26+
if (key === 'query') return '<pre>' + val.replace(/\n/g, '<br>') + '</pre>'
27+
return typeof val === 'number' && !Number.isInteger(val) ? Math.round(val * 100) / 100 : val
28+
})
29+
process.stdout.write(`| ${formattedVals.join(' | ')} |\n`)
30+
}
31+
})()

β€Žbenchmark/index.cjsβ€Ž

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/env node
2+
3+
const {parseArgs} = require('node:util')
4+
const {readFile} = require('node:fs/promises')
5+
const {DuckDBInstance} = require('@duckdb/node-api')
6+
const {Bench: Benchmark} = require('tinybench')
7+
const {basename} = require('node:path')
8+
9+
// adapted from https://stackoverflow.com/a/55297611/1072129
10+
const quantile = (sorted, q) => {
11+
const pos = (sorted.length - 1) * q
12+
const base = Math.floor(pos)
13+
const rest = pos - base
14+
if (base + 1 < sorted.length) {
15+
return sorted[base] + rest * (sorted[base + 1] - sorted[base])
16+
} else {
17+
return sorted[base]
18+
}
19+
}
20+
21+
const {
22+
values: flags,
23+
positionals: args,
24+
} = parseArgs({
25+
options: {
26+
'help': {
27+
type: 'boolean',
28+
short: 'h',
29+
},
30+
},
31+
allowPositionals: true,
32+
})
33+
34+
if (flags.help) {
35+
process.stdout.write(`
36+
Usage:
37+
benchmark [options] [--] <db-file> <sql-file> ...
38+
\n`)
39+
process.exit(0)
40+
}
41+
42+
;(async () => {
43+
44+
const [pathToDb, ...queryFiles] = args
45+
if (!pathToDb) {
46+
console.error('you must pass the path to a DuckDB db file')
47+
process.exit(1)
48+
}
49+
if (queryFiles.length === 0) {
50+
console.error('you must pass >0 SQL files')
51+
process.exit(1)
52+
}
53+
const instance = await DuckDBInstance.create(pathToDb, {
54+
access_mode: 'READ_ONLY',
55+
})
56+
const db = await instance.connect()
57+
58+
await db.run(`\
59+
INSTALL spatial;
60+
LOAD spatial;
61+
`)
62+
63+
const queriesByName = new Map()
64+
const benchmark = new Benchmark({
65+
// - The default minimum number of iterations is too high.
66+
// - The default minimum time is too low.
67+
warmup: true,
68+
warmupIterations: 1,
69+
warmupTime: 5000, // 5s
70+
iterations: 3,
71+
time: 10000, // 10s
72+
})
73+
await Promise.all(
74+
queryFiles
75+
.filter(queryFile => queryFile.slice(-9) !== '.skip.sql')
76+
.map(async (queryFile) => {
77+
const name = basename(queryFile)
78+
const query = await readFile(queryFile, {encoding: 'utf8'})
79+
queriesByName.set(name, query)
80+
benchmark.add(name, async () => {
81+
await db.run(query)
82+
})
83+
}),
84+
)
85+
86+
// do all queries once, to make sure they work
87+
for (const [name, query] of queriesByName.entries()) {
88+
try {
89+
await db.run(query)
90+
} catch (err) {
91+
err.benchmark = name
92+
err.query = query
93+
throw err
94+
}
95+
}
96+
97+
benchmark.addEventListener('cycle', (ev) => {
98+
const {task} = ev
99+
const query = queriesByName.get(task.name)
100+
if ('error' in task.result) {
101+
console.error(task.result)
102+
process.exit(1)
103+
}
104+
const samples = Array.from(task.result.samples).sort()
105+
console.log(JSON.stringify({
106+
query,
107+
avg: task.result.latency.mean,
108+
min: task.result.latency.min,
109+
p25: quantile(samples, .25),
110+
p50: task.result.latency.p50,
111+
p75: task.result.latency.p75,
112+
p95: quantile(samples, .95),
113+
p99: task.result.latency.p99,
114+
max: task.result.latency.max,
115+
iterations: task.result.samples.length,
116+
}))
117+
})
118+
119+
await benchmark.run()
120+
121+
})()
122+
.catch((err) => {
123+
console.error(err)
124+
process.exit(1)
125+
})

β€Žbenchmark/index.sqlβ€Ž

Lines changed: 0 additions & 110 deletions
This file was deleted.

β€Žbenchmark/init.shβ€Ž

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ env | grep '^PG' || true
1212

1313
../cli.js -d \
1414
--stops-location-index --stats-by-route-date=view \
15-
../vbb-2022-07-01.gtfs/*.csv | sponge | psql -b
15+
vbb-2022-07-01.gtfs.duckdb \
16+
../vbb-2022-07-01.gtfs/*.csv

0 commit comments

Comments
Β (0)