From 9a6a6f09d40c38e2a9db5911ff5c40fe4ee0d2d9 Mon Sep 17 00:00:00 2001 From: 0xf1e Date: Tue, 23 Sep 2025 06:36:28 +0200 Subject: [PATCH 01/13] feat: add secret protection --- .gitignore | 2 + .pre-commit-config.yaml | 7 +++ .secrets.baseline | 131 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 .secrets.baseline diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d99c904 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env +.token diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7ffa266 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: ['--baseline', '.secrets.baseline'] + additional_dependencies: ['gibberish-detector'] diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 0000000..9223b7a --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,131 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.gibberish.should_exclude_secret", + "limit": 3.7 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": {}, + "generated_at": "2025-09-23T04:15:11Z" +} From 9ee423088d49030aa6ff3986bcb543183656e5ae Mon Sep 17 00:00:00 2001 From: 0xf1e Date: Tue, 23 Sep 2025 07:07:01 +0200 Subject: [PATCH 02/13] feat: add rudimentary pipeline script --- .gitignore | 2 ++ pipeline.sh | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100755 pipeline.sh diff --git a/.gitignore b/.gitignore index d99c904..452734c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .env .token +.local +.aider* diff --git a/pipeline.sh b/pipeline.sh new file mode 100755 index 0000000..dd371c2 --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,11 @@ +mkdir -p .local/raw_data .local/metrics + +chmod o+w .local/raw_data .local/metrics + +podman run --rm \ + -e GITHUB_TOKEN_FILE=/app/token.txt \ + -v ./.token:/app/token.txt \ + -v $(pwd)/.local/raw_data:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + +podman run --rm -it --name duckdb-importer -v "$(pwd)"/.local/raw_data:/raw_data -v "$(pwd)"/.local/metrics:/metrics docker.io/duckdb/duckdb duckdb /metrics/project_health.db -c "INSTALL json; LOAD json; CREATE OR REPLACE TABLE commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl');" From cdd20e4dbea06a9befced418699893c8bdf5c8bc Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:10:08 +0200 Subject: [PATCH 03/13] feat: load all JSONL files from raw_data into corresponding tables --- pipeline.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pipeline.sh b/pipeline.sh index dd371c2..e5ea287 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -8,4 +8,13 @@ podman run --rm \ -v $(pwd)/.local/raw_data:/app/output \ codeberg.org/0xf1e/project-health-analyzer:latest -podman run --rm -it --name duckdb-importer -v "$(pwd)"/.local/raw_data:/raw_data -v "$(pwd)"/.local/metrics:/metrics docker.io/duckdb/duckdb duckdb /metrics/project_health.db -c "INSTALL json; LOAD json; CREATE OR REPLACE TABLE commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl');" +podman run --rm -it --name duckdb-importer -v "$(pwd)"/.local/raw_data:/raw_data -v "$(pwd)"/.local/metrics:/metrics docker.io/duckdb/duckdb duckdb /metrics/project_health.db -c " + INSTALL json; + LOAD json; + CREATE OR REPLACE TABLE commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); + CREATE OR REPLACE TABLE license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); + CREATE OR REPLACE TABLE contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); + CREATE OR REPLACE TABLE releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); + CREATE OR REPLACE TABLE issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); + CREATE OR REPLACE TABLE root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); +" From 5339f563404db7004f8d1121c8faf5cb622e39bf Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:11:28 +0200 Subject: [PATCH 04/13] refactor: prepend 'raw_' to each table name in duckdb import script --- pipeline.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pipeline.sh b/pipeline.sh index e5ea287..ef419bd 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -11,10 +11,10 @@ podman run --rm \ podman run --rm -it --name duckdb-importer -v "$(pwd)"/.local/raw_data:/raw_data -v "$(pwd)"/.local/metrics:/metrics docker.io/duckdb/duckdb duckdb /metrics/project_health.db -c " INSTALL json; LOAD json; - CREATE OR REPLACE TABLE commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); - CREATE OR REPLACE TABLE license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); - CREATE OR REPLACE TABLE contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); - CREATE OR REPLACE TABLE releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); - CREATE OR REPLACE TABLE issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); - CREATE OR REPLACE TABLE root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); + CREATE OR REPLACE TABLE raw_commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); + CREATE OR REPLACE TABLE raw_license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); + CREATE OR REPLACE TABLE raw_contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); + CREATE OR REPLACE TABLE raw_releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); + CREATE OR REPLACE TABLE raw_issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); + CREATE OR REPLACE TABLE raw_root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); " From 99da562fbada6289446a6e065a90ce5d5fd0c11c Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:22:03 +0200 Subject: [PATCH 05/13] feat: move SQL script to separate pipeline.sql file for duckdb loading --- pipeline.sh | 15 +++++---------- pipeline.sql | 8 ++++++++ 2 files changed, 13 insertions(+), 10 deletions(-) create mode 100644 pipeline.sql diff --git a/pipeline.sh b/pipeline.sh index ef419bd..744300f 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -8,13 +8,8 @@ podman run --rm \ -v $(pwd)/.local/raw_data:/app/output \ codeberg.org/0xf1e/project-health-analyzer:latest -podman run --rm -it --name duckdb-importer -v "$(pwd)"/.local/raw_data:/raw_data -v "$(pwd)"/.local/metrics:/metrics docker.io/duckdb/duckdb duckdb /metrics/project_health.db -c " - INSTALL json; - LOAD json; - CREATE OR REPLACE TABLE raw_commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); - CREATE OR REPLACE TABLE raw_license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); - CREATE OR REPLACE TABLE raw_contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); - CREATE OR REPLACE TABLE raw_releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); - CREATE OR REPLACE TABLE raw_issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); - CREATE OR REPLACE TABLE raw_root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); -" +podman run --rm -it --name duckdb-importer \ + -v "$(pwd)"/.local/raw_data:/raw_data \ + -v "$(pwd)"/.local/metrics:/metrics \ + -v "$(pwd)"/pipeline.sql:/app/pipeline.sql \ + docker.io/duckdb/duckdb duckdb /metrics/project_health.db -init /app/pipeline.sql diff --git a/pipeline.sql b/pipeline.sql new file mode 100644 index 0000000..2b9eb87 --- /dev/null +++ b/pipeline.sql @@ -0,0 +1,8 @@ +INSTALL json; +LOAD json; +CREATE OR REPLACE TABLE raw_commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); +CREATE OR REPLACE TABLE raw_license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); +CREATE OR REPLACE TABLE raw_contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); +CREATE OR REPLACE TABLE raw_releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); +CREATE OR REPLACE TABLE raw_issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); +CREATE OR REPLACE TABLE raw_root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); From 7e97f82e176a45833b25380feec353fb2e89baf2 Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:23:14 +0200 Subject: [PATCH 06/13] fix: use -f instead of --init for duckdb command --- pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline.sh b/pipeline.sh index 744300f..cb884b9 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -12,4 +12,4 @@ podman run --rm -it --name duckdb-importer \ -v "$(pwd)"/.local/raw_data:/raw_data \ -v "$(pwd)"/.local/metrics:/metrics \ -v "$(pwd)"/pipeline.sql:/app/pipeline.sql \ - docker.io/duckdb/duckdb duckdb /metrics/project_health.db -init /app/pipeline.sql + docker.io/duckdb/duckdb duckdb /metrics/project_health.db -f /app/pipeline.sql From 0300f460ff5caf0d254da0083c808d4424bdf62a Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:30:45 +0200 Subject: [PATCH 07/13] feat: add Kubernetes pod declaration for project health pipeline --- pipeline.yaml | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 pipeline.yaml diff --git a/pipeline.yaml b/pipeline.yaml new file mode 100644 index 0000000..e930f60 --- /dev/null +++ b/pipeline.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: Pod +metadata: + name: project-health-pipeline +spec: + volumes: + - name: raw-data-volume + hostPath: + path: ./.local/raw_data + type: Directory + - name: metrics-volume + hostPath: + path: ./.local/metrics + type: Directory + - name: token-volume + hostPath: + path: ./.token + type: File + - name: sql-volume + hostPath: + path: ./pipeline.sql + type: File + containers: + - name: health-analyzer + image: codeberg.org/0xf1e/project-health-analyzer:latest + env: + - name: GITHUB_TOKEN_FILE + value: /app/token.txt + volumeMounts: + - name: raw-data-volume + mountPath: /app/output + - name: token-volume + mountPath: /app/token.txt + subPath: token.txt + resources: {} + - name: duckdb-importer + image: docker.io/duckdb/duckdb + command: + - duckdb + - /metrics/project_health.db + - -f + - /app/pipeline.sql + volumeMounts: + - name: raw-data-volume + mountPath: /raw_data + - name: metrics-volume + mountPath: /metrics + - name: sql-volume + mountPath: /app/pipeline.sql + subPath: pipeline.sql + resources: {} + restartPolicy: Never From 1a5f9a18cb97b491cf72ab42071b433ff91de732 Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:31:05 +0200 Subject: [PATCH 08/13] fix: move health-analyzer to init container to run first --- pipeline.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipeline.yaml b/pipeline.yaml index e930f60..99617f4 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -20,7 +20,7 @@ spec: hostPath: path: ./pipeline.sql type: File - containers: + initContainers: - name: health-analyzer image: codeberg.org/0xf1e/project-health-analyzer:latest env: @@ -33,6 +33,7 @@ spec: mountPath: /app/token.txt subPath: token.txt resources: {} + containers: - name: duckdb-importer image: docker.io/duckdb/duckdb command: From 87145a46165477ba1eed7f36f3cddb14cf5ea759 Mon Sep 17 00:00:00 2001 From: 0xf1e Date: Tue, 23 Sep 2025 07:33:21 +0200 Subject: [PATCH 09/13] fix: move volumes definition to correct location in pipeline.yaml --- pipeline.yaml | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pipeline.yaml b/pipeline.yaml index 99617f4..c444030 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -3,23 +3,6 @@ kind: Pod metadata: name: project-health-pipeline spec: - volumes: - - name: raw-data-volume - hostPath: - path: ./.local/raw_data - type: Directory - - name: metrics-volume - hostPath: - path: ./.local/metrics - type: Directory - - name: token-volume - hostPath: - path: ./.token - type: File - - name: sql-volume - hostPath: - path: ./pipeline.sql - type: File initContainers: - name: health-analyzer image: codeberg.org/0xf1e/project-health-analyzer:latest @@ -51,3 +34,20 @@ spec: subPath: pipeline.sql resources: {} restartPolicy: Never + volumes: + - name: raw-data-volume + hostPath: + path: ./.local/raw_data + type: Directory + - name: metrics-volume + hostPath: + path: ./.local/metrics + type: Directory + - name: token-volume + hostPath: + path: ./.token + type: File + - name: sql-volume + hostPath: + path: ./pipeline.sql + type: File From 1908bdc1a328993153f938a7b4520f5e87c51010 Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:35:29 +0200 Subject: [PATCH 10/13] fix: remove incorrect subPath from volume mounts in pipeline.yaml --- pipeline.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipeline.yaml b/pipeline.yaml index c444030..dfaf3dc 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -14,7 +14,6 @@ spec: mountPath: /app/output - name: token-volume mountPath: /app/token.txt - subPath: token.txt resources: {} containers: - name: duckdb-importer @@ -31,7 +30,6 @@ spec: mountPath: /metrics - name: sql-volume mountPath: /app/pipeline.sql - subPath: pipeline.sql resources: {} restartPolicy: Never volumes: From 9ce1ddff075437bedfd668359df6c2d9749c6ae7 Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:39:09 +0200 Subject: [PATCH 11/13] feat: convert existing containers to initContainers and add nginx todo container --- pipeline.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pipeline.yaml b/pipeline.yaml index dfaf3dc..24d1c6e 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -15,7 +15,6 @@ spec: - name: token-volume mountPath: /app/token.txt resources: {} - containers: - name: duckdb-importer image: docker.io/duckdb/duckdb command: @@ -31,6 +30,12 @@ spec: - name: sql-volume mountPath: /app/pipeline.sql resources: {} + containers: + - name: todo + image: nginx:latest + ports: + - containerPort: 80 + resources: {} restartPolicy: Never volumes: - name: raw-data-volume From ef9d814e838d8faa66485905fd0f717e88dfbedd Mon Sep 17 00:00:00 2001 From: 0xf1e Date: Tue, 23 Sep 2025 07:55:28 +0200 Subject: [PATCH 12/13] feat: pipeline.sh runs podman kube play --- pipeline.sh | 12 ++---------- pipeline.yaml | 3 --- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/pipeline.sh b/pipeline.sh index cb884b9..15b28a5 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -2,14 +2,6 @@ mkdir -p .local/raw_data .local/metrics chmod o+w .local/raw_data .local/metrics -podman run --rm \ - -e GITHUB_TOKEN_FILE=/app/token.txt \ - -v ./.token:/app/token.txt \ - -v $(pwd)/.local/raw_data:/app/output \ - codeberg.org/0xf1e/project-health-analyzer:latest +podman kube play pipeline.yaml -podman run --rm -it --name duckdb-importer \ - -v "$(pwd)"/.local/raw_data:/raw_data \ - -v "$(pwd)"/.local/metrics:/metrics \ - -v "$(pwd)"/pipeline.sql:/app/pipeline.sql \ - docker.io/duckdb/duckdb duckdb /metrics/project_health.db -f /app/pipeline.sql +podman kube down pipeline.yaml diff --git a/pipeline.yaml b/pipeline.yaml index 24d1c6e..79fce08 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -33,9 +33,6 @@ spec: containers: - name: todo image: nginx:latest - ports: - - containerPort: 80 - resources: {} restartPolicy: Never volumes: - name: raw-data-volume From 67a16acb696dd6dccf5a66993c9e02e4c04d0bb6 Mon Sep 17 00:00:00 2001 From: "0xf1e (aider)" Date: Tue, 23 Sep 2025 07:58:09 +0200 Subject: [PATCH 13/13] fix: convert pipeline.sh to proper bash script with execution flags --- pipeline.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pipeline.sh b/pipeline.sh index 15b28a5..9fe65e4 100755 --- a/pipeline.sh +++ b/pipeline.sh @@ -1,3 +1,7 @@ +#!/bin/bash + +set -euxo pipefail + mkdir -p .local/raw_data .local/metrics chmod o+w .local/raw_data .local/metrics