diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..452734c --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.env +.token +.local +.aider* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7ffa266 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: ['--baseline', '.secrets.baseline'] + additional_dependencies: ['gibberish-detector'] diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 0000000..9223b7a --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,131 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.gibberish.should_exclude_secret", + "limit": 3.7 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": {}, + "generated_at": "2025-09-23T04:15:11Z" +} diff --git a/pipeline.sh b/pipeline.sh new file mode 100755 index 0000000..9fe65e4 --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euxo pipefail + +mkdir -p .local/raw_data .local/metrics + +chmod o+w .local/raw_data .local/metrics + +podman kube play pipeline.yaml + +podman kube down pipeline.yaml diff --git a/pipeline.sql b/pipeline.sql new file mode 100644 index 0000000..2b9eb87 --- /dev/null +++ b/pipeline.sql @@ -0,0 +1,8 @@ +INSTALL json; +LOAD json; +CREATE OR REPLACE TABLE raw_commits AS SELECT * FROM read_json_auto('/raw_data/commits.jsonl'); +CREATE OR REPLACE TABLE raw_license AS SELECT * FROM read_json_auto('/raw_data/license.jsonl'); +CREATE OR REPLACE TABLE raw_contributors AS SELECT * FROM read_json_auto('/raw_data/contributors.jsonl'); +CREATE OR REPLACE TABLE raw_releases AS SELECT * FROM read_json_auto('/raw_data/releases.jsonl'); +CREATE OR REPLACE TABLE raw_issues AS SELECT * FROM read_json_auto('/raw_data/issues.jsonl'); +CREATE OR REPLACE TABLE raw_root_md_files AS SELECT * FROM read_json_auto('/raw_data/root_md_files.jsonl'); diff --git a/pipeline.yaml b/pipeline.yaml new file mode 100644 index 0000000..79fce08 --- /dev/null +++ b/pipeline.yaml @@ -0,0 +1,53 @@ +apiVersion: v1 +kind: Pod +metadata: + name: project-health-pipeline +spec: + initContainers: + - name: health-analyzer + image: codeberg.org/0xf1e/project-health-analyzer:latest + env: + - name: GITHUB_TOKEN_FILE + value: /app/token.txt + volumeMounts: + - name: raw-data-volume + mountPath: /app/output + - name: token-volume + mountPath: /app/token.txt + resources: {} + - name: duckdb-importer + image: docker.io/duckdb/duckdb + command: + - duckdb + - /metrics/project_health.db + - -f + - /app/pipeline.sql + volumeMounts: + - name: raw-data-volume + mountPath: /raw_data + - name: metrics-volume + mountPath: /metrics + - name: sql-volume + mountPath: /app/pipeline.sql + resources: {} + containers: + - name: todo + image: nginx:latest + restartPolicy: Never + volumes: + - name: raw-data-volume + hostPath: + path: ./.local/raw_data + type: Directory + - name: metrics-volume + hostPath: + path: ./.local/metrics + type: Directory + - name: token-volume + hostPath: + path: ./.token + type: File + - name: sql-volume + hostPath: + path: ./pipeline.sql + type: File