Add anomaly detector input feature visualization

JohT · JohT · commit 639a31e8a8c1 · 2025-11-23T10:46:32.000+01:00
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb
@@ -67,6 +67,7 @@
     "from optuna import Study, create_study\n",
     "\n",
     "import shap # Explainable AI tool\n",
+    "import umap\n",
     "\n",
     "import matplotlib.pyplot as plot"
    ]
@@ -921,6 +922,7 @@
     "    cluster_label_column: str = \"clusterLabel\",\n",
     "    cluster_medoid_column: str = \"clusterMedoid\",\n",
     "    cluster_size_column: str = \"clusterSize\",\n",
+    "    cluster_color_map: str = \"tab20\",\n",
     "    anomaly_label_column: str = \"anomalyLabel\",\n",
     "    anomaly_score_column: str = \"anomalyScore\",\n",
     "    size_column: str = \"articleRank\",\n",
@@ -982,7 +984,7 @@
     "        y=cluster_noise[y_position_column],\n",
     "        s=cluster_noise[size_column] * 60 + 2,\n",
     "        color='lightgrey',\n",
-    "        alpha=0.4,\n",
+    "        alpha=0.3,\n",
     "        label='Noise'\n",
     "    )\n",
     "\n",
@@ -992,8 +994,8 @@
     "        y=cluster_non_noise[y_position_column],\n",
     "        s=cluster_non_noise[size_column] * 60 + 2,\n",
     "        c=cluster_non_noise[cluster_label_column],\n",
-    "        cmap='tab20',\n",
-    "        alpha=0.7,\n",
+    "        cmap=cluster_color_map,\n",
+    "        alpha=0.5,\n",
     "        label='Clusters'\n",
     "    )\n",
     "\n",
@@ -1103,12 +1105,94 @@
     "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "77dee89a",
+   "metadata": {},
+   "source": [
+    "#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
+    "\n",
+    "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
+    "\n",
+    "- Red: detected anomalies  \n",
+    "- Lightgrey: code units labeled as noise by HDBSCAN  \n",
+    "- Greys: cluster labels  \n",
+    "- Size: Article Rank (larger = more important)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c30a29f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n",
+    "    see https://umap-learn.readthedocs.io\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Check if features are empty\n",
+    "    if features is None or len(features) == 0:\n",
+    "        print(\"No feature data available\")\n",
+    "        return anomaly_detection_results\n",
+    "\n",
+    "    # Check if features and anomaly_detection_results have compatible lengths\n",
+    "    if features.shape[0] != anomaly_detection_results.shape[0]:\n",
+    "        raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n",
+    "\n",
+    "    # Use UMAP to reduce the dimensionality to 2D for visualization\n",
+    "    umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n",
+    "    two_dimensional_features = umap_reducer.fit_transform(features)\n",
+    "    \n",
+    "    # Convert to dense numpy array (works for both sparse and dense input)\n",
+    "    feature_coordinates = np.asarray(two_dimensional_features)\n",
+    "\n",
+    "    anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n",
+    "    anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n",
+    "\n",
+    "    return anomaly_detection_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6af9eb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
+    "    java_package_anomaly_detection_features_prepared,\n",
+    "    java_package_anomaly_detection_features\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a679562",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomalies(\n",
+    "    java_package_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_non_anomalies=0,\n",
+    "    annotate_top_n_clusters=5,\n",
+    "    annotate_top_n_anomalies=5,\n",
+    "    cluster_color_map=\"Greys\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "0f1b08b6",
    "metadata": {},
    "source": [
-    "#### 1.4b Plot anomalies solely based on embeddings"
+    "#### 1.4c Plot anomalies solely based on embeddings"
    ]
   },
   {
@@ -1914,12 +1998,59 @@
     "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6eb52ab0",
+   "metadata": {},
+   "source": [
+    "#### 2.4b Plot anomalies solely based on embeddings\n",
+    "\n",
+    "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
+    "\n",
+    "- Red: detected anomalies  \n",
+    "- Lightgrey: code units labeled as noise by HDBSCAN  \n",
+    "- Greys: cluster labels  \n",
+    "- Size: Article Rank (larger = more important)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "129cced0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
+    "    java_type_anomaly_detection_features_prepared,\n",
+    "    java_type_anomaly_detection_features\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f05ef08c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_anomalies(\n",
+    "    java_type_anomaly_detection_features,\n",
+    "    title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n",
+    "    x_position_column='featureVisualizationX',\n",
+    "    y_position_column='featureVisualizationY',\n",
+    "    annotate_top_n_non_anomalies=0,\n",
+    "    annotate_top_n_clusters=5,\n",
+    "    annotate_top_n_anomalies=5,\n",
+    "    cluster_color_map=\"Greys\"\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "05275be7",
    "metadata": {},
    "source": [
-    "#### 2.4.b Plot anomalies solely based on embeddings"
+    "#### 2.4c Plot anomalies solely based on embeddings"
    ]
   },
   {