Skip to content

Commit 639a31e

Browse files
committed
Add anomaly detector input feature visualization
1 parent 7666eb5 commit 639a31e

File tree

1 file changed

+136
-5
lines changed

1 file changed

+136
-5
lines changed

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 136 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
"from optuna import Study, create_study\n",
6868
"\n",
6969
"import shap # Explainable AI tool\n",
70+
"import umap\n",
7071
"\n",
7172
"import matplotlib.pyplot as plot"
7273
]
@@ -921,6 +922,7 @@
921922
" cluster_label_column: str = \"clusterLabel\",\n",
922923
" cluster_medoid_column: str = \"clusterMedoid\",\n",
923924
" cluster_size_column: str = \"clusterSize\",\n",
925+
" cluster_color_map: str = \"tab20\",\n",
924926
" anomaly_label_column: str = \"anomalyLabel\",\n",
925927
" anomaly_score_column: str = \"anomalyScore\",\n",
926928
" size_column: str = \"articleRank\",\n",
@@ -982,7 +984,7 @@
982984
" y=cluster_noise[y_position_column],\n",
983985
" s=cluster_noise[size_column] * 60 + 2,\n",
984986
" color='lightgrey',\n",
985-
" alpha=0.4,\n",
987+
" alpha=0.3,\n",
986988
" label='Noise'\n",
987989
" )\n",
988990
"\n",
@@ -992,8 +994,8 @@
992994
" y=cluster_non_noise[y_position_column],\n",
993995
" s=cluster_non_noise[size_column] * 60 + 2,\n",
994996
" c=cluster_non_noise[cluster_label_column],\n",
995-
" cmap='tab20',\n",
996-
" alpha=0.7,\n",
997+
" cmap=cluster_color_map,\n",
998+
" alpha=0.5,\n",
997999
" label='Clusters'\n",
9981000
" )\n",
9991001
"\n",
@@ -1103,12 +1105,94 @@
11031105
"plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")"
11041106
]
11051107
},
1108+
{
1109+
"cell_type": "markdown",
1110+
"id": "77dee89a",
1111+
"metadata": {},
1112+
"source": [
1113+
"#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n",
1114+
"\n",
1115+
"This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
1116+
"\n",
1117+
"- Red: detected anomalies \n",
1118+
"- Lightgrey: code units labeled as noise by HDBSCAN \n",
1119+
"- Greys: cluster labels \n",
1120+
"- Size: Article Rank (larger = more important)"
1121+
]
1122+
},
1123+
{
1124+
"cell_type": "code",
1125+
"execution_count": null,
1126+
"id": "c30a29f8",
1127+
"metadata": {},
1128+
"outputs": [],
1129+
"source": [
1130+
"def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n",
1131+
" \"\"\"\n",
1132+
" Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n",
1133+
" see https://umap-learn.readthedocs.io\n",
1134+
" \"\"\"\n",
1135+
"\n",
1136+
" # Check if features are empty\n",
1137+
" if features is None or len(features) == 0:\n",
1138+
" print(\"No feature data available\")\n",
1139+
" return anomaly_detection_results\n",
1140+
"\n",
1141+
" # Check if features and anomaly_detection_results have compatible lengths\n",
1142+
" if features.shape[0] != anomaly_detection_results.shape[0]:\n",
1143+
" raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n",
1144+
"\n",
1145+
" # Use UMAP to reduce the dimensionality to 2D for visualization\n",
1146+
" umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n",
1147+
" two_dimensional_features = umap_reducer.fit_transform(features)\n",
1148+
" \n",
1149+
" # Convert to dense numpy array (works for both sparse and dense input)\n",
1150+
" feature_coordinates = np.asarray(two_dimensional_features)\n",
1151+
"\n",
1152+
" anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n",
1153+
" anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n",
1154+
"\n",
1155+
" return anomaly_detection_results"
1156+
]
1157+
},
1158+
{
1159+
"cell_type": "code",
1160+
"execution_count": null,
1161+
"id": "f6af9eb9",
1162+
"metadata": {},
1163+
"outputs": [],
1164+
"source": [
1165+
"java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
1166+
" java_package_anomaly_detection_features_prepared,\n",
1167+
" java_package_anomaly_detection_features\n",
1168+
")"
1169+
]
1170+
},
1171+
{
1172+
"cell_type": "code",
1173+
"execution_count": null,
1174+
"id": "7a679562",
1175+
"metadata": {},
1176+
"outputs": [],
1177+
"source": [
1178+
"plot_anomalies(\n",
1179+
" java_package_anomaly_detection_features,\n",
1180+
" title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n",
1181+
" x_position_column='featureVisualizationX',\n",
1182+
" y_position_column='featureVisualizationY',\n",
1183+
" annotate_top_n_non_anomalies=0,\n",
1184+
" annotate_top_n_clusters=5,\n",
1185+
" annotate_top_n_anomalies=5,\n",
1186+
" cluster_color_map=\"Greys\"\n",
1187+
")"
1188+
]
1189+
},
11061190
{
11071191
"cell_type": "markdown",
11081192
"id": "0f1b08b6",
11091193
"metadata": {},
11101194
"source": [
1111-
"#### 1.4b Plot anomalies solely based on embeddings"
1195+
"#### 1.4c Plot anomalies solely based on embeddings"
11121196
]
11131197
},
11141198
{
@@ -1914,12 +1998,59 @@
19141998
"plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")"
19151999
]
19162000
},
2001+
{
2002+
"cell_type": "markdown",
2003+
"id": "6eb52ab0",
2004+
"metadata": {},
2005+
"source": [
2006+
"#### 2.4b Plot anomalies solely based on embeddings\n",
2007+
"\n",
2008+
"This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n",
2009+
"\n",
2010+
"- Red: detected anomalies \n",
2011+
"- Lightgrey: code units labeled as noise by HDBSCAN \n",
2012+
"- Greys: cluster labels \n",
2013+
"- Size: Article Rank (larger = more important)"
2014+
]
2015+
},
2016+
{
2017+
"cell_type": "code",
2018+
"execution_count": null,
2019+
"id": "129cced0",
2020+
"metadata": {},
2021+
"outputs": [],
2022+
"source": [
2023+
"java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n",
2024+
" java_type_anomaly_detection_features_prepared,\n",
2025+
" java_type_anomaly_detection_features\n",
2026+
")"
2027+
]
2028+
},
2029+
{
2030+
"cell_type": "code",
2031+
"execution_count": null,
2032+
"id": "f05ef08c",
2033+
"metadata": {},
2034+
"outputs": [],
2035+
"source": [
2036+
"plot_anomalies(\n",
2037+
" java_type_anomaly_detection_features,\n",
2038+
" title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n",
2039+
" x_position_column='featureVisualizationX',\n",
2040+
" y_position_column='featureVisualizationY',\n",
2041+
" annotate_top_n_non_anomalies=0,\n",
2042+
" annotate_top_n_clusters=5,\n",
2043+
" annotate_top_n_anomalies=5,\n",
2044+
" cluster_color_map=\"Greys\"\n",
2045+
")"
2046+
]
2047+
},
19172048
{
19182049
"cell_type": "markdown",
19192050
"id": "05275be7",
19202051
"metadata": {},
19212052
"source": [
1922-
"#### 2.4.b Plot anomalies solely based on embeddings"
2053+
"#### 2.4c Plot anomalies solely based on embeddings"
19232054
]
19242055
},
19252056
{

0 commit comments

Comments
 (0)