|
67 | 67 | "from optuna import Study, create_study\n", |
68 | 68 | "\n", |
69 | 69 | "import shap # Explainable AI tool\n", |
| 70 | + "import umap\n", |
70 | 71 | "\n", |
71 | 72 | "import matplotlib.pyplot as plot" |
72 | 73 | ] |
|
921 | 922 | " cluster_label_column: str = \"clusterLabel\",\n", |
922 | 923 | " cluster_medoid_column: str = \"clusterMedoid\",\n", |
923 | 924 | " cluster_size_column: str = \"clusterSize\",\n", |
| 925 | + " cluster_color_map: str = \"tab20\",\n", |
924 | 926 | " anomaly_label_column: str = \"anomalyLabel\",\n", |
925 | 927 | " anomaly_score_column: str = \"anomalyScore\",\n", |
926 | 928 | " size_column: str = \"articleRank\",\n", |
|
982 | 984 | " y=cluster_noise[y_position_column],\n", |
983 | 985 | " s=cluster_noise[size_column] * 60 + 2,\n", |
984 | 986 | " color='lightgrey',\n", |
985 | | - " alpha=0.4,\n", |
| 987 | + " alpha=0.3,\n", |
986 | 988 | " label='Noise'\n", |
987 | 989 | " )\n", |
988 | 990 | "\n", |
|
992 | 994 | " y=cluster_non_noise[y_position_column],\n", |
993 | 995 | " s=cluster_non_noise[size_column] * 60 + 2,\n", |
994 | 996 | " c=cluster_non_noise[cluster_label_column],\n", |
995 | | - " cmap='tab20',\n", |
996 | | - " alpha=0.7,\n", |
| 997 | + " cmap=cluster_color_map,\n", |
| 998 | + " alpha=0.5,\n", |
997 | 999 | " label='Clusters'\n", |
998 | 1000 | " )\n", |
999 | 1001 | "\n", |
|
1103 | 1105 | "plot_anomalies(java_package_anomaly_detection_features, title_prefix=\"Java Package Anomalies\")" |
1104 | 1106 | ] |
1105 | 1107 | }, |
| 1108 | + { |
| 1109 | + "cell_type": "markdown", |
| 1110 | + "id": "77dee89a", |
| 1111 | + "metadata": {}, |
| 1112 | + "source": [ |
| 1113 | + "#### 1.4b Plot features with highlighted top anomalies in a 2D scatter plot (UMAP reduction)\n", |
| 1114 | + "\n", |
| 1115 | + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", |
| 1116 | + "\n", |
| 1117 | + "- Red: detected anomalies \n", |
| 1118 | + "- Lightgrey: code units labeled as noise by HDBSCAN \n", |
| 1119 | + "- Greys: cluster labels \n", |
| 1120 | + "- Size: Article Rank (larger = more important)" |
| 1121 | + ] |
| 1122 | + }, |
| 1123 | + { |
| 1124 | + "cell_type": "code", |
| 1125 | + "execution_count": null, |
| 1126 | + "id": "c30a29f8", |
| 1127 | + "metadata": {}, |
| 1128 | + "outputs": [], |
| 1129 | + "source": [ |
| 1130 | + "def prepare_features_for_2d_visualization(features: np.ndarray, anomaly_detection_results: pd.DataFrame) -> pd.DataFrame:\n", |
| 1131 | + " \"\"\"\n", |
| 1132 | + " Reduces the dimensionality of the features down to two dimensions for 2D visualization using UMAP.\n", |
| 1133 | + " see https://umap-learn.readthedocs.io\n", |
| 1134 | + " \"\"\"\n", |
| 1135 | + "\n", |
| 1136 | + " # Check if features are empty\n", |
| 1137 | + " if features is None or len(features) == 0:\n", |
| 1138 | + " print(\"No feature data available\")\n", |
| 1139 | + " return anomaly_detection_results\n", |
| 1140 | + "\n", |
| 1141 | + " # Check if features and anomaly_detection_results have compatible lengths\n", |
| 1142 | + " if features.shape[0] != anomaly_detection_results.shape[0]:\n", |
| 1143 | + " raise ValueError(\"Features and anomaly_detection_results must have the same number of samples.\")\n", |
| 1144 | + "\n", |
| 1145 | + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", |
| 1146 | + " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n", |
| 1147 | + " two_dimensional_features = umap_reducer.fit_transform(features)\n", |
| 1148 | + " \n", |
| 1149 | + " # Convert to dense numpy array (works for both sparse and dense input)\n", |
| 1150 | + " feature_coordinates = np.asarray(two_dimensional_features)\n", |
| 1151 | + "\n", |
| 1152 | + " anomaly_detection_results['featureVisualizationX'] = feature_coordinates[:, 0]\n", |
| 1153 | + " anomaly_detection_results['featureVisualizationY'] = feature_coordinates[:, 1]\n", |
| 1154 | + "\n", |
| 1155 | + " return anomaly_detection_results" |
| 1156 | + ] |
| 1157 | + }, |
| 1158 | + { |
| 1159 | + "cell_type": "code", |
| 1160 | + "execution_count": null, |
| 1161 | + "id": "f6af9eb9", |
| 1162 | + "metadata": {}, |
| 1163 | + "outputs": [], |
| 1164 | + "source": [ |
| 1165 | + "java_package_anomaly_detection_features = prepare_features_for_2d_visualization(\n", |
| 1166 | + " java_package_anomaly_detection_features_prepared,\n", |
| 1167 | + " java_package_anomaly_detection_features\n", |
| 1168 | + ")" |
| 1169 | + ] |
| 1170 | + }, |
| 1171 | + { |
| 1172 | + "cell_type": "code", |
| 1173 | + "execution_count": null, |
| 1174 | + "id": "7a679562", |
| 1175 | + "metadata": {}, |
| 1176 | + "outputs": [], |
| 1177 | + "source": [ |
| 1178 | + "plot_anomalies(\n", |
| 1179 | + " java_package_anomaly_detection_features,\n", |
| 1180 | + " title_prefix=\"Java Package Anomalies (2D Feature Visualization)\",\n", |
| 1181 | + " x_position_column='featureVisualizationX',\n", |
| 1182 | + " y_position_column='featureVisualizationY',\n", |
| 1183 | + " annotate_top_n_non_anomalies=0,\n", |
| 1184 | + " annotate_top_n_clusters=5,\n", |
| 1185 | + " annotate_top_n_anomalies=5,\n", |
| 1186 | + " cluster_color_map=\"Greys\"\n", |
| 1187 | + ")" |
| 1188 | + ] |
| 1189 | + }, |
1106 | 1190 | { |
1107 | 1191 | "cell_type": "markdown", |
1108 | 1192 | "id": "0f1b08b6", |
1109 | 1193 | "metadata": {}, |
1110 | 1194 | "source": [ |
1111 | | - "#### 1.4b Plot anomalies solely based on embeddings" |
| 1195 | + "#### 1.4c Plot anomalies solely based on embeddings" |
1112 | 1196 | ] |
1113 | 1197 | }, |
1114 | 1198 | { |
|
1914 | 1998 | "plot_anomalies(java_type_anomaly_detection_features, title_prefix=\"Java Type Anomalies\")" |
1915 | 1999 | ] |
1916 | 2000 | }, |
| 2001 | + { |
| 2002 | + "cell_type": "markdown", |
| 2003 | + "id": "6eb52ab0", |
| 2004 | + "metadata": {}, |
| 2005 | + "source": [ |
| 2006 | + "#### 2.4b Plot anomalies solely based on embeddings\n", |
| 2007 | + "\n", |
| 2008 | + "This plot visualizes the input features used by the Isolation Forest anomaly detector in a 2D scatter plot. Dimensionality reduction is performed with UMAP to illustrate how the detector \"sees\" the data.\n", |
| 2009 | + "\n", |
| 2010 | + "- Red: detected anomalies \n", |
| 2011 | + "- Lightgrey: code units labeled as noise by HDBSCAN \n", |
| 2012 | + "- Greys: cluster labels \n", |
| 2013 | + "- Size: Article Rank (larger = more important)" |
| 2014 | + ] |
| 2015 | + }, |
| 2016 | + { |
| 2017 | + "cell_type": "code", |
| 2018 | + "execution_count": null, |
| 2019 | + "id": "129cced0", |
| 2020 | + "metadata": {}, |
| 2021 | + "outputs": [], |
| 2022 | + "source": [ |
| 2023 | + "java_type_anomaly_detection_features = prepare_features_for_2d_visualization(\n", |
| 2024 | + " java_type_anomaly_detection_features_prepared,\n", |
| 2025 | + " java_type_anomaly_detection_features\n", |
| 2026 | + ")" |
| 2027 | + ] |
| 2028 | + }, |
| 2029 | + { |
| 2030 | + "cell_type": "code", |
| 2031 | + "execution_count": null, |
| 2032 | + "id": "f05ef08c", |
| 2033 | + "metadata": {}, |
| 2034 | + "outputs": [], |
| 2035 | + "source": [ |
| 2036 | + "plot_anomalies(\n", |
| 2037 | + " java_type_anomaly_detection_features,\n", |
| 2038 | + " title_prefix=\"Java Type Anomalies (2D Feature Visualization)\",\n", |
| 2039 | + " x_position_column='featureVisualizationX',\n", |
| 2040 | + " y_position_column='featureVisualizationY',\n", |
| 2041 | + " annotate_top_n_non_anomalies=0,\n", |
| 2042 | + " annotate_top_n_clusters=5,\n", |
| 2043 | + " annotate_top_n_anomalies=5,\n", |
| 2044 | + " cluster_color_map=\"Greys\"\n", |
| 2045 | + ")" |
| 2046 | + ] |
| 2047 | + }, |
1917 | 2048 | { |
1918 | 2049 | "cell_type": "markdown", |
1919 | 2050 | "id": "05275be7", |
1920 | 2051 | "metadata": {}, |
1921 | 2052 | "source": [ |
1922 | | - "#### 2.4.b Plot anomalies solely based on embeddings" |
| 2053 | + "#### 2.4c Plot anomalies solely based on embeddings" |
1923 | 2054 | ] |
1924 | 2055 | }, |
1925 | 2056 | { |
|
0 commit comments