microsoft
diff --git a/‎.azureDevOps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/databricks/listclusters.py‎
Lines changed: 138 additions & 0 deletions b/‎.azureDevOps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/databricks/listclusters.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎.azureDevOps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/dependencies/conda.yaml‎
Lines changed: 17 additions & 0 deletions b/‎.azureDevOps/MLOps_Engineer/Infrastructure/Azure_ML_Pipelines/components/dependencies/conda.yaml‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,138 @@
+import os
+import argparse
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import logging
+import mlflow
+import requests
+import os
+
+#parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
+
+def main():
+    """Main function of the script."""
+
+    # input and output arguments
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--data", type=str, help="path to input data")
+    parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
+    parser.add_argument("--train_data", type=str, help="path to train data")
+    parser.add_argument("--test_data", type=str, help="path to test data")
+
+    args = parser.parse_args()
+    # Start Logging
+    mlflow.start_run()
+
+    print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
+
+    print("input data:", args.data)
+
+    credit_df = pd.read_excel(args.data, header=1, index_col=0)
+
+    mlflow.log_metric("num_samples", credit_df.shape[0])
+    mlflow.log_metric("num_features", credit_df.shape[1] - 1)
+
+    credit_train_df, credit_test_df = train_test_split(
+        credit_df,
+        test_size=args.test_train_ratio,
+    )
+
+    # output paths are mounted as folder, therefore, we are adding a filename to the path
+    credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)
+
+    credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)
+
+    # Stop Logging
+    mlflow.end_run()
+
+
+
+    # Retrieve Tokens 
+
+
+def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
+        """
+            Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
+        """
+
+        tokenRequestBody['resource'] = 'https://management.core.windows.net/'
+        
+        response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
+        
+        if response.status_code == 200:
+            print(response.status_code)
+        
+        else:
+            raise Exception(response.text)
+        
+        return response.json()['access_token']
+
+def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
+        """
+            Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
+        """
+        
+        tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'
+        
+        response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
+        
+        if response.status_code == 200:
+            print(response.status_code)
+        
+        else:
+            raise Exception(response.text)
+        
+        return response.json()['access_token']
+
+
+
+def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE):
+    """
+        Returns a Json object containing a list of existing Databricks Clusters.
+    """
+
+    response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS)
+
+    if response.status_code != 200:
+        raise Exception(response.content)
+
+    else:
+        return response.json()
+
+
+
+if __name__ == "__main__":
+
+    # The sp credentials need to come in from key vault 
+
+    tokenRequestBody = {
+        'grant_type': 'client_credentials',
+        'client_id': '841ba6d9-a509-44ee-bf40-c0876b4ac6bb',
+        'client_secret': 'IQG8Q~hQDGO5eFcRos~YN9waI0gE-Gsx8sMx5bJQ'
+    } 
+    tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'}
+    tokenBaseURL = 'https://login.microsoftonline.com/' + '16b3c013-d300-468d-ac64-7eda0820b6d3' + '/oauth2/token'
+
+    DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody, 
+                                    tokenRequestHeaders=tokenRequestHeaders, 
+                                    tokenBaseURL=tokenBaseURL
+                    )
+    
+    DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody,
+                                            tokenRequestHeaders=tokenRequestHeaders,
+                                            tokenBaseURL=tokenBaseURL
+                    )
+
+
+    DBRKS_REQ_HEADERS = {
+    'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}',
+    'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}',
+    'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3',
+    'Content-Type': 'application/json'
+}
+    DATABRICKS_INSTANCE = "adb-2041102092454885.5.azuredatabricks.net"
+
+    existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE)
+
+    print(existingClusters)
@@ -0,0 +1,17 @@
+name: model-env
+channels:
+  - conda-forge
+dependencies:
+  - python=3.8
+  - numpy=1.21.2
+  - pip=21.2.4
+  - scikit-learn=0.24.2
+  - scipy=1.7.1
+  - pandas>=1.1,<1.2
+  - pip:
+    - inference-schema[numpy-support]==1.3.0
+    - xlrd==2.0.1
+    - mlflow== 1.26.1
+    - azureml-mlflow==1.42.0
+    - pandas
+    - requests