Skip to content

Commit d476cdf

Browse files
authored
Merge pull request #13 from ciaran28/featureone
Featureone
2 parents 9e683f7 + 64bb312 commit d476cdf

File tree

56 files changed

+525
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+525
-2
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import os
2+
import argparse
3+
import pandas as pd
4+
from sklearn.model_selection import train_test_split
5+
import logging
6+
import mlflow
7+
import requests
8+
import os
9+
10+
#parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
11+
12+
def main():
13+
"""Main function of the script."""
14+
15+
# input and output arguments
16+
parser = argparse.ArgumentParser()
17+
18+
parser.add_argument("--data", type=str, help="path to input data")
19+
parser.add_argument("--test_train_ratio", type=float, required=False, default=0.25)
20+
parser.add_argument("--train_data", type=str, help="path to train data")
21+
parser.add_argument("--test_data", type=str, help="path to test data")
22+
23+
args = parser.parse_args()
24+
# Start Logging
25+
mlflow.start_run()
26+
27+
print(" ".join(f"{k}={v}" for k, v in vars(args).items()))
28+
29+
print("input data:", args.data)
30+
31+
credit_df = pd.read_excel(args.data, header=1, index_col=0)
32+
33+
mlflow.log_metric("num_samples", credit_df.shape[0])
34+
mlflow.log_metric("num_features", credit_df.shape[1] - 1)
35+
36+
credit_train_df, credit_test_df = train_test_split(
37+
credit_df,
38+
test_size=args.test_train_ratio,
39+
)
40+
41+
# output paths are mounted as folder, therefore, we are adding a filename to the path
42+
credit_train_df.to_csv(os.path.join(args.train_data, "data.csv"), index=False)
43+
44+
credit_test_df.to_csv(os.path.join(args.test_data, "data.csv"), index=False)
45+
46+
# Stop Logging
47+
mlflow.end_run()
48+
49+
50+
51+
# Retrieve Tokens
52+
53+
54+
def createManagementToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
55+
"""
56+
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
57+
"""
58+
59+
tokenRequestBody['resource'] = 'https://management.core.windows.net/'
60+
61+
response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
62+
63+
if response.status_code == 200:
64+
print(response.status_code)
65+
66+
else:
67+
raise Exception(response.text)
68+
69+
return response.json()['access_token']
70+
71+
def createBearerToken(tokenRequestBody, tokenRequestHeaders, tokenBaseURL):
72+
"""
73+
Uses Our Service Principal Credentials To Generate Azure Active Directory Tokens
74+
"""
75+
76+
tokenRequestBody['resource'] = '2ff814a6-3304-4ab8-85cb-cd0e6f879c1d'
77+
78+
response = requests.get(tokenBaseURL, headers=tokenRequestHeaders, data=tokenRequestBody)
79+
80+
if response.status_code == 200:
81+
print(response.status_code)
82+
83+
else:
84+
raise Exception(response.text)
85+
86+
return response.json()['access_token']
87+
88+
89+
90+
def listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE):
91+
"""
92+
Returns a Json object containing a list of existing Databricks Clusters.
93+
"""
94+
95+
response = requests.get('https://' + DATABRICKS_INSTANCE + '/api/2.0/clusters/list', headers=DBRKS_REQ_HEADERS)
96+
97+
if response.status_code != 200:
98+
raise Exception(response.content)
99+
100+
else:
101+
return response.json()
102+
103+
104+
105+
if __name__ == "__main__":
106+
107+
# The sp credentials need to come in from key vault
108+
109+
tokenRequestBody = {
110+
'grant_type': 'client_credentials',
111+
'client_id': '841ba6d9-a509-44ee-bf40-c0876b4ac6bb',
112+
'client_secret': 'IQG8Q~hQDGO5eFcRos~YN9waI0gE-Gsx8sMx5bJQ'
113+
}
114+
tokenRequestHeaders = {'Content-Type': 'application/x-www-form-urlencoded'}
115+
tokenBaseURL = 'https://login.microsoftonline.com/' + '16b3c013-d300-468d-ac64-7eda0820b6d3' + '/oauth2/token'
116+
117+
DBRKS_BEARER_TOKEN = createBearerToken(tokenRequestBody=tokenRequestBody,
118+
tokenRequestHeaders=tokenRequestHeaders,
119+
tokenBaseURL=tokenBaseURL
120+
)
121+
122+
DBRKS_MANAGEMENT_TOKEN = createManagementToken(tokenRequestBody=tokenRequestBody,
123+
tokenRequestHeaders=tokenRequestHeaders,
124+
tokenBaseURL=tokenBaseURL
125+
)
126+
127+
128+
DBRKS_REQ_HEADERS = {
129+
'Authorization': f'Bearer {DBRKS_BEARER_TOKEN}',
130+
'X-Databricks-Azure-SP-Management-Token': f'{DBRKS_MANAGEMENT_TOKEN}',
131+
'X-Databricks-Azure-Workspace-Resource-Id': '/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.Databricks/workspaces/dbxwssandbox-eco3',
132+
'Content-Type': 'application/json'
133+
}
134+
DATABRICKS_INSTANCE = "adb-2041102092454885.5.azuredatabricks.net"
135+
136+
existingClusters = listClusters(DBRKS_REQ_HEADERS, DATABRICKS_INSTANCE)
137+
138+
print(existingClusters)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
name: model-env
2+
channels:
3+
- conda-forge
4+
dependencies:
5+
- python=3.8
6+
- numpy=1.21.2
7+
- pip=21.2.4
8+
- scikit-learn=0.24.2
9+
- scipy=1.7.1
10+
- pandas>=1.1,<1.2
11+
- pip:
12+
- inference-schema[numpy-support]==1.3.0
13+
- xlrd==2.0.1
14+
- mlflow== 1.26.1
15+
- azureml-mlflow==1.42.0
16+
- pandas
17+
- requests

0 commit comments

Comments
 (0)