diff --git a/.env.example b/.env.example index a3662d20..d85e4586 100644 --- a/.env.example +++ b/.env.example @@ -30,7 +30,7 @@ TRAINING_PIPELINE_NAME = 'Training Pipeline' MODEL_PATH = '' EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py' REGISTER_SCRIPT_PATH = 'register/register_model.py' -SOURCES_DIR_TRAIN = 'code' +SOURCES_DIR_TRAIN = 'diabetes_regression' DATASET_NAME = 'diabetes_ds' DATASTORE_NAME = 'datablobstore' DATAFILE_NAME = 'diabetes.csv' diff --git a/.pipelines/azdo-pr-build-train.yml b/.pipelines/azdo-pr-build-train.yml index 24231b2a..76337ab5 100644 --- a/.pipelines/azdo-pr-build-train.yml +++ b/.pipelines/azdo-pr-build-train.yml @@ -11,7 +11,7 @@ container: mcr.microsoft.com/mlops/python:latest variables: -- template: azdo-variables.yml +- template: diabetes_regression-variables.yml - group: devopsforai-aml-vg diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/diabetes_regression-ci-build-train.yml similarity index 96% rename from .pipelines/azdo-ci-build-train.yml rename to .pipelines/diabetes_regression-ci-build-train.yml index e6a7ea18..b8889b87 100644 --- a/.pipelines/azdo-ci-build-train.yml +++ b/.pipelines/diabetes_regression-ci-build-train.yml @@ -4,14 +4,14 @@ trigger: include: - master paths: - exclude: - - docs/ - - environment_setup/ - - ml_service/util/create_scoring_image.* - - ml_service/util/smoke_test_scoring_service.py + include: + - diabetes_regression/ + - ml_service/pipelines/diabetes_regression_build_train_pipeline.py + - ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py + - ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py variables: -- template: azdo-variables.yml +- template: diabetes_regression-variables.yml - group: devopsforai-aml-vg diff --git a/.pipelines/azdo-ci-image.yml b/.pipelines/diabetes_regression-ci-image.yml similarity index 81% rename from .pipelines/azdo-ci-image.yml rename to .pipelines/diabetes_regression-ci-image.yml index b5375ad5..04c02253 100644 --- a/.pipelines/azdo-ci-image.yml +++ b/.pipelines/diabetes_regression-ci-image.yml @@ -7,10 +7,10 @@ trigger: include: - ml_service/util/create_scoring_image.py - ml_service/util/Dockerfile - - code/scoring/ + - diabetes_regression/scoring/ exclude: - - code/scoring/deployment_config_aci.yml - - code/scoring/deployment_config_aks.yml + - diabetes_regression/scoring/deployment_config_aci.yml + - diabetes_regression/scoring/deployment_config_aks.yml pool: vmImage: 'ubuntu-latest' diff --git a/.pipelines/azdo-variables.yml b/.pipelines/diabetes_regression-variables.yml similarity index 87% rename from .pipelines/azdo-variables.yml rename to .pipelines/diabetes_regression-variables.yml index 0691e673..0186de32 100644 --- a/.pipelines/azdo-variables.yml +++ b/.pipelines/diabetes_regression-variables.yml @@ -15,7 +15,7 @@ variables: value: lowpriority # Training Config - name: BUILD_TRAIN_SCRIPT - value: build_train_pipeline.py + value: diabetes_regression_build_train_pipeline.py - name: TRAIN_SCRIPT_PATH value: training/train.py - name: MODEL_NAME @@ -24,7 +24,7 @@ variables: value: '1' # AML Pipeline Config - name: TRAINING_PIPELINE_NAME - value: 'Training-Pipeline' + value: 'diabetes-Training-Pipeline' - name: MODEL_PATH value: '' - name: EVALUATE_SCRIPT_PATH @@ -32,9 +32,9 @@ variables: - name: REGISTER_SCRIPT_PATH value: register/register_model.py - name: SOURCES_DIR_TRAIN - value: code + value: diabetes_regression - name: IMAGE_NAME - value: 'mltrained' + value: 'diabetestrained' # Optional. Used by a training pipeline with R on Databricks - name: DB_CLUSTER_ID value: '' diff --git a/code/evaluate/evaluate_model.py b/diabetes_regression/evaluate/evaluate_model.py similarity index 99% rename from code/evaluate/evaluate_model.py rename to diabetes_regression/evaluate/evaluate_model.py index 640442a1..2218137f 100644 --- a/code/evaluate/evaluate_model.py +++ b/diabetes_regression/evaluate/evaluate_model.py @@ -36,7 +36,7 @@ load_dotenv() sources_dir = os.environ.get("SOURCES_DIR_TRAIN") if (sources_dir is None): - sources_dir = 'code' + sources_dir = 'diabetes_regression' path_to_util = os.path.join(".", sources_dir, "util") sys.path.append(os.path.abspath(path_to_util)) # NOQA: E402 from model_helper import get_model_by_tag diff --git a/code/register/register_model.py b/diabetes_regression/register/register_model.py similarity index 100% rename from code/register/register_model.py rename to diabetes_regression/register/register_model.py diff --git a/code/scoring/conda_dependencies.yml b/diabetes_regression/scoring/conda_dependencies.yml similarity index 100% rename from code/scoring/conda_dependencies.yml rename to diabetes_regression/scoring/conda_dependencies.yml diff --git a/code/scoring/deployment_config_aci.yml b/diabetes_regression/scoring/deployment_config_aci.yml similarity index 100% rename from code/scoring/deployment_config_aci.yml rename to diabetes_regression/scoring/deployment_config_aci.yml diff --git a/code/scoring/deployment_config_aks.yml b/diabetes_regression/scoring/deployment_config_aks.yml similarity index 100% rename from code/scoring/deployment_config_aks.yml rename to diabetes_regression/scoring/deployment_config_aks.yml diff --git a/code/scoring/inference_config.yml b/diabetes_regression/scoring/inference_config.yml similarity index 100% rename from code/scoring/inference_config.yml rename to diabetes_regression/scoring/inference_config.yml diff --git a/code/scoring/score.py b/diabetes_regression/scoring/score.py similarity index 100% rename from code/scoring/score.py rename to diabetes_regression/scoring/score.py diff --git a/code/scoring/scoreA.py b/diabetes_regression/scoring/scoreA.py similarity index 100% rename from code/scoring/scoreA.py rename to diabetes_regression/scoring/scoreA.py diff --git a/code/scoring/scoreB.py b/diabetes_regression/scoring/scoreB.py similarity index 100% rename from code/scoring/scoreB.py rename to diabetes_regression/scoring/scoreB.py diff --git a/code/training/R/r_train.r b/diabetes_regression/training/R/r_train.r similarity index 100% rename from code/training/R/r_train.r rename to diabetes_regression/training/R/r_train.r diff --git a/code/training/R/train_with_r.py b/diabetes_regression/training/R/train_with_r.py similarity index 100% rename from code/training/R/train_with_r.py rename to diabetes_regression/training/R/train_with_r.py diff --git a/code/training/R/train_with_r_on_databricks.py b/diabetes_regression/training/R/train_with_r_on_databricks.py similarity index 100% rename from code/training/R/train_with_r_on_databricks.py rename to diabetes_regression/training/R/train_with_r_on_databricks.py diff --git a/code/training/R/weight_data.csv b/diabetes_regression/training/R/weight_data.csv similarity index 100% rename from code/training/R/weight_data.csv rename to diabetes_regression/training/R/weight_data.csv diff --git a/code/training/train.py b/diabetes_regression/training/train.py similarity index 100% rename from code/training/train.py rename to diabetes_regression/training/train.py diff --git a/code/util/model_helper.py b/diabetes_regression/util/model_helper.py similarity index 100% rename from code/util/model_helper.py rename to diabetes_regression/util/model_helper.py diff --git a/docs/code_description.md b/docs/code_description.md index 472e781b..5a1af307 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -15,31 +15,29 @@ ### Pipelines - `.pipelines/azdo-base-pipeline.yml` : a pipeline template used by ci-build-train pipeline and pr-build-train pipelines. It contains steps performing linting, data and unit testing. -- `.pipelines/azdo-ci-build-train.yml` : a pipeline triggered when the code is merged into **master**. It performs linting, data integrity testing, unit testing, building and publishing an ML pipeline. +- `.pipelines/diabetes_regression-ci-build-train.yml` : a pipeline triggered when the code is merged into **master**. It performs linting, data integrity testing, unit testing, building and publishing an ML pipeline. - `.pipelines/azdo-pr-build-train.yml` : a pipeline triggered when a **pull request** to the **master** branch is created. It performs linting, data integrity testing and unit testing only. ### ML Services -- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. It uses Python on ML Compute. -- `ml_service/pipelines/build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute. -- `ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline.py` : builds and publishes an ML training pipeline. It uses Python on ML Compute. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute. +- `ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute. - `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline (Python on ML Compute) via REST API. - `ml_service/pipelines/verify_train_pipeline.py` : determines whether the evaluate_model.py step of the training pipeline registered a new model. - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. ### Code -- `code/training/train.py` : a training step of an ML training pipeline. -- `code/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one. -- `code/evaluate/register_model.py` : (LEGACY) registers a new trained model if evaluation shows the new model is more performant than the previous one. -- `code/training/R/r_train.r` : training a model with R basing on a sample dataset (weight_data.csv). -- `code/training/R/train_with_r.py` : a python wrapper (ML Pipeline Step) invoking R training script on ML Compute -- `code/training/R/train_with_r_on_databricks.py` : a python wrapper (ML Pipeline Step) invoking R training script on Databricks Compute -- `code/training/R/weight_data.csv` : a sample dataset used by R script (r_train.r) to train a model +- `diabetes_regression/training/train.py` : a training step of an ML training pipeline. +- `diabetes_regression/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `diabetes_regression/evaluate/register_model.py` : (LEGACY) registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `diabetes_regression/training/R/r_train.r` : training a model with R basing on a sample dataset (weight_data.csv). +- `diabetes_regression/training/R/train_with_r.py` : a python wrapper (ML Pipeline Step) invoking R training script on ML Compute +- `diabetes_regression/training/R/train_with_r_on_databricks.py` : a python wrapper (ML Pipeline Step) invoking R training script on Databricks Compute +- `diabetes_regression/training/R/weight_data.csv` : a sample dataset used by R script (r_train.r) to train a model ### Scoring -- code/scoring/score.py : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. -- code/scoring/conda_dependencies.yml : contains a list of dependencies required by score.py to be installed in a deployable Docker Image -- code/scoring/inference_config.yml, deployment_config_aci.yml, deployment_config_aks.yml : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. - - +- `diabetes_regression/scoring/score.py` : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. +- `diabetes_regression/scoring/conda_dependencies.yml` : contains a list of dependencies required by score.py to be installed in a deployable Docker Image +- `diabetes_regression/scoring/inference_config.yml`, deployment_config_aci.yml, deployment_config_aks.yml : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. diff --git a/docs/getting_started.md b/docs/getting_started.md index a46d5304..3d372ab3 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -87,7 +87,7 @@ Please be aware that the local environment also needs access to the Azure subscr ### Azure DevOps configuration -For using Azure DevOps Pipelines all other variables are stored in the file `.pipelines/azdo-variables.yml`. Using the default values as a starting point, adjust the variables to suit your requirements. +For using Azure DevOps Pipelines all other variables are stored in the file `.pipelines/diabetes_regression-variables.yml`. Using the default values as a starting point, adjust the variables to suit your requirements. Up until now you should have: @@ -131,7 +131,7 @@ Install the **Azure Machine Learning** extension to your organization from the [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml), so that you can set up a service connection to your AML workspace. -Create a service connection to your ML workspace via the [Azure DevOps Azure ML task instructions](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) to be able to execute the Azure ML training pipeline. The connection name specified here needs to be used for the value of the `WORKSPACE_SVC_CONNECTION` set in the variable group below. +Create a service connection to your ML workspace via the [Azure DevOps Azure ML task instructions](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml) to be able to execute the Azure ML training pipeline. The connection name specified here needs to be used for the value of the `WORKSPACE_SVC_CONNECTION` set in the variable group above. **Note:** Creating service connection with Azure Machine Learning workspace scope requires 'Owner' or 'User Access Administrator' permissions on the Workspace. You must also have sufficient permissions to register an application with @@ -154,7 +154,7 @@ environments, or alternatively to Azure App Service. ### Set up the Pipeline In your [Azure DevOps](https://dev.azure.com) project create and run a new build -pipeline referring to the [azdo-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) +pipeline referring to the [diabetes_regression-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) pipeline definition in your forked repository: ![configure ci build pipeline](./images/ci-build-pipeline-configure.png) @@ -174,7 +174,7 @@ Great, you now have the build pipeline set up which automatically triggers every **Note:** The build pipeline also supports building and publishing ML pipelines using R to train a model. This is enabled -by changing the `build-train-script` pipeline variable to either `build_train_pipeline_with_r.py`, or `build_train_pipeline_with_r_on_dbricks.py`. For pipeline training a model with R on Databricks you'll need +by changing the `build-train-script` pipeline variable to either `diabetes_regression_build_train_pipeline_with_r.py`, or `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py`. For pipeline training a model with R on Databricks you'll need to manually create a Databricks cluster and attach it to the ML Workspace as a compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables should be specified). @@ -189,7 +189,7 @@ Wait until the pipeline finishes and verify that there is a new model in the **M ![trained model](./images/trained-model.png) -To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\azdo-ci-build-train.yml` pipeline to `false`. This can also be overridden at runtime execution of the pipeline. +To disable the automatic trigger of the training pipeline, change the `auto-trigger-training` variable as listed in the `.pipelines\diabetes_regression-ci-build-train.yml` pipeline to `false`. This can also be overridden at runtime execution of the pipeline. ### Deploy the Model to Azure Kubernetes Service diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py similarity index 100% rename from ml_service/pipelines/build_train_pipeline.py rename to ml_service/pipelines/diabetes_regression_build_train_pipeline.py diff --git a/ml_service/pipelines/build_train_pipeline_with_r.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py similarity index 97% rename from ml_service/pipelines/build_train_pipeline_with_r.py rename to ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py index eea6d4c6..509224e7 100644 --- a/ml_service/pipelines/build_train_pipeline_with_r.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py @@ -43,7 +43,7 @@ def main(): name="Train Model", script_name="train_with_r.py", compute_target=aml_compute, - source_directory="code/training/R", + source_directory="diabetes_regression/training/R", runconfig=run_config, allow_reuse=False, ) diff --git a/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py similarity index 96% rename from ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py rename to ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py index 7d1891c7..7e435659 100644 --- a/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py @@ -32,7 +32,7 @@ def main(): name="DBPythonInLocalMachine", num_workers=1, python_script_name="train_with_r_on_databricks.py", - source_directory="code/training/R", + source_directory="diabetes_regression/training/R", run_name='DB_Python_R_demo', existing_cluster_id=e.db_cluster_id, compute_target=aml_compute, diff --git a/ml_service/pipelines/verify_train_pipeline.py b/ml_service/pipelines/verify_train_pipeline.py index e141b8ff..2824accb 100644 --- a/ml_service/pipelines/verify_train_pipeline.py +++ b/ml_service/pipelines/verify_train_pipeline.py @@ -15,7 +15,7 @@ def main(): load_dotenv() sources_dir = os.environ.get("SOURCES_DIR_TRAIN") if (sources_dir is None): - sources_dir = 'code' + sources_dir = 'diabetes_regression' path_to_util = os.path.join(".", sources_dir, "util") sys.path.append(os.path.abspath(path_to_util)) # NOQA: E402 from model_helper import get_model_by_tag diff --git a/ml_service/util/create_scoring_image.py b/ml_service/util/create_scoring_image.py index dadaf12b..4b3887fe 100644 --- a/ml_service/util/create_scoring_image.py +++ b/ml_service/util/create_scoring_image.py @@ -28,7 +28,7 @@ model = Model(ws, name=e.model_name, version=e.model_version) sources_dir = e.sources_directory_train if (sources_dir is None): - sources_dir = 'code' + sources_dir = 'diabetes_regression' path_to_scoring = os.path.join(".", sources_dir, "scoring") cwd = os.getcwd() os.chdir(path_to_scoring) diff --git a/tests/unit/code_test.py b/tests/unit/code_test.py index 06654b2f..c7b10182 100644 --- a/tests/unit/code_test.py +++ b/tests/unit/code_test.py @@ -3,7 +3,8 @@ import numpy as np from azureml.core.run import Run from unittest.mock import Mock -sys.path.append(os.path.abspath("./code/training")) # NOQA: E402 +sys.path.append(os.path.abspath( + "./diabetes_regression/training")) # NOQA: E402 from train import train_model