diff --git a/.gitignore b/.gitignore index b327895b..3a5a8879 100644 --- a/.gitignore +++ b/.gitignore @@ -104,5 +104,4 @@ venv.bak/ # mypy .mypy_cache/ -aml_config/config.json .DS_Store diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml index eab1a743..09f90909 100644 --- a/.pipelines/azdo-ci-build-train.yml +++ b/.pipelines/azdo-ci-build-train.yml @@ -7,9 +7,7 @@ trigger: pool: vmImage: 'ubuntu-latest' -container: - image: mlopscr.azurecr.io/public/mlops/mlopspython:latest - endpoint: acrconnection +container: mcr.microsoft.com/mlops/python:latest variables: diff --git a/.pipelines/azdo-pr-build-train.yml b/.pipelines/azdo-pr-build-train.yml index 6bbf7387..8bf6ca56 100644 --- a/.pipelines/azdo-pr-build-train.yml +++ b/.pipelines/azdo-pr-build-train.yml @@ -7,9 +7,7 @@ pr: pool: vmImage: 'ubuntu-latest' -container: - image: mlopscr.azurecr.io/public/mlops/mlopspython:latest - endpoint: acrconnection +container: mcr.microsoft.com/mlops/python:latest variables: diff --git a/README.md b/README.md index 29ebd646..e09b85b1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,6 @@ [![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Microsoft.MLOpsPython?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=25&branchName=master) -### Author: Praneet Solanki | Richin Jain MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization. @@ -25,20 +24,15 @@ To deploy this solution in your subscription, follow the manual instructions in This reference architecture shows how to implement continuous integration (CI), continuous delivery (CD), and retraining pipeline for an AI application using Azure DevOps and Azure Machine Learning. The solution is built on the scikit-learn diabetes dataset but can be easily adapted for any AI scenario and other popular build systems such as Jenkins and Travis. -![Architecture](/docs/images/Architecture_DevOps_AI.png) +![Architecture](/docs/images/main-flow.png) ## Architecture Flow ### Train Model 1. Data Scientist writes/updates the code and push it to git repo. This triggers the Azure DevOps build pipeline (continuous integration). -2. Once the Azure DevOps build pipeline is triggered, it runs following types of tasks: - - Run for new code: Every time new code is committed to the repo, the build pipeline performs data sanity tests and unit tests on the new code. - - One-time run: These tasks runs only for the first time the build pipeline runs. It will programatically create an [Azure ML Service Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace), provision [Azure ML Compute](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) (used for model training compute), and publish an [Azure ML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines). This published Azure ML pipeline is the model training/retraining pipeline. - - > Note: The Publish Azure ML pipeline task currently runs for every code change - -3. The Azure ML Retraining pipeline is triggered once the Azure DevOps build pipeline completes. All the tasks in this pipeline runs on Azure ML Compute created earlier. Following are the tasks in this pipeline: +2. Once the Azure DevOps build pipeline is triggered, it performs code quality checks, data sanity tests, unit tests, builds an [Azure ML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) and publishes it in an [Azure ML Service Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace). +3. The [Azure ML Pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-ml-pipelines) is triggered once the Azure DevOps build pipeline completes. All the tasks in this pipeline runs on Azure ML Compute. Following are the tasks in this pipeline: - **Train Model** task executes model training script on Azure ML Compute. It outputs a [model](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#model) file which is stored in the [run history](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#run). @@ -50,16 +44,8 @@ This reference architecture shows how to implement continuous integration (CI), Once you have registered your ML model, you can use Azure ML + Azure DevOps to deploy it. -The **Package Model** task packages the new model along with the scoring file and its python dependencies into a [docker image](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#image) and pushes it to [Azure Container Registry](https://docs.microsoft.com/en-us/azure/container-registry/container-registry-intro). This image is used to deploy the model as [web service](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#web-service). - -The **Deploy Model** task handles deploying your Azure ML model to the cloud (ACI or AKS). -This pipeline deploys the model scoring image into Staging/QA and PROD environments. - - In the Staging/QA environment, one task creates an [Azure Container Instance](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-overview) and deploys the scoring image as a [web service](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#web-service) on it. - -The second task invokes the web service by calling its REST endpoint with dummy data. +[Azure DevOps release pipeline](https://docs.microsoft.com/en-us/azure/devops/pipelines/release/?view=azure-devops) packages the new model along with the scoring file and its python dependencies into a [docker image](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#image) and pushes it to [Azure Container Registry](https://docs.microsoft.com/en-us/azure/container-registry/container-registry-intro). This image is used to deploy the model as [web service](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-azure-machine-learning-architecture#web-service) across QA and Prod environments. The QA environment is running on top of [Azure Container Instances (ACI)](https://azure.microsoft.com/en-us/services/container-instances/) and the Prod environemt is built with [Azure Kubernetes Service (AKS)](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes). -5. The deployment in production is a [gated release](https://docs.microsoft.com/en-us/azure/devops/pipelines/release/approvals/gates?view=azure-devops). This means that once the model web service deployment in the Staging/QA environment is successful, a notification is sent to approvers to manually review and approve the release. Once the release is approved, the model scoring web service is deployed to [Azure Kubernetes Service(AKS)](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes) and the deployment is tested. ### Repo Details diff --git a/docs/code_description.md b/docs/code_description.md index 45fb7bb7..ef131408 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -2,59 +2,37 @@ ### Environment Setup -- requirements.txt : It consist of list of python packages which are needed by the train.py to run successfully on host agent (locally). +- `environment_setup/requirements.txt` : It consist of list of python packages which are needed by the train.py to run successfully on host agent (locally). -- install_requirements.sh : This script prepare the python environment i.e. install the Azure ML SDK and the packages specified in requirements.txt +- `environment_setup/install_requirements.sh` : This script prepare the python environment i.e. install the Azure ML SDK and the packages specified in requirements.txt -### Config Files -All the scripts inside the ./aml_config are config files. These are the files where you need to provide details about the subscription, resource group, workspace, conda dependencies, remote vm, AKS etc. +- `environment_setup/iac-*.yml, arm-templates` : Infrastructure as Code piplines to create and delete required resources along with corresponding arm-templates. -- config.json : This is a mandatory config file. Provide the subscription id, resource group name, workspace name and location where you want to create Azure ML services workspace. If you have already created the workspace, provide the existing workspace details in here. +- `environment_setup/Dockerfile` : Dockerfile of a building agent containing Python 3.6 and all required packages. -- conda_dependencies.yml : This is a mandatory file. This files contains the list of dependencies which are needed by the training/scoring script to run. This file is used to prepare environment for the local run(user managed/system managed) and docker run(local/remote). +- `environment_setup/docker-image-pipeline.yml` : An AzDo pipeline building and pushing [microsoft/mlopspython](https://hub.docker.com/_/microsoft-mlops-python) image. -- security_config.json : This file contains the credentials to the remove vm where we want to train the model. This config is used by the script 02-AttachTrainingVM.py to attach remote vm as a compute to the workspace. Attaching remote vm to workspace is one time operation. It is recommended not to publish this file with credentials populated in it. You can put the credentials, run the 02-AttachTrainingVM.py manually and clear the credentials before pushing it to git. +### Pipelines -- aks_webservice.json : This is an optional config. If you already have an AKS attached to your workspace, then provide the details in this file. If not, you do not have to check in this file to git. +- `.pipelines/azdo-base-pipeline.yml` : a pipeline template used by ci-build-train pipeline and pr-build-train pipelines. It contains steps performig linting, data and unit testing. +- `.pipelines/azdo-ci-build-train.yml` : a pipeline triggered when the code is merged into **master**. It profrorms linting, data integrity testing, unit testing, building and publishing an ML pipeline. +- `.pipelines/azdo-pr-build-train.yml` : a pipeline triggered when a **pull request** to the **master** branch is created. It profrorms linting, data integrity testing and unit testing only. -### Build Pipeline Scripts +### ML Services -The script under ./aml_service are used in build pipeline. All the scripts starting with 0 are the one time run scripts. These are the scripts which need to be run only once. There is no harm of running these scripts every time in build pipeline. +- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. +- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline via REST API. +- `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. -- 00-WorkSpace.py : This is a onetime run script. It reads the workspace details from ./aml_config/config.json file and create (if workspace not available) or get (existing workspace). +### Code -- 01-Experiment.py : This is a onetime run script. It registers the root directory as project. It is not included as a step in build pipeline. +- `code/training/train.py` : a training step of an ML training pipeline. +- `code/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline. +- `code/evaluate/register_model.py` : registers a new trained model if evaluation shows the new model is more performent than the previous one. -- 02-AttachTrainingVM.py : This is a onetime run script. It attaches a remote VM to the workspace. It reads the config from ./aml_config/security_config.json. It is not included as a step in build pipeline. +### Scoring +- code/scoring/score.py : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. +- code/scoring/conda_dependencies.yml : contains a list of dependencies required by sore.py to be installed in a deployable Docker Image +- code/scoring/inference_config.yml, deployment_config_aci.yml, deployment_config_aks.yml : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. -- 10-TrainOnLocal.py : This scripts triggers the run of ./training/train.py script on the local compute(Host agent in case of build pipeline). If you are training on remote vm, you do not need this script in build pipeline. All the training scripts (1x) generates an output file aml_config/run_id.json which records the run_id and run history name of the training run. run_id.json is used by 20-RegisterModel.py to get the trained model. - -- 11-TrainOnLocalEnv.py : Its functionality is same as 10-TrainOnLocal.py, the only difference is that it creates a virtual environment on local compute and run training script on virtual env. - -- 12-TrainOnVM.py : As we want to train the model on remote VM, this script is included as a task in build pipeline. It submits the training job on remote vm. - -- 15.EvaluateModel.py : It gets the metrics of latest model trained and compares it with the model in production. If the production model still performs better, all below scripts are skipped. - -- 20-RegisterModel.py : It gets the run id from training steps output json and registers the model associated with that run along with tags. This scripts outputs a model.json file which contains model name and version. This script included as build task. - -- 30-CreateScoringImage.py : This takes the model details from last step, creates a scoring webservice docker image and publish the image to ACR. This script included as build task. It writes the image name and version to image.json file. - -### Deployment/Release Scripts -File under the directory ./aml_service starting with 5x and 6x are used in release pipeline. They are basically to deploy the docker image on AKS and ACI and publish webservice on them. - -- 50-deployOnAci.py : This script reads the image.json which is published as an artifact from build pipeline, create aci cluster and deploy the scoring web service on it. It writes the scoring service details to aci_webservice.json - -- 51-deployOnAks.py : This script reads the image.json which is published as an artifact from build pipeline, create aks cluster and deploy the scoring web service on it. If the aks_webservice.json file was checked in with existing aks details, it will update the existing webservice with new Image. It writes the scoring service details to aks_webservice.json - -- 60-AciWebServiceTest.py : Reads the ACI info from aci_webservice.json and test it with sample data. - -- 61-AksWebServiceTest.py : Reads the AKS info from aks_webservice.json and test it with sample data. - -### Training/Scoring Scripts - -- /code/training/train.py : This is the model training code. It uploads the model file to AML Service run id once the training is successful. This script is submitted as run job by all the 1x scripts. - -- /code/scoring/score.py : This is the score file used to create the webservice docker image. There is a conda_dependencies.yml in this directory which is exactly same as the one in aml_config. These two files are needed by the 30-CreateScoringImage.py scripts to be in same root directory while creating the image. - -**Note: In CICD Pipeline, please make sure that the working directory is the root directory of the repo.** diff --git a/docs/getting_started.md b/docs/getting_started.md index 2f0de306..994e220d 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -10,211 +10,205 @@ We use Azure DevOps for running our build(CI), retraining trigger and release (C If you already have Azure DevOps account, create a [new project](https://docs.microsoft.com/en-us/azure/devops/organizations/projects/create-project?view=azure-devops). -#### Enable Azure DevOps Preview -The steps below uses the latest DevOps features. Thus, please enable the feature **New YAML pipeline creation experience** by following the instructions [here](https://docs.microsoft.com/en-us/azure/devops/project/navigation/preview-features?view=azure-devops). -**Note:** Make sure you have the right permissions in Azure DevOps to do so. - -### 3. Create Service Principal to Login to Azure and create resources +### 3. Create Service Principal to Login to Azure To create service principal, register an application entity in Azure Active Directory (Azure AD) and grant it the Contributor or Owner role of the subscription or the resource group where the web service belongs to. See [how to create service principal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal) and assign permissions to manage Azure resource. -Please make note the following values after creating a service principal, we will need them in subsequent steps -- Azure subscription id (subscriptionid) -- Service principal username (spidentity)([application id](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#get-application-id-and-authentication-key)) -- Service principal password (spsecret) ([auth_key](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#get-application-id-and-authentication-key)) -- Service principal [tenant id](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal#get-tenant-id) (sptenant) - -**Note:** You must have sufficient permissions to register an application with your Azure AD tenant, and assign the application to a role in your Azure subscription. Contact your subscription adminstator if you don't have the permissions. Normally a subscription admin can create a Service principal and can provide you the details. - -### 3(b). Configure local development environment variables +Please make note of the following values after creating a service principal, we will need them in subsequent steps +- Application (client) ID +- Directory (tenant) ID +- Application Secret -For local development, this project makes use of [python-dotenv](https://pypi.org/project/python-dotenv/). This pip package allows you to use a `.env` file to manage your environment variables at runtime. -The .env.example file is a template. To run this code locally, create a file in the root of this project titled `.env`, and add in the key-value pairs for each of the environment variables found in the `.env.example`, as well as any environment variables needed for your custom scripts that will run on the build agent. - -### 4. Store secret in Key Vault and link it as variable group in Azure DevOps to be used by piplines. -Our pipeline require the following variables to autheticate with Azure. -- spidentity -- spsecret -- sptenant -- subscriptionid +**Note:** You must have sufficient permissions to register an application with your Azure AD tenant, and assign the application to a role in your Azure subscription. Contact your subscription adminstator if you don't have the permissions. Normally a subscription admin can create a Service principal and can provide you the details. -We noted the value of these variables in previous steps. -**NOTE:** These values should be treated as secret as they allow access to your subscription. +### 4. Create a Variable Group We make use of variable group inside Azure DevOps to store variables and their values that we want to make available across multiple pipelines. You can either store the values directly in [Azure DevOps](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) or connect to an Azure Key Vault in your subscription. Please refer to the documentation [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#create-a-variable-group) to learn more about how to create a variable group and [link](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/variable-groups?view=azure-devops&tabs=designer#use-a-variable-group) it to your pipeline. -Please name your variable group **``AzureKeyVaultSecrets``**, we are using this name within our build yaml file. - -Up until now you shouls have +Please name your variable group **``devopsforai-aml-vg``** as we are using this name within our build yaml file. + +The varibale group should contain the following variables: + +| Variable Name | Suggested Value | +| --- | --- | +| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2 | +| AML_COMPUTE_CLUSTER_NAME | train-cluster | +| AML_WORKSPACE_NAME | mlops-AML-WS | +| BASE_NAME | mlops | +| EVALUATE_SCRIPT_PATH | evaluate/evaluate_model.py | +| EXPERIMENT_NAME | mlopspython | +| LOCATION | centralus | +| MODEL_NAME | sklearn_regression_model.pkl | +| REGISTER_SCRIPT_PATH | register/register_model.py | +| RESOURCE_GROUP | mlops-AML-RG | +| SOURCES_DIR_TRAIN | code | +| SP_APP_ID | | +| SP_APP_SECRET | | +| SUBSCRIPTION_ID | | +| TENANT_ID | | +| TRAIN_SCRIPT_PATH | training/train.py | + +Mark **SP_APP_SECRET** variable as a secret one. + +Make sure to select the **Allow access to all pipelines** checkbox in the variable group configuration. + +Up until now you should have: - Forked (or cloned) the repo - Created a devops account or use an existing one - Got service principal details and subscription id -- Set them as variable group within devops +- A variable group with all configuration values -We now have 3 pipelines that we would set up -- **Build Pipeline (azure-pipelines.yml)**: Runs tests and sets up infrastructure -- **Retraining trigger pipeline(/template/retraining-template.json)**: This pipeline triggers Azure ML Pipeline (training/retraining) which trains a new model and publishes model image, if new model performs better -- **Release pipeline(/template/release-template.json)**: This pipeline deploys and tests model image as web service in QA and Prod environment +### 5. Create resources +The easiest way to create all required resources (Resource Group, ML Workspace, Container Registry, Storage Account, etc.) is to leverage an "Infrastructure as Code" [pipeline coming in this repository](../environment_setup/iac-create-environment.yml). This **IaC** pipeline takes care of all required resources basing on these [ARM templates](../environment_setup/arm-templates/cloud-environment.json). The pipeline requires an **Azure Resource Manager** service connection: +![create service connection](./images/create-rm-service-connection.png) -### 5. Set up Build Pipeline -1. Select your devops organization and project by clicking dev.azure.com -2. Once you are in the right devops project, click Pipelines on the left hand menu and select Builds -3. Click **New pipeline** to create new pipeline - ![new build pipeline](./images/new-build-pipeline1.png) -4. On the Connect option page, select **GitHub** - ![build connnect step](./images/build-connect.png) - -5. On the Select option page, select the GitHub repository where you forked the code. -![select repo](./images/build-selectrepo.png) +Give the connection name **``AzureResourceConnection``** as it is referred by the pipeline definition. -6. Authorize Azure Pipelines to access your git account -![select repo](./images/Install_Azure_pipeline.png) +In your DevOps project create a build pipeline from your forked **GitHub** repository: -7. Since the repository contains azure-pipelines.yml at the root level, Azure DevOps recognizes it and auto imports it. Click **Run** and this will start the build pipeline. -![select repo](./images/build-createpipeline1.png) +![build connnect step](./images/build-connect.png) -8. Your build run would look similar to the following image -![select repo](./images/build-run.png) +Refer to an **Existing Azure Pipelines YAML file**: -Great, you now have the build pipeline setup, you can either manually trigger it or it gets automatically triggered everytime there is a change in the master branch. +![configure step](./images/select-iac-pipeline.png) +Having done that, run the pipeline: -**Note:** The build pipeline will perform basic test on the code and provision infrastructure on azure. This can take around 10 mins to complete. +![iac run](./images/run-iac-pipeline.png) -### 6. Set up Retraining trigger release pipeline +Check out created resources in the [Azure Portal](portal.azure.com): -**Note:** For setting up release pipelines, first download the [release-pipelines](../release-pipelines) to your local filesystem so you can import it. +![created resources](./images/created-resources.png) -**Also Note:** If this is the first time you are creating a release pipeline, you would see the following option, click on **New Pipeline** -![import release pipeline](./images/release-new-pipeline.png) +Alternatively, you can also use a [cleaning pipeline](../environment_setup/iac-remove-environment.yml) that removes resources created for this project or you can just delete a resource group in the [Azure Portal](portal.azure.com). -To enable the option to **Import release pipeline**, we must have atleast one release pipeline so let's create one with an empty job. -![import release pipeline](./images/release-empty-job.png) -On the next screen, click on **Save** and then click **Ok** to save the empty release pipeline. -![import release pipeline](./images/release-save-empty.png) +### 6. Set up Build Pipeline -**Steps** +In your [Azure DevOps](https://dev.azure.com) project create and run a new build pipeline refereing to [azdo-ci-build-train.yml](../.pipelines/azdo-ci-build-train.yml) pipeline in your forked **GitHub** repository: -1. Select the Release tab from the menu on the left, then click the New dropdown on top and click on **Import Release pipeline** -![import release pipeline](./images/release-import.png) +![configure ci build pipeline](./images/ci-build-pipeline-configure.png) -1. On the next screen, navigate to **release-pipelines** folder and select **retrainingtrigger.json** pipeline file, click import. You should now see the following screen. Under Stages click on the Retrain stage, where it shows the red error sign. -![release retraining triggger](./images/release-retrainingtrigger.png) +Name the pipeline **ci-build**. Once the pipline is finished, explore the execution logs: - Click on agent job and then from the drop down for Agent Pool on the right side select **Hosted Ubuntu 1604** agent to execute your run and click **Save** button on top right. -![release retraining agent](./images/release-retrainingagent.png) +![ci build logs](./images/ci-build-logs.png) -1. We would now link the variable group we created earlier to this release pipeline. To do so click on the **Variables** tab, then click on **Variable** groups and then select **Link variable group** and select the variable group that we created in previous step and click **Link** followed by **Save** button. -![release retraining artifact](./images/release-link-vg.png) -1. We want the retraining pipeline to be triggered every time build pipeline is complete. To create this dependency, we will link the artifact from build pipeline as a trigger for retraining trigger release pipeline. To do so, click on the **pipeline** tab and then select **Add an artifact** option under Artifacts. -![release pipeline view](./images/release-retrainingpipeline.png) +and checkout a published training pipeline in the **mlops-AML-WS** workspace in [Azure Portal](https://ms.portal.azure.com/): -1. This will open up a pop up window, on this screen: - - for source type, select **Build** - - for project, select your project in Azure DevOps that you created in previous steps. - - For Source select the source build pipeline. If you have forked the git repo, the build pipeline may named ``yourgitusername.MLOpsPython`` - - In the Source alias, replace the auto-populated value with - **``DevOpsForAI``** - - Field **Default version** will get auto populated **Latest**, you can leave them as it is. - - Click on **Add**, and then **Save** the pipeline - ![release retraining artifact](./images/release-retrainingartifact.png) +![training pipeline](./images/training-pipeline.png) -1. Artifact is now added for retraining trigger pipeline, hit the **save** button on top right and then click **ok**. -1. To trigger this pipeline every time build pipeline executes, click on the lighting sign to enable the **Continous Deployment Trigger**, click **Save**. - ![release retraining artifact](./images/release-retrainingtrigger1.png) - -2. If you want to run this pipeline on a schedule, you can set one by clicking on **Schedule set** in Artifacts section. -![release retraining artifact](./images/release-retrainingartifactsuccess.png) +Great, you now have the build pipeline setup, you can either manually trigger it or it gets automatically triggered everytime there is a change in the master branch. The pipeline performs linting, unit testing, builds and publishes an **ML Training Pipeline** in an **ML Workspace** -1. For the first time, we will manually trigger this pipeline. - - Click Releases option on the left hand side and navigate to the release pipeline you just created. - ![release retraining artifact](./images/release-createarelease.png) - - Click **Create Release** - ![release create ](./images/release-create.png) - - On the next screen click on **Create** button, this creates a manual release for you. +### 7. Train the Model - **Note**: This release pipeline will call the published AML pipeline. The AML pipeline will train the model and package it into image. It will take around 10 mins to complete. The next steps need this pipeline to complete successfully. At this point, you can go to the Azure Portal AML WOrkspace resource created inside resource group "DevOps_AzureML_Demo" and click on the **Pipeline** tab to see the running pipeline. +The next step is to invoke the training pipeline created in the previous step. It can be done with a **Release Pipeline**: -### 7. Set up release (Deployment) pipeline +![invoke training pipeline](./images/invoke-training-pipeline.png) -**Note:** For setting up release pipelines, first download the [release-pipelines](../release-pipelines) to your local filesystem so you can import it. +An artifact of this pipeline will be the result of the build pipeline **ci-buid**: -**Also Note:** Before creating this pipeline, make sure that the build pipeline, retraining trigger release pipeline and AML retraining pipeline have been executed, as they will be creating resources during their run like docker images that we will deploy as part of this pipeline. So it is important for them to have successful runs before the setup here. +![artifact invoke pipeline](./images/artifact-invoke-pipeline.png) -Let's set up the release deployment pipeline now. -1. As done in previous step, Select the Release tab from the menu on the left, then click the New dropdown on top and click on **Import Release pipeline** -![import release pipeline](./images/release-import.png) +Configure a pipeline to see values from the previously defined variable group **devopsforai-aml-vg**: -1. On the next screen, navigate to **release-pipelines** folder and select **releasedeployment.json** pipeline file, click import. You should now see the following screen. Under Stages click on the QA environment's **view stage task", where it shows the red error sign. -![release retraining triggger](./images/release-deployment.png) +![retrain pipeline vg](./images/retrain-pipeline-vg.png) - Click on agent job and then from the drop down for Agent Pool on the right side select **Hosted Ubuntu 1604** agent to execute your run and click **Save** button on top right. -![release retraining agent](./images/release-deploymentqaagent.png) +Add an empty stage with name **Invoke Training Pipeline** and make sure that the **Agent Specification** is **ubuntu-16.04**: - Follow the same steps for **Prod Environment** and select **Hosted Ubuntu 1604** for agent pool and save the pipeline. - ![release retraining agent](./images/release-deploymentprodagent.png) +![agent specification](./images/agent-specification.png) -1. We would now link the variable group we created earlier to this release pipeline. To do so click on the **Variables** tab, then click on **Variable** groups and then select **Link variable group** and select the variable group that we created in previous step and click **Link** followed by **Save** button. -![release retraining artifact](./images/release-link-vg.png) +Add a command line step **Run Training Pipeline** with the following script: -1. We now need to add artifact that will trigger this pipeline. We will add two artifacts: - - Build pipeline output as artifact since that contains our configuration and code files that we require in this pipeline. - - ACR artifact to trigger this pipeline everytime there is a new image that gets published to Azure container registry (ACR) as part of retraining pipeline. +```bash +docker run -v $(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/ml_service/pipelines:/pipelines \ +-w=/pipelines -e MODEL_NAME=$MODEL_NAME -e EXPERIMENT_NAME=$EXPERIMENT_NAME \ +-e TENANT_ID=$TENANT_ID -e SP_APP_ID=$SP_APP_ID -e SP_APP_SECRET=$SP_APP_SECRET \ +mcr.microsoft.com/mlops/python:latest python run_train_pipeline.py +``` - Here are the steps to add build output as artifact +This release pipeline should be automatically triggered (continuous deployment) whenever a new **ML training pipeline** is published by the **AzDo builder pipeline**. It can also be triggered manually or configured to run on a scheduled basis. Create a new release to trigger the pipeline manually: - - Click on pipeline tab to go back to pipeline view and click **Add an artifact**. This will open a pop up window - - for source type, select **Build** - - for project, select your project in Azure DevOps that you created in previous steps. - - For Source select the source build pipeline. If you have forked the git repo, the build pipeline may named ``yourgitusername.DevOpsForAI`` - - In the Source alias, replace the auto-populated value with - **``DevOpsForAI``** - - Field **Devault version** will get auto populated **Latest**, you can leave them as it is. - - Click on **Add**, and then **Save** the pipeline - ![release retraining artifact](./images/release-retrainingartifact.png) +![create release](./images/create-release.png) - **Here are the steps to add [Azure ML Model as an artifact](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml)** +Once the release pipeline is completed, check out in the **ML Workspace** that the training pipeline is running: +![running training pipeline](./images/running-training-pipeline.png) - - Install the Azure Machine Learning extension for your DevOps organization from [here](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml). You need to have admin rights to install it. +The training pipeline will train, evaluate and register a new model. Wait until it is fininshed and make sure there is a new model in the **ML Workspace**: - - Create Service Connection - 1. Go to your DevOps project and click on Project settings on bottom left corner - 2. Under Project Settings -> Pipelines, click on Service connections, click on "New service connection" and select Azure Resource Manager - ![release retraining agent](./images/service-connection.png) - - 3. Provide following info and click Ok once done: - ![release retraining agent](./images/service-connection-add.png) - - - - Click on pipeline tab to go back to pipeline view and click **Add an artifact**. This will open a pop up window - - For Source type, click on **more artifact types** dropdown and select **AzureML Model Artifact** - - For **Service Endpoint**, select an existing endpoint **MLOpsPython**, if you don't see anything in the dropdown, click on **Manage** and [create new **Azure Resource Manager**](https://docs.microsoft.com/en-us/azure/devops/pipelines/library/service-endpoints?view=azure-devops#create-a-service-connection) service connection for your subscription. - ![release retraining agent](./images/model-artifact.png) - **Note:** You must have sufficient privileges to create a service connection, if not contact your subscription adminstrator. - - For Model Names, select **sklearn_regression_model.pkl**, this is the name of the newly trained model and if the previous pipelines executed properly you will see this model name in the drop down. - - For Default version, keep it to **Latest version** - - For Source alias, keep the default generated name. - - Click Add - - Click on lighting sign to enable the **Continous Deployment Trigger**, click **Save**. - ![release retraining artifact](./images/model-artifact-cd-trigger.png) +![trained model](./images/trained-model.png) + +Good! Now we have a trained model. + +### 8. Deploy the Model + +The final step is to deploy the model across environments with a release pipeline. There will be a **``QA``** environment running on [Azure Container Instances](https://azure.microsoft.com/en-us/services/container-instances/) and a **``Prod``** environment running on [Azure Kubernetes Service](https://azure.microsoft.com/en-us/services/kubernetes-service). + +![deploy model](./images/deploy-model.png) + + +This pipeline leverages the **Azure Machine Learning** extension that should be installed in your organization from the [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml). +The pipeline consumes two artifacts: the result of the **Build Pipeline** as it contains configuration files and the **model** trained and registered by the ML training pipeline. -1. We now have QA environment continously deployed each time there is a new ml model registered in AML Model Management. You can select pre-deployment conditions for prod environment, normally you don't want it to be auto deployed, so select manual only trigger here. +Configuration of a code **_ci-build** artifact is similar to what we did in the previous chapter. - ![release retraining artifact](./images/release-deploymentprodtrigger.png) +In order to configure a model artifact there should be a service connection to **mlops-AML-WS** workspace: - To deploy a release manually, follow the document [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/get-started-designer?view=azure-devops&tabs=new-nav#deploy-a-release) +![workspace connection](./images/workspace-connection.png) +Add an artifact to the pipeline and select **AzureML Model Artifact** source type. Select the **Service Endpoint** and **Model Names** from the drop down lists: -Congratulations, you now have three pipelines set up end to end. - - Build pipeline: triggered on code change to master branch on GitHub. - - Release Trigger pipeline: triggered on build pipeline execution and registers a new ML model to AML Model Management if better than previous one. - - Release Deployment pipeline: QA environment is auto triggered when there is a new model. - Prod is manual only and user decides when to release to this environment. +![model artifact](./images/model-artifact.png) + +Create a stage **QA (ACI)** and add a single task to the job **Azure ML Model Deploy**: + +![deploy aci](./images/deploy-aci.png) + +Specify task parameters as it is shown in the table below: + + +| Parameter | Value | +| --- | --- | +| Display Name | Azure ML Model Deploy | +| Azure ML Workspace | mlops-AML-WS | +| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml` | +| Model Deployment Target | Azure Container Instance | +| Deployment Name | mlopspython-aci | +| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aci.yml` | +| Overwrite existing deployment | X | + + +In a similar way create a stage **Prod (AKS** and add a single task to the job **Azure ML Model Deploy**: + +![deploy aks](./images/deploy-aks.png) + +Specify task parameters as it is shown in the table below: + +| Parameter | Value | +| --- | --- | +| Display Name | Azure ML Model Deploy | +| Azure ML Workspace | mlops-AML-WS | +| Inference config Path | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/inference_config.yml` | +| Model Deployment Target | Azure Kubernetes Service | +| Select AKS Cluster for Deployment | YOUR_DEPLOYMENT_K8S_CLUSTER | +| Deployment Name | mlopspython-aks | +| Deployment Configuration file | `$(System.DefaultWorkingDirectory)/_ci-build/mlops-pipelines/code/scoring/deployment_config_aks.yml` | +| Overwrite existing deployment | X | + +**Note:** Creating of a Kubernetes cluster on AKS is out of scope of this tutorial, so you should take care of it on your own. + +Save the pipeline and craete a release to trigger it manually. Once the pipeline exection is finished, check out deployments in the **mlops-AML-WS** workspace. + + + +Congratulations! You have three pipelines set up end to end: + - Build pipeline: triggered on code change to master branch on GitHub, performs linting, unit testing and publishing a trainig pipeline + - Release Trigger pipeline: runs a published training pipeline to trian, evaluate and register a model + - Release Deployment pipeline: deploys a model to QA (ACI) and Prod (AKS) environemts + diff --git a/docs/images/agent-specification.png b/docs/images/agent-specification.png new file mode 100644 index 00000000..c71c3b68 Binary files /dev/null and b/docs/images/agent-specification.png differ diff --git a/docs/images/artifact-invoke-pipeline.png b/docs/images/artifact-invoke-pipeline.png new file mode 100644 index 00000000..2a6dcebf Binary files /dev/null and b/docs/images/artifact-invoke-pipeline.png differ diff --git a/docs/images/build-connect.png b/docs/images/build-connect.png index f5d9d61a..79553d80 100644 Binary files a/docs/images/build-connect.png and b/docs/images/build-connect.png differ diff --git a/docs/images/ci-build-logs.png b/docs/images/ci-build-logs.png new file mode 100644 index 00000000..726f70ac Binary files /dev/null and b/docs/images/ci-build-logs.png differ diff --git a/docs/images/ci-build-pipeline-configure.png b/docs/images/ci-build-pipeline-configure.png new file mode 100644 index 00000000..d593d1dc Binary files /dev/null and b/docs/images/ci-build-pipeline-configure.png differ diff --git a/docs/images/create-release.png b/docs/images/create-release.png new file mode 100644 index 00000000..15069b5d Binary files /dev/null and b/docs/images/create-release.png differ diff --git a/docs/images/create-rm-service-connection.png b/docs/images/create-rm-service-connection.png new file mode 100644 index 00000000..629d3c2a Binary files /dev/null and b/docs/images/create-rm-service-connection.png differ diff --git a/docs/images/created-resources.png b/docs/images/created-resources.png new file mode 100644 index 00000000..d5136ee8 Binary files /dev/null and b/docs/images/created-resources.png differ diff --git a/docs/images/deploy-aci.png b/docs/images/deploy-aci.png new file mode 100644 index 00000000..0270143b Binary files /dev/null and b/docs/images/deploy-aci.png differ diff --git a/docs/images/deploy-aks.png b/docs/images/deploy-aks.png new file mode 100644 index 00000000..96d83b8b Binary files /dev/null and b/docs/images/deploy-aks.png differ diff --git a/docs/images/deploy-model.png b/docs/images/deploy-model.png new file mode 100644 index 00000000..8a4cbd06 Binary files /dev/null and b/docs/images/deploy-model.png differ diff --git a/docs/images/invoke-training-pipeline.png b/docs/images/invoke-training-pipeline.png new file mode 100644 index 00000000..21619ae3 Binary files /dev/null and b/docs/images/invoke-training-pipeline.png differ diff --git a/docs/images/main-flow.png b/docs/images/main-flow.png new file mode 100644 index 00000000..a49f7440 Binary files /dev/null and b/docs/images/main-flow.png differ diff --git a/docs/images/model-artifact.png b/docs/images/model-artifact.png index 0681a556..b89390b4 100644 Binary files a/docs/images/model-artifact.png and b/docs/images/model-artifact.png differ diff --git a/docs/images/retrain-pipeline-vg.png b/docs/images/retrain-pipeline-vg.png new file mode 100644 index 00000000..4aa30e9f Binary files /dev/null and b/docs/images/retrain-pipeline-vg.png differ diff --git a/docs/images/run-iac-pipeline.png b/docs/images/run-iac-pipeline.png new file mode 100644 index 00000000..15771246 Binary files /dev/null and b/docs/images/run-iac-pipeline.png differ diff --git a/docs/images/running-training-pipeline.png b/docs/images/running-training-pipeline.png new file mode 100644 index 00000000..0d3af93e Binary files /dev/null and b/docs/images/running-training-pipeline.png differ diff --git a/docs/images/select-iac-pipeline.png b/docs/images/select-iac-pipeline.png new file mode 100644 index 00000000..e165ccc8 Binary files /dev/null and b/docs/images/select-iac-pipeline.png differ diff --git a/docs/images/trained-model.png b/docs/images/trained-model.png new file mode 100644 index 00000000..3753fd7d Binary files /dev/null and b/docs/images/trained-model.png differ diff --git a/docs/images/training-pipeline.png b/docs/images/training-pipeline.png new file mode 100644 index 00000000..cbdaf048 Binary files /dev/null and b/docs/images/training-pipeline.png differ diff --git a/docs/images/workspace-connection.png b/docs/images/workspace-connection.png new file mode 100644 index 00000000..570a724e Binary files /dev/null and b/docs/images/workspace-connection.png differ diff --git a/environment_setup/iac-create-environment.yml b/environment_setup/iac-create-environment.yml index af2c2fcd..57e5a106 100644 --- a/environment_setup/iac-create-environment.yml +++ b/environment_setup/iac-create-environment.yml @@ -10,21 +10,19 @@ pool: vmImage: 'ubuntu-latest' variables: - baseName: $[coalesce(variables['baseNameOverride'], 'mlops')] - location: $[coalesce(variables['locationOverride'], 'centralus')] - azuresub: $[coalesce(variables['azuresubOverride'], 'AzureResourceConnection')] +- group: devopsforai-aml-vg steps: - task: AzureResourceGroupDeployment@2 inputs: - azureSubscription: $(azuresub) + azureSubscription: 'AzureResourceConnection' action: 'Create Or Update Resource Group' - resourceGroupName: '$(baseName)-AML-RG' - location: $(location) + resourceGroupName: '$(RESOURCE_GROUP)' + location: $(LOCATION) templateLocation: 'Linked artifact' csmFile: '$(Build.SourcesDirectory)/environment_setup/arm-templates/cloud-environment.json' - overrideParameters: '-baseName $(baseName)' + overrideParameters: '-baseName $(BASE_NAME)' deploymentMode: 'Incremental' displayName: 'Deploy MLOps resources to Azure' diff --git a/environment_setup/iac-remove-environment.yml b/environment_setup/iac-remove-environment.yml index 152efa59..67626223 100644 --- a/environment_setup/iac-remove-environment.yml +++ b/environment_setup/iac-remove-environment.yml @@ -10,18 +10,16 @@ pool: vmImage: 'ubuntu-latest' variables: - baseName: $[coalesce(variables['baseNameOverride'], 'mlops')] - location: $[coalesce(variables['locationOverride'], 'centralus')] - azuresub: $[coalesce(variables['azuresubOverride'], 'AzureResourceConnection')] +- group: devopsforai-aml-vg steps: - task: AzureResourceGroupDeployment@2 inputs: - azureSubscription: $(azuresub) + azureSubscription: 'AzureResourceConnection' action: 'DeleteRG' - resourceGroupName: '$(baseName)-AML-RG' - location: $(location) + resourceGroupName: '$(RESOURCE_GROUP)' + location: $(LOCATION) displayName: 'Delete resources in Azure' \ No newline at end of file diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt index 1a3c1785..8a086c4d 100644 --- a/environment_setup/requirements.txt +++ b/environment_setup/requirements.txt @@ -4,4 +4,5 @@ azureml>=0.2 azureml-sdk>=1.0 python-dotenv>=0.10.3 flake8 -flake8_formatter_junit_xml \ No newline at end of file +flake8_formatter_junit_xml +azure-cli==2.0.71 \ No newline at end of file