diff --git a/bootstrap/README.md b/bootstrap/README.md index d5db8a62..027512bf 100644 --- a/bootstrap/README.md +++ b/bootstrap/README.md @@ -4,7 +4,7 @@ To use this existing project structure and scripts for your new ML project, you Bootstrapping will prepare a directory structure for your project which includes: -* renaming files and folders from the base project name `diabetes` to your project name +* renaming files and folders from the base project name `diabetes_regression` to your project name * fixing imports and absolute path based on your project name * deleting and cleaning up some directories diff --git a/diabetes_regression/training/test_train.py b/diabetes_regression/training/test_train.py index d121ecbc..e1a79781 100644 --- a/diabetes_regression/training/test_train.py +++ b/diabetes_regression/training/test_train.py @@ -10,7 +10,7 @@ def test_train_model(): reg_model = train_model(data, {"alpha": 1.2}) preds = reg_model.predict([[1], [2]]) - np.testing.assert_equal(preds, [9.93939393939394, 9.03030303030303]) + np.testing.assert_almost_equal(preds, [9.93939393939394, 9.03030303030303]) def test_get_model_metrics(): diff --git a/docs/custom_model.md b/docs/custom_model.md index 6515bcbe..bce1fb8a 100644 --- a/docs/custom_model.md +++ b/docs/custom_model.md @@ -2,8 +2,8 @@ This document provides steps to follow when using this repository as a template to train models and deploy the models with real-time inference in Azure ML with your own scripts and data. -1. Follow the MLOpsPython [Getting Started](https://github.com/microsoft/MLOpsPython/blob/master/docs/getting_started.md) guide -1. Follow the MLOpsPython [bootstrap instructions](https://github.com/microsoft/MLOpsPython/blob/master/bootstrap/README.md) to create your project starting point +1. Follow the MLOpsPython [Getting Started](getting_started.md) guide +1. Follow the MLOpsPython [bootstrap instructions](../bootstrap/README.md) to create your project starting point 1. Configure training data 1. [If necessary] Convert your ML experimental code into production ready code 1. Replace the training code @@ -13,11 +13,13 @@ This document provides steps to follow when using this repository as a template ## Follow the Getting Started guide -Follow the [Getting Started](https://github.com/microsoft/MLOpsPython/blob/master/docs/getting_started.md) guide to set up the infrastructure and pipelines to execute MLOpsPython. +Follow the [Getting Started](getting_started.md) guide to set up the infrastructure and pipelines to execute MLOpsPython. + +Take a look at the [Repo Details](code_description.md) document for a description of the structure of this repository. ## Follow the Bootstrap instructions -The [Bootstrap from MLOpsPython repository](https://github.com/microsoft/MLOpsPython/blob/master/bootstrap/README.md) guide will help you to quickly prepare the repository for your project. +The [Bootstrap from MLOpsPython repository](../bootstrap/README.md) guide will help you to quickly prepare the repository for your project. **Note:** Since the bootstrap script will rename the `diabetes_regression` folder to the project name of your choice, we'll refer to your project as `[project name]` when paths are involved. diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb index 5e507f96..fa192115 100644 --- a/experimentation/Diabetes Ridge Regression Training.ipynb +++ b/experimentation/Diabetes Ridge Regression Training.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,8 @@ "from sklearn.linear_model import Ridge\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.model_selection import train_test_split\n", - "import joblib" + "import joblib\n", + "import pandas as pd" ] }, { @@ -36,16 +37,21 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "X, y = load_diabetes(return_X_y=True)" + "sample_data = load_diabetes()\n", + "\n", + "df = pd.DataFrame(\n", + " data=sample_data.data,\n", + " columns=sample_data.feature_names)\n", + "df['Y'] = sample_data.target" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -57,29 +63,12 @@ } ], "source": [ - "print(X.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(442,)\n" - ] - } - ], - "source": [ - "print(y.shape)" + "print(df.shape)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -103,16 +92,17 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 2\n", - " 3\n", - " 4\n", - " 5\n", - " 6\n", - " 7\n", - " 8\n", - " 9\n", + " age\n", + " sex\n", + " bmi\n", + " bp\n", + " s1\n", + " s2\n", + " s3\n", + " s4\n", + " s5\n", + " s6\n", + " Y\n", " \n", " \n", " \n", @@ -128,19 +118,21 @@ " 4.420000e+02\n", " 4.420000e+02\n", " 4.420000e+02\n", + " 442.000000\n", " \n", " \n", " mean\n", - " -3.639623e-16\n", - " 1.309912e-16\n", - " -8.013951e-16\n", - " 1.289818e-16\n", - " -9.042540e-17\n", - " 1.301121e-16\n", - " -4.563971e-16\n", - " 3.863174e-16\n", - " -3.848103e-16\n", - " -3.398488e-16\n", + " -3.634285e-16\n", + " 1.308343e-16\n", + " -8.045349e-16\n", + " 1.281655e-16\n", + " -8.835316e-17\n", + " 1.327024e-16\n", + " -4.574646e-16\n", + " 3.777301e-16\n", + " -3.830854e-16\n", + " -3.412882e-16\n", + " 152.133484\n", " \n", " \n", " std\n", @@ -154,6 +146,7 @@ " 4.761905e-02\n", " 4.761905e-02\n", " 4.761905e-02\n", + " 77.093005\n", " \n", " \n", " min\n", @@ -167,6 +160,7 @@ " -7.639450e-02\n", " -1.260974e-01\n", " -1.377672e-01\n", + " 25.000000\n", " \n", " \n", " 25%\n", @@ -180,6 +174,7 @@ " -3.949338e-02\n", " -3.324879e-02\n", " -3.317903e-02\n", + " 87.000000\n", " \n", " \n", " 50%\n", @@ -193,6 +188,7 @@ " -2.592262e-03\n", " -1.947634e-03\n", " -1.077698e-03\n", + " 140.500000\n", " \n", " \n", " 75%\n", @@ -206,6 +202,7 @@ " 3.430886e-02\n", " 3.243323e-02\n", " 2.791705e-02\n", + " 211.500000\n", " \n", " \n", " max\n", @@ -219,15 +216,16 @@ " 1.852344e-01\n", " 1.335990e-01\n", " 1.356118e-01\n", + " 346.000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " 0 1 2 3 4 \\\n", + " age sex bmi bp s1 \\\n", "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", - "mean -3.639623e-16 1.309912e-16 -8.013951e-16 1.289818e-16 -9.042540e-17 \n", + "mean -3.634285e-16 1.308343e-16 -8.045349e-16 1.281655e-16 -8.835316e-17 \n", "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", "min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01 \n", "25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02 \n", @@ -235,26 +233,35 @@ "75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564384e-02 2.835801e-02 \n", "max 1.107267e-01 5.068012e-02 1.705552e-01 1.320442e-01 1.539137e-01 \n", "\n", - " 5 6 7 8 9 \n", - "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", - "mean 1.301121e-16 -4.563971e-16 3.863174e-16 -3.848103e-16 -3.398488e-16 \n", - "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", - "min -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01 \n", - "25% -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02 \n", - "50% -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03 \n", - "75% 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02 \n", - "max 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01 " + " s2 s3 s4 s5 s6 \\\n", + "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n", + "mean 1.327024e-16 -4.574646e-16 3.777301e-16 -3.830854e-16 -3.412882e-16 \n", + "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n", + "min -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01 \n", + "25% -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02 \n", + "50% -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03 \n", + "75% 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02 \n", + "max 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01 \n", + "\n", + " Y \n", + "count 442.000000 \n", + "mean 152.133484 \n", + "std 77.093005 \n", + "min 25.000000 \n", + "25% 87.000000 \n", + "50% 140.500000 \n", + "75% 211.500000 \n", + "max 346.000000 " ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import pandas as pd\n", - "features = pd.DataFrame(X)\n", - "features.describe()" + "# All data in a single dataframe\n", + "df.describe()" ] }, { @@ -266,11 +273,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", + "X = df.drop('Y', axis=1).values\n", + "y = df['Y'].values\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=0)\n", "data = {\"train\": {\"X\": X_train, \"y\": y_train},\n", " \"test\": {\"X\": X_test, \"y\": y_test}}" ] @@ -284,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -294,16 +305,19 @@ " normalize=False, random_state=None, solver='auto', tol=0.001)" ] }, - "execution_count": 4, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "alpha = 0.5\n", + "# experiment parameters\n", + "args = {\n", + " \"alpha\": 0.5\n", + "}\n", "\n", - "reg = Ridge(alpha=alpha)\n", - "reg.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])" + "reg_model = Ridge(**args)\n", + "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])" ] }, { @@ -315,20 +329,22 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "mse: 3298.9096058070622\n" + "{'mse': 3298.9096058070622}\n" ] } ], "source": [ - "preds = reg.predict(data[\"test\"][\"X\"])\n", - "print(\"mse: \", mean_squared_error(preds, y_test))" + "preds = reg_model.predict(data[\"test\"][\"X\"])\n", + "mse = mean_squared_error(preds, y_test)\n", + "metrics = {\"mse\": mse}\n", + "print(metrics)" ] }, { @@ -363,9 +379,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (storedna)", + "display_name": "Python 3", "language": "python", - "name": "storedna" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -377,7 +393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.7.4" } }, "nbformat": 4,