From 2dea4774090cfe170c8053817553a752c4ab6f70 Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Sun, 6 Apr 2025 00:34:12 +0545 Subject: [PATCH 1/6] Initialize DVC --- .dvc/.gitignore | 3 +++ .dvc/config | 0 .dvcignore | 3 +++ 3 files changed, 6 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..e69de29b diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore From 42d584c5821850bfe8887163ccceaaa03de1230f Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Sun, 6 Apr 2025 00:40:13 +0545 Subject: [PATCH 2/6] Track data and models with DVC --- data/.gitignore | 1 + models/.gitignore | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 data/.gitignore create mode 100644 models/.gitignore diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 00000000..43ee6de4 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/raw_data.csv diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 00000000..5605a1d1 --- /dev/null +++ b/models/.gitignore @@ -0,0 +1,2 @@ +/model.pkldel.pkl +/model.pkl From 1fca8650b0a386253eb3889263d218d994436e6e Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Sun, 6 Apr 2025 12:11:35 +0545 Subject: [PATCH 3/6] Initialize DVC and add project files --- .github/workflows/train.yml | 17 +++ .gitignore | 156 +++++++++------------ data/processed/train_data.csv | 3 + data/raw_data.csv.dvc | 5 + requirements.txt | 6 + src/monitor.py | 28 ++++ src/train.py | 249 ++++++++++++++++++++++++++++++++++ tests/test_train.py | 8 ++ 8 files changed, 381 insertions(+), 91 deletions(-) create mode 100644 .github/workflows/train.yml create mode 100644 data/processed/train_data.csv create mode 100644 data/raw_data.csv.dvc create mode 100644 requirements.txt create mode 100644 src/monitor.py create mode 100644 src/train.py create mode 100644 tests/test_train.py diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml new file mode 100644 index 00000000..502feefb --- /dev/null +++ b/.github/workflows/train.yml @@ -0,0 +1,17 @@ +name: Train Model +on: [push] +jobs: + train: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: pip install -r requirements.txt + - run: python -m pytest tests/ + - run: python src/train.py + - uses: actions/upload-artifact@v3 + with: + name: model + path: mlruns/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3ab04e2f..02ada6a8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,88 +3,44 @@ __pycache__/ *.py[cod] *$py.class -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST -venv/ - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ -*-testresults.xml -test-output.xml - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments +# Project-specific ignores +/data/raw/ +/data/interim/ +/models/ +/mlruns/ +/mlartifacts/ + +# Exceptions - keep these directories (empty placeholder files will be tracked) +!/data/processed/ +!/data/processed/.gitkeep +!/models/.gitkeep +!/notebooks/ +!/notebooks/.gitkeep + +# Keep all source code +!/src/ +!/src/*.py + +# Keep documentation and configs +!*.md +!LICENSE +!README.md +!OUTLINE +!TIMELINE +!requirements.txt +!setup.py + +# DVC files (track these) +!.dvc/ +!data/.dvc +!models/.dvc + +# IDE and environment files +.vscode/ +.idea/ +*.swp +*.swo +*~ .env .venv env/ @@ -94,18 +50,36 @@ env.bak/ venv.bak/ *.vscode condaenv.* +pipenv +Pipfile* +poetry.lock -# Spyder project settings -.spyderproject -.spyproject +# Build and distribution files +build/ +dist/ +*.egg-info/ +*.egg +*.whl -# Rope project settings -.ropeproject +# Logs and debug files +*.log +logs/ +debug/ -# mkdocs documentation -/site +# Test and coverage reports +.coverage +htmlcov/ +.pytest_cache/ +test-results.xml -# mypy -.mypy_cache/ +# Jupyter +.ipynb_checkpoints/ +# OS-specific .DS_Store +Thumbs.db + +# Python cache +.mypy_cache/ +.python-version + diff --git a/data/processed/train_data.csv b/data/processed/train_data.csv new file mode 100644 index 00000000..19620ce2 --- /dev/null +++ b/data/processed/train_data.csv @@ -0,0 +1,3 @@ +feature1,feature2,target +1.2,3.4,0 +5.6,7.8,1 diff --git a/data/raw_data.csv.dvc b/data/raw_data.csv.dvc new file mode 100644 index 00000000..1b7cdaf5 --- /dev/null +++ b/data/raw_data.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fd253016ea5e108529ecfa57303ea9dd + size: 48 + hash: md5 + path: raw_data.csv diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..ab9652f1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +mlflow +scikit-learn +evidently +pandas +dvc +pytest \ No newline at end of file diff --git a/src/monitor.py b/src/monitor.py new file mode 100644 index 00000000..d8a1465a --- /dev/null +++ b/src/monitor.py @@ -0,0 +1,28 @@ +from evidently import ColumnMapping +from evidently.report import Report +from evidently.metric_preset import DataDriftPreset +from evidently.metrics import ClassificationQualityMetric +import pandas as pd +import logging + +def generate_report(current_data: pd.DataFrame, + reference_data: pd.DataFrame, + target_col: str = "target"): + """Generate data drift and quality report""" + column_mapping = ColumnMapping( + target=target_col, + numerical_features=current_data.select_dtypes(include='number').columns.tolist() + ) + + report = Report(metrics=[ + DataDriftPreset(), + ClassificationQualityMetric() + ]) + + report.run( + current_data=current_data, + reference_data=reference_data, + column_mapping=column_mapping + ) + + return report \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 00000000..5b690b05 --- /dev/null +++ b/src/train.py @@ -0,0 +1,249 @@ +import mlflow +import pandas as pd +from pathlib import Path +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import (accuracy_score, f1_score, + precision_score, recall_score, + classification_report) +from mlflow.models.signature import infer_signature +from mlflow.tracking import MlflowClient +import logging +import json +from datetime import datetime + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('training.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# Constants +BASE_DIR = Path(__file__).parent.parent +DATA_PATH = BASE_DIR / "data" / "processed" / "train_data.csv" +MODEL_NAME = "Fraud_Detection_Model" +VALIDATION_THRESHOLD = 0.9 +CONFIG = { + "data": { + "test_size": 0.2, + "random_state": 42, + "target_col": "target" + }, + "model": { + "type": "RandomForestClassifier", + "params": { + "n_estimators": 150, + "max_depth": 8, + "min_samples_split": 2, + "random_state": 42, + "class_weight": "balanced" + } + } +} + +def load_and_validate_data(): + """Load and validate input data""" + logger.info(f"Loading data from {DATA_PATH}") + try: + data = pd.read_csv(DATA_PATH) + + # Validate data + assert CONFIG['data']['target_col'] in data.columns, \ + f"Target column {CONFIG['data']['target_col']} not found" + assert len(data) > 100, "Insufficient data samples" + + X = data.drop(CONFIG['data']['target_col'], axis=1) + y = data[CONFIG['data']['target_col']] + + return train_test_split( + X, y, + test_size=CONFIG['data']['test_size'], + random_state=CONFIG['data']['random_state'], + stratify=y + ) + except Exception as e: + logger.error(f"Data loading failed: {str(e)}") + raise + +def evaluate_model(model, X_test, y_test): + """Comprehensive model evaluation""" + y_pred = model.predict(X_test) + y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None + + metrics = { + "accuracy": accuracy_score(y_test, y_pred), + "f1_score": f1_score(y_test, y_pred, average='weighted'), + "precision": precision_score(y_test, y_pred, average='weighted'), + "recall": recall_score(y_test, y_pred, average='weighted'), + "classification_report": classification_report(y_test, y_pred, output_dict=True) + } + + # Log class distribution + metrics["class_distribution"] = dict(pd.Series(y_test).value_counts()) + + return metrics, y_pred + +def setup_mlflow(): + """Configure MLflow tracking""" + mlflow.set_tracking_uri(f"file:{str(BASE_DIR / 'mlruns')}") + mlflow.set_experiment("Fraud_Detection") + + # Enable autologging + mlflow.sklearn.autolog( + log_input_examples=True, + log_model_signatures=True, + log_models=True + ) + +def register_and_promote_model(client, run_id, metrics): + """Handle model versioning and promotion""" + try: + # Get the newly created version + new_version = client.get_latest_versions(MODEL_NAME, stages=["None"])[0] + + # Add comprehensive metadata + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="validation_status", + value="Pending" + ) + + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="deployment_ready", + value=str(metrics["accuracy"] >= VALIDATION_THRESHOLD).lower() + ) + + # Evaluate promotion criteria + if metrics["accuracy"] >= VALIDATION_THRESHOLD: + promote_model(client, new_version, metrics) + else: + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="validation_status", + value="Rejected" + ) + logger.warning(f"Model accuracy {metrics['accuracy']:.2f} below threshold {VALIDATION_THRESHOLD}") + + except Exception as e: + logger.error(f"Model registration failed: {str(e)}") + raise + +def promote_model(client, new_version, metrics): + """Promote model through staging to production""" + try: + # Transition to Staging + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Staging" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Challenger", + version=new_version.version + ) + + # Check against current champion + try: + champion_version = client.get_model_version_by_alias(MODEL_NAME, "Champion") + champion_run = client.get_run(champion_version.run_id) + champion_metrics = champion_run.data.metrics + + if metrics["accuracy"] > champion_metrics["accuracy"]: + # Archive old champion + client.transition_model_version_stage( + name=MODEL_NAME, + version=champion_version.version, + stage="Archived" + ) + + # Promote new champion + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Production" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Champion", + version=new_version.version + ) + logger.info(f"New champion! Version {new_version.version} promoted to Production") + + except Exception as e: + logger.warning(f"No existing champion found: {str(e)}") + # First deployment - promote directly to Production + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Production" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Champion", + version=new_version.version + ) + + except Exception as e: + logger.error(f"Model promotion failed: {str(e)}") + raise + +def train_and_register(): + """End-to-end training and registration pipeline""" + try: + # Setup tracking + setup_mlflow() + + # Load data + X_train, X_test, y_train, y_test = load_and_validate_data() + + # Train model + with mlflow.start_run(run_name=f"challenger_{datetime.now().strftime('%Y%m%d_%H%M%S')}"): + # Log config + mlflow.log_dict(CONFIG, "config.json") + + # Initialize model + model = RandomForestClassifier(**CONFIG['model']['params']) + + # Train + model.fit(X_train, y_train) + + # Evaluate + metrics, y_pred = evaluate_model(model, X_test, y_test) + + # Log metrics + mlflow.log_metrics(metrics) + mlflow.log_text( + json.dumps(metrics['classification_report'], indent=2), + "classification_report.json" + ) + + # Log model + signature = infer_signature(X_train, y_pred) + mlflow.sklearn.log_model( + sk_model=model, + artifact_path="model", + signature=signature, + input_example=X_train.iloc[:1], + registered_model_name=MODEL_NAME + ) + + # Register and promote + client = MlflowClient() + register_and_promote_model(client, mlflow.active_run().info.run_id, metrics) + + except Exception as e: + logger.error(f"Training pipeline failed: {str(e)}", exc_info=True) + raise + +if __name__ == "__main__": + train_and_register() \ No newline at end of file diff --git a/tests/test_train.py b/tests/test_train.py new file mode 100644 index 00000000..3d5966af --- /dev/null +++ b/tests/test_train.py @@ -0,0 +1,8 @@ +# tests/test_train.py +import pytest +from train import load_and_validate_data + +def test_data_loading(): + X_train, X_test, y_train, y_test = load_and_validate_data() + assert len(X_train) > 0 + assert len(y_test) > 0 \ No newline at end of file From 87241cc105701ceb3113a62c865259dd40c0cd41 Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Sun, 6 Apr 2025 23:05:43 +0545 Subject: [PATCH 4/6] Fix workflow: add DVC, debug outputs --- .github/workflows/train.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml index 502feefb..d4239944 100644 --- a/.github/workflows/train.yml +++ b/.github/workflows/train.yml @@ -10,8 +10,9 @@ jobs: python-version: '3.10' - run: pip install -r requirements.txt - run: python -m pytest tests/ - - run: python src/train.py - - uses: actions/upload-artifact@v3 + - run: python src/train.py || exit 1 + - run: ls -R mlruns/ + - uses: actions/upload-artifact@v4 with: name: model path: mlruns/ \ No newline at end of file From 678e7106f64eed61bc8f461d1d3fdd7b569c7b86 Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Sun, 6 Apr 2025 23:54:43 +0545 Subject: [PATCH 5/6] Update DVC tracking and add test infrastructure --- setup.py | 7 +++++++ src/__init__.py | 0 src/train.py | 46 +++++++++++++++++++++++---------------------- tests/__init__.py | 0 tests/conftest.py | 28 +++++++++++++++++++++++++++ tests/test_train.py | 22 ++++++++++++++++++---- 6 files changed, 77 insertions(+), 26 deletions(-) create mode 100644 setup.py create mode 100644 src/__init__.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..cc19cfcf --- /dev/null +++ b/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup, find_packages + +setup( + name="mlops", + version="0.1", + packages=find_packages(), +) \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/train.py b/src/train.py index 5b690b05..8fd103e1 100644 --- a/src/train.py +++ b/src/train.py @@ -47,28 +47,30 @@ } def load_and_validate_data(): - """Load and validate input data""" - logger.info(f"Loading data from {DATA_PATH}") - try: - data = pd.read_csv(DATA_PATH) - - # Validate data - assert CONFIG['data']['target_col'] in data.columns, \ - f"Target column {CONFIG['data']['target_col']} not found" - assert len(data) > 100, "Insufficient data samples" - - X = data.drop(CONFIG['data']['target_col'], axis=1) - y = data[CONFIG['data']['target_col']] - - return train_test_split( - X, y, - test_size=CONFIG['data']['test_size'], - random_state=CONFIG['data']['random_state'], - stratify=y - ) - except Exception as e: - logger.error(f"Data loading failed: {str(e)}") - raise + data = pd.read_csv(DATA_PATH) + + # Check for any target column if not specified + target_col = CONFIG['data'].get('target_col', 'target') + if target_col not in data.columns: + # Try common target column names + for col in ['target', 'label', 'class']: + if col in data.columns: + target_col = col + break + else: + raise ValueError(f"No target column found in {DATA_PATH}") + + # Adjust sample size check + min_samples = CONFIG['data'].get('min_samples', 10) + if len(data) < min_samples: + logger.warning(f"Dataset has only {len(data)} samples (min {min_samples})") + + return train_test_split( + data.drop(target_col, axis=1), + data[target_col], + test_size=CONFIG['data'].get('test_size', 0.2), + random_state=CONFIG['data'].get('random_state', 42) + ) def evaluate_model(model, X_test, y_test): """Comprehensive model evaluation""" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..a1290976 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,28 @@ +import pytest +import pandas as pd +import numpy as np +from copy import deepcopy +from src.train import CONFIG + +@pytest.fixture +def sample_data(): + """Generate test data with correct column names""" + return pd.DataFrame({ + 'feature1': np.random.normal(0, 1, 200), + 'feature2': np.random.uniform(0, 1, 200), + 'target': np.random.randint(0, 2, 200) + }) + +@pytest.fixture +def original_config(): + """Preserve original configuration""" + return deepcopy(CONFIG) + +@pytest.fixture(autouse=True) +def restore_config(original_config): + """Auto-restore config after each test""" + yield + CONFIG.clear() + CONFIG.update(original_config) + + diff --git a/tests/test_train.py b/tests/test_train.py index 3d5966af..4e7b6542 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -1,8 +1,22 @@ -# tests/test_train.py -import pytest -from train import load_and_validate_data +from src.train import load_and_validate_data -def test_data_loading(): +# tests/test_train.py +def test_data_loading(sample_data, tmp_path, monkeypatch): + # Save test data + test_path = tmp_path / "test_data.csv" + sample_data.to_csv(test_path, index=False) + + # Temporarily patch the configuration + from src.train import CONFIG, DATA_PATH + CONFIG['data']['target_col'] = 'target' # Must match your fixture column + CONFIG['data']['min_samples'] = 5 # Lower threshold for tests + + # Use monkeypatch to safely modify DATA_PATH + monkeypatch.setattr('src.train.DATA_PATH', str(test_path)) + + # Import AFTER patching + from src.train import load_and_validate_data X_train, X_test, y_train, y_test = load_and_validate_data() + assert len(X_train) > 0 assert len(y_test) > 0 \ No newline at end of file From cf37bc44599ef1b2c3a6a0146e41f3bdf7e9cfde Mon Sep 17 00:00:00 2001 From: Rupesh Sigdel Date: Mon, 7 Apr 2025 00:38:53 +0545 Subject: [PATCH 6/6] Improve model training with robust data validation and metric handling --- src/train.py | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/src/train.py b/src/train.py index 8fd103e1..a8b96dfa 100644 --- a/src/train.py +++ b/src/train.py @@ -1,9 +1,10 @@ import mlflow +import numpy as np import pandas as pd from pathlib import Path from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split -from sklearn.metrics import (accuracy_score, f1_score, +from sklearn.metrics import (accuracy_score, f1_score, log_loss, precision_score, recall_score, classification_report) from mlflow.models.signature import infer_signature @@ -65,6 +66,13 @@ def load_and_validate_data(): if len(data) < min_samples: logger.warning(f"Dataset has only {len(data)} samples (min {min_samples})") + class_counts = data[CONFIG['data']['target_col']].value_counts() + if len(class_counts) < 2: + raise ValueError(f"Need at least 2 classes, found {class_counts.index.tolist()}") + + if len(data) < CONFIG['data'].get('min_samples', 100): + logger.warning(f"Dataset small ({len(data)} samples)") + return train_test_split( data.drop(target_col, axis=1), data[target_col], @@ -73,20 +81,34 @@ def load_and_validate_data(): ) def evaluate_model(model, X_test, y_test): - """Comprehensive model evaluation""" y_pred = model.predict(X_test) - y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None + + if hasattr(model, "predict_proba"): + proba = model.predict_proba(X_test) + y_proba = proba[:, -1] + else: + y_proba = None + + report = classification_report(y_test, y_pred, output_dict=True, zero_division=0) metrics = { "accuracy": accuracy_score(y_test, y_pred), "f1_score": f1_score(y_test, y_pred, average='weighted'), "precision": precision_score(y_test, y_pred, average='weighted'), "recall": recall_score(y_test, y_pred, average='weighted'), - "classification_report": classification_report(y_test, y_pred, output_dict=True) + "classification_report": report # Keep the full report } - # Log class distribution - metrics["class_distribution"] = dict(pd.Series(y_test).value_counts()) + class_metrics = { + f"class_{k}_{metric}": v + for k, v in report.items() + if isinstance(v, dict) + for metric, v in v.items() + } + metrics.update(class_metrics) + + if len(np.unique(y_test)) > 1 and y_proba is not None: + metrics["log_loss"] = log_loss(y_test, y_proba) return metrics, y_pred @@ -223,11 +245,16 @@ def train_and_register(): metrics, y_pred = evaluate_model(model, X_test, y_test) # Log metrics - mlflow.log_metrics(metrics) - mlflow.log_text( - json.dumps(metrics['classification_report'], indent=2), - "classification_report.json" - ) + mlflow.log_metrics({ + k: v for k, v in metrics.items() + if not k.endswith('_report') and isinstance(v, (int, float)) + }) + + if 'classification_report' in metrics: + mlflow.log_text( + json.dumps(metrics['classification_report'], indent=2), + "classification_report.json" + ) # Log model signature = infer_signature(X_train, y_pred)