diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..e69de29b diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml new file mode 100644 index 00000000..d4239944 --- /dev/null +++ b/.github/workflows/train.yml @@ -0,0 +1,18 @@ +name: Train Model +on: [push] +jobs: + train: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - run: pip install -r requirements.txt + - run: python -m pytest tests/ + - run: python src/train.py || exit 1 + - run: ls -R mlruns/ + - uses: actions/upload-artifact@v4 + with: + name: model + path: mlruns/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3ab04e2f..02ada6a8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,88 +3,44 @@ __pycache__/ *.py[cod] *$py.class -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST -venv/ - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ -*-testresults.xml -test-output.xml - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments +# Project-specific ignores +/data/raw/ +/data/interim/ +/models/ +/mlruns/ +/mlartifacts/ + +# Exceptions - keep these directories (empty placeholder files will be tracked) +!/data/processed/ +!/data/processed/.gitkeep +!/models/.gitkeep +!/notebooks/ +!/notebooks/.gitkeep + +# Keep all source code +!/src/ +!/src/*.py + +# Keep documentation and configs +!*.md +!LICENSE +!README.md +!OUTLINE +!TIMELINE +!requirements.txt +!setup.py + +# DVC files (track these) +!.dvc/ +!data/.dvc +!models/.dvc + +# IDE and environment files +.vscode/ +.idea/ +*.swp +*.swo +*~ .env .venv env/ @@ -94,18 +50,36 @@ env.bak/ venv.bak/ *.vscode condaenv.* +pipenv +Pipfile* +poetry.lock -# Spyder project settings -.spyderproject -.spyproject +# Build and distribution files +build/ +dist/ +*.egg-info/ +*.egg +*.whl -# Rope project settings -.ropeproject +# Logs and debug files +*.log +logs/ +debug/ -# mkdocs documentation -/site +# Test and coverage reports +.coverage +htmlcov/ +.pytest_cache/ +test-results.xml -# mypy -.mypy_cache/ +# Jupyter +.ipynb_checkpoints/ +# OS-specific .DS_Store +Thumbs.db + +# Python cache +.mypy_cache/ +.python-version + diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 00000000..43ee6de4 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/raw_data.csv diff --git a/data/processed/train_data.csv b/data/processed/train_data.csv new file mode 100644 index 00000000..19620ce2 --- /dev/null +++ b/data/processed/train_data.csv @@ -0,0 +1,3 @@ +feature1,feature2,target +1.2,3.4,0 +5.6,7.8,1 diff --git a/data/raw_data.csv.dvc b/data/raw_data.csv.dvc new file mode 100644 index 00000000..1b7cdaf5 --- /dev/null +++ b/data/raw_data.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fd253016ea5e108529ecfa57303ea9dd + size: 48 + hash: md5 + path: raw_data.csv diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 00000000..5605a1d1 --- /dev/null +++ b/models/.gitignore @@ -0,0 +1,2 @@ +/model.pkldel.pkl +/model.pkl diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..ab9652f1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +mlflow +scikit-learn +evidently +pandas +dvc +pytest \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..cc19cfcf --- /dev/null +++ b/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup, find_packages + +setup( + name="mlops", + version="0.1", + packages=find_packages(), +) \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/monitor.py b/src/monitor.py new file mode 100644 index 00000000..d8a1465a --- /dev/null +++ b/src/monitor.py @@ -0,0 +1,28 @@ +from evidently import ColumnMapping +from evidently.report import Report +from evidently.metric_preset import DataDriftPreset +from evidently.metrics import ClassificationQualityMetric +import pandas as pd +import logging + +def generate_report(current_data: pd.DataFrame, + reference_data: pd.DataFrame, + target_col: str = "target"): + """Generate data drift and quality report""" + column_mapping = ColumnMapping( + target=target_col, + numerical_features=current_data.select_dtypes(include='number').columns.tolist() + ) + + report = Report(metrics=[ + DataDriftPreset(), + ClassificationQualityMetric() + ]) + + report.run( + current_data=current_data, + reference_data=reference_data, + column_mapping=column_mapping + ) + + return report \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 00000000..a8b96dfa --- /dev/null +++ b/src/train.py @@ -0,0 +1,278 @@ +import mlflow +import numpy as np +import pandas as pd +from pathlib import Path +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import (accuracy_score, f1_score, log_loss, + precision_score, recall_score, + classification_report) +from mlflow.models.signature import infer_signature +from mlflow.tracking import MlflowClient +import logging +import json +from datetime import datetime + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('training.log'), + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# Constants +BASE_DIR = Path(__file__).parent.parent +DATA_PATH = BASE_DIR / "data" / "processed" / "train_data.csv" +MODEL_NAME = "Fraud_Detection_Model" +VALIDATION_THRESHOLD = 0.9 +CONFIG = { + "data": { + "test_size": 0.2, + "random_state": 42, + "target_col": "target" + }, + "model": { + "type": "RandomForestClassifier", + "params": { + "n_estimators": 150, + "max_depth": 8, + "min_samples_split": 2, + "random_state": 42, + "class_weight": "balanced" + } + } +} + +def load_and_validate_data(): + data = pd.read_csv(DATA_PATH) + + # Check for any target column if not specified + target_col = CONFIG['data'].get('target_col', 'target') + if target_col not in data.columns: + # Try common target column names + for col in ['target', 'label', 'class']: + if col in data.columns: + target_col = col + break + else: + raise ValueError(f"No target column found in {DATA_PATH}") + + # Adjust sample size check + min_samples = CONFIG['data'].get('min_samples', 10) + if len(data) < min_samples: + logger.warning(f"Dataset has only {len(data)} samples (min {min_samples})") + + class_counts = data[CONFIG['data']['target_col']].value_counts() + if len(class_counts) < 2: + raise ValueError(f"Need at least 2 classes, found {class_counts.index.tolist()}") + + if len(data) < CONFIG['data'].get('min_samples', 100): + logger.warning(f"Dataset small ({len(data)} samples)") + + return train_test_split( + data.drop(target_col, axis=1), + data[target_col], + test_size=CONFIG['data'].get('test_size', 0.2), + random_state=CONFIG['data'].get('random_state', 42) + ) + +def evaluate_model(model, X_test, y_test): + y_pred = model.predict(X_test) + + if hasattr(model, "predict_proba"): + proba = model.predict_proba(X_test) + y_proba = proba[:, -1] + else: + y_proba = None + + report = classification_report(y_test, y_pred, output_dict=True, zero_division=0) + + metrics = { + "accuracy": accuracy_score(y_test, y_pred), + "f1_score": f1_score(y_test, y_pred, average='weighted'), + "precision": precision_score(y_test, y_pred, average='weighted'), + "recall": recall_score(y_test, y_pred, average='weighted'), + "classification_report": report # Keep the full report + } + + class_metrics = { + f"class_{k}_{metric}": v + for k, v in report.items() + if isinstance(v, dict) + for metric, v in v.items() + } + metrics.update(class_metrics) + + if len(np.unique(y_test)) > 1 and y_proba is not None: + metrics["log_loss"] = log_loss(y_test, y_proba) + + return metrics, y_pred + +def setup_mlflow(): + """Configure MLflow tracking""" + mlflow.set_tracking_uri(f"file:{str(BASE_DIR / 'mlruns')}") + mlflow.set_experiment("Fraud_Detection") + + # Enable autologging + mlflow.sklearn.autolog( + log_input_examples=True, + log_model_signatures=True, + log_models=True + ) + +def register_and_promote_model(client, run_id, metrics): + """Handle model versioning and promotion""" + try: + # Get the newly created version + new_version = client.get_latest_versions(MODEL_NAME, stages=["None"])[0] + + # Add comprehensive metadata + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="validation_status", + value="Pending" + ) + + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="deployment_ready", + value=str(metrics["accuracy"] >= VALIDATION_THRESHOLD).lower() + ) + + # Evaluate promotion criteria + if metrics["accuracy"] >= VALIDATION_THRESHOLD: + promote_model(client, new_version, metrics) + else: + client.set_model_version_tag( + name=MODEL_NAME, + version=new_version.version, + key="validation_status", + value="Rejected" + ) + logger.warning(f"Model accuracy {metrics['accuracy']:.2f} below threshold {VALIDATION_THRESHOLD}") + + except Exception as e: + logger.error(f"Model registration failed: {str(e)}") + raise + +def promote_model(client, new_version, metrics): + """Promote model through staging to production""" + try: + # Transition to Staging + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Staging" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Challenger", + version=new_version.version + ) + + # Check against current champion + try: + champion_version = client.get_model_version_by_alias(MODEL_NAME, "Champion") + champion_run = client.get_run(champion_version.run_id) + champion_metrics = champion_run.data.metrics + + if metrics["accuracy"] > champion_metrics["accuracy"]: + # Archive old champion + client.transition_model_version_stage( + name=MODEL_NAME, + version=champion_version.version, + stage="Archived" + ) + + # Promote new champion + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Production" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Champion", + version=new_version.version + ) + logger.info(f"New champion! Version {new_version.version} promoted to Production") + + except Exception as e: + logger.warning(f"No existing champion found: {str(e)}") + # First deployment - promote directly to Production + client.transition_model_version_stage( + name=MODEL_NAME, + version=new_version.version, + stage="Production" + ) + client.set_registered_model_alias( + name=MODEL_NAME, + alias="Champion", + version=new_version.version + ) + + except Exception as e: + logger.error(f"Model promotion failed: {str(e)}") + raise + +def train_and_register(): + """End-to-end training and registration pipeline""" + try: + # Setup tracking + setup_mlflow() + + # Load data + X_train, X_test, y_train, y_test = load_and_validate_data() + + # Train model + with mlflow.start_run(run_name=f"challenger_{datetime.now().strftime('%Y%m%d_%H%M%S')}"): + # Log config + mlflow.log_dict(CONFIG, "config.json") + + # Initialize model + model = RandomForestClassifier(**CONFIG['model']['params']) + + # Train + model.fit(X_train, y_train) + + # Evaluate + metrics, y_pred = evaluate_model(model, X_test, y_test) + + # Log metrics + mlflow.log_metrics({ + k: v for k, v in metrics.items() + if not k.endswith('_report') and isinstance(v, (int, float)) + }) + + if 'classification_report' in metrics: + mlflow.log_text( + json.dumps(metrics['classification_report'], indent=2), + "classification_report.json" + ) + + # Log model + signature = infer_signature(X_train, y_pred) + mlflow.sklearn.log_model( + sk_model=model, + artifact_path="model", + signature=signature, + input_example=X_train.iloc[:1], + registered_model_name=MODEL_NAME + ) + + # Register and promote + client = MlflowClient() + register_and_promote_model(client, mlflow.active_run().info.run_id, metrics) + + except Exception as e: + logger.error(f"Training pipeline failed: {str(e)}", exc_info=True) + raise + +if __name__ == "__main__": + train_and_register() \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..a1290976 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,28 @@ +import pytest +import pandas as pd +import numpy as np +from copy import deepcopy +from src.train import CONFIG + +@pytest.fixture +def sample_data(): + """Generate test data with correct column names""" + return pd.DataFrame({ + 'feature1': np.random.normal(0, 1, 200), + 'feature2': np.random.uniform(0, 1, 200), + 'target': np.random.randint(0, 2, 200) + }) + +@pytest.fixture +def original_config(): + """Preserve original configuration""" + return deepcopy(CONFIG) + +@pytest.fixture(autouse=True) +def restore_config(original_config): + """Auto-restore config after each test""" + yield + CONFIG.clear() + CONFIG.update(original_config) + + diff --git a/tests/test_train.py b/tests/test_train.py new file mode 100644 index 00000000..4e7b6542 --- /dev/null +++ b/tests/test_train.py @@ -0,0 +1,22 @@ +from src.train import load_and_validate_data + +# tests/test_train.py +def test_data_loading(sample_data, tmp_path, monkeypatch): + # Save test data + test_path = tmp_path / "test_data.csv" + sample_data.to_csv(test_path, index=False) + + # Temporarily patch the configuration + from src.train import CONFIG, DATA_PATH + CONFIG['data']['target_col'] = 'target' # Must match your fixture column + CONFIG['data']['min_samples'] = 5 # Lower threshold for tests + + # Use monkeypatch to safely modify DATA_PATH + monkeypatch.setattr('src.train.DATA_PATH', str(test_path)) + + # Import AFTER patching + from src.train import load_and_validate_data + X_train, X_test, y_train, y_test = load_and_validate_data() + + assert len(X_train) > 0 + assert len(y_test) > 0 \ No newline at end of file