From 2dea4774090cfe170c8053817553a752c4ab6f70 Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Sun, 6 Apr 2025 00:34:12 +0545
Subject: [PATCH 1/6] Initialize DVC

---
 .dvc/.gitignore | 3 +++
 .dvc/config     | 0
 .dvcignore      | 3 +++
 3 files changed, 6 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore

diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 00000000..528f30c7
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
new file mode 100644
index 00000000..e69de29b
diff --git a/.dvcignore b/.dvcignore
new file mode 100644
index 00000000..51973055
--- /dev/null
+++ b/.dvcignore
@@ -0,0 +1,3 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore

From 42d584c5821850bfe8887163ccceaaa03de1230f Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Sun, 6 Apr 2025 00:40:13 +0545
Subject: [PATCH 2/6] Track data and models with DVC

---
 data/.gitignore   | 1 +
 models/.gitignore | 2 ++
 2 files changed, 3 insertions(+)
 create mode 100644 data/.gitignore
 create mode 100644 models/.gitignore

diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 00000000..43ee6de4
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1 @@
+/raw_data.csv
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 00000000..5605a1d1
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1,2 @@
+/model.pkldel.pkl
+/model.pkl

From 1fca8650b0a386253eb3889263d218d994436e6e Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Sun, 6 Apr 2025 12:11:35 +0545
Subject: [PATCH 3/6] Initialize DVC and add project files

---
 .github/workflows/train.yml   |  17 +++
 .gitignore                    | 156 +++++++++------------
 data/processed/train_data.csv |   3 +
 data/raw_data.csv.dvc         |   5 +
 requirements.txt              |   6 +
 src/monitor.py                |  28 ++++
 src/train.py                  | 249 ++++++++++++++++++++++++++++++++++
 tests/test_train.py           |   8 ++
 8 files changed, 381 insertions(+), 91 deletions(-)
 create mode 100644 .github/workflows/train.yml
 create mode 100644 data/processed/train_data.csv
 create mode 100644 data/raw_data.csv.dvc
 create mode 100644 requirements.txt
 create mode 100644 src/monitor.py
 create mode 100644 src/train.py
 create mode 100644 tests/test_train.py

diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml
new file mode 100644
index 00000000..502feefb
--- /dev/null
+++ b/.github/workflows/train.yml
@@ -0,0 +1,17 @@
+name: Train Model
+on: [push]
+jobs:
+  train:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - run: pip install -r requirements.txt
+      - run: python -m pytest tests/
+      - run: python src/train.py
+      - uses: actions/upload-artifact@v3
+        with:
+          name: model
+          path: mlruns/
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 3ab04e2f..02ada6a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,88 +3,44 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-venv/
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-*-testresults.xml
-test-output.xml
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
+# Project-specific ignores
+/data/raw/
+/data/interim/
+/models/
+/mlruns/
+/mlartifacts/
+
+# Exceptions - keep these directories (empty placeholder files will be tracked)
+!/data/processed/
+!/data/processed/.gitkeep
+!/models/.gitkeep
+!/notebooks/
+!/notebooks/.gitkeep
+
+# Keep all source code
+!/src/
+!/src/*.py
+
+# Keep documentation and configs
+!*.md
+!LICENSE
+!README.md
+!OUTLINE
+!TIMELINE
+!requirements.txt
+!setup.py
+
+# DVC files (track these)
+!.dvc/
+!data/.dvc
+!models/.dvc
+
+# IDE and environment files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
 .env
 .venv
 env/
@@ -94,18 +50,36 @@ env.bak/
 venv.bak/
 *.vscode
 condaenv.*
+pipenv
+Pipfile*
+poetry.lock
 
-# Spyder project settings
-.spyderproject
-.spyproject
+# Build and distribution files
+build/
+dist/
+*.egg-info/
+*.egg
+*.whl
 
-# Rope project settings
-.ropeproject
+# Logs and debug files
+*.log
+logs/
+debug/
 
-# mkdocs documentation
-/site
+# Test and coverage reports
+.coverage
+htmlcov/
+.pytest_cache/
+test-results.xml
 
-# mypy
-.mypy_cache/
+# Jupyter
+.ipynb_checkpoints/
 
+# OS-specific
 .DS_Store
+Thumbs.db
+
+# Python cache
+.mypy_cache/
+.python-version
+
diff --git a/data/processed/train_data.csv b/data/processed/train_data.csv
new file mode 100644
index 00000000..19620ce2
--- /dev/null
+++ b/data/processed/train_data.csv
@@ -0,0 +1,3 @@
+﻿feature1,feature2,target
+1.2,3.4,0
+5.6,7.8,1
diff --git a/data/raw_data.csv.dvc b/data/raw_data.csv.dvc
new file mode 100644
index 00000000..1b7cdaf5
--- /dev/null
+++ b/data/raw_data.csv.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: fd253016ea5e108529ecfa57303ea9dd
+  size: 48
+  hash: md5
+  path: raw_data.csv
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..ab9652f1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+mlflow
+scikit-learn
+evidently
+pandas
+dvc
+pytest
\ No newline at end of file
diff --git a/src/monitor.py b/src/monitor.py
new file mode 100644
index 00000000..d8a1465a
--- /dev/null
+++ b/src/monitor.py
@@ -0,0 +1,28 @@
+from evidently import ColumnMapping
+from evidently.report import Report
+from evidently.metric_preset import DataDriftPreset
+from evidently.metrics import ClassificationQualityMetric
+import pandas as pd
+import logging
+
+def generate_report(current_data: pd.DataFrame, 
+                   reference_data: pd.DataFrame,
+                   target_col: str = "target"):
+    """Generate data drift and quality report"""
+    column_mapping = ColumnMapping(
+        target=target_col,
+        numerical_features=current_data.select_dtypes(include='number').columns.tolist()
+    )
+    
+    report = Report(metrics=[
+        DataDriftPreset(),
+        ClassificationQualityMetric()
+    ])
+    
+    report.run(
+        current_data=current_data,
+        reference_data=reference_data,
+        column_mapping=column_mapping
+    )
+    
+    return report
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
new file mode 100644
index 00000000..5b690b05
--- /dev/null
+++ b/src/train.py
@@ -0,0 +1,249 @@
+import mlflow
+import pandas as pd
+from pathlib import Path
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (accuracy_score, f1_score, 
+                           precision_score, recall_score, 
+                           classification_report)
+from mlflow.models.signature import infer_signature
+from mlflow.tracking import MlflowClient
+import logging
+import json
+from datetime import datetime
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('training.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+BASE_DIR = Path(__file__).parent.parent
+DATA_PATH = BASE_DIR / "data" / "processed" / "train_data.csv"
+MODEL_NAME = "Fraud_Detection_Model"
+VALIDATION_THRESHOLD = 0.9
+CONFIG = {
+    "data": {
+        "test_size": 0.2,
+        "random_state": 42,
+        "target_col": "target"
+    },
+    "model": {
+        "type": "RandomForestClassifier",
+        "params": {
+            "n_estimators": 150,
+            "max_depth": 8,
+            "min_samples_split": 2,
+            "random_state": 42,
+            "class_weight": "balanced"
+        }
+    }
+}
+
+def load_and_validate_data():
+    """Load and validate input data"""
+    logger.info(f"Loading data from {DATA_PATH}")
+    try:
+        data = pd.read_csv(DATA_PATH)
+        
+        # Validate data
+        assert CONFIG['data']['target_col'] in data.columns, \
+            f"Target column {CONFIG['data']['target_col']} not found"
+        assert len(data) > 100, "Insufficient data samples"
+        
+        X = data.drop(CONFIG['data']['target_col'], axis=1)
+        y = data[CONFIG['data']['target_col']]
+        
+        return train_test_split(
+            X, y,
+            test_size=CONFIG['data']['test_size'],
+            random_state=CONFIG['data']['random_state'],
+            stratify=y
+        )
+    except Exception as e:
+        logger.error(f"Data loading failed: {str(e)}")
+        raise
+
+def evaluate_model(model, X_test, y_test):
+    """Comprehensive model evaluation"""
+    y_pred = model.predict(X_test)
+    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
+    
+    metrics = {
+        "accuracy": accuracy_score(y_test, y_pred),
+        "f1_score": f1_score(y_test, y_pred, average='weighted'),
+        "precision": precision_score(y_test, y_pred, average='weighted'),
+        "recall": recall_score(y_test, y_pred, average='weighted'),
+        "classification_report": classification_report(y_test, y_pred, output_dict=True)
+    }
+    
+    # Log class distribution
+    metrics["class_distribution"] = dict(pd.Series(y_test).value_counts())
+    
+    return metrics, y_pred
+
+def setup_mlflow():
+    """Configure MLflow tracking"""
+    mlflow.set_tracking_uri(f"file:{str(BASE_DIR / 'mlruns')}")
+    mlflow.set_experiment("Fraud_Detection")
+    
+    # Enable autologging
+    mlflow.sklearn.autolog(
+        log_input_examples=True,
+        log_model_signatures=True,
+        log_models=True
+    )
+
+def register_and_promote_model(client, run_id, metrics):
+    """Handle model versioning and promotion"""
+    try:
+        # Get the newly created version
+        new_version = client.get_latest_versions(MODEL_NAME, stages=["None"])[0]
+        
+        # Add comprehensive metadata
+        client.set_model_version_tag(
+            name=MODEL_NAME,
+            version=new_version.version,
+            key="validation_status",
+            value="Pending"
+        )
+        
+        client.set_model_version_tag(
+            name=MODEL_NAME,
+            version=new_version.version,
+            key="deployment_ready",
+            value=str(metrics["accuracy"] >= VALIDATION_THRESHOLD).lower()
+        )
+        
+        # Evaluate promotion criteria
+        if metrics["accuracy"] >= VALIDATION_THRESHOLD:
+            promote_model(client, new_version, metrics)
+        else:
+            client.set_model_version_tag(
+                name=MODEL_NAME,
+                version=new_version.version,
+                key="validation_status",
+                value="Rejected"
+            )
+            logger.warning(f"Model accuracy {metrics['accuracy']:.2f} below threshold {VALIDATION_THRESHOLD}")
+            
+    except Exception as e:
+        logger.error(f"Model registration failed: {str(e)}")
+        raise
+
+def promote_model(client, new_version, metrics):
+    """Promote model through staging to production"""
+    try:
+        # Transition to Staging
+        client.transition_model_version_stage(
+            name=MODEL_NAME,
+            version=new_version.version,
+            stage="Staging"
+        )
+        client.set_registered_model_alias(
+            name=MODEL_NAME,
+            alias="Challenger",
+            version=new_version.version
+        )
+        
+        # Check against current champion
+        try:
+            champion_version = client.get_model_version_by_alias(MODEL_NAME, "Champion")
+            champion_run = client.get_run(champion_version.run_id)
+            champion_metrics = champion_run.data.metrics
+            
+            if metrics["accuracy"] > champion_metrics["accuracy"]:
+                # Archive old champion
+                client.transition_model_version_stage(
+                    name=MODEL_NAME,
+                    version=champion_version.version,
+                    stage="Archived"
+                )
+                
+                # Promote new champion
+                client.transition_model_version_stage(
+                    name=MODEL_NAME,
+                    version=new_version.version,
+                    stage="Production"
+                )
+                client.set_registered_model_alias(
+                    name=MODEL_NAME,
+                    alias="Champion",
+                    version=new_version.version
+                )
+                logger.info(f"New champion! Version {new_version.version} promoted to Production")
+                
+        except Exception as e:
+            logger.warning(f"No existing champion found: {str(e)}")
+            # First deployment - promote directly to Production
+            client.transition_model_version_stage(
+                name=MODEL_NAME,
+                version=new_version.version,
+                stage="Production"
+            )
+            client.set_registered_model_alias(
+                name=MODEL_NAME,
+                alias="Champion",
+                version=new_version.version
+            )
+            
+    except Exception as e:
+        logger.error(f"Model promotion failed: {str(e)}")
+        raise
+
+def train_and_register():
+    """End-to-end training and registration pipeline"""
+    try:
+        # Setup tracking
+        setup_mlflow()
+        
+        # Load data
+        X_train, X_test, y_train, y_test = load_and_validate_data()
+        
+        # Train model
+        with mlflow.start_run(run_name=f"challenger_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
+            # Log config
+            mlflow.log_dict(CONFIG, "config.json")
+            
+            # Initialize model
+            model = RandomForestClassifier(**CONFIG['model']['params'])
+            
+            # Train
+            model.fit(X_train, y_train)
+            
+            # Evaluate
+            metrics, y_pred = evaluate_model(model, X_test, y_test)
+            
+            # Log metrics
+            mlflow.log_metrics(metrics)
+            mlflow.log_text(
+                json.dumps(metrics['classification_report'], indent=2),
+                "classification_report.json"
+            )
+            
+            # Log model
+            signature = infer_signature(X_train, y_pred)
+            mlflow.sklearn.log_model(
+                sk_model=model,
+                artifact_path="model",
+                signature=signature,
+                input_example=X_train.iloc[:1],
+                registered_model_name=MODEL_NAME
+            )
+            
+            # Register and promote
+            client = MlflowClient()
+            register_and_promote_model(client, mlflow.active_run().info.run_id, metrics)
+            
+    except Exception as e:
+        logger.error(f"Training pipeline failed: {str(e)}", exc_info=True)
+        raise
+
+if __name__ == "__main__":
+    train_and_register()
\ No newline at end of file
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 00000000..3d5966af
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,8 @@
+# tests/test_train.py
+import pytest
+from train import load_and_validate_data
+
+def test_data_loading():
+    X_train, X_test, y_train, y_test = load_and_validate_data()
+    assert len(X_train) > 0
+    assert len(y_test) > 0
\ No newline at end of file

From 87241cc105701ceb3113a62c865259dd40c0cd41 Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Sun, 6 Apr 2025 23:05:43 +0545
Subject: [PATCH 4/6] Fix workflow: add DVC, debug outputs

---
 .github/workflows/train.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml
index 502feefb..d4239944 100644
--- a/.github/workflows/train.yml
+++ b/.github/workflows/train.yml
@@ -10,8 +10,9 @@ jobs:
           python-version: '3.10'
       - run: pip install -r requirements.txt
       - run: python -m pytest tests/
-      - run: python src/train.py
-      - uses: actions/upload-artifact@v3
+      - run: python src/train.py || exit 1
+      - run: ls -R mlruns/ 
+      - uses: actions/upload-artifact@v4
         with:
           name: model
           path: mlruns/
\ No newline at end of file

From 678e7106f64eed61bc8f461d1d3fdd7b569c7b86 Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Sun, 6 Apr 2025 23:54:43 +0545
Subject: [PATCH 5/6] Update DVC tracking and add test infrastructure

---
 setup.py            |  7 +++++++
 src/__init__.py     |  0
 src/train.py        | 46 +++++++++++++++++++++++----------------------
 tests/__init__.py   |  0
 tests/conftest.py   | 28 +++++++++++++++++++++++++++
 tests/test_train.py | 22 ++++++++++++++++++----
 6 files changed, 77 insertions(+), 26 deletions(-)
 create mode 100644 setup.py
 create mode 100644 src/__init__.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..cc19cfcf
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,7 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="mlops",
+    version="0.1",
+    packages=find_packages(),
+)
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/train.py b/src/train.py
index 5b690b05..8fd103e1 100644
--- a/src/train.py
+++ b/src/train.py
@@ -47,28 +47,30 @@
 }
 
 def load_and_validate_data():
-    """Load and validate input data"""
-    logger.info(f"Loading data from {DATA_PATH}")
-    try:
-        data = pd.read_csv(DATA_PATH)
-        
-        # Validate data
-        assert CONFIG['data']['target_col'] in data.columns, \
-            f"Target column {CONFIG['data']['target_col']} not found"
-        assert len(data) > 100, "Insufficient data samples"
-        
-        X = data.drop(CONFIG['data']['target_col'], axis=1)
-        y = data[CONFIG['data']['target_col']]
-        
-        return train_test_split(
-            X, y,
-            test_size=CONFIG['data']['test_size'],
-            random_state=CONFIG['data']['random_state'],
-            stratify=y
-        )
-    except Exception as e:
-        logger.error(f"Data loading failed: {str(e)}")
-        raise
+    data = pd.read_csv(DATA_PATH)
+    
+    # Check for any target column if not specified
+    target_col = CONFIG['data'].get('target_col', 'target')
+    if target_col not in data.columns:
+        # Try common target column names
+        for col in ['target', 'label', 'class']:
+            if col in data.columns:
+                target_col = col
+                break
+        else:
+            raise ValueError(f"No target column found in {DATA_PATH}")
+
+    # Adjust sample size check
+    min_samples = CONFIG['data'].get('min_samples', 10)
+    if len(data) < min_samples:
+        logger.warning(f"Dataset has only {len(data)} samples (min {min_samples})")
+    
+    return train_test_split(
+        data.drop(target_col, axis=1),
+        data[target_col],
+        test_size=CONFIG['data'].get('test_size', 0.2),
+        random_state=CONFIG['data'].get('random_state', 42)
+    )
 
 def evaluate_model(model, X_test, y_test):
     """Comprehensive model evaluation"""
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..a1290976
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,28 @@
+import pytest
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+from src.train import CONFIG
+
+@pytest.fixture
+def sample_data():
+    """Generate test data with correct column names"""
+    return pd.DataFrame({
+        'feature1': np.random.normal(0, 1, 200),
+        'feature2': np.random.uniform(0, 1, 200),
+        'target': np.random.randint(0, 2, 200) 
+    })
+
+@pytest.fixture
+def original_config():
+    """Preserve original configuration"""
+    return deepcopy(CONFIG)
+
+@pytest.fixture(autouse=True)
+def restore_config(original_config):
+    """Auto-restore config after each test"""
+    yield
+    CONFIG.clear()
+    CONFIG.update(original_config)
+
+
diff --git a/tests/test_train.py b/tests/test_train.py
index 3d5966af..4e7b6542 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,8 +1,22 @@
-# tests/test_train.py
-import pytest
-from train import load_and_validate_data
+from src.train import load_and_validate_data
 
-def test_data_loading():
+# tests/test_train.py
+def test_data_loading(sample_data, tmp_path, monkeypatch):
+    # Save test data
+    test_path = tmp_path / "test_data.csv"
+    sample_data.to_csv(test_path, index=False)
+    
+    # Temporarily patch the configuration
+    from src.train import CONFIG, DATA_PATH
+    CONFIG['data']['target_col'] = 'target'  # Must match your fixture column
+    CONFIG['data']['min_samples'] = 5  # Lower threshold for tests
+    
+    # Use monkeypatch to safely modify DATA_PATH
+    monkeypatch.setattr('src.train.DATA_PATH', str(test_path))
+    
+    # Import AFTER patching
+    from src.train import load_and_validate_data
     X_train, X_test, y_train, y_test = load_and_validate_data()
+    
     assert len(X_train) > 0
     assert len(y_test) > 0
\ No newline at end of file

From cf37bc44599ef1b2c3a6a0146e41f3bdf7e9cfde Mon Sep 17 00:00:00 2001
From: Rupesh Sigdel <rupeshcgdl2060@gmail.com>
Date: Mon, 7 Apr 2025 00:38:53 +0545
Subject: [PATCH 6/6] Improve model training with robust data validation and
 metric handling

---
 src/train.py | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/src/train.py b/src/train.py
index 8fd103e1..a8b96dfa 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,9 +1,10 @@
 import mlflow
+import numpy as np
 import pandas as pd
 from pathlib import Path
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import (accuracy_score, f1_score, 
+from sklearn.metrics import (accuracy_score, f1_score, log_loss,
                            precision_score, recall_score, 
                            classification_report)
 from mlflow.models.signature import infer_signature
@@ -65,6 +66,13 @@ def load_and_validate_data():
     if len(data) < min_samples:
         logger.warning(f"Dataset has only {len(data)} samples (min {min_samples})")
     
+    class_counts = data[CONFIG['data']['target_col']].value_counts()
+    if len(class_counts) < 2:
+        raise ValueError(f"Need at least 2 classes, found {class_counts.index.tolist()}")
+    
+    if len(data) < CONFIG['data'].get('min_samples', 100):
+        logger.warning(f"Dataset small ({len(data)} samples)")
+    
     return train_test_split(
         data.drop(target_col, axis=1),
         data[target_col],
@@ -73,20 +81,34 @@ def load_and_validate_data():
     )
 
 def evaluate_model(model, X_test, y_test):
-    """Comprehensive model evaluation"""
     y_pred = model.predict(X_test)
-    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
+    
+    if hasattr(model, "predict_proba"):
+        proba = model.predict_proba(X_test)
+        y_proba = proba[:, -1]
+    else:
+        y_proba = None
+    
+    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
     
     metrics = {
         "accuracy": accuracy_score(y_test, y_pred),
         "f1_score": f1_score(y_test, y_pred, average='weighted'),
         "precision": precision_score(y_test, y_pred, average='weighted'),
         "recall": recall_score(y_test, y_pred, average='weighted'),
-        "classification_report": classification_report(y_test, y_pred, output_dict=True)
+        "classification_report": report  # Keep the full report
     }
     
-    # Log class distribution
-    metrics["class_distribution"] = dict(pd.Series(y_test).value_counts())
+    class_metrics = {
+        f"class_{k}_{metric}": v 
+        for k, v in report.items() 
+        if isinstance(v, dict)
+        for metric, v in v.items()
+    }
+    metrics.update(class_metrics)
+    
+    if len(np.unique(y_test)) > 1 and y_proba is not None:
+        metrics["log_loss"] = log_loss(y_test, y_proba)
     
     return metrics, y_pred
 
@@ -223,11 +245,16 @@ def train_and_register():
             metrics, y_pred = evaluate_model(model, X_test, y_test)
             
             # Log metrics
-            mlflow.log_metrics(metrics)
-            mlflow.log_text(
-                json.dumps(metrics['classification_report'], indent=2),
-                "classification_report.json"
-            )
+            mlflow.log_metrics({
+                k: v for k, v in metrics.items() 
+                if not k.endswith('_report') and isinstance(v, (int, float))
+            })
+
+            if 'classification_report' in metrics:
+                mlflow.log_text(
+                    json.dumps(metrics['classification_report'], indent=2),
+                    "classification_report.json"
+                )
             
             # Log model
             signature = infer_signature(X_train, y_pred)