mlops/solution/train.py

import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report
import rich  # <!-- дебаг через rich - моя guilty pleasure, очень уж люблю на красивые выводы смотреть
from rich import print as rpint
from rich.console import Console
from rich import box
from rich.table import Table
from rich.markdown import Markdown

console = Console()


# инициализируем специальные классы, чтобы раскидать данные по категориям

# Для преобразования TF-IDF в вектора
class TextExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self

    def transform(self, X):
        return X['full_text'].fillna('')

# Для StandardScaler
class NumberExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self

    def transform(self, X):
        return X[['amount']].fillna(0)


def train_model():
    console.log("[yellow]Грузим данные из data...[/yellow]")
    try:
        tx = pd.read_csv('data/transactions.csv')
        terminals = pd.read_csv('data/terminals.csv')
        receipts = pd.read_csv('data/receipts.csv')
    except FileNotFoundError as e:
        console.log(
            f"Файлы для обучения не найдены :( \n {e}", style="white on red")
        return

    console.log("[yellow]Предобрабатываем данные...[/yellow]")
    # Приклеиваеем вместе имена товаров
    receipts_agg = receipts.groupby('transaction_id')['item_name'].apply(
        lambda x: ' '.join(str(i) for i in x)
    ).reset_index()

    # Делаем один большой датафрейм с которым будем работать
    df = tx.merge(terminals[['terminal_id', 'terminal_name',
                  'terminal_description']], on='terminal_id', how='left')
    df = df.merge(receipts_agg, on='transaction_id', how='left')

    # Делаем текстовое поле для TF-IDF
    df['full_text'] = (
        df['terminal_name'].astype(str) + " " +
        
        df['terminal_description'].astype(str) + " " + # <!-- изначально я пробовал клеить id транзакции, однако модель слишком на ней зацикливалась
        df['item_name'].astype(str)
    ).str.lower()

    X = df[['full_text', 'amount']]
    y = df['true_mcc']

    # Пайплайн обучения

    pipeline = Pipeline([
        ('features', FeatureUnion([
            # Ветка для слов
            ('text_branch', Pipeline([
                ('extract', TextExtractor()),
                ('tfidf_union', FeatureUnion([
                    # Векторизуем слова и удаляем лишние слова без смысла
                    ('word', TfidfVectorizer(
                        ngram_range=(1, 2),
                        analyzer='word',
                        stop_words='english',
                        max_features=5000
                    )),
                    # Фиксим очепятки
                    ('char', TfidfVectorizer(
                        ngram_range=(2, 5),
                        analyzer='char_wb',
                        max_features=10000
                    ))
                ]))
            ])),
            # Ветка для чисел
            ('numeric_branch', Pipeline([
                ('extract', NumberExtractor()),
                ('scaler', StandardScaler())
            ]))
        ])),
        # Для классификации юзаем логрег
        ('clf', LogisticRegression(C=1.0, max_iter=1000)) # <!-- были разные коэффиценты, в итоге оставил что-то средненькое
    ])

    # Валидация

    console.log(
        "[yellow]Оцениваем качество на валидационной выборке...[/yellow]")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    probs = pipeline.predict_proba(X_test)

    acc = accuracy_score(y_test, y_pred)
    conf = np.mean(np.max(probs, axis=1))

    table = Table(box=box.ROUNDED, title="Отчёт")

    table.add_column("Метрика", justify="center", style="yellow")
    table.add_column("Значение", justify="center", style="yellow")

    table.add_row("Accuracy", f"{acc:.4f}")
    table.add_row("Avg Confidence", f"{conf:.4f}")
    console.print(table, justify="center")

    console.print("[yellow]Репорт по классам[/yellow]", justify="center")
    console.print(classification_report(y_test, y_pred), justify="center")

    # Метрики норм, учимся на всех данных и сохранем модель

    console.log("[yellow]Учимся на всем, что есть...[/yellow]")
    pipeline.fit(X, y)

    os.makedirs('solution/model', exist_ok=True)
    joblib.dump(pipeline, 'solution/model/mcc_model.pkl')
    console.log(Markdown("Сохранили модель в **solution/model/mcc_model.pkl**"))


if __name__ == "__main__":
    with console.status("Учим модель..."):
        train_model()
    console.print(Markdown("*Модель готова :)*"))