Initial commit

2026-01-21 22:08:24 +03:00 · 2026-01-21 22:08:24 +03:00 · 74df71d82d
commit 74df71d82d
7 changed files with 404 additions and 0 deletions
--- a/dih.py
+++ b/dih.py
@ -0,0 +1,93 @@
+import pandas as pd
+import numpy as np
+from catboost import CatBoostClassifier
+
+class MCCPredictor:
+    def __init__(self, model_path='mcc_classifier.cbm'):
+        self.model = CatBoostClassifier()
+        self.model.load_model(model_path)
+        
+        # Порядок колонок должен быть В ТОЧНОСТИ как при обучении
+        self.feature_order = [
+            'terminal_name', 'terminal_description', 'terminal_city', 'items_text', 'text',
+            'amount', 'items_count', 'items_total_price', 'items_max_price', 'items_min_price',
+            'terminal_id'
+        ]
+
+    def _preprocess_json(self, data):
+        """Конвертирует входящий JSON в плоскую структуру для модели"""
+        
+        # 1. Агрегируем данные из списка items
+        items = data.get('items', [])
+        item_names = [str(i.get('name', '')) for i in items]
+        item_prices = [float(i.get('price', 0)) for i in items]
+        
+        items_text = " ".join(item_names)
+        items_count = len(items)
+        items_total_price = sum(item_prices)
+        items_max_price = max(item_prices) if item_prices else 0
+        items_min_price = min(item_prices) if item_prices else 0
+        
+        # 2. Формируем ту самую склеенную колонку 'text'
+        # Важно: используй тот же формат, что был в train.csv
+        combined_text = f"{data.get('terminal_name', '')} {data.get('terminal_description', '')} {data.get('city', '')} items {items_text}"
+        
+        # 3. Собираем финальный словарь
+        flat_data = {
+            'terminal_name': str(data.get('terminal_name', '')),
+            'terminal_description': str(data.get('terminal_description', '')),
+            'terminal_city': str(data.get('city', '')), # city -> terminal_city
+            'items_text': items_text,
+            'text': combined_text.lower(),
+            'amount': float(data.get('amount', 0)),
+            'items_count': float(items_count),
+            'items_total_price': float(items_total_price),
+            'items_max_price': float(items_max_price),
+            'items_min_price': float(items_min_price),
+            'terminal_id': 'unknown' # В запросе нет ID, ставим заглушку
+        }
+        return flat_data
+
+    def predict(self, raw_json):
+        # Если пришла одна транзакция, оборачиваем в список
+        if isinstance(raw_json, dict):
+            raw_json = [raw_json]
+            
+        # Препроцессинг всех транзакций в списке
+        processed_data = [self._preprocess_json(t) for t in raw_json]
+        df = pd.DataFrame(processed_data)
+        
+        # Проверка порядка колонок
+        df = df[self.feature_order]
+
+        # Предсказание
+        mcc_codes = self.model.predict(df)
+        probs = self.model.predict_proba(df)
+        
+        results = []
+        for i in range(len(raw_json)):
+            results.append({
+                "transaction_id": raw_json[i].get('transaction_id'),
+                "mcc": int(mcc_codes[i][0]),
+                "confidence": round(float(np.max(probs[i])), 4)
+            })
+        return results
+
+# --- ТЕСТ ---
+predictor = MCCPredictor('mcc_classifier.cbm')
+
+request_data = {
+  "transaction_id": "TX00001116",
+  "terminal_name": "STORE001",
+  "terminal_description": "common common common thing",
+  "city": "NYC",
+  "amount": 272.80,
+  "items": [
+    {"name": "basic loyalty", "price": 58.20},
+    {"name": "Bringiong item lifes", "price": 28.99},
+    {"name": "regular item basic item", "price": 56.91}
+  ]
+}
+
+res = predictor.predict(request_data)
+print(res)
--- a/mcc_classifier.cbm
+++ b/mcc_classifier.cbm
--- a/prepare_data.py
+++ b/prepare_data.py
@ -0,0 +1,100 @@
+import pandas as pd
+import re
+
+DATA_DIR = "data"
+
+# ---------- Текстовая очистка ----------
+def clean_text(s: str, max_len=1000):
+    if not isinstance(s, str):
+        return ""
+
+    s = s.lower()
+
+    # --- УБРАТЬ ПРЕФИКС t" или t' ---
+    s = re.sub(r'^t["\']', '', s)
+
+    # --- УНИФИКАЦИЯ КАВЫЧЕК ---
+    s = s.replace('"', ' ').replace("'", " ").replace('`', ' ')
+
+    # убрать не-ASCII (эмодзи, иероглифы и т.п.)
+    s = re.sub(r'[^\x00-\x7F]+', ' ', s)
+
+    # оставить только буквы и цифры
+    s = re.sub(r'[^a-z0-9\s]', ' ', s)
+
+    # удалить подряд идущие повторы слов
+    words = s.split()
+    dedup = []
+    prev = None
+    for w in words:
+        if w != prev:
+            dedup.append(w)
+        prev = w
+    s = " ".join(dedup)
+
+    # нормализовать пробелы и обрезать
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s[:max_len]
+
+
+# ---------- LOAD ----------
+transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv")
+terminals    = pd.read_csv(f"{DATA_DIR}/terminals.csv")
+receipts     = pd.read_csv(f"{DATA_DIR}/receipts.csv")
+
+# ---------- CLEAN TEXT ----------
+for col in ["terminal_name", "terminal_description", "terminal_city"]:
+    if col == "terminal_city":
+        terminals[col] = terminals[col].astype(str).apply(clean_text)
+    else:
+        terminals[col] = terminals[col].apply(clean_text)
+
+receipts["item_name"] = receipts["item_name"].apply(clean_text)
+
+# ---------- AGGREGATE RECEIPTS ----------
+receipt_agg = receipts.groupby("transaction_id").agg(
+    items_text=("item_name", lambda x: " ".join(x)),
+    items_count=("item_name", "count"),
+    items_total_price=("item_price", "sum"),
+    items_max_price=("item_price", "max"),
+    items_min_price=("item_price", "min"),
+).reset_index()
+
+# ---------- MERGE WITH TRANSACTIONS ----------
+df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge(
+    terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]],
+    on="terminal_id",
+    how="left"
+)
+
+df = df.merge(receipt_agg, on="transaction_id", how="left")
+
+# ---------- FILL NA ----------
+for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]:
+    df[col] = df[col].fillna("")
+
+for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]:
+    df[col] = df[col].fillna(0)
+
+# ---------- BUILD FINAL TEXT ----------
+df["text"] = (
+    df["terminal_name"] + " " +
+    df["terminal_description"] + " " +
+    df["terminal_city"] + " " +
+    " items " + df["items_text"] + " items " +
+    df["items_text"]
+)
+
+df["text"] = df["text"].apply(clean_text)
+
+# ---------- FINAL CHECK ----------
+print("rows:", len(df))
+print("unique tx:", df["transaction_id"].nunique())
+print(df["true_mcc"].value_counts())
+
+assert len(df) == df["transaction_id"].nunique()
+assert df["text"].str.len().min() > 0
+
+# ---------- SAVE ----------
+df.to_csv("train.csv", index=False)
+print("saved train.csv")
--- a/req.txt
+++ b/req.txt
@ -0,0 +1,49 @@
+asttokens==3.0.1
+catboost==1.2.8
+comm==0.2.3
+contourpy==1.3.3
+cycler==0.12.1
+debugpy==1.8.19
+decorator==5.2.1
+executing==2.2.1
+fonttools==4.61.1
+graphviz==0.21
+ipykernel==7.1.0
+ipython==9.9.0
+ipython-pygments-lexers==1.1.1
+jedi==0.19.2
+joblib==1.5.3
+jupyter-client==8.8.0
+jupyter-core==5.9.1
+kiwisolver==1.4.9
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+narwhals==2.15.0
+nest-asyncio==1.6.0
+numpy==2.4.1
+packaging==25.0
+pandas==2.3.3
+parso==0.8.5
+pexpect==4.9.0
+pillow==12.1.0
+pip==25.0.1
+platformdirs==4.5.1
+plotly==6.5.2
+prompt-toolkit==3.0.52
+psutil==7.2.1
+ptyprocess==0.7.0
+pure-eval==0.2.3
+pygments==2.19.2
+pyparsing==3.3.2
+python-dateutil==2.9.0.post0
+pytz==2025.2
+pyzmq==27.1.0
+scikit-learn==1.8.0
+scipy==1.17.0
+six==1.17.0
+stack-data==0.6.3
+threadpoolctl==3.6.0
+tornado==6.5.4
+traitlets==5.14.3
+tzdata==2025.3
+wcwidth==0.2.14
--- a/solution/model/mcc_model.pkl
+++ b/solution/model/mcc_model.pkl
--- a/test.py
+++ b/test.py
@ -0,0 +1,38 @@
+import joblib
+import pandas as pd
+# Импортируем наши классы, чтобы joblib мог их десериализовать
+from train import TextExtractor, NumberExtractor
+
+# 1. Загружаем модель
+model = joblib.load('solution/model/mcc_model.pkl')
+
+# 2. Подготавливаем тестовые данные (как они придут в API)
+test_json = {
+    "transaction_id": "TX00001116",
+    "terminal_name": "STORE001",
+    "terminal_description": "common common common thing",
+    "city": "NYC",
+    "amount": 272.80,
+    "items": [
+        {"name": "basic loyalty", "price": 58.20},
+        {"name": "Bringiong item lifes", "price": 28.99}
+    ]
+}
+
+
+# 3. Преобразуем в формат, который понимает пайплайн (как в main.py)
+items_str = " ".join([i['name'] for i in test_json['items']])
+full_text = f"{test_json['terminal_name']} {test_json['terminal_description']} {items_str}".lower()
+
+input_df = pd.DataFrame([{
+    'full_text': full_text,
+    'amount': test_json['amount']
+}])
+
+# 4. Делаем предсказание
+prediction = model.predict(input_df)[0]
+confidence = model.predict_proba(input_df).max()
+
+print(f"Transaction ID: {test_json['transaction_id']}")
+print(f"Predicted MCC: {prediction}")
+print(f"Confidence: {confidence:.4f}")
--- a/train.py
+++ b/train.py
@ -0,0 +1,124 @@
+import pandas as pd
+import numpy as np
+import joblib
+import os
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.preprocessing import StandardScaler
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.metrics import accuracy_score, classification_report
+
+# --- 1. Кастомные трансформеры для пайплайна ---
+
+class TextExtractor(BaseEstimator, TransformerMixin):
+    """Извлекает текстовую колонку для TF-IDF"""
+    def fit(self, X, y=None): return self
+    def transform(self, X):
+        return X['full_text'].fillna('')
+
+class NumberExtractor(BaseEstimator, TransformerMixin):
+    """Извлекает числовую колонку 'amount'"""
+    def fit(self, X, y=None): return self
+    def transform(self, X):
+        # Возвращаем как DataFrame (2D массив) для StandardScaler
+        return X[['amount']].fillna(0)
+
+def train_model():
+    print("Загрузка данных...")
+    # Пути к файлам (предполагаем, что скрипт запускается из корня, где есть папка data/)
+    try:
+        tx = pd.read_csv('data/transactions.csv')
+        terminals = pd.read_csv('data/terminals.csv')
+        receipts = pd.read_csv('data/receipts.csv')
+    except FileNotFoundError as e:
+        print(f"Ошибка: Не найдены файлы данных в папке data/. {e}")
+        return
+
+    # --- 2. Предобработка и сборка признаков ---
+
+    print("Предобработка...")
+    # Агрегируем названия товаров в одну строку для каждой транзакции
+    receipts_agg = receipts.groupby('transaction_id')['item_name'].apply(
+        lambda x: ' '.join(str(i) for i in x)
+    ).reset_index()
+    
+    # Объединяем транзакции с данными терминалов и чеками
+    df = tx.merge(terminals[['terminal_id', 'terminal_name', 'terminal_description']], on='terminal_id', how='left')
+    df = df.merge(receipts_agg, on='transaction_id', how='left')
+    
+    # Создаем единое текстовое поле (игнорируем transaction_id, чтобы не было утечки!)
+    df['full_text'] = (
+        df['terminal_name'].astype(str) + " " + 
+        df['terminal_description'].astype(str) + " " + 
+        df['item_name'].astype(str)
+    ).str.lower()
+
+    X = df[['full_text', 'amount']]
+    y = df['true_mcc']
+
+    # --- 3. Создание пайплайна ---
+
+    pipeline = Pipeline([
+        ('features', FeatureUnion([
+            # Ветка ТЕКСТА
+            ('text_branch', Pipeline([
+                ('extract', TextExtractor()),
+                ('tfidf_union', FeatureUnion([
+                    # Слова (смысл)
+                    ('word', TfidfVectorizer(
+                        ngram_range=(1, 2),
+                        analyzer='word',
+                        stop_words='english',
+                        max_features=5000
+                    )),
+                    # Символы (опечатки)
+                    ('char', TfidfVectorizer(
+                        ngram_range=(2, 5),
+                        analyzer='char_wb',
+                        max_features=10000
+                    ))
+                ]))
+            ])),
+            # Ветка ЧИСЕЛ
+            ('numeric_branch', Pipeline([
+                ('extract', NumberExtractor()),
+                ('scaler', StandardScaler())
+            ]))
+        ])),
+        # Классификатор
+        ('clf', LogisticRegression(C=1.0, max_iter=1000))
+    ])
+
+    # --- 4. Оценка качества (Валидация) ---
+
+    print("Оценка качества на валидационной выборке...")
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y
+    )
+
+    pipeline.fit(X_train, y_train)
+    y_pred = pipeline.predict(X_test)
+    probs = pipeline.predict_proba(X_test)
+
+    acc = accuracy_score(y_test, y_pred)
+    conf = np.mean(np.max(probs, axis=1))
+
+    print(f"\n[РЕЗУЛЬТАТЫ]")
+    print(f"Accuracy: {acc:.4f}")
+    print(f"Average Confidence: {conf:.4f}")
+    print("\nОтчет по категориям:")
+    print(classification_report(y_test, y_pred))
+
+    # --- 5. Финальное обучение и сохранение ---
+
+    print("\nФинальное обучение на всех данных...")
+    pipeline.fit(X, y)
+
+    os.makedirs('solution/model', exist_ok=True)
+    joblib.dump(pipeline, 'solution/model/mcc_model.pkl')
+    print("Модель успешно сохранена в solution/model/mcc_model.pkl")
+
+if __name__ == "__main__":
+    train_model()