commit 74df71d82dd08c5b7ff5ca225a78adc35dfe0c5a Author: NightyStudios Date: Wed Jan 21 22:08:24 2026 +0300 Initial commit diff --git a/dih.py b/dih.py new file mode 100644 index 0000000..a4cbb20 --- /dev/null +++ b/dih.py @@ -0,0 +1,93 @@ +import pandas as pd +import numpy as np +from catboost import CatBoostClassifier + +class MCCPredictor: + def __init__(self, model_path='mcc_classifier.cbm'): + self.model = CatBoostClassifier() + self.model.load_model(model_path) + + # Порядок колонок должен быть В ТОЧНОСТИ как при обучении + self.feature_order = [ + 'terminal_name', 'terminal_description', 'terminal_city', 'items_text', 'text', + 'amount', 'items_count', 'items_total_price', 'items_max_price', 'items_min_price', + 'terminal_id' + ] + + def _preprocess_json(self, data): + """Конвертирует входящий JSON в плоскую структуру для модели""" + + # 1. Агрегируем данные из списка items + items = data.get('items', []) + item_names = [str(i.get('name', '')) for i in items] + item_prices = [float(i.get('price', 0)) for i in items] + + items_text = " ".join(item_names) + items_count = len(items) + items_total_price = sum(item_prices) + items_max_price = max(item_prices) if item_prices else 0 + items_min_price = min(item_prices) if item_prices else 0 + + # 2. Формируем ту самую склеенную колонку 'text' + # Важно: используй тот же формат, что был в train.csv + combined_text = f"{data.get('terminal_name', '')} {data.get('terminal_description', '')} {data.get('city', '')} items {items_text}" + + # 3. Собираем финальный словарь + flat_data = { + 'terminal_name': str(data.get('terminal_name', '')), + 'terminal_description': str(data.get('terminal_description', '')), + 'terminal_city': str(data.get('city', '')), # city -> terminal_city + 'items_text': items_text, + 'text': combined_text.lower(), + 'amount': float(data.get('amount', 0)), + 'items_count': float(items_count), + 'items_total_price': float(items_total_price), + 'items_max_price': float(items_max_price), + 'items_min_price': float(items_min_price), + 'terminal_id': 'unknown' # В запросе нет ID, ставим заглушку + } + return flat_data + + def predict(self, raw_json): + # Если пришла одна транзакция, оборачиваем в список + if isinstance(raw_json, dict): + raw_json = [raw_json] + + # Препроцессинг всех транзакций в списке + processed_data = [self._preprocess_json(t) for t in raw_json] + df = pd.DataFrame(processed_data) + + # Проверка порядка колонок + df = df[self.feature_order] + + # Предсказание + mcc_codes = self.model.predict(df) + probs = self.model.predict_proba(df) + + results = [] + for i in range(len(raw_json)): + results.append({ + "transaction_id": raw_json[i].get('transaction_id'), + "mcc": int(mcc_codes[i][0]), + "confidence": round(float(np.max(probs[i])), 4) + }) + return results + +# --- ТЕСТ --- +predictor = MCCPredictor('mcc_classifier.cbm') + +request_data = { + "transaction_id": "TX00001116", + "terminal_name": "STORE001", + "terminal_description": "common common common thing", + "city": "NYC", + "amount": 272.80, + "items": [ + {"name": "basic loyalty", "price": 58.20}, + {"name": "Bringiong item lifes", "price": 28.99}, + {"name": "regular item basic item", "price": 56.91} + ] +} + +res = predictor.predict(request_data) +print(res) \ No newline at end of file diff --git a/mcc_classifier.cbm b/mcc_classifier.cbm new file mode 100644 index 0000000..479d500 Binary files /dev/null and b/mcc_classifier.cbm differ diff --git a/prepare_data.py b/prepare_data.py new file mode 100644 index 0000000..ce801df --- /dev/null +++ b/prepare_data.py @@ -0,0 +1,100 @@ +import pandas as pd +import re + +DATA_DIR = "data" + +# ---------- Текстовая очистка ---------- +def clean_text(s: str, max_len=1000): + if not isinstance(s, str): + return "" + + s = s.lower() + + # --- УБРАТЬ ПРЕФИКС t" или t' --- + s = re.sub(r'^t["\']', '', s) + + # --- УНИФИКАЦИЯ КАВЫЧЕК --- + s = s.replace('"', ' ').replace("'", " ").replace('`', ' ') + + # убрать не-ASCII (эмодзи, иероглифы и т.п.) + s = re.sub(r'[^\x00-\x7F]+', ' ', s) + + # оставить только буквы и цифры + s = re.sub(r'[^a-z0-9\s]', ' ', s) + + # удалить подряд идущие повторы слов + words = s.split() + dedup = [] + prev = None + for w in words: + if w != prev: + dedup.append(w) + prev = w + s = " ".join(dedup) + + # нормализовать пробелы и обрезать + s = re.sub(r'\s+', ' ', s).strip() + return s[:max_len] + + +# ---------- LOAD ---------- +transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv") +terminals = pd.read_csv(f"{DATA_DIR}/terminals.csv") +receipts = pd.read_csv(f"{DATA_DIR}/receipts.csv") + +# ---------- CLEAN TEXT ---------- +for col in ["terminal_name", "terminal_description", "terminal_city"]: + if col == "terminal_city": + terminals[col] = terminals[col].astype(str).apply(clean_text) + else: + terminals[col] = terminals[col].apply(clean_text) + +receipts["item_name"] = receipts["item_name"].apply(clean_text) + +# ---------- AGGREGATE RECEIPTS ---------- +receipt_agg = receipts.groupby("transaction_id").agg( + items_text=("item_name", lambda x: " ".join(x)), + items_count=("item_name", "count"), + items_total_price=("item_price", "sum"), + items_max_price=("item_price", "max"), + items_min_price=("item_price", "min"), +).reset_index() + +# ---------- MERGE WITH TRANSACTIONS ---------- +df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge( + terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]], + on="terminal_id", + how="left" +) + +df = df.merge(receipt_agg, on="transaction_id", how="left") + +# ---------- FILL NA ---------- +for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]: + df[col] = df[col].fillna("") + +for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]: + df[col] = df[col].fillna(0) + +# ---------- BUILD FINAL TEXT ---------- +df["text"] = ( + df["terminal_name"] + " " + + df["terminal_description"] + " " + + df["terminal_city"] + " " + + " items " + df["items_text"] + " items " + + df["items_text"] +) + +df["text"] = df["text"].apply(clean_text) + +# ---------- FINAL CHECK ---------- +print("rows:", len(df)) +print("unique tx:", df["transaction_id"].nunique()) +print(df["true_mcc"].value_counts()) + +assert len(df) == df["transaction_id"].nunique() +assert df["text"].str.len().min() > 0 + +# ---------- SAVE ---------- +df.to_csv("train.csv", index=False) +print("saved train.csv") diff --git a/req.txt b/req.txt new file mode 100644 index 0000000..a78edd6 --- /dev/null +++ b/req.txt @@ -0,0 +1,49 @@ +asttokens==3.0.1 +catboost==1.2.8 +comm==0.2.3 +contourpy==1.3.3 +cycler==0.12.1 +debugpy==1.8.19 +decorator==5.2.1 +executing==2.2.1 +fonttools==4.61.1 +graphviz==0.21 +ipykernel==7.1.0 +ipython==9.9.0 +ipython-pygments-lexers==1.1.1 +jedi==0.19.2 +joblib==1.5.3 +jupyter-client==8.8.0 +jupyter-core==5.9.1 +kiwisolver==1.4.9 +matplotlib==3.10.8 +matplotlib-inline==0.2.1 +narwhals==2.15.0 +nest-asyncio==1.6.0 +numpy==2.4.1 +packaging==25.0 +pandas==2.3.3 +parso==0.8.5 +pexpect==4.9.0 +pillow==12.1.0 +pip==25.0.1 +platformdirs==4.5.1 +plotly==6.5.2 +prompt-toolkit==3.0.52 +psutil==7.2.1 +ptyprocess==0.7.0 +pure-eval==0.2.3 +pygments==2.19.2 +pyparsing==3.3.2 +python-dateutil==2.9.0.post0 +pytz==2025.2 +pyzmq==27.1.0 +scikit-learn==1.8.0 +scipy==1.17.0 +six==1.17.0 +stack-data==0.6.3 +threadpoolctl==3.6.0 +tornado==6.5.4 +traitlets==5.14.3 +tzdata==2025.3 +wcwidth==0.2.14 diff --git a/solution/model/mcc_model.pkl b/solution/model/mcc_model.pkl new file mode 100644 index 0000000..39376e6 Binary files /dev/null and b/solution/model/mcc_model.pkl differ diff --git a/test.py b/test.py new file mode 100644 index 0000000..c738614 --- /dev/null +++ b/test.py @@ -0,0 +1,38 @@ +import joblib +import pandas as pd +# Импортируем наши классы, чтобы joblib мог их десериализовать +from train import TextExtractor, NumberExtractor + +# 1. Загружаем модель +model = joblib.load('solution/model/mcc_model.pkl') + +# 2. Подготавливаем тестовые данные (как они придут в API) +test_json = { + "transaction_id": "TX00001116", + "terminal_name": "STORE001", + "terminal_description": "common common common thing", + "city": "NYC", + "amount": 272.80, + "items": [ + {"name": "basic loyalty", "price": 58.20}, + {"name": "Bringiong item lifes", "price": 28.99} + ] +} + + +# 3. Преобразуем в формат, который понимает пайплайн (как в main.py) +items_str = " ".join([i['name'] for i in test_json['items']]) +full_text = f"{test_json['terminal_name']} {test_json['terminal_description']} {items_str}".lower() + +input_df = pd.DataFrame([{ + 'full_text': full_text, + 'amount': test_json['amount'] +}]) + +# 4. Делаем предсказание +prediction = model.predict(input_df)[0] +confidence = model.predict_proba(input_df).max() + +print(f"Transaction ID: {test_json['transaction_id']}") +print(f"Predicted MCC: {prediction}") +print(f"Confidence: {confidence:.4f}") diff --git a/train.py b/train.py new file mode 100644 index 0000000..cbd201e --- /dev/null +++ b/train.py @@ -0,0 +1,124 @@ +import pandas as pd +import numpy as np +import joblib +import os +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.preprocessing import StandardScaler +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.metrics import accuracy_score, classification_report + +# --- 1. Кастомные трансформеры для пайплайна --- + +class TextExtractor(BaseEstimator, TransformerMixin): + """Извлекает текстовую колонку для TF-IDF""" + def fit(self, X, y=None): return self + def transform(self, X): + return X['full_text'].fillna('') + +class NumberExtractor(BaseEstimator, TransformerMixin): + """Извлекает числовую колонку 'amount'""" + def fit(self, X, y=None): return self + def transform(self, X): + # Возвращаем как DataFrame (2D массив) для StandardScaler + return X[['amount']].fillna(0) + +def train_model(): + print("Загрузка данных...") + # Пути к файлам (предполагаем, что скрипт запускается из корня, где есть папка data/) + try: + tx = pd.read_csv('data/transactions.csv') + terminals = pd.read_csv('data/terminals.csv') + receipts = pd.read_csv('data/receipts.csv') + except FileNotFoundError as e: + print(f"Ошибка: Не найдены файлы данных в папке data/. {e}") + return + + # --- 2. Предобработка и сборка признаков --- + + print("Предобработка...") + # Агрегируем названия товаров в одну строку для каждой транзакции + receipts_agg = receipts.groupby('transaction_id')['item_name'].apply( + lambda x: ' '.join(str(i) for i in x) + ).reset_index() + + # Объединяем транзакции с данными терминалов и чеками + df = tx.merge(terminals[['terminal_id', 'terminal_name', 'terminal_description']], on='terminal_id', how='left') + df = df.merge(receipts_agg, on='transaction_id', how='left') + + # Создаем единое текстовое поле (игнорируем transaction_id, чтобы не было утечки!) + df['full_text'] = ( + df['terminal_name'].astype(str) + " " + + df['terminal_description'].astype(str) + " " + + df['item_name'].astype(str) + ).str.lower() + + X = df[['full_text', 'amount']] + y = df['true_mcc'] + + # --- 3. Создание пайплайна --- + + pipeline = Pipeline([ + ('features', FeatureUnion([ + # Ветка ТЕКСТА + ('text_branch', Pipeline([ + ('extract', TextExtractor()), + ('tfidf_union', FeatureUnion([ + # Слова (смысл) + ('word', TfidfVectorizer( + ngram_range=(1, 2), + analyzer='word', + stop_words='english', + max_features=5000 + )), + # Символы (опечатки) + ('char', TfidfVectorizer( + ngram_range=(2, 5), + analyzer='char_wb', + max_features=10000 + )) + ])) + ])), + # Ветка ЧИСЕЛ + ('numeric_branch', Pipeline([ + ('extract', NumberExtractor()), + ('scaler', StandardScaler()) + ])) + ])), + # Классификатор + ('clf', LogisticRegression(C=1.0, max_iter=1000)) + ]) + + # --- 4. Оценка качества (Валидация) --- + + print("Оценка качества на валидационной выборке...") + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y + ) + + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + probs = pipeline.predict_proba(X_test) + + acc = accuracy_score(y_test, y_pred) + conf = np.mean(np.max(probs, axis=1)) + + print(f"\n[РЕЗУЛЬТАТЫ]") + print(f"Accuracy: {acc:.4f}") + print(f"Average Confidence: {conf:.4f}") + print("\nОтчет по категориям:") + print(classification_report(y_test, y_pred)) + + # --- 5. Финальное обучение и сохранение --- + + print("\nФинальное обучение на всех данных...") + pipeline.fit(X, y) + + os.makedirs('solution/model', exist_ok=True) + joblib.dump(pipeline, 'solution/model/mcc_model.pkl') + print("Модель успешно сохранена в solution/model/mcc_model.pkl") + +if __name__ == "__main__": + train_model() \ No newline at end of file