Initial commit

This commit is contained in:
NightyStudios 2026-01-21 22:08:24 +03:00
commit 74df71d82d
Signed by: temmie
GPG Key ID: A9459339CC5FF071
7 changed files with 404 additions and 0 deletions

93
dih.py Normal file
View File

@ -0,0 +1,93 @@
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
class MCCPredictor:
def __init__(self, model_path='mcc_classifier.cbm'):
self.model = CatBoostClassifier()
self.model.load_model(model_path)
# Порядок колонок должен быть В ТОЧНОСТИ как при обучении
self.feature_order = [
'terminal_name', 'terminal_description', 'terminal_city', 'items_text', 'text',
'amount', 'items_count', 'items_total_price', 'items_max_price', 'items_min_price',
'terminal_id'
]
def _preprocess_json(self, data):
"""Конвертирует входящий JSON в плоскую структуру для модели"""
# 1. Агрегируем данные из списка items
items = data.get('items', [])
item_names = [str(i.get('name', '')) for i in items]
item_prices = [float(i.get('price', 0)) for i in items]
items_text = " ".join(item_names)
items_count = len(items)
items_total_price = sum(item_prices)
items_max_price = max(item_prices) if item_prices else 0
items_min_price = min(item_prices) if item_prices else 0
# 2. Формируем ту самую склеенную колонку 'text'
# Важно: используй тот же формат, что был в train.csv
combined_text = f"{data.get('terminal_name', '')} {data.get('terminal_description', '')} {data.get('city', '')} items {items_text}"
# 3. Собираем финальный словарь
flat_data = {
'terminal_name': str(data.get('terminal_name', '')),
'terminal_description': str(data.get('terminal_description', '')),
'terminal_city': str(data.get('city', '')), # city -> terminal_city
'items_text': items_text,
'text': combined_text.lower(),
'amount': float(data.get('amount', 0)),
'items_count': float(items_count),
'items_total_price': float(items_total_price),
'items_max_price': float(items_max_price),
'items_min_price': float(items_min_price),
'terminal_id': 'unknown' # В запросе нет ID, ставим заглушку
}
return flat_data
def predict(self, raw_json):
# Если пришла одна транзакция, оборачиваем в список
if isinstance(raw_json, dict):
raw_json = [raw_json]
# Препроцессинг всех транзакций в списке
processed_data = [self._preprocess_json(t) for t in raw_json]
df = pd.DataFrame(processed_data)
# Проверка порядка колонок
df = df[self.feature_order]
# Предсказание
mcc_codes = self.model.predict(df)
probs = self.model.predict_proba(df)
results = []
for i in range(len(raw_json)):
results.append({
"transaction_id": raw_json[i].get('transaction_id'),
"mcc": int(mcc_codes[i][0]),
"confidence": round(float(np.max(probs[i])), 4)
})
return results
# --- ТЕСТ ---
predictor = MCCPredictor('mcc_classifier.cbm')
request_data = {
"transaction_id": "TX00001116",
"terminal_name": "STORE001",
"terminal_description": "common common common thing",
"city": "NYC",
"amount": 272.80,
"items": [
{"name": "basic loyalty", "price": 58.20},
{"name": "Bringiong item lifes", "price": 28.99},
{"name": "regular item basic item", "price": 56.91}
]
}
res = predictor.predict(request_data)
print(res)

BIN
mcc_classifier.cbm Normal file

Binary file not shown.

100
prepare_data.py Normal file
View File

@ -0,0 +1,100 @@
import pandas as pd
import re
DATA_DIR = "data"
# ---------- Текстовая очистка ----------
def clean_text(s: str, max_len=1000):
if not isinstance(s, str):
return ""
s = s.lower()
# --- УБРАТЬ ПРЕФИКС t" или t' ---
s = re.sub(r'^t["\']', '', s)
# --- УНИФИКАЦИЯ КАВЫЧЕК ---
s = s.replace('"', ' ').replace("'", " ").replace('`', ' ')
# убрать не-ASCII (эмодзи, иероглифы и т.п.)
s = re.sub(r'[^\x00-\x7F]+', ' ', s)
# оставить только буквы и цифры
s = re.sub(r'[^a-z0-9\s]', ' ', s)
# удалить подряд идущие повторы слов
words = s.split()
dedup = []
prev = None
for w in words:
if w != prev:
dedup.append(w)
prev = w
s = " ".join(dedup)
# нормализовать пробелы и обрезать
s = re.sub(r'\s+', ' ', s).strip()
return s[:max_len]
# ---------- LOAD ----------
transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv")
terminals = pd.read_csv(f"{DATA_DIR}/terminals.csv")
receipts = pd.read_csv(f"{DATA_DIR}/receipts.csv")
# ---------- CLEAN TEXT ----------
for col in ["terminal_name", "terminal_description", "terminal_city"]:
if col == "terminal_city":
terminals[col] = terminals[col].astype(str).apply(clean_text)
else:
terminals[col] = terminals[col].apply(clean_text)
receipts["item_name"] = receipts["item_name"].apply(clean_text)
# ---------- AGGREGATE RECEIPTS ----------
receipt_agg = receipts.groupby("transaction_id").agg(
items_text=("item_name", lambda x: " ".join(x)),
items_count=("item_name", "count"),
items_total_price=("item_price", "sum"),
items_max_price=("item_price", "max"),
items_min_price=("item_price", "min"),
).reset_index()
# ---------- MERGE WITH TRANSACTIONS ----------
df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge(
terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]],
on="terminal_id",
how="left"
)
df = df.merge(receipt_agg, on="transaction_id", how="left")
# ---------- FILL NA ----------
for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]:
df[col] = df[col].fillna("")
for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]:
df[col] = df[col].fillna(0)
# ---------- BUILD FINAL TEXT ----------
df["text"] = (
df["terminal_name"] + " " +
df["terminal_description"] + " " +
df["terminal_city"] + " " +
" items " + df["items_text"] + " items " +
df["items_text"]
)
df["text"] = df["text"].apply(clean_text)
# ---------- FINAL CHECK ----------
print("rows:", len(df))
print("unique tx:", df["transaction_id"].nunique())
print(df["true_mcc"].value_counts())
assert len(df) == df["transaction_id"].nunique()
assert df["text"].str.len().min() > 0
# ---------- SAVE ----------
df.to_csv("train.csv", index=False)
print("saved train.csv")

49
req.txt Normal file
View File

@ -0,0 +1,49 @@
asttokens==3.0.1
catboost==1.2.8
comm==0.2.3
contourpy==1.3.3
cycler==0.12.1
debugpy==1.8.19
decorator==5.2.1
executing==2.2.1
fonttools==4.61.1
graphviz==0.21
ipykernel==7.1.0
ipython==9.9.0
ipython-pygments-lexers==1.1.1
jedi==0.19.2
joblib==1.5.3
jupyter-client==8.8.0
jupyter-core==5.9.1
kiwisolver==1.4.9
matplotlib==3.10.8
matplotlib-inline==0.2.1
narwhals==2.15.0
nest-asyncio==1.6.0
numpy==2.4.1
packaging==25.0
pandas==2.3.3
parso==0.8.5
pexpect==4.9.0
pillow==12.1.0
pip==25.0.1
platformdirs==4.5.1
plotly==6.5.2
prompt-toolkit==3.0.52
psutil==7.2.1
ptyprocess==0.7.0
pure-eval==0.2.3
pygments==2.19.2
pyparsing==3.3.2
python-dateutil==2.9.0.post0
pytz==2025.2
pyzmq==27.1.0
scikit-learn==1.8.0
scipy==1.17.0
six==1.17.0
stack-data==0.6.3
threadpoolctl==3.6.0
tornado==6.5.4
traitlets==5.14.3
tzdata==2025.3
wcwidth==0.2.14

Binary file not shown.

38
test.py Normal file
View File

@ -0,0 +1,38 @@
import joblib
import pandas as pd
# Импортируем наши классы, чтобы joblib мог их десериализовать
from train import TextExtractor, NumberExtractor
# 1. Загружаем модель
model = joblib.load('solution/model/mcc_model.pkl')
# 2. Подготавливаем тестовые данные (как они придут в API)
test_json = {
"transaction_id": "TX00001116",
"terminal_name": "STORE001",
"terminal_description": "common common common thing",
"city": "NYC",
"amount": 272.80,
"items": [
{"name": "basic loyalty", "price": 58.20},
{"name": "Bringiong item lifes", "price": 28.99}
]
}
# 3. Преобразуем в формат, который понимает пайплайн (как в main.py)
items_str = " ".join([i['name'] for i in test_json['items']])
full_text = f"{test_json['terminal_name']} {test_json['terminal_description']} {items_str}".lower()
input_df = pd.DataFrame([{
'full_text': full_text,
'amount': test_json['amount']
}])
# 4. Делаем предсказание
prediction = model.predict(input_df)[0]
confidence = model.predict_proba(input_df).max()
print(f"Transaction ID: {test_json['transaction_id']}")
print(f"Predicted MCC: {prediction}")
print(f"Confidence: {confidence:.4f}")

124
train.py Normal file
View File

@ -0,0 +1,124 @@
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report
# --- 1. Кастомные трансформеры для пайплайна ---
class TextExtractor(BaseEstimator, TransformerMixin):
"""Извлекает текстовую колонку для TF-IDF"""
def fit(self, X, y=None): return self
def transform(self, X):
return X['full_text'].fillna('')
class NumberExtractor(BaseEstimator, TransformerMixin):
"""Извлекает числовую колонку 'amount'"""
def fit(self, X, y=None): return self
def transform(self, X):
# Возвращаем как DataFrame (2D массив) для StandardScaler
return X[['amount']].fillna(0)
def train_model():
print("Загрузка данных...")
# Пути к файлам (предполагаем, что скрипт запускается из корня, где есть папка data/)
try:
tx = pd.read_csv('data/transactions.csv')
terminals = pd.read_csv('data/terminals.csv')
receipts = pd.read_csv('data/receipts.csv')
except FileNotFoundError as e:
print(f"Ошибка: Не найдены файлы данных в папке data/. {e}")
return
# --- 2. Предобработка и сборка признаков ---
print("Предобработка...")
# Агрегируем названия товаров в одну строку для каждой транзакции
receipts_agg = receipts.groupby('transaction_id')['item_name'].apply(
lambda x: ' '.join(str(i) for i in x)
).reset_index()
# Объединяем транзакции с данными терминалов и чеками
df = tx.merge(terminals[['terminal_id', 'terminal_name', 'terminal_description']], on='terminal_id', how='left')
df = df.merge(receipts_agg, on='transaction_id', how='left')
# Создаем единое текстовое поле (игнорируем transaction_id, чтобы не было утечки!)
df['full_text'] = (
df['terminal_name'].astype(str) + " " +
df['terminal_description'].astype(str) + " " +
df['item_name'].astype(str)
).str.lower()
X = df[['full_text', 'amount']]
y = df['true_mcc']
# --- 3. Создание пайплайна ---
pipeline = Pipeline([
('features', FeatureUnion([
# Ветка ТЕКСТА
('text_branch', Pipeline([
('extract', TextExtractor()),
('tfidf_union', FeatureUnion([
# Слова (смысл)
('word', TfidfVectorizer(
ngram_range=(1, 2),
analyzer='word',
stop_words='english',
max_features=5000
)),
# Символы (опечатки)
('char', TfidfVectorizer(
ngram_range=(2, 5),
analyzer='char_wb',
max_features=10000
))
]))
])),
# Ветка ЧИСЕЛ
('numeric_branch', Pipeline([
('extract', NumberExtractor()),
('scaler', StandardScaler())
]))
])),
# Классификатор
('clf', LogisticRegression(C=1.0, max_iter=1000))
])
# --- 4. Оценка качества (Валидация) ---
print("Оценка качества на валидационной выборке...")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
probs = pipeline.predict_proba(X_test)
acc = accuracy_score(y_test, y_pred)
conf = np.mean(np.max(probs, axis=1))
print(f"\n[РЕЗУЛЬТАТЫ]")
print(f"Accuracy: {acc:.4f}")
print(f"Average Confidence: {conf:.4f}")
print("\nОтчет по категориям:")
print(classification_report(y_test, y_pred))
# --- 5. Финальное обучение и сохранение ---
print("\nФинальное обучение на всех данных...")
pipeline.fit(X, y)
os.makedirs('solution/model', exist_ok=True)
joblib.dump(pipeline, 'solution/model/mcc_model.pkl')
print("Модель успешно сохранена в solution/model/mcc_model.pkl")
if __name__ == "__main__":
train_model()