mlops/prepare_data.py

101 lines
3.0 KiB
Python
Raw Normal View History

2026-01-21 21:08:24 +02:00
import pandas as pd
import re
DATA_DIR = "data"
# ---------- Текстовая очистка ----------
def clean_text(s: str, max_len=1000):
if not isinstance(s, str):
return ""
s = s.lower()
# --- УБРАТЬ ПРЕФИКС t" или t' ---
s = re.sub(r'^t["\']', '', s)
# --- УНИФИКАЦИЯ КАВЫЧЕК ---
s = s.replace('"', ' ').replace("'", " ").replace('`', ' ')
# убрать не-ASCII (эмодзи, иероглифы и т.п.)
s = re.sub(r'[^\x00-\x7F]+', ' ', s)
# оставить только буквы и цифры
s = re.sub(r'[^a-z0-9\s]', ' ', s)
# удалить подряд идущие повторы слов
words = s.split()
dedup = []
prev = None
for w in words:
if w != prev:
dedup.append(w)
prev = w
s = " ".join(dedup)
# нормализовать пробелы и обрезать
s = re.sub(r'\s+', ' ', s).strip()
return s[:max_len]
# ---------- LOAD ----------
transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv")
terminals = pd.read_csv(f"{DATA_DIR}/terminals.csv")
receipts = pd.read_csv(f"{DATA_DIR}/receipts.csv")
# ---------- CLEAN TEXT ----------
for col in ["terminal_name", "terminal_description", "terminal_city"]:
if col == "terminal_city":
terminals[col] = terminals[col].astype(str).apply(clean_text)
else:
terminals[col] = terminals[col].apply(clean_text)
receipts["item_name"] = receipts["item_name"].apply(clean_text)
# ---------- AGGREGATE RECEIPTS ----------
receipt_agg = receipts.groupby("transaction_id").agg(
items_text=("item_name", lambda x: " ".join(x)),
items_count=("item_name", "count"),
items_total_price=("item_price", "sum"),
items_max_price=("item_price", "max"),
items_min_price=("item_price", "min"),
).reset_index()
# ---------- MERGE WITH TRANSACTIONS ----------
df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge(
terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]],
on="terminal_id",
how="left"
)
df = df.merge(receipt_agg, on="transaction_id", how="left")
# ---------- FILL NA ----------
for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]:
df[col] = df[col].fillna("")
for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]:
df[col] = df[col].fillna(0)
# ---------- BUILD FINAL TEXT ----------
df["text"] = (
df["terminal_name"] + " " +
df["terminal_description"] + " " +
df["terminal_city"] + " " +
" items " + df["items_text"] + " items " +
df["items_text"]
)
df["text"] = df["text"].apply(clean_text)
# ---------- FINAL CHECK ----------
print("rows:", len(df))
print("unique tx:", df["transaction_id"].nunique())
print(df["true_mcc"].value_counts())
assert len(df) == df["transaction_id"].nunique()
assert df["text"].str.len().min() > 0
# ---------- SAVE ----------
df.to_csv("train.csv", index=False)
print("saved train.csv")