101 lines
3.0 KiB
Python
101 lines
3.0 KiB
Python
|
|
import pandas as pd
|
||
|
|
import re
|
||
|
|
|
||
|
|
DATA_DIR = "data"
|
||
|
|
|
||
|
|
# ---------- Текстовая очистка ----------
|
||
|
|
def clean_text(s: str, max_len=1000):
|
||
|
|
if not isinstance(s, str):
|
||
|
|
return ""
|
||
|
|
|
||
|
|
s = s.lower()
|
||
|
|
|
||
|
|
# --- УБРАТЬ ПРЕФИКС t" или t' ---
|
||
|
|
s = re.sub(r'^t["\']', '', s)
|
||
|
|
|
||
|
|
# --- УНИФИКАЦИЯ КАВЫЧЕК ---
|
||
|
|
s = s.replace('"', ' ').replace("'", " ").replace('`', ' ')
|
||
|
|
|
||
|
|
# убрать не-ASCII (эмодзи, иероглифы и т.п.)
|
||
|
|
s = re.sub(r'[^\x00-\x7F]+', ' ', s)
|
||
|
|
|
||
|
|
# оставить только буквы и цифры
|
||
|
|
s = re.sub(r'[^a-z0-9\s]', ' ', s)
|
||
|
|
|
||
|
|
# удалить подряд идущие повторы слов
|
||
|
|
words = s.split()
|
||
|
|
dedup = []
|
||
|
|
prev = None
|
||
|
|
for w in words:
|
||
|
|
if w != prev:
|
||
|
|
dedup.append(w)
|
||
|
|
prev = w
|
||
|
|
s = " ".join(dedup)
|
||
|
|
|
||
|
|
# нормализовать пробелы и обрезать
|
||
|
|
s = re.sub(r'\s+', ' ', s).strip()
|
||
|
|
return s[:max_len]
|
||
|
|
|
||
|
|
|
||
|
|
# ---------- LOAD ----------
|
||
|
|
transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv")
|
||
|
|
terminals = pd.read_csv(f"{DATA_DIR}/terminals.csv")
|
||
|
|
receipts = pd.read_csv(f"{DATA_DIR}/receipts.csv")
|
||
|
|
|
||
|
|
# ---------- CLEAN TEXT ----------
|
||
|
|
for col in ["terminal_name", "terminal_description", "terminal_city"]:
|
||
|
|
if col == "terminal_city":
|
||
|
|
terminals[col] = terminals[col].astype(str).apply(clean_text)
|
||
|
|
else:
|
||
|
|
terminals[col] = terminals[col].apply(clean_text)
|
||
|
|
|
||
|
|
receipts["item_name"] = receipts["item_name"].apply(clean_text)
|
||
|
|
|
||
|
|
# ---------- AGGREGATE RECEIPTS ----------
|
||
|
|
receipt_agg = receipts.groupby("transaction_id").agg(
|
||
|
|
items_text=("item_name", lambda x: " ".join(x)),
|
||
|
|
items_count=("item_name", "count"),
|
||
|
|
items_total_price=("item_price", "sum"),
|
||
|
|
items_max_price=("item_price", "max"),
|
||
|
|
items_min_price=("item_price", "min"),
|
||
|
|
).reset_index()
|
||
|
|
|
||
|
|
# ---------- MERGE WITH TRANSACTIONS ----------
|
||
|
|
df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge(
|
||
|
|
terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]],
|
||
|
|
on="terminal_id",
|
||
|
|
how="left"
|
||
|
|
)
|
||
|
|
|
||
|
|
df = df.merge(receipt_agg, on="transaction_id", how="left")
|
||
|
|
|
||
|
|
# ---------- FILL NA ----------
|
||
|
|
for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]:
|
||
|
|
df[col] = df[col].fillna("")
|
||
|
|
|
||
|
|
for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]:
|
||
|
|
df[col] = df[col].fillna(0)
|
||
|
|
|
||
|
|
# ---------- BUILD FINAL TEXT ----------
|
||
|
|
df["text"] = (
|
||
|
|
df["terminal_name"] + " " +
|
||
|
|
df["terminal_description"] + " " +
|
||
|
|
df["terminal_city"] + " " +
|
||
|
|
" items " + df["items_text"] + " items " +
|
||
|
|
df["items_text"]
|
||
|
|
)
|
||
|
|
|
||
|
|
df["text"] = df["text"].apply(clean_text)
|
||
|
|
|
||
|
|
# ---------- FINAL CHECK ----------
|
||
|
|
print("rows:", len(df))
|
||
|
|
print("unique tx:", df["transaction_id"].nunique())
|
||
|
|
print(df["true_mcc"].value_counts())
|
||
|
|
|
||
|
|
assert len(df) == df["transaction_id"].nunique()
|
||
|
|
assert df["text"].str.len().min() > 0
|
||
|
|
|
||
|
|
# ---------- SAVE ----------
|
||
|
|
df.to_csv("train.csv", index=False)
|
||
|
|
print("saved train.csv")
|