import pandas as pd import re DATA_DIR = "data" # ---------- Текстовая очистка ---------- def clean_text(s: str, max_len=1000): if not isinstance(s, str): return "" s = s.lower() # --- УБРАТЬ ПРЕФИКС t" или t' --- s = re.sub(r'^t["\']', '', s) # --- УНИФИКАЦИЯ КАВЫЧЕК --- s = s.replace('"', ' ').replace("'", " ").replace('`', ' ') # убрать не-ASCII (эмодзи, иероглифы и т.п.) s = re.sub(r'[^\x00-\x7F]+', ' ', s) # оставить только буквы и цифры s = re.sub(r'[^a-z0-9\s]', ' ', s) # удалить подряд идущие повторы слов words = s.split() dedup = [] prev = None for w in words: if w != prev: dedup.append(w) prev = w s = " ".join(dedup) # нормализовать пробелы и обрезать s = re.sub(r'\s+', ' ', s).strip() return s[:max_len] # ---------- LOAD ---------- transactions = pd.read_csv(f"{DATA_DIR}/transactions.csv") terminals = pd.read_csv(f"{DATA_DIR}/terminals.csv") receipts = pd.read_csv(f"{DATA_DIR}/receipts.csv") # ---------- CLEAN TEXT ---------- for col in ["terminal_name", "terminal_description", "terminal_city"]: if col == "terminal_city": terminals[col] = terminals[col].astype(str).apply(clean_text) else: terminals[col] = terminals[col].apply(clean_text) receipts["item_name"] = receipts["item_name"].apply(clean_text) # ---------- AGGREGATE RECEIPTS ---------- receipt_agg = receipts.groupby("transaction_id").agg( items_text=("item_name", lambda x: " ".join(x)), items_count=("item_name", "count"), items_total_price=("item_price", "sum"), items_max_price=("item_price", "max"), items_min_price=("item_price", "min"), ).reset_index() # ---------- MERGE WITH TRANSACTIONS ---------- df = transactions[["transaction_id", "terminal_id", "amount", "true_mcc"]].merge( terminals[["terminal_id", "terminal_name", "terminal_description", "terminal_city"]], on="terminal_id", how="left" ) df = df.merge(receipt_agg, on="transaction_id", how="left") # ---------- FILL NA ---------- for col in ["items_text", "terminal_name", "terminal_description", "terminal_city"]: df[col] = df[col].fillna("") for col in ["items_count", "items_total_price", "items_max_price", "items_min_price"]: df[col] = df[col].fillna(0) # ---------- BUILD FINAL TEXT ---------- df["text"] = ( df["terminal_name"] + " " + df["terminal_description"] + " " + df["terminal_city"] + " " + " items " + df["items_text"] + " items " + df["items_text"] ) df["text"] = df["text"].apply(clean_text) # ---------- FINAL CHECK ---------- print("rows:", len(df)) print("unique tx:", df["transaction_id"].nunique()) print(df["true_mcc"].value_counts()) assert len(df) == df["transaction_id"].nunique() assert df["text"].str.len().min() > 0 # ---------- SAVE ---------- df.to_csv("train.csv", index=False) print("saved train.csv")