بایگانی‌های CSV

import numpy as np

import pandas as pd

import xgboost as xgb

from sklearn.model_selection import train_test_split

from sklearn.metrics import (

confusion_matrix, precision_score, recall_score,

roc_auc_score, average_precision_score, precision_recall_curve

)

# 1) Load a tiny cusomer churn CSV called churn.csv

df = pd.read_csv(“churn.csv”)

# 2) Do quick, safe checks – missing values and class balance.

missing_share = df.isna().mean().sort_values(ascending=False)

class_share = df[“churn”].value_counts(normalize=True).rename(“share”)

print(“Missing share (top 5):\n”, missing_share.head(5), “\n”)

print(“Class share:\n”, class_share, “\n”)

# 3) Split data into train, validation, test – 60-20-20.

X = df.drop(columns=[“churn”]); y = df[“churn”]

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, stratify=y, random_state=13)

X_tr, X_va, y_tr, y_va = train_test_split(X_tr, y_tr, test_size=0.25, stratify=y_tr, random_state=13)

neg, pos = int((y_tr==0).sum()), int((y_tr==1).sum())

spw = neg / max(pos, 1)

print(f“Shapes -> train {X_tr.shape}, val {X_va.shape}, test {X_te.shape}”)

print(f“Class balance in train -> neg {neg}, pos {pos}, scale_pos_weight {spw:.2f}\n”)

# Wrap as DMatrix (fast internal format)

feat_names = list(X.columns)

dtr = xgb.DMatrix(X_tr, label=y_tr, feature_names=feat_names)

dva = xgb.DMatrix(X_va, label=y_va, feature_names=feat_names)

dte = xgb.DMatrix(X_te, label=y_te, feature_names=feat_names)

# 4) Train XGBoost with early stopping using the Booster API.

params = dict(

objective=“binary:logistic”,

eval_metric=“aucpr”,

tree_method=“hist”,

max_depth=5,

eta=0.03,

subsample=0.8,

colsample_bytree=0.8,

reg_lambda=1.0,

scale_pos_weight=spw

)

bst = xgb.train(params, dtr, num_boost_round=4000, evals=[(dva, “val”)],

early_stopping_rounds=200, verbose_eval=False)

print(“Best trees (baseline):”, bst.best_iteration)

# 6) Choose a practical decision treshold from validation – “a line in the sand”.

p_va = bst.predict(dva, iteration_range=(0, bst.best_iteration + 1))

pre, rec, thr = precision_recall_curve(y_va, p_va)

f1 = 2 * pre * rec / np.clip(pre + rec, 1e–9, None)

t_best = float(thr[np.argmax(f1[:–1])])

print(“Chosen threshold t_best (validation F1):”, round(t_best, 3), “\n”)

# 7) Explain results on the test set in plain terms – confusion matrix, precision, recall, ROC AUC, PR AUC

p_te = bst.predict(dte, iteration_range=(0, bst.best_iteration + 1))

pred = (p_te >= t_best).astype(int)

cm = confusion_matrix(y_te, pred)

print(“Confusion matrix:\n”, cm)

print(“Precision:”, round(precision_score(y_te, pred), 3))

print(“Recall :”, round(recall_score(y_te, pred), 3))

print(“ROC AUC :”, round(roc_auc_score(y_te, p_te), 3))

print(“PR AUC :”, round(average_precision_score(y_te, p_te), 3), “\n”)

# 8) See which column mattered most

# (a hint – if people start calling the call centre a lot, most probably there is a problem and they will quit using your service)

imp = pd.Series(bst.get_score(importance_type=“gain”)).sort_values(ascending=False)

print(“Top features by importance (gain):\n”, imp.head(10), “\n”)

# 9) Add two business rules with monotonic constraints

cons = [0]*len(feat_names)

if “debt_ratio” in feat_names: cons[feat_names.index(“debt_ratio”)] = 1 # non-decreasing

if “tenure_months” in feat_names: cons[feat_names.index(“tenure_months”)] = –1 # non-increasing

mono = “(“ + “,”.join(map(str, cons)) + “)”

params_cons = params.copy()

params_cons.update({“monotone_constraints”: mono, “max_bin”: 512})

bst_cons = xgb.train(params_cons, dtr, num_boost_round=4000, evals=[(dva, “val”)],

early_stopping_rounds=200, verbose_eval=False)

print(“Best trees (constrained):”, bst_cons.best_iteration)

# 10) Compare the quality of bst_cons and bst with a few lines.

p_cons = bst_cons.predict(dte, iteration_range=(0, bst_cons.best_iteration + 1))

print(“PR AUC baseline vs constrained:”, round(average_precision_score(y_te, p_te), 3),

“vs”, round(average_precision_score(y_te, p_cons), 3))

print(“ROC AUC baseline vs constrained:”, round(roc_auc_score(y_te, p_te), 3),

“vs”, round(roc_auc_score(y_te, p_cons), 3), “\n”)

# 11) Save both models

bst.save_model(“easy_xgb_base.ubj”)

bst_cons.save_model(“easy_xgb_cons.ubj”)

print(“Saved models: easy_xgb_base.ubj, easy_xgb_cons.ubj”)

Source link

برچسب: CSV

XGBoost for beginners – from CSV to Trustworthy Model – Useful code