Steps Summary #

Split the sample data into training and validation data
Train the model with the training data
Evaluate the model with validation data
Find the optimal tree node depth using MAE(Mean Absolute Error)
Create a final model by training all the data using the optimal node depth found above

[edit]

Code #

from https://www.kaggle.com/code/gimslabgimslab/exercise-underfitting-and-overfitting/

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

home_data = pd.read_csv('train.csv')
y = home_data.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

* Mean Absolute Error

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
	model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
	model.fit(train_X, train_y)
	preds_val = model.predict(val_X)
	mae = mean_absolute_error(val_y, preds_val)
	return(mae)

* get best node depth

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
mae_map = {n: get_mae(n, train_X, val_X, train_y, val_y) for n in candidate_max_leaf_nodes}
print(mae_map)
best_tree_size = min(mae_map, key=mae_map.get)
print(f"best: {best_tree_size}")

* get final model fitted with best node depth

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

last modified 2023-02-19 11:54:32
Processing time 0.0077 sec