Steps Summary #


  1. Split the sample data into training and validation data
  2. Train the model with the training data
  3. Evaluate the model with validation data
  4. Find the optimal tree node depth using MAE(Mean Absolute Error)
  5. Create a final model by training all the data using the optimal node depth found above


Code #



import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

home_data = pd.read_csv('train.csv')
y = home_data.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))
 



* Mean Absolute Error
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
	model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
	model.fit(train_X, train_y)
	preds_val = model.predict(val_X)
	mae = mean_absolute_error(val_y, preds_val)
	return(mae)
  


* get best node depth
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
mae_map = {n: get_mae(n, train_X, val_X, train_y, val_y) for n in candidate_max_leaf_nodes}
print(mae_map)
best_tree_size = min(mae_map, key=mae_map.get)
print(f"best: {best_tree_size}")


* get final model fitted with best node depth
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)
  
Valid XHTML 1.0! Valid CSS! powered by MoniWiki
last modified 2023-02-19 11:54:32
Processing time 0.0077 sec