Keras random forests regression model

with house price data

1 package

1.1 download package

Code

import os
os.system('pip install tensorflow_decision_forests')

1.2 load package

Code

import os
# Keep using Keras 2
os.environ['TF_USE_LEGACY_KERAS'] = '1'

import tensorflow_decision_forests as tfdf

import numpy as np
import pandas as pd
import tensorflow as tf
import tf_keras
import math

Code

# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

Found TensorFlow Decision Forests v1.9.0

2 data

data download form kaggle

2.1 read data

Code

train_file_path = "data/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1460, 81)

Code

dataset_df.head(3)

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500

3 rows × 81 columns

Code

dataset_df = dataset_df.drop('Id', axis=1)
dataset_df.head(3)

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	Inside	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	FR2	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	Inside	...	NaN	NaN	NaN	9	2008	WD	Normal	223500

3 rows × 80 columns

Code

#dataset_df.info()

2.2 data pre

Code

import numpy as np
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

1029 examples in training, 431 examples in testing.

Code

label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

3 model

3.1 define model

Code

# Specify the model.
model_1 = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)

Use /var/folders/v3/pzt9c47n1nbcsmybsg_w0lhw0000gn/T/tmptkal5dhv as temporary training directory

3.2 compile model

Code

model_1.compile(metrics=["mse"])

3.3 Train the model

Code

model_1.fit(train_ds)

Reading training dataset...
Training dataset read in 0:00:01.731521. Found 1029 examples.
Training model...
Model trained in 0:00:00.564812
Compiling model...
Model compiled.

<tf_keras.src.callbacks.History at 0x2860a47d0>

3.4 Evaluate the model

Code

evaluation = model_1.evaluate(valid_ds, return_dict=True)
print()

1/1 [==============================] - ETA: 0s - loss: 0.0000e+00 - mse: 786470720.00001/1 [==============================] - 2s 2s/step - loss: 0.0000e+00 - mse: 786470720.0000

Code

for name, value in evaluation.items():
  mse=value

RMSE

Code

import math
math.sqrt(mse)

28044.085294407447

Code

import matplotlib.pyplot as plt
logs = model_1.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

4 reference:

https://colab.research.google.com/github/tensorflow/decision-forests/blob/main/documentation/tutorials/beginner_colab.ipynb#scrollTo=xUy4ULEMtDXB

--- title: "Keras random forests regression model" subtitle: "with house price data" execute: warning: false error: false format: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9" --- # package ## download package ```{python} #| eval: false import os os.system('pip install tensorflow_decision_forests') ``` ## load package ```{python} import os # Keep using Keras 2 os.environ['TF_USE_LEGACY_KERAS'] = '1' import tensorflow_decision_forests as tfdf import numpy as np import pandas as pd import tensorflow as tf import tf_keras import math ``` ```{python} # Check the version of TensorFlow Decision Forests print("Found TensorFlow Decision Forests v" + tfdf.__version__) ``` # data data download form [kaggle](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data) ## read data ```{python} train_file_path = "data/train.csv" dataset_df = pd.read_csv(train_file_path) print("Full train dataset shape is {}".format(dataset_df.shape)) ``` ```{python} dataset_df.head(3) ``` ```{python} dataset_df = dataset_df.drop('Id', axis=1) dataset_df.head(3) ``` ```{python} #dataset_df.info() ``` ## data pre ```{python} import numpy as np def split_dataset(dataset, test_ratio=0.30): test_indices = np.random.rand(len(dataset)) < test_ratio return dataset[~test_indices], dataset[test_indices] train_ds_pd, valid_ds_pd = split_dataset(dataset_df) print("{} examples in training, {} examples in testing.".format( len(train_ds_pd), len(valid_ds_pd))) ``` ```{python} label = 'SalePrice' train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION) valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION) ``` # model ## define model ```{python} # Specify the model. model_1 = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION) ``` ## compile model ```{python} model_1.compile(metrics=["mse"]) ``` ## Train the model ```{python} model_1.fit(train_ds) ``` ## Evaluate the model ```{python} evaluation = model_1.evaluate(valid_ds, return_dict=True) print() ``` ```{python} for name, value in evaluation.items(): mse=value ``` RMSE ```{python} import math math.sqrt(mse) ``` ```{python} import matplotlib.pyplot as plt logs = model_1.make_inspector().training_logs() plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs]) plt.xlabel("Number of trees") plt.ylabel("RMSE (out-of-bag)") plt.show() ``` # reference: https://colab.research.google.com/github/tensorflow/decision-forests/blob/main/documentation/tutorials/beginner_colab.ipynb#scrollTo=xUy4ULEMtDXB