methods #

  1. Drop column
  2. Imputation
  3. Extension to imputation

data = pd.read_csv('housing-price-train.csv')
y = data.Price

# use numerical features
predictors = data.drop(['Price'], axis='columns')
X = predictors.select_dtypes(exclude=['object'])

# divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(
				X, y, train_size=0.8, test_size=0.2, random_state=0)

  • method 1. Drop columns with Missing Values
cols_with_missing = [col for col in X_train.columns
				if X_train[col].isnull().any()]
reduced_X_train = X_train.drop[cols_with_missing, axis='columns')
reduced_X_valid = X_valid.drop[cols_with_missing, axis='columns')

  • method 2. Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) # same as fit() and transform()
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

  • method 3. Extension to Imputation
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

cols_with_missing = [col for col in X_train.columns
				if X_train[col].isnull().any()]

for col in cols_with_missing:
	X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
	X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus = X_train_plus.columns
imputed_X_valid_plus = X_valid_plus.columns

  • etc
X_full = pd.read_csv('housing-price-train.csv')

# Remove rows with missing target(y)
X_full.dropna(axis='index', subset=['SalePrice'], inplace=True)

# sepaarate target from predictors(features, columns)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis='columns', inplace=True)

# use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])

Valid XHTML 1.0! Valid CSS! powered by MoniWiki
last modified 2023-02-20 22:52:53
Processing time 0.0061 sec