convenc › 쇠고기유통추적 › CleanArchitecture-2020 › 나는아마존에서미래를다녔다-박정준-201907 › 모바일사이트의특징 › Missing Value Handling
ML Study >
methods #
- Drop column
- Imputation
- Extension to imputation
data = pd.read_csv('housing-price-train.csv') y = data.Price # use numerical features predictors = data.drop(['Price'], axis='columns') X = predictors.select_dtypes(exclude=['object']) # divide data into training and validation subsets X_train, X_valid, y_train, y_valid = train_test_split( X, y, train_size=0.8, test_size=0.2, random_state=0)
- method 1. Drop columns with Missing Values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] reduced_X_train = X_train.drop[cols_with_missing, axis='columns') reduced_X_valid = X_valid.drop[cols_with_missing, axis='columns')
- method 2. Imputation
my_imputer = SimpleImputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) # same as fit() and transform() imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid) # Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns
- method 3. Extension to Imputation
X_train_plus = X_train.copy() X_valid_plus = X_valid.copy() cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull() my_imputer = SimpleImputer() imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus)) imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus)) # Imputation removed column names; put them back imputed_X_train_plus = X_train_plus.columns imputed_X_valid_plus = X_valid_plus.columns
- etc
X_full = pd.read_csv('housing-price-train.csv') # Remove rows with missing target(y) X_full.dropna(axis='index', subset=['SalePrice'], inplace=True) # sepaarate target from predictors(features, columns) y = X_full.SalePrice X_full.drop(['SalePrice'], axis='columns', inplace=True) # use only numerical predictors X = X_full.select_dtypes(exclude=['object'])
dropna() : https://cosmosproject.tistory.com/308