Source code for WORC.featureprocessing.OneHotEncoderWrapper
#!/usr/bin/env python# Copyright 2020 Biomedical Imaging Group Rotterdam, Departments of# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importnumpyasnpfromsklearn.preprocessingimportOneHotEncoder
[docs]classOneHotEncoderWrapper(object):"""Module for OneHotEncoding features."""
[docs]def__init__(self,feature_labels_tofit,handle_unknown='ignore',verbose=False):"""Init preprocessor of features."""# Initiate varablesself.handle_unknown=handle_unknownself.verbose=verboseself.feature_labels_tofit=feature_labels_tofit
[docs]deffit(self,X,feature_labels,y=None):"""Fit OneHotEncoder for labels in feature_labels."""self.selectcolumns=list()self.selectlabels=list()self.skipcolumns=list()fornum,labelinenumerate(feature_labels):ifany(flinlabelforflinself.feature_labels_tofit):# This feature needs to be one hot encodedself.selectcolumns.append(num)self.selectlabels.append(label)else:# This feature needs to be skipped from onehotencodingself.skipcolumns.append(num)ifself.verbose:print(f'\t Fitting one-hot-encoder for features {self.selectlabels}.')iflen(self.selectcolumns)==0:ifself.verbose:print('\t No features selected, skip one-hot-encoding')self.encoder=Nonereturn# Gather skipped feature values and labels and selected onesskipped_feature_labels=list(np.asarray(feature_labels)[self.skipcolumns])select_feature_values=X[:,self.selectcolumns]select_feature_labels=list(np.asarray(feature_labels)[self.selectcolumns])# Apply the onehotencodingself.encoder=OneHotEncoder(handle_unknown=self.handle_unknown)self.encoder.fit(select_feature_values)# Adjust feature labelscategories=self.encoder.categories_self.encoded_feature_labels=skipped_feature_labelsforfl,catinzip(select_feature_labels,categories):forcinrange(cat.shape[0]):self.encoded_feature_labels.append(fl+f'_{c}')ifself.verbose:print(f'\t Encoded feature labels: {self.encoded_feature_labels}.')
[docs]deftransform(self,inputarray):"""Transform feature array. Transform the inputarray to select only the features based on the result from the fit function. Parameters ---------- inputarray: numpy array, mandatory Array containing the items to use selection on. The type of item in this list does not matter, e.g. floats, strings etc. """ifself.encoderisNone:# No features encodedoutputarray=inputarrayelse:# Gather skipped feature values and labels and selected onesskipped_feature_values=inputarray[:,self.skipcolumns]select_feature_values=inputarray[:,self.selectcolumns]# Transform selected featuresencoded_feature_values=self.encoder.transform(select_feature_values).toarray()# Recombine bothoutputarray=np.concatenate((skipped_feature_values,encoded_feature_values),axis=1)returnoutputarray
[docs]deftest():"""Test OneHotEncoderWrapper object."""# ObjectsX_train=np.asarray([['Male',1,5],['Female',3,6],['Female',2,7]])X_test=np.asarray([['Male',2,7],['Unknown',10,10]])feature_labels=['Gender','Numeric0','Numeric1']feature_labels_tofit=['Gender','0']# Fit and transformenc=OneHotEncoderWrapper(feature_labels_tofit=feature_labels_tofit,verbose=True)enc.fit(X_train,feature_labels)X_train_encoded=enc.transform(X_train)X_test_encoded=enc.transform(X_test)# Print resultsprint("X_train:")print(f"Input: {X_train}.")print(f"Output: {X_train_encoded}.")print("X_test:")print(f"Input: {X_test}.")print(f"Output: {X_test_encoded}.")print("Encoded feature labels:")print(enc.encoded_feature_labels)