遗传特征选择器节点¶
FSSNode 从预定义的特征子集列表中进行选择,而 GeneticFeatureSelectorNode 使用进化算法从头开始优化新的特征子集。这在没有预定义特征分组的情况下非常有用。
初始化 GeneticFeatureSelectorNode 只需传入数据集中特征的总数(即列数)。
在这些示例中,我们创建一个虚拟数据集,其中前六列是信息丰富的,其余列是非信息性的。
In [1]
已复制!
import tpot
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *
import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *
from tpot.config import get_search_space
X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=6, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
X = np.hstack([X, np.random.rand(X.shape[0],6)]) #add six uninformative features
X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i', 'j', 'k', 'l']) # a, b ,c the rest are uninformative
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
X.head()
import tpot from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np import pandas as pd import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * from tpot.config import get_search_space X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=6, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) X = np.hstack([X, np.random.rand(X.shape[0],6)]) # 添加六个非信息性特征 X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i', 'j', 'k', 'l']) # a, b ,c 其余非信息性 X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25) X.head()
/opt/anaconda3/envs/tpotenv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Out[1]
a | b | c | d | e | f | g | h | i | j | k | l | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.431031 | 1.889841 | 0.403235 | 0.130347 | 1.245051 | -3.356587 | 0.254612 | 0.477369 | 0.145158 | 0.633607 | 0.200373 | 0.037735 |
1 | 0.016308 | -1.035908 | -1.625176 | 1.803391 | 0.442258 | -0.844052 | 0.141507 | 0.024659 | 0.769976 | 0.658990 | 0.971987 | 0.570931 |
2 | 3.769469 | 0.209185 | -1.303033 | 4.077509 | 2.935603 | 1.243487 | 0.088988 | 0.377935 | 0.019007 | 0.923725 | 0.760895 | 0.316752 |
3 | -2.583292 | 0.172831 | -1.531697 | -0.078774 | 1.656190 | 0.475652 | 0.741539 | 0.179612 | 0.993759 | 0.624101 | 0.290679 | 0.946652 |
4 | -0.833504 | 3.209340 | -0.928798 | 0.345765 | 1.599057 | 0.242801 | 0.359656 | 0.697036 | 0.643063 | 0.198362 | 0.725530 | 0.974992 |
In [2]
已复制!
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
每个 GeneticFeatureSelectorNode 将选择一个新的特征子集
In [3]
已复制!
selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
selector = gfs_sp.generate().export_pipeline() selector.set_output(transform="pandas") # 默认情况下 sklearn 选择器返回 numpy 数组。这将使其返回 pandas 数据帧 selector.fit(X_train, y_train) selector.transform(X_train)
Out[3]
b | j | |
---|---|---|
89 | 0.067735 | 0.839366 |
897 | -0.175982 | 0.050951 |
824 | -0.503185 | 0.826335 |
305 | 2.775297 | 0.877498 |
774 | 3.143969 | 0.429360 |
... | ... | ... |
310 | 1.402502 | 0.506769 |
333 | 2.384090 | 0.047125 |
259 | 5.262763 | 0.500726 |
30 | 1.107717 | 0.768569 |
757 | 3.606505 | 0.557151 |
750 行 × 2 列
In [4]
已复制!
selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
selector = gfs_sp.generate().export_pipeline() selector.set_output(transform="pandas") # 默认情况下 sklearn 选择器返回 numpy 数组。这将使其返回 pandas 数据帧 selector.fit(X_train, y_train) selector.transform(X_train)
Out[4]
k | |
---|---|
89 | 0.179639 |
897 | 0.430166 |
824 | 0.354605 |
305 | 0.949369 |
774 | 0.499857 |
... | ... |
310 | 0.624468 |
333 | 0.995309 |
259 | 0.138835 |
30 | 0.548930 |
757 | 0.643055 |
750 行 × 1 列
变异和交叉可以从学习到的特征集中添加或删除子集。
In [5]
已复制!
selector_ind = gfs_sp.generate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]
print("selected features: ", selected_features)
selector_ind = gfs_sp.generate() selector = selector_ind.export_pipeline() selected_features = X.columns[selector.mask] print("所选特征: ", selected_features)
selected features: Index(['a', 'j'], dtype='object')
In [6]
已复制!
selector_ind.mutate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]
print("selected features: ", selected_features)
selector_ind.mutate() selector = selector_ind.export_pipeline() selected_features = X.columns[selector.mask] print("所选特征: ", selected_features)
selected features: Index(['a', 'h', 'j'], dtype='object')
训练¶
In [7]
已复制!
import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])
est = tpot.TPOTEstimator( population_size=32,
generations=10,
scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer],
scorers_weights=[1.0, -1.0],
n_jobs=32,
classification=True,
search_space = final_classification_search_space,
verbose=1,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1]) classifiers_sp = get_search_space('RandomForestClassifier') final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp]) est = tpot.TPOTEstimator( population_size=32, generations=10, scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer], scorers_weights=[1.0, -1.0], n_jobs=32, classification=True, search_space = final_classification_search_space, verbose=1, ) scorer = sklearn.metrics.get_scorer('roc_auc_ovo') est.fit(X_train, y_train) print(scorer(est, X_test, y_test))
/Users/ketrong/Desktop/tpotvalidation/tpot/tpot/tpot_estimator/estimator.py:456: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met. warnings.warn("Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.") Generation: 100%|██████████| 10/10 [00:53<00:00, 5.33s/it]
0.9458645653148825
In [8]
已复制!
est.fitted_pipeline_
est.fitted_pipeline_
Out[8]
Pipeline(steps=[('maskselector', MaskSelector(mask=array([ True, True, True, True, True, True, False, False, True, False, True, True]))), ('randomforestclassifier', RandomForestClassifier(class_weight='balanced', criterion='entropy', max_features=0.487196536075, min_samples_leaf=5, min_samples_split=3, n_estimators=128, n_jobs=1))])在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
Pipeline(steps=[('maskselector', MaskSelector(mask=array([ True, True, True, True, True, True, False, False, True, False, True, True]))), ('randomforestclassifier', RandomForestClassifier(class_weight='balanced', criterion='entropy', max_features=0.487196536075, min_samples_leaf=5, min_samples_split=3, n_estimators=128, n_jobs=1))])
MaskSelector(mask=array([ True, True, True, True, True, True, False, False, True, False, True, True]))
RandomForestClassifier(class_weight='balanced', criterion='entropy', max_features=0.487196536075, min_samples_leaf=5, min_samples_split=3, n_estimators=128, n_jobs=1)
In [9]
已复制!
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask] print("所选特征: ", selected_features)
selected features: Index(['a', 'b', 'c', 'd', 'e', 'f', 'i', 'k', 'l'], dtype='object')
用于最小化所选特征数量的自定义目标函数¶
我们可以创建一个自定义目标函数,该函数返回每个流水线选择的特征数量。other_objective_functions 参数用于不需要拟合流水线且不需要交叉验证的目标函数。由于我们知道选择器实例从其参数中获取特征,而不是通过拟合,因此我们可以为 other_objective_functions 参数创建一个目标函数。我们将权重设置为 -1,因为我们希望最小化选择的特征数量。我们还为其指定一个名称,以便在 evaluated_individuals 数据框中更轻松地访问它。
In [10]
已复制!
def number_of_selected_features(est):
return sum(est.steps[0][1].mask)
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])
est = tpot.TPOTEstimator(
population_size=32,
generations=10,
scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer],
scorers_weights=[1.0, -1.0],
other_objective_functions=[number_of_selected_features],
other_objective_functions_weights = [-1],
objective_function_names = ["Number of selected features"],
n_jobs=32,
classification=True,
search_space = final_classification_search_space,
verbose=2,
)
scorer = sklearn.metrics.get_scorer('roc_auc_ovo')
est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
def number_of_selected_features(est): return sum(est.steps[0][1].mask) gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1]) classifiers_sp = get_search_space('RandomForestClassifier') final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp]) est = tpot.TPOTEstimator( population_size=32, generations=10, scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer], scorers_weights=[1.0, -1.0], other_objective_functions=[number_of_selected_features], other_objective_functions_weights = [-1], objective_function_names = ["Number of selected features"], n_jobs=32, classification=True, search_space = final_classification_search_space, verbose=2, ) scorer = sklearn.metrics.get_scorer('roc_auc_ovo') est.fit(X_train, y_train) print(scorer(est, X_test, y_test))
/Users/ketrong/Desktop/tpotvalidation/tpot/tpot/tpot_estimator/estimator.py:456: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met. warnings.warn("Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.") Generation: 100%|██████████| 10/10 [00:47<00:00, 4.73s/it]
0.9414440386956244
In [11]
已复制!
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask] print("所选特征: ", selected_features)
selected features: Index(['b', 'c', 'd', 'e', 'f', 'g'], dtype='object')
In [12]
已复制!
import seaborn as sns
import matplotlib.pyplot as plt
df = est.evaluated_individuals
col1 = "Number of selected features"
col2 = "roc_auc_score"
# Multiple orange dots show because the pareto front in this case is actually 3D along the auroc score, number of features, and complexity.
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], x=col1, y=col2, label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
ax.set_yscale('log')
plt.show()
#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
import seaborn as sns import matplotlib.pyplot as plt df = est.evaluated_individuals col1 = "Number of selected features" col2 = "roc_auc_score" # 显示多个橙色点是因为在这种情况下,帕累托前沿实际上在 auroc 分数、特征数量和复杂度三个维度上是 3D 的。 # 将帕累托前沿中的 nan 替换为 0 fig, ax = plt.subplots(figsize=(5,5)) sns.scatterplot(df[df['Pareto_Front']!=1], x=col1, y=col2, label='other', ax=ax) sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax) ax.title.set_text('Performance of all pipelines') # y 轴对数刻度 ax.set_yscale('log') plt.show() # 将帕累托前沿中的 nan 替换为 0 fig, ax = plt.subplots(figsize=(10,5)) sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax) ax.title.set_text('Performance of only the Pareto Front') # y 轴对数刻度 # ax.set_yscale('log') plt.show()
In [13]
已复制!
linear_search_space = tpot.config.template_search_spaces.get_template_search_spaces("linear", classification=True)
gfs_and_linear_search_space = SequentialPipeline([gfs_sp, linear_search_space])
# est = tpot.TPOTEstimator(
# population_size=32,
# generations=10,
# scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer],
# scorers_weights=[1.0, -1.0],
# other_objective_functions=[number_of_selected_features],
# other_objective_functions_weights = [-1],
# objective_function_names = ["Number of selected features"],
# n_jobs=32,
# classification=True,
# search_space = gfs_and_linear_search_space,
# verbose=2,
# )
gfs_and_linear_search_space.generate(rng=1).export_pipeline()
linear_search_space = tpot.config.template_search_spaces.get_template_search_spaces("linear", classification=True) gfs_and_linear_search_space = SequentialPipeline([gfs_sp, linear_search_space]) # est = tpot.TPOTEstimator( # population_size=32, # generations=10, # scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer], # scorers_weights=[1.0, -1.0], # other_objective_functions=[number_of_selected_features], # other_objective_functions_weights = [-1], # objective_function_names = ["Number of selected features"], # n_jobs=32, # classification=True, # search_space = gfs_and_linear_search_space, # verbose=2, # ) gfs_and_linear_search_space.generate(rng=1).export_pipeline()
Out[13]
Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))), ('pipeline', Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))]))])在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))), ('pipeline', Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))]))])
MaskSelector(mask=array([False, False, True, False, False, False, False, False, False, True, False, False]))
Pipeline(steps=[('normalizer', Normalizer(norm='l1')), ('selectpercentile', SelectPercentile(percentile=74.2561844719571)), ('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])), ('adaboostclassifier', AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320))])
Normalizer(norm='l1')
SelectPercentile(percentile=74.2561844719571)
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('binarizer', Binarizer(threshold=0.0935770250992))])), ('passthrough', Passthrough())])
Binarizer(threshold=0.0935770250992)
Passthrough()
FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()), ('passthrough', Passthrough())])
SkipTransformer()
Passthrough()
AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9665397922726, n_estimators=320)
进阶使用¶
如果您想进行更高级的设置,可以组合更多搜索空间,以便为每个特征集设置独特的预处理流水线。这是一个示例
In [14]
已复制!
dynamic_transformers = DynamicUnionPipeline(get_search_space("all_transformers"), max_estimators=4)
dynamic_transformers_with_passthrough = tpot.search_spaces.pipelines.UnionPipeline([
dynamic_transformers,
tpot.config.get_search_space("Passthrough")],
)
multi_step_engineering = DynamicLinearPipeline(dynamic_transformers_with_passthrough, max_length=4)
gfs_engineering_search_space = SequentialPipeline([gfs_sp, multi_step_engineering])
union_fss_engineering_search_space = DynamicUnionPipeline(gfs_engineering_search_space)
classification_search_space = get_search_space('classifiers')
final_fancy_search_space = SequentialPipeline([union_fss_engineering_search_space, classification_search_space])
final_fancy_search_space.generate(rng=1).export_pipeline()
dynamic_transformers = DynamicUnionPipeline(get_search_space("all_transformers"), max_estimators=4) dynamic_transformers_with_passthrough = tpot.search_spaces.pipelines.UnionPipeline([ dynamic_transformers, tpot.config.get_search_space("Passthrough")], ) multi_step_engineering = DynamicLinearPipeline(dynamic_transformers_with_passthrough, max_length=4) gfs_engineering_search_space = SequentialPipeline([gfs_sp, multi_step_engineering]) union_fss_engineering_search_space = DynamicUnionPipeline(gfs_engineering_search_space) classification_search_space = get_search_space('classifiers') final_fancy_search_space = SequentialPipeline([union_fss_engineering_search_space, classification_search_space]) final_fancy_search_space.generate(rng=1).export_pipeline()
Out[14]
Pipeline(steps=[('featureunion', FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', Robu... FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('normalizer', Normalizer(norm='l1')), ('nystroem', Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3))])), ('passthrough', Passthrough())]))]))]))])), ('sgdclassifier', SGDClassifier(alpha=0.0024802032445, eta0=0.2824117602653, l1_ratio=0.281711265998, loss='modified_huber', n_jobs=1, penalty='elasticnet'))])在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
Pipeline(steps=[('featureunion', FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', Robu... FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('normalizer', Normalizer(norm='l1')), ('nystroem', Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3))])), ('passthrough', Passthrough())]))]))]))])), ('sgdclassifier', SGDClassifier(alpha=0.0024802032445, eta0=0.2824117602653, l1_ratio=0.281711265998, loss='modified_huber', n_jobs=1, penalty='elasticnet'))])
FeatureUnion(transformer_list=[('pipeline', Pipeline(steps=[('maskselector', MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))), ('pipeline', Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.18740... FeatureAgglomeration(linkage='complete', metric='l2', n_clusters=28))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('normalizer', Normalizer(norm='l1')), ('nystroem', Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3))])), ('passthrough', Passthrough())]))]))]))])
MaskSelector(mask=array([False, True, False, False, False, False, False, False, True, False, False, False]))
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))), ('featureagglomeration', FeatureAgglomeration(linkage='complete', metric='l2', n_clusters=28))])), ('passthrough', Passthrough())])), ('featureunion-2', FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('normalizer', Normalizer(norm='l1')), ('nystroem', Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3))])), ('passthrough', Passthrough())]))])
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('robustscaler', RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))), ('featureagglomeration', FeatureAgglomeration(linkage='complete', metric='l2', n_clusters=28))])), ('passthrough', Passthrough())])
RobustScaler(quantile_range=(0.1874078711948, 0.7642865555088))
FeatureAgglomeration(linkage='complete', metric='l2', n_clusters=28)
Passthrough()
FeatureUnion(transformer_list=[('featureunion', FeatureUnion(transformer_list=[('normalizer', Normalizer(norm='l1')), ('nystroem', Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3))])), ('passthrough', Passthrough())])
Normalizer(norm='l1')
Nystroem(gamma=0.5186832611359, kernel='polynomial', n_components=3)
Passthrough()
SGDClassifier(alpha=0.0024802032445, eta0=0.2824117602653, l1_ratio=0.281711265998, loss='modified_huber', n_jobs=1, penalty='elasticnet')