遗传特征选择器节点¶

FSSNode 从预定义的特征子集列表中进行选择，而 GeneticFeatureSelectorNode 使用进化算法从头开始优化新的特征子集。这在没有预定义特征分组的情况下非常有用。

初始化 GeneticFeatureSelectorNode 只需传入数据集中特征的总数（即列数）。

在这些示例中，我们创建一个虚拟数据集，其中前六列是信息丰富的，其余列是非信息性的。

In [1]

已复制！





import tpot
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *
import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *
from tpot.config import get_search_space


X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=6, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
X = np.hstack([X, np.random.rand(X.shape[0],6)]) #add six uninformative features
X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i', 'j', 'k', 'l']) # a, b ,c the rest are uninformative
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)

X.head()
import tpot from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np import pandas as pd import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * from tpot.config import get_search_space X, y = sklearn.datasets.make_classification(n_samples=1000, n_features=6, n_informative=6, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) X = np.hstack([X, np.random.rand(X.shape[0],6)]) # 添加六个非信息性特征 X = pd.DataFrame(X, columns=['a','b','c','d','e','f','g','h','i', 'j', 'k', 'l']) # a, b ,c 其余非信息性 X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25) X.head()

/opt/anaconda3/envs/tpotenv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Out[1]

	a	b	c	d	e	f	g	h	i	j	k	l
0	0.431031	1.889841	0.403235	0.130347	1.245051	-3.356587	0.254612	0.477369	0.145158	0.633607	0.200373	0.037735
1	0.016308	-1.035908	-1.625176	1.803391	0.442258	-0.844052	0.141507	0.024659	0.769976	0.658990	0.971987	0.570931
2	3.769469	0.209185	-1.303033	4.077509	2.935603	1.243487	0.088988	0.377935	0.019007	0.923725	0.760895	0.316752
3	-2.583292	0.172831	-1.531697	-0.078774	1.656190	0.475652	0.741539	0.179612	0.993759	0.624101	0.290679	0.946652
4	-0.833504	3.209340	-0.928798	0.345765	1.599057	0.242801	0.359656	0.697036	0.643063	0.198362	0.725530	0.974992

In [2]

已复制！

gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])

每个 GeneticFeatureSelectorNode 将选择一个新的特征子集

In [3]

已复制！

selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
selector = gfs_sp.generate().export_pipeline() selector.set_output(transform="pandas") # 默认情况下 sklearn 选择器返回 numpy 数组。这将使其返回 pandas 数据帧 selector.fit(X_train, y_train) selector.transform(X_train)

Out[3]

	b	j
89	0.067735	0.839366
897	-0.175982	0.050951
824	-0.503185	0.826335
305	2.775297	0.877498
774	3.143969	0.429360
...	...	...
310	1.402502	0.506769
333	2.384090	0.047125
259	5.262763	0.500726
30	1.107717	0.768569
757	3.606505	0.557151

750 行 × 2 列

In [4]

已复制！

selector = gfs_sp.generate().export_pipeline()
selector.set_output(transform="pandas") #by default sklearn selectors return numpy arrays. this will make it return pandas dataframes
selector.fit(X_train, y_train)
selector.transform(X_train)
selector = gfs_sp.generate().export_pipeline() selector.set_output(transform="pandas") # 默认情况下 sklearn 选择器返回 numpy 数组。这将使其返回 pandas 数据帧 selector.fit(X_train, y_train) selector.transform(X_train)

Out[4]

	k
89	0.179639
897	0.430166
824	0.354605
305	0.949369
774	0.499857
...	...
310	0.624468
333	0.995309
259	0.138835
30	0.548930
757	0.643055

750 行 × 1 列

变异和交叉可以从学习到的特征集中添加或删除子集。

In [5]

已复制！

selector_ind = gfs_sp.generate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]

print("selected features: ", selected_features)
selector_ind = gfs_sp.generate() selector = selector_ind.export_pipeline() selected_features = X.columns[selector.mask] print("所选特征: ", selected_features)

selected features:  Index(['a', 'j'], dtype='object')

In [6]

已复制！

selector_ind.mutate()
selector = selector_ind.export_pipeline()
selected_features = X.columns[selector.mask]
print("selected features: ", selected_features)
selector_ind.mutate() selector = selector_ind.export_pipeline() selected_features = X.columns[selector.mask] print("所选特征: ", selected_features)

selected features:  Index(['a', 'h', 'j'], dtype='object')

训练¶

In [7]

已复制！





import tpot
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
import numpy as np
from tpot.search_spaces.nodes import *
from tpot.search_spaces.pipelines import *

gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])

est = tpot.TPOTEstimator(  population_size=32,
                            generations=10, 
                            scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer],
                            scorers_weights=[1.0, -1.0],
                            n_jobs=32,
                            classification=True,
                            search_space = final_classification_search_space,
                            verbose=1,
                            )


scorer = sklearn.metrics.get_scorer('roc_auc_ovo')

est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
import tpot import sklearn.datasets from sklearn.linear_model import LogisticRegression import numpy as np from tpot.search_spaces.nodes import * from tpot.search_spaces.pipelines import * gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1]) classifiers_sp = get_search_space('RandomForestClassifier') final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp]) est = tpot.TPOTEstimator( population_size=32, generations=10, scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer], scorers_weights=[1.0, -1.0], n_jobs=32, classification=True, search_space = final_classification_search_space, verbose=1, ) scorer = sklearn.metrics.get_scorer('roc_auc_ovo') est.fit(X_train, y_train) print(scorer(est, X_test, y_test))

/Users/ketrong/Desktop/tpotvalidation/tpot/tpot/tpot_estimator/estimator.py:456: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.
  warnings.warn("Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.")
Generation: 100%|██████████| 10/10 [00:53<00:00,  5.33s/it]

0.9458645653148825

In [8]

已复制！

est.fitted_pipeline_
est.fitted_pipeline_

Out[8]

Pipeline(steps=[('maskselector',
                 MaskSelector(mask=array([ True,  True,  True,  True,  True,  True, False, False,  True,
       False,  True,  True]))),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight='balanced',
                                        criterion='entropy',
                                        max_features=0.487196536075,
                                        min_samples_leaf=5, min_samples_split=3,
                                        n_estimators=128, n_jobs=1))])

在 Jupyter 环境中，请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上，HTML 表示无法渲染，请尝试使用 nbviewer.org 加载此页面。

In [9]

已复制！

selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask] print("所选特征: ", selected_features)

selected features:  Index(['a', 'b', 'c', 'd', 'e', 'f', 'i', 'k', 'l'], dtype='object')

用于最小化所选特征数量的自定义目标函数¶

我们可以创建一个自定义目标函数，该函数返回每个流水线选择的特征数量。other_objective_functions 参数用于不需要拟合流水线且不需要交叉验证的目标函数。由于我们知道选择器实例从其参数中获取特征，而不是通过拟合，因此我们可以为 other_objective_functions 参数创建一个目标函数。我们将权重设置为 -1，因为我们希望最小化选择的特征数量。我们还为其指定一个名称，以便在 evaluated_individuals 数据框中更轻松地访问它。

In [10]

已复制！





def number_of_selected_features(est):
   return sum(est.steps[0][1].mask)

gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1])
classifiers_sp = get_search_space('RandomForestClassifier')
final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp])

est = tpot.TPOTEstimator(  
                           population_size=32,
                           generations=10, 
                           scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer],
                           scorers_weights=[1.0, -1.0],
                           other_objective_functions=[number_of_selected_features],
                           other_objective_functions_weights = [-1],
                           objective_function_names = ["Number of selected features"],

                           n_jobs=32,
                           classification=True,
                           search_space = final_classification_search_space,
                           verbose=2,
                            )

scorer = sklearn.metrics.get_scorer('roc_auc_ovo')

est.fit(X_train, y_train)
print(scorer(est, X_test, y_test))
def number_of_selected_features(est): return sum(est.steps[0][1].mask) gfs_sp = GeneticFeatureSelectorNode(n_features=X.shape[1]) classifiers_sp = get_search_space('RandomForestClassifier') final_classification_search_space = SequentialPipeline([gfs_sp, classifiers_sp]) est = tpot.TPOTEstimator( population_size=32, generations=10, scorers=["roc_auc_ovr", tpot.objectives.complexity_scorer], scorers_weights=[1.0, -1.0], other_objective_functions=[number_of_selected_features], other_objective_functions_weights = [-1], objective_function_names = ["Number of selected features"], n_jobs=32, classification=True, search_space = final_classification_search_space, verbose=2, ) scorer = sklearn.metrics.get_scorer('roc_auc_ovo') est.fit(X_train, y_train) print(scorer(est, X_test, y_test))

/Users/ketrong/Desktop/tpotvalidation/tpot/tpot/tpot_estimator/estimator.py:456: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.
  warnings.warn("Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.")
Generation: 100%|██████████| 10/10 [00:47<00:00,  4.73s/it]

0.9414440386956244

In [11]

已复制！

selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask]
print("selected features: ", selected_features)
selected_features = X.columns[est.fitted_pipeline_.steps[0][1].mask] print("所选特征: ", selected_features)

selected features:  Index(['b', 'c', 'd', 'e', 'f', 'g'], dtype='object')

In [12]

已复制！





import seaborn as sns
import matplotlib.pyplot as plt

df = est.evaluated_individuals
col1 = "Number of selected features"
col2 = "roc_auc_score"

# Multiple orange dots show because the pareto front in this case is actually 3D along the auroc score, number of features, and complexity.

#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(5,5))
sns.scatterplot(df[df['Pareto_Front']!=1], x=col1, y=col2, label='other', ax=ax)
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of all pipelines')
#log scale y
ax.set_yscale('log')
plt.show()

#replace nans in pareto front with 0
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax)
ax.title.set_text('Performance of only the Pareto Front')
#log scale y
# ax.set_yscale('log')
plt.show()
import seaborn as sns import matplotlib.pyplot as plt df = est.evaluated_individuals col1 = "Number of selected features" col2 = "roc_auc_score" # 显示多个橙色点是因为在这种情况下，帕累托前沿实际上在 auroc 分数、特征数量和复杂度三个维度上是 3D 的。 # 将帕累托前沿中的 nan 替换为 0 fig, ax = plt.subplots(figsize=(5,5)) sns.scatterplot(df[df['Pareto_Front']!=1], x=col1, y=col2, label='other', ax=ax) sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax) ax.title.set_text('Performance of all pipelines') # y 轴对数刻度 ax.set_yscale('log') plt.show() # 将帕累托前沿中的 nan 替换为 0 fig, ax = plt.subplots(figsize=(10,5)) sns.scatterplot(df[df['Pareto_Front']==1], x=col1, y=col2, label='Pareto Front', ax=ax) ax.title.set_text('Performance of only the Pareto Front') # y 轴对数刻度 # ax.set_yscale('log') plt.show()

No description has been provided for this image

其他示例¶

与所有搜索空间一样，GeneticFeatureSelectorNode 可以与任何其他搜索空间结合使用。

您也可以将其与现有的预构建模板配对使用，例如

进阶使用¶

如果您想进行更高级的设置，可以组合更多搜索空间，以便为每个特征集设置独特的预处理流水线。这是一个示例