Amltk 搜索空间解析器示例
AMLTK (https://github.com/automl/amltk) 提供了一个用于开发 AutoML 系统的框架。该系统的一个组件是搜索空间定义。
TPOT 提供了一个名为 tpot.utils.tpot_parser
的函数,可以将 AMLTK API 中定义的搜索空间转换为 TPOT 使用的搜索空间类。这允许用户定义一个单独的搜索空间供两种算法使用,从而方便进行更好的比较。下面是一些在 AMLTK 中定义的搜索空间以及如何在 TPOT 中使用它们的示例。
注意:此功能仍在实验中,AMLTK API 中并非所有功能都已在 TPOT 中完全支持。(例如,使用 amltk.pipeline.Split
基于分类 vs. 数值进行自动分割的功能目前尚未在解析器中实现。)
输入 [1]
已复制!
from sklearn.compose import make_column_selector
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from amltk.pipeline import Choice, Component, Sequential, Split
import tpot
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
import tpot
import numpy as np
import sklearn
import sklearn.datasets
import pandas as pd
# create dummy pandas dataset with both categorical and numerical columns
X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42)
X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)])
# add 5 categorical columns
for i in range(5):
X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100)
y = y.flatten()
# train test split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5)
# TODO: implement support for this condition
# select_categories = make_column_selector(dtype_include=object)
# select_numerical = make_column_selector(dtype_include=np.number)
# split_imputation = Split(
# {
# "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")],
# "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}),
# },
# config={"categories": select_categories, "numerics": select_numerical}, #not yet supported
# name="feature_preprocessing",
# )
# split_imputation
select_categories = make_column_selector(dtype_include=object)
select_numerical = make_column_selector(dtype_include=np.number)
cat_selector = make_column_transformer(("passthrough", select_categories))
num_selector = make_column_transformer(("passthrough", select_numerical))
split_imputation = Split(
{
"categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)],
"numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})],
},
name="split_imputation",
)
split_imputation
from sklearn.compose import make_column_selector import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC from amltk.pipeline import Choice, Component, Sequential, Split import tpot from sklearn.preprocessing import FunctionTransformer from sklearn.compose import make_column_transformer import tpot import numpy as np import sklearn import sklearn.datasets import pandas as pd # create dummy pandas dataset with both categorical and numerical columns X, y = sklearn.datasets.make_classification(n_samples=100, n_features=5, n_informative=3, n_classes=2, random_state=42) X = pd.DataFrame(X, columns=[f"num_{i}" for i in range(5)]) # add 5 categorical columns for i in range(5): X[f"cat_{i}"] = np.random.choice(["A", "B", "C"], size=100) y = y.flatten() # train test split X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.5) # TODO: implement support for this condition # select_categories = make_column_selector(dtype_include=object) # select_numerical = make_column_selector(dtype_include=np.number) # split_imputation = Split( # { # "categories": [SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first")], # "numerics": Component(SimpleImputer, space={"strategy": ["mean", "median"]}), # }, # config={"categories": select_categories, "numerics": select_numerical}, #not yet supported # name="feature_preprocessing", # ) # split_imputation select_categories = make_column_selector(dtype_include=object) select_numerical = make_column_selector(dtype_include=np.number) cat_selector = make_column_transformer(("passthrough", select_categories)) num_selector = make_column_transformer(("passthrough", select_numerical)) split_imputation = Split( { "categories": [cat_selector,SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(drop="first", sparse_output=False)], "numerics": [num_selector, Component(SimpleImputer, space={"strategy": ["mean", "median"]})], }, name="split_imputation", ) split_imputation
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[1], line 7 5 from sklearn.preprocessing import OneHotEncoder 6 from sklearn.svm import SVC ----> 7 from amltk.pipeline import Choice, Component, Sequential, Split 8 import tpot 9 from sklearn.preprocessing import FunctionTransformer ModuleNotFoundError: No module named 'amltk'
输入 [ ]
已复制!
from tpot.builtin_modules import Passthrough, ZeroCount
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, SelectKBest
selectors = Choice(
Component(VarianceThreshold, space={"threshold": (0.1,1)}),
Component(SelectKBest, space={"k": (1, 10)}),
name="selectors",
)
transformers = Split(
{
"passthrough": Passthrough(),
"polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}),
"zerocount" : ZeroCount(),
},
# config={"categories": select_categories, "numerics": select_numerical},
name="transformers",
)
pipeline = (
Sequential(name="my_pipeline")
>> split_imputation
# >> Component(SimpleImputer, space={"strategy": ["mean", "median"]}) # Choose either mean or median
>> selectors
>> transformers
>> Choice(
# Our pipeline can choose between two different estimators
Component(
RandomForestClassifier,
space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]},
config={"max_depth": 3},
),
Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}),
name="estimator",
)
)
# Display the amltk Pipeline
pipeline
from tpot.builtin_modules import Passthrough, ZeroCount from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA from sklearn.feature_selection import VarianceThreshold, SelectKBest selectors = Choice( Component(VarianceThreshold, space={"threshold": (0.1,1)}), Component(SelectKBest, space={"k": (1, 10)}), name="selectors", ) transformers = Split( { "passthrough": Passthrough(), "polynomial": Component(PolynomialFeatures, space={"degree": [2, 3]}), "zerocount" : ZeroCount(), }, # config={"categories": select_categories, "numerics": select_numerical}, name="transformers", ) pipeline = ( Sequential(name="my_pipeline") >> split_imputation # >> Component(SimpleImputer, space={"strategy": ["mean", "median"]}) # Choose either mean or median >> selectors >> transformers >> Choice( # Our pipeline can choose between two different estimators Component( RandomForestClassifier, space={"n_estimators": (10, 100), "criterion": ["gini", "log_loss"]}, config={"max_depth": 3}, ), Component(SVC, space={"kernel": ["linear", "rbf", "poly"]}), name="estimator", ) ) # Display the amltk Pipeline pipeline
╭─ Sequential(my_pipeline) ───────────────────────────────────────────────────────────────────────────────────────╮ │ ╭─ Split(split_imputation) ───────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Sequential(categories) ─────────────────────────╮ ╭─ Sequential(numerics) ─────────────────────────────╮ │ │ │ │ │ ╭─ Fixed(ColumnTransformer) ───────────────────╮ │ │ ╭─ Fixed(ColumnTransformer) ─────────────────────╮ │ │ │ │ │ │ │ item ColumnTransformer(transformers=[('pass… │ │ │ │ item ColumnTransformer(transformers=[('passth… │ │ │ │ │ │ │ │ 'passthrough', │ │ │ │ 'passthrough', │ │ │ │ │ │ │ │ <skle… │ │ │ │ <sklear… │ │ │ │ │ │ │ │ object at 0x7d354d946290>)]) │ │ │ │ object at 0x7d34edf94fa0>)]) │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────────────╯ │ │ │ │ │ │ ↓ │ │ ↓ │ │ │ │ │ │ ╭─ Fixed(SimpleImputer) ───────────────────────╮ │ │ ╭─ Component(SimpleImputer) ─────────────╮ │ │ │ │ │ │ │ item SimpleImputer(fill_value='missing', │ │ │ │ item class SimpleImputer(...) │ │ │ │ │ │ │ │ strategy='constant') │ │ │ │ space {'strategy': ['mean', 'median']} │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ ╰────────────────────────────────────────╯ │ │ │ │ │ │ ↓ │ ╰────────────────────────────────────────────────────╯ │ │ │ │ │ ╭─ Fixed(OneHotEncoder) ───────────────────────╮ │ │ │ │ │ │ │ item OneHotEncoder(drop='first', │ │ │ │ │ │ │ │ sparse_output=False) │ │ │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ │ │ │ ╰──────────────────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Choice(selectors) ─────────────────────────────────────────────────────╮ │ │ │ ╭─ Component(SelectKBest) ─────╮ ╭─ Component(VarianceThreshold) ─────╮ │ │ │ │ │ item class SelectKBest(...) │ │ item class VarianceThreshold(...) │ │ │ │ │ │ space {'k': (1, 10)} │ │ space {'threshold': (0.1, 1)} │ │ │ │ │ ╰──────────────────────────────╯ ╰────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Split(transformers) ─────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Sequential(passthrough) ─╮ ╭─ Sequential(polynomial) ────────────────╮ ╭─ Sequential(zerocount) ─╮ │ │ │ │ │ ╭─ Fixed(Passthrough) ─╮ │ │ ╭─ Component(PolynomialFeatures) ─────╮ │ │ ╭─ Fixed(ZeroCount) ─╮ │ │ │ │ │ │ │ item Passthrough() │ │ │ │ item class PolynomialFeatures(...) │ │ │ │ item ZeroCount() │ │ │ │ │ │ │ ╰──────────────────────╯ │ │ │ space {'degree': [2, 3]} │ │ │ ╰────────────────────╯ │ │ │ │ │ ╰───────────────────────────╯ │ ╰─────────────────────────────────────╯ │ ╰─────────────────────────╯ │ │ │ │ ╰─────────────────────────────────────────╯ │ │ │ ╰───────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ │ ↓ │ │ ╭─ Choice(estimator) ─────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ ╭─ Component(RandomForestClassifier) ──────────╮ ╭─ Component(SVC) ────────────────────────────╮ │ │ │ │ │ item class RandomForestClassifier(...) │ │ item class SVC(...) │ │ │ │ │ │ config {'max_depth': 3} │ │ space {'kernel': ['linear', 'rbf', 'poly']} │ │ │ │ │ │ space { │ ╰─────────────────────────────────────────────╯ │ │ │ │ │ 'n_estimators': (10, 100), │ │ │ │ │ │ 'criterion': [ │ │ │ │ │ │ 'gini', │ │ │ │ │ │ 'log_loss' │ │ │ │ │ │ ] │ │ │ │ │ │ } │ │ │ │ │ ╰──────────────────────────────────────────────╯ │ │ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
输出 [ ]
输入 [ ]
已复制!
#convert to tpot search space
tpot_search_space = tpot.utils.tpot_parser(pipeline)
# sample a pipeline from the tpot search space
tpot_search_space.generate().export_pipeline()
#convert to tpot search space tpot_search_space = tpot.utils.tpot_parser(pipeline) # sample a pipeline from the tpot search space tpot_search_space.generate().export_pipeline()
输出 [ ]
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.6738938110936)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(n_estimators=16))])在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.6738938110936)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(n_estimators=16))])
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencoder', OneHotEncoder(drop='first', sparse_output=False))])), ('pipeline-2', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])), ('simpleimputer', SimpleImputer(strategy='median'))]))])
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d354d946290>
passthrough
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', sparse_output=False)
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34edf94fa0>
passthrough
SimpleImputer(strategy='median')
VarianceThreshold(threshold=0.6738938110936)
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3))])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])
Passthrough()
PolynomialFeatures(degree=3)
ZeroCount()
RandomForestClassifier(n_estimators=16)
输入 [ ]
已复制!
est = tpot.TPOTEstimator(
scorers = ["roc_auc"],
scorers_weights = [1],
classification = True,
cv = 5,
search_space = tpot_search_space, #converted search space goes here
population_size= 10,
generations = 2,
max_eval_time_mins = 60*5,
verbose = 5,
n_jobs=10,
)
est.fit(X_train, y_train)
est = tpot.TPOTEstimator( scorers = ["roc_auc"], scorers_weights = [1], classification = True, cv = 5, search_space = tpot_search_space, #converted search space goes here population_size= 10, generations = 2, max_eval_time_mins = 60*5, verbose = 5, n_jobs=10, ) est.fit(X_train, y_train)
Generation: 50%|█████ | 1/2 [00:02<00:02, 2.60s/it]
Generation: 1 Best roc_auc_score score: 0.976
Generation: 100%|██████████| 2/2 [00:03<00:00, 1.57s/it] 2024-09-09 17:25:40,301 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:39897' caused the cluster to lose scattered data, which can't be recovered: {'ndarray-3f2f44921e6e9cc40ef07cfcd8ae90fb', 'DataFrame-5551f84174fd651642ff10eb71e30b22'} (stimulus_id='handle-worker-cleanup-1725927940.3010821')
Generation: 2 Best roc_auc_score score: 0.984
输出 [ ]
TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300, n_jobs=10, population_size=10, scorers=['roc_auc'], scorers_weights=[1], search_space=<tpot.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>, verbose=5)在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
TPOTEstimator(classification=True, generations=2, max_eval_time_mins=300, n_jobs=10, population_size=10, scorers=['roc_auc'], scorers_weights=[1], search_space=<tpot.search_spaces.pipelines.sequential.SequentialPipeline object at 0x7d34ec1efbb0>, verbose=5)
输入 [ ]
已复制!
est.fitted_pipeline_
est.fitted_pipeline_
输出 [ ]
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.1557560591318)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(criterion='log_loss', n_estimators=80))])在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
Pipeline(steps=[('featureunion-1', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencode... VarianceThreshold(threshold=0.1557560591318)), ('featureunion-2', FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])), ('randomforestclassifier', RandomForestClassifier(criterion='log_loss', n_estimators=80))])
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])), ('simpleimputer', SimpleImputer(fill_value='missing', strategy='constant')), ('onehotencoder', OneHotEncoder(drop='first', sparse_output=False))])), ('pipeline-2', Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])), ('simpleimputer', SimpleImputer(strategy='median'))]))])
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307cd0>
passthrough
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', sparse_output=False)
ColumnTransformer(transformers=[('passthrough', 'passthrough', <sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>)])
<sklearn.compose._column_transformer.make_column_selector object at 0x7d34eb307d30>
passthrough
SimpleImputer(strategy='median')
VarianceThreshold(threshold=0.1557560591318)
FeatureUnion(transformer_list=[('pipeline-1', Pipeline(steps=[('passthrough', Passthrough())])), ('pipeline-2', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures())])), ('pipeline-3', Pipeline(steps=[('zerocount', ZeroCount())]))])
Passthrough()
PolynomialFeatures()
ZeroCount()
RandomForestClassifier(criterion='log_loss', n_estimators=80)
输入 [ ]
已复制!
est.predict(X_test)
est.predict(X_test)
输出 [ ]
array([1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0])