注意
前往結尾以下載完整範例程式碼。
轉換具有 XGBoost 模型的管道¶
sklearn-onnx 只將 scikit-learn 模型轉換為 ONNX,但許多函式庫實作 scikit-learn API,以便將其模型包含在 scikit-learn 管道中。此範例考慮包含 XGBoost 模型的管道。sklearn-onnx 只要知道與 *XGBClassifier* 相關聯的轉換器,就可以轉換整個管道。讓我們看看如何操作。
訓練 XGBoost 分類器¶
import numpy
import onnxruntime as rt
from sklearn.datasets import load_iris, load_diabetes, make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, XGBRegressor, DMatrix, train as train_xgb
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
calculate_linear_classifier_output_shapes,
calculate_linear_regressor_output_shapes,
)
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert import convert_xgboost as convert_xgboost_booster
data = load_iris()
X = data.data[:, :2]
y = data.target
ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()
pipe = Pipeline([("scaler", StandardScaler()), ("xgb", XGBClassifier(n_estimators=3))])
pipe.fit(X, y)
# The conversion fails but it is expected.
try:
convert_sklearn(
pipe,
"pipeline_xgboost",
[("input", FloatTensorType([None, 2]))],
target_opset={"": 12, "ai.onnx.ml": 2},
)
except Exception as e:
print(e)
# The error message tells no converter was found
# for :epkg:`XGBoost` models. By default, :epkg:`sklearn-onnx`
# only handles models from :epkg:`scikit-learn` but it can
# be extended to every model following :epkg:`scikit-learn`
# API as long as the module knows there exists a converter
# for every model used in a pipeline. That's why
# we need to register a converter.
註冊 XGBClassifier 的轉換器¶
轉換器實作於 onnxmltools 中:onnxmltools…XGBoost.py。以及形狀計算器:onnxmltools…Classifier.py。
update_registered_converter(
XGBClassifier,
"XGBoostXGBClassifier",
calculate_linear_classifier_output_shapes,
convert_xgboost,
options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
再次轉換¶
比較預測結果¶
使用 XGBoost 的預測。
predict [0 2 1 1 2]
predict_proba [[0.69600695 0.1526681 0.15132491]]
使用 onnxruntime 的預測。
sess = rt.InferenceSession("pipeline_xgboost.onnx", providers=["CPUExecutionProvider"])
pred_onx = sess.run(None, {"input": X[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])
predict [0 2 1 1 2]
predict_proba [{0: 0.6960069537162781, 1: 0.15266810357570648, 2: 0.15132491290569305}]
與 XGBRegressor 相同的範例¶
update_registered_converter(
XGBRegressor,
"XGBoostXGBRegressor",
calculate_linear_regressor_output_shapes,
convert_xgboost,
)
data = load_diabetes()
x = data.data
y = data.target
X_train, X_test, y_train, _ = train_test_split(x, y, test_size=0.5)
pipe = Pipeline([("scaler", StandardScaler()), ("xgb", XGBRegressor(n_estimators=3))])
pipe.fit(X_train, y_train)
print("predict", pipe.predict(X_test[:5]))
predict [167.95638 209.4882 112.31891 127.7238 126.65028]
ONNX
onx = to_onnx(
pipe, X_train.astype(numpy.float32), target_opset={"": 12, "ai.onnx.ml": 2}
)
sess = rt.InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
pred_onx = sess.run(None, {"X": X_test[:5].astype(numpy.float32)})
print("predict", pred_onx[0].ravel())
predict [167.95638 209.4882 112.31891 127.7238 126.65028]
可能會出現一些差異。在這種情況下,您應該閱讀 切換到浮點數時的問題。
與 Booster 相同¶
Booster 無法插入管道中。它需要不同的轉換函式,因為它不遵循 scikit-learn API。
x, y = make_classification(
n_classes=2, n_features=5, n_samples=100, random_state=42, n_informative=3
)
X_train, X_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42)
dtrain = DMatrix(X_train, label=y_train)
param = {"objective": "multi:softmax", "num_class": 3}
bst = train_xgb(param, dtrain, 10)
initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
try:
onx = convert_xgboost_booster(bst, "name", initial_types=initial_type)
cont = True
except AssertionError as e:
print("XGBoost is too recent or onnxmltools too old.", e)
cont = False
if cont:
sess = rt.InferenceSession(
onx.SerializeToString(), providers=["CPUExecutionProvider"]
)
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]
print(pred_onx)
[0 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 1 1 1 1
0 1 1 1 0 0 1 1 0 0 0 1 0]
指令碼總執行時間:(0 分鐘 0.126 秒)