轉換帶有 ColumnTransformer 的管線

scikit-learn 最近發布了 ColumnTransformer,讓使用者可以定義複雜的管線,其中每個欄位都可以使用不同的轉換器進行預處理。sklearn-onnx 在這種情況下仍然有效,如 轉換複雜管線 一節所示。

建立和訓練複雜的管線

我們重複使用範例 Column Transformer with Mixed Types 中實作的管線。有一個變更,因為 ONNX-ML Imputer 不處理字串類型。這不能成為最終 ONNX 管線的一部分,必須移除。尋找下方以 --- 開頭的註解。

import os
import pprint
import pandas as pd
import numpy as np
from numpy.testing import assert_almost_equal
import onnx
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
import matplotlib.pyplot as plt
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType

titanic_url = (
    "https://raw.githubusercontent.com/amueller/"
    "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
)
data = pd.read_csv(titanic_url)
X = data.drop("survived", axis=1)
y = data["survived"]
print(data.dtypes)

# SimpleImputer on string is not available for
# string in ONNX-ML specifications.
# So we do it beforehand.
for cat in ["embarked", "sex", "pclass"]:
    X[cat].fillna("missing", inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        # --- SimpleImputer is not available for strings in ONNX-ML specifications.
        # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(solver="lbfgs")),
    ]
)


clf.fit(X_train, y_train)
pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', LogisticRegression())])
在 Jupyter 環境中,請重新執行此儲存格以顯示 HTML 表示法或信任筆記本。
在 GitHub 上,HTML 表示法無法呈現,請嘗試使用 nbviewer.org 載入此頁面。


定義 ONNX 圖的輸入

sklearn-onnx 不知道用於訓練模型的功能,但它需要知道哪個功能具有哪個名稱。我們只需重複使用 DataFrame 欄位定義。

pclass         int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

轉換後。

def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == "int64":
            t = Int64TensorType([None, 1])
        elif v == "float64":
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs


initial_inputs = convert_dataframe_schema(X_train)

pprint.pprint(initial_inputs)
[('pclass', Int64TensorType(shape=[None, 1])),
 ('name', StringTensorType(shape=[None, 1])),
 ('sex', StringTensorType(shape=[None, 1])),
 ('age', FloatTensorType(shape=[None, 1])),
 ('sibsp', Int64TensorType(shape=[None, 1])),
 ('parch', Int64TensorType(shape=[None, 1])),
 ('ticket', StringTensorType(shape=[None, 1])),
 ('fare', FloatTensorType(shape=[None, 1])),
 ('cabin', StringTensorType(shape=[None, 1])),
 ('embarked', StringTensorType(shape=[None, 1])),
 ('boat', StringTensorType(shape=[None, 1])),
 ('body', FloatTensorType(shape=[None, 1])),
 ('home.dest', StringTensorType(shape=[None, 1]))]

將單個欄位合併到向量中並不是計算預測的最有效方式。可以在將管線轉換為圖形之前完成。

將管線轉換為 ONNX

try:
    model_onnx = convert_sklearn(
        clf, "pipeline_titanic", initial_inputs, target_opset=12
    )
except Exception as e:
    print(e)

如果圖形很小,則預測會更有效。這就是為什麼轉換器會檢查是否有未使用的輸入。它們需要從圖形輸入中移除。

to_drop = {"parch", "sibsp", "cabin", "ticket", "name", "body", "home.dest", "boat"}
initial_inputs = convert_dataframe_schema(X_train, to_drop)
try:
    model_onnx = convert_sklearn(
        clf, "pipeline_titanic", initial_inputs, target_opset=12
    )
except Exception as e:
    print(e)

scikit-learn 會在可以的情況下進行隱式轉換。sklearn-onnx 不會。OneHotEncoder 的 ONNX 版本必須套用在相同類型的欄位上。

initial_inputs = convert_dataframe_schema(X_train, to_drop)

model_onnx = convert_sklearn(clf, "pipeline_titanic", initial_inputs, target_opset=12)


# And save.
with open("pipeline_titanic.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

比較預測

最後一步,我們需要確保轉換後的模型產生相同的預測、標籤和機率。讓我們從 scikit-learn 開始。

print("predict", clf.predict(X_test[:5]))
print("predict_proba", clf.predict_proba(X_test[:2]))
predict [1 1 0 1 0]
predict_proba [[0.40266136 0.59733864]
 [0.15699767 0.84300233]]

使用 onnxruntime 進行預測。我們需要移除已捨棄的欄位,並將 double 向量變更為 float 向量,因為 onnxruntime 不支援 double float。onnxruntime 不接受 dataframe。輸入必須以字典列表的形式給定。最後一個細節,每個欄位都被描述為一個單欄矩陣,而不是真正的向量,這解釋了最後一行與 reshape 的關係。

X_test2 = X_test.drop(to_drop, axis=1)
inputs = {c: X_test2[c].values for c in X_test2.columns}
for c in numeric_features:
    inputs[c] = inputs[c].astype(np.float32)
for k in inputs:
    inputs[k] = inputs[k].reshape((inputs[k].shape[0], 1))

我們準備好執行 onnxruntime 了。

sess = rt.InferenceSession("pipeline_titanic.onnx", providers=["CPUExecutionProvider"])
pred_onx = sess.run(None, inputs)
print("predict", pred_onx[0][:5])
print("predict_proba", pred_onx[1][:2])
predict [1 1 0 1 0]
predict_proba [{0: 0.40266138315200806, 1: 0.5973386168479919}, {0: 0.15699762105941772, 1: 0.8430023789405823}]

onnxruntime 的輸出是字典列表。讓我們切換到陣列,但這需要再次使用其他選項 zipmap 進行轉換。

model_onnx = convert_sklearn(
    clf,
    "pipeline_titanic",
    initial_inputs,
    target_opset=12,
    options={id(clf): {"zipmap": False}},
)
with open("pipeline_titanic_nozipmap.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

sess = rt.InferenceSession(
    "pipeline_titanic_nozipmap.onnx", providers=["CPUExecutionProvider"]
)
pred_onx = sess.run(None, inputs)
print("predict", pred_onx[0][:5])
print("predict_proba", pred_onx[1][:2])
predict [1 1 0 1 0]
predict_proba [[0.40266138 0.5973386 ]
 [0.15699762 0.8430024 ]]

讓我們檢查它們是否相同。

assert_almost_equal(clf.predict_proba(X_test), pred_onx[1])

顯示 ONNX 圖形

最後,讓我們看看使用 sklearn-onnx 轉換的圖形。

pydot_graph = GetPydotGraph(
    model_onnx.graph,
    name=model_onnx.graph.name,
    rankdir="TB",
    node_producer=GetOpNodeProducer(
        "docstring", color="yellow", fillcolor="yellow", style="filled"
    ),
)
pydot_graph.write_dot("pipeline_titanic.dot")

os.system("dot -O -Gdpi=300 -Tpng pipeline_titanic.dot")

image = plt.imread("pipeline_titanic.dot.png")
fig, ax = plt.subplots(figsize=(40, 20))
ax.imshow(image)
ax.axis("off")
plot complex pipeline
(-0.5, 6901.5, 5287.5, -0.5)

此範例中使用的版本

print("numpy:", np.__version__)
print("scikit-learn:", sklearn.__version__)
print("onnx: ", onnx.__version__)
print("onnxruntime: ", rt.__version__)
print("skl2onnx: ", skl2onnx.__version__)
numpy: 1.26.4
scikit-learn: 1.6.dev0
onnx:  1.17.0
onnxruntime:  1.18.0+cu118
skl2onnx:  1.17.0

腳本總執行時間:(0 分鐘 19.991 秒)

由 Sphinx-Gallery 產生的展示