以資料框作為輸入

管線通常會將資料作為矩陣輸入。如果所有資料都具有相同的類型,則可能會轉換為矩陣。但是,資料框中的資料通常具有多種類型,例如浮點數、整數或字串(用於類別)。ONNX 也支援這種情況。

具有類別的資料集

import numpy
import pprint
from onnxruntime import InferenceSession
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx
from skl2onnx.algebra.type_helper import guess_initial_types


data = DataFrame(
    [
        dict(CAT1="a", CAT2="c", num1=0.5, num2=0.6, y=0),
        dict(CAT1="b", CAT2="d", num1=0.4, num2=0.8, y=1),
        dict(CAT1="a", CAT2="d", num1=0.5, num2=0.56, y=0),
        dict(CAT1="a", CAT2="d", num1=0.55, num2=0.56, y=1),
        dict(CAT1="a", CAT2="c", num1=0.35, num2=0.86, y=0),
        dict(CAT1="a", CAT2="c", num1=0.5, num2=0.68, y=1),
    ]
)

cat_cols = ["CAT1", "CAT2"]
train_data = data.drop("y", axis=1)


categorical_transformer = Pipeline(
    [("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, cat_cols)], remainder="passthrough"
)
pipe = Pipeline([("preprocess", preprocessor), ("rf", RandomForestClassifier())])
pipe.fit(train_data, data["y"])
Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['CAT1', 'CAT2'])])),
                ('rf', RandomForestClassifier())])
在 Jupyter 環境中,請重新執行此儲存格以顯示 HTML 表示法或信任筆記本。
在 GitHub 上,HTML 表示法無法呈現,請嘗試使用 nbviewer.org 載入此頁面。


轉換為 ONNX

函式 to_onnx 不處理資料框。

onx = to_onnx(pipe, train_data[:1], options={RandomForestClassifier: {"zipmap": False}})

使用 ONNX 進行預測

onnxruntime 不支援資料框。

sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
try:
    sess.run(None, train_data)
except Exception as e:
    print(e)

# Unhide conversion logic with a dataframe
# ++++++++++++++++++++++++++++++++++++++++
#
# A dataframe can be seen as a set of columns with
# different types. That's what ONNX should see:
# a list of inputs, the input name is the column name,
# the input type is the column type.


def guess_schema_from_data(X):
    init = guess_initial_types(X)
    unique = set()
    for _, col in init:
        if len(col.shape) != 2:
            return init
        if col.shape[0] is not None:
            return init
        if len(unique) > 0 and col.__class__ not in unique:
            return init
        unique.add(col.__class__)
    unique = list(unique)
    return [("X", unique[0]([None, sum(_[1].shape[1] for _ in init)]))]


init = guess_schema_from_data(train_data)

pprint.pprint(init)
run(): incompatible function arguments. The following argument types are supported:
    1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]

Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7f029b60e570>, ['label', 'probabilities'],   CAT1 CAT2  num1  num2
0    a    c  0.50  0.60
1    b    d  0.40  0.80
2    a    d  0.50  0.56
3    a    d  0.55  0.56
4    a    c  0.35  0.86
5    a    c  0.50  0.68, None
[('CAT1', StringTensorType(shape=[None, 1])),
 ('CAT2', StringTensorType(shape=[None, 1])),
 ('num1', DoubleTensorType(shape=[None, 1])),
 ('num2', DoubleTensorType(shape=[None, 1]))]

讓我們改用浮點數。

for c in train_data.columns:
    if c not in cat_cols:
        train_data[c] = train_data[c].astype(numpy.float32)


init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
 ('CAT2', StringTensorType(shape=[None, 1])),
 ('num1', FloatTensorType(shape=[None, 1])),
 ('num2', FloatTensorType(shape=[None, 1]))]

讓我們僅使用 skl2onnx 進行轉換。

onx2 = to_onnx(
    pipe, initial_types=init, options={RandomForestClassifier: {"zipmap": False}}
)

讓我們使用 onnxruntime 執行它。我們需要將資料框轉換為字典,其中欄名稱成為鍵,而欄值成為值。

inputs = {c: train_data[c].values.reshape((-1, 1)) for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
       ['b'],
       ['a'],
       ['a'],
       ['a'],
       ['a']], dtype=object),
 'CAT2': array([['c'],
       ['d'],
       ['d'],
       ['d'],
       ['c'],
       ['c']], dtype=object),
 'num1': array([[0.5 ],
       [0.4 ],
       [0.5 ],
       [0.55],
       [0.35],
       [0.5 ]], dtype=float32),
 'num2': array([[0.6 ],
       [0.8 ],
       [0.56],
       [0.56],
       [0.86],
       [0.68]], dtype=float32)}

推論。

sess2 = InferenceSession(onx2.SerializeToString(), providers=["CPUExecutionProvider"])

got2 = sess2.run(None, inputs)

print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]

和機率。

print(pipe.predict_proba(train_data))
print(got2[1])
[[0.82 0.18]
 [0.26 0.74]
 [0.76 0.24]
 [0.37 0.63]
 [0.75 0.25]
 [0.29 0.71]]
[[0.82       0.18      ]
 [0.2600004  0.7399996 ]
 [0.76       0.24000004]
 [0.3700003  0.6299997 ]
 [0.75       0.25000003]
 [0.29000038 0.7099996 ]]

腳本總執行時間: (0 分鐘 0.412 秒)

由 Sphinx-Gallery 產生展示