注意
跳至結尾以下載完整的範例程式碼
以資料框作為輸入¶
管線通常會將資料作為矩陣輸入。如果所有資料都具有相同的類型,則可能會轉換為矩陣。但是,資料框中的資料通常具有多種類型,例如浮點數、整數或字串(用於類別)。ONNX 也支援這種情況。
具有類別的資料集¶
import numpy
import pprint
from onnxruntime import InferenceSession
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx
from skl2onnx.algebra.type_helper import guess_initial_types
data = DataFrame(
[
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.6, y=0),
dict(CAT1="b", CAT2="d", num1=0.4, num2=0.8, y=1),
dict(CAT1="a", CAT2="d", num1=0.5, num2=0.56, y=0),
dict(CAT1="a", CAT2="d", num1=0.55, num2=0.56, y=1),
dict(CAT1="a", CAT2="c", num1=0.35, num2=0.86, y=0),
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.68, y=1),
]
)
cat_cols = ["CAT1", "CAT2"]
train_data = data.drop("y", axis=1)
categorical_transformer = Pipeline(
[("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
transformers=[("cat", categorical_transformer, cat_cols)], remainder="passthrough"
)
pipe = Pipeline([("preprocess", preprocessor), ("rf", RandomForestClassifier())])
pipe.fit(train_data, data["y"])
轉換為 ONNX¶
函式 to_onnx 不處理資料框。
onx = to_onnx(pipe, train_data[:1], options={RandomForestClassifier: {"zipmap": False}})
使用 ONNX 進行預測¶
onnxruntime 不支援資料框。
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
try:
sess.run(None, train_data)
except Exception as e:
print(e)
# Unhide conversion logic with a dataframe
# ++++++++++++++++++++++++++++++++++++++++
#
# A dataframe can be seen as a set of columns with
# different types. That's what ONNX should see:
# a list of inputs, the input name is the column name,
# the input type is the column type.
def guess_schema_from_data(X):
init = guess_initial_types(X)
unique = set()
for _, col in init:
if len(col.shape) != 2:
return init
if col.shape[0] is not None:
return init
if len(unique) > 0 and col.__class__ not in unique:
return init
unique.add(col.__class__)
unique = list(unique)
return [("X", unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
init = guess_schema_from_data(train_data)
pprint.pprint(init)
run(): incompatible function arguments. The following argument types are supported:
1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]
Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7f029b60e570>, ['label', 'probabilities'], CAT1 CAT2 num1 num2
0 a c 0.50 0.60
1 b d 0.40 0.80
2 a d 0.50 0.56
3 a d 0.55 0.56
4 a c 0.35 0.86
5 a c 0.50 0.68, None
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', DoubleTensorType(shape=[None, 1])),
('num2', DoubleTensorType(shape=[None, 1]))]
讓我們改用浮點數。
for c in train_data.columns:
if c not in cat_cols:
train_data[c] = train_data[c].astype(numpy.float32)
init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', FloatTensorType(shape=[None, 1])),
('num2', FloatTensorType(shape=[None, 1]))]
讓我們僅使用 skl2onnx 進行轉換。
onx2 = to_onnx(
pipe, initial_types=init, options={RandomForestClassifier: {"zipmap": False}}
)
讓我們使用 onnxruntime 執行它。我們需要將資料框轉換為字典,其中欄名稱成為鍵,而欄值成為值。
inputs = {c: train_data[c].values.reshape((-1, 1)) for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
['b'],
['a'],
['a'],
['a'],
['a']], dtype=object),
'CAT2': array([['c'],
['d'],
['d'],
['d'],
['c'],
['c']], dtype=object),
'num1': array([[0.5 ],
[0.4 ],
[0.5 ],
[0.55],
[0.35],
[0.5 ]], dtype=float32),
'num2': array([[0.6 ],
[0.8 ],
[0.56],
[0.56],
[0.86],
[0.68]], dtype=float32)}
推論。
sess2 = InferenceSession(onx2.SerializeToString(), providers=["CPUExecutionProvider"])
got2 = sess2.run(None, inputs)
print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]
和機率。
print(pipe.predict_proba(train_data))
print(got2[1])
[[0.82 0.18]
[0.26 0.74]
[0.76 0.24]
[0.37 0.63]
[0.75 0.25]
[0.29 0.71]]
[[0.82 0.18 ]
[0.2600004 0.7399996 ]
[0.76 0.24000004]
[0.3700003 0.6299997 ]
[0.75 0.25000003]
[0.29000038 0.7099996 ]]
腳本總執行時間: (0 分鐘 0.412 秒)