注意
前往末尾以下載完整的範例程式碼
TfIdf 和稀疏矩陣¶
TfidfVectorizer 通常會建立稀疏資料。如果資料夠稀疏,矩陣通常會沿著管線保持稀疏,直到訓練預測器為止。稀疏矩陣不考慮空值和遺失值,因為它們不存在於資料集中。由於某些預測器會執行差異運算,因此當轉換為 ONNX 時,這種模糊性可能會導致差異。此範例探討了幾種組態。
匯入、設定¶
所有匯入。它也註冊了 xgboost 和 lightgbm 的 onnx 轉換器。
import warnings
import numpy
import pandas
import onnxruntime as rt
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
try:
from sklearn.ensemble import HistGradientBoostingClassifier
except ImportError:
HistGradientBoostingClassifier = None
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.sklapi import CastTransformer, ReplaceTransformer
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
update_registered_converter(
XGBClassifier,
"XGBoostXGBClassifier",
calculate_linear_classifier_output_shapes,
convert_xgboost,
options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
update_registered_converter(
LGBMClassifier,
"LightGbmLGBMClassifier",
calculate_linear_classifier_output_shapes,
convert_lightgbm,
options={"nocl": [True, False], "zipmap": [True, False]},
)
人工資料集¶
鳶尾花 + 文字欄位。
cst = ["class zero", "class one", "class two"]
data = load_iris()
X = data.data[:, :2]
y = data.target
df = pandas.DataFrame(X)
df.columns = [f"c{c}" for c in df.columns]
df["text"] = [cst[i] for i in y]
ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()
稀疏後訓練集成¶
此範例使用鳶尾花資料集,其中包含以 tf-idf 預處理的人工文字資料集。sparse_threshold=1. 避免將稀疏矩陣轉換為密集矩陣。
def make_pipelines(
df_train,
y_train,
models=None,
sparse_threshold=1.0,
replace_nan=False,
insert_replace=False,
):
if models is None:
models = [
RandomForestClassifier,
HistGradientBoostingClassifier,
XGBClassifier,
LGBMClassifier,
]
models = [_ for _ in models if _ is not None]
pipes = []
for model in tqdm(models):
if model == HistGradientBoostingClassifier:
kwargs = dict(max_iter=5)
elif model == XGBClassifier:
kwargs = dict(n_estimators=5, use_label_encoder=False)
else:
kwargs = dict(n_estimators=5)
if insert_replace:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
("repl", ReplaceTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
else:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
try:
pipe.fit(df_train, y_train)
except TypeError as e:
obs = dict(model=model.__name__, pipe=pipe, error=e, model_onnx=None)
pipes.append(obs)
continue
options = {model: {"zipmap": False}}
if replace_nan:
options[TfidfTransformer] = {"nan": True}
# convert
with warnings.catch_warnings(record=False):
warnings.simplefilter("ignore", (FutureWarning, UserWarning))
model_onnx = to_onnx(
pipe,
initial_types=[
("input", FloatTensorType([None, 2])),
("text", StringTensorType([None, 1])),
],
target_opset={"": 12, "ai.onnx.ml": 2},
options=options,
)
with open("model.onnx", "wb") as f:
f.write(model_onnx.SerializeToString())
sess = rt.InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
inputs = {
"input": df[["c0", "c1"]].values.astype(numpy.float32),
"text": df[["text"]].values,
}
pred_onx = sess.run(None, inputs)
diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()
obs = dict(
model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe
)
pipes.append(obs)
return pipes
data_sparse = make_pipelines(df, y)
stat = pandas.DataFrame(data_sparse).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]
75%|███████▌ | 3/4 [00:00<00:00, 13.14it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
100%|██████████| 4/4 [00:00<00:00, 11.75it/s]
model discrepencies
0 RandomForestClassifier 0.700004
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 28.331459
3 LGBMClassifier 0.000009
稀疏資料會造成損害。
密集資料¶
讓我們使用 sparse_threshold=0. 將稀疏資料替換為密集資料。
data_dense = make_pipelines(df, y, sparse_threshold=0.0)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]
50%|█████ | 2/4 [00:00<00:00, 17.40it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
100%|██████████| 4/4 [00:00<00:00, 20.51it/s]
這樣好多了。讓我們比較一下預處理如何套用在資料上。
print("sparse")
print(data_sparse[-1]["pipe"].steps[0][-1].transform(df)[:2])
print()
print("dense")
print(data_dense[-1]["pipe"].steps[0][-1].transform(df)[:2])
sparse
(0, 0) -0.9006811702978088
(0, 1) 1.019004351971607
(0, 2) 0.4323732931220851
(0, 5) 0.9016947018779491
(1, 0) -1.1430169111851105
(1, 1) -0.13197947932162468
(1, 2) 0.4323732931220851
(1, 5) 0.9016947018779491
dense
[[-0.90068117 1.01900435 0.43237329 0. 0. 0.9016947 ]
[-1.14301691 -0.13197948 0.43237329 0. 0. 0.9016947 ]]
這顯示 RandomForestClassifier、XGBClassifier 不會像 LGBMClassifier 那樣以相同的方式處理稀疏和密集矩陣。而且 HistGradientBoostingClassifier 會失敗。
具有 nan 的密集資料¶
讓我們在 scikit-learn 管線中保留稀疏資料,但在 onnx 圖中將空值替換為 nan。
data_dense = make_pipelines(df, y, sparse_threshold=1.0, replace_nan=True)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
100%|██████████| 4/4 [00:00<00:00, 29.40it/s]
100%|██████████| 4/4 [00:00<00:00, 29.34it/s]
model discrepencies
0 RandomForestClassifier 40.908293
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 2.967510
3 LGBMClassifier 0.000009
密集,0 已替換為 nan¶
與其使用特定選項將空值替換為 nan 值,不如將名為 ReplaceTransformer 的自訂轉換器明確插入管線。新的轉換器已新增至支援的模型清單中。它與先前的選項等效,只是更明確。
data_dense = make_pipelines(
df, y, sparse_threshold=1.0, replace_nan=False, insert_replace=True
)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
100%|██████████| 4/4 [00:00<00:00, 36.10it/s]
100%|██████████| 4/4 [00:00<00:00, 36.01it/s]
model discrepencies
0 RandomForestClassifier 41.729285
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 2.967510
3 LGBMClassifier 0.000009
結論¶
除非使用密集陣列,否則由於 onnxruntime ONNX 尚不支援稀疏,因此需要根據遵循 TfIdf 預處理的模型來調整轉換。
腳本的總執行時間: (0 分鐘 0.992 秒)