Skip to content

Commit abc3ee7

Browse files
authored
Autocomplete now makes use of the default-max-batch-size (#120)
* autocomplete now makes use of the default-max-batch-size * check to determine dynamic batch scheduler after we set the new batch size * added default-max-batch-size section * when default_max_batch_size=0, then max_batch_size=0
1 parent e32a32c commit abc3ee7

File tree

3 files changed

+83
-6
lines changed

3 files changed

+83
-6
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,29 @@ parameters { key: "inter_op_thread_count" value: { string_value: "0" } }
185185
* `memory.enable_memory_arena_shrinkage`: See [this](https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h) for more information.
186186

187187
### Command line options
188+
189+
#### Thread Pools
190+
188191
When intra and inter op threads is set to 0 or a value higher than 1, by default ORT creates threadpool per session. This may not be ideal in every scenario, therefore ORT also supports global threadpools. When global threadpools are enabled ORT creates 1 global threadpool which is shared by every session. Use the backend config to enable global threadpool. When global threadpool is enabled, intra and inter op num threads config should also be provided via backend config. Config values provided in model config will be ignored.
189192

190193
```
191194
--backend-config=onnxruntime,enable-global-threadpool=<0,1>, --backend-config=onnxruntime,intra_op_thread_count=<int> , --backend-config=onnxruntime,inter_op_thread_count=<int>
192195
```
196+
197+
#### Default Max Batch Size
198+
199+
The default-max-batch-size value is used for max_batch_size during [Autocomplete](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#auto-generated-model-configuration) when no
200+
other value is found. If the `--strict-model-config=false` command-line
201+
option is used, the onnxruntime backend will set the max_batch_size
202+
of the model to this default value under the following conditions:
203+
204+
1. Autocomplete has determined the model is capable of batching requests.
205+
2. max_batch_size is 0 in the model configuration or max_batch_size
206+
is omitted from the model configuration.
207+
208+
If max_batch_size > 1 and no [scheduler](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#scheduling-and-batching) is provided, the dynamic batch scheduler will be used.
209+
210+
```
211+
--backend-config=onnxruntime,default-max-batch-size=<int>
212+
```
213+

src/onnxruntime.cc

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ struct SessionDeleter {
5353
void operator()(OrtSession* f) { OnnxLoader::UnloadSession(f); }
5454
};
5555

56+
// BackendConfiguration
57+
struct BackendConfiguration {
58+
BackendConfiguration() : default_max_batch_size_(0) {}
59+
60+
int default_max_batch_size_;
61+
};
62+
5663
//
5764
// ModelState
5865
//
@@ -767,19 +774,52 @@ ModelState::AutoCompleteMaxBatch(
767774
// initialized in the model state.
768775
if (can_support_batching) {
769776
if (MaxBatchSize() == 0) {
777+
int default_max_batch_size = 0;
778+
{
779+
TRITONBACKEND_Backend* backend;
780+
THROW_IF_BACKEND_INSTANCE_ERROR(
781+
TRITONBACKEND_ModelBackend(TritonModel(), &backend));
782+
void* state;
783+
THROW_IF_BACKEND_INSTANCE_ERROR(
784+
TRITONBACKEND_BackendState(backend, &state));
785+
default_max_batch_size = reinterpret_cast<BackendConfiguration*>(state)
786+
->default_max_batch_size_;
787+
}
788+
int max_batch_size = std::max(default_max_batch_size, 0);
789+
770790
triton::common::TritonJson::Value mbs_value;
771791
ModelConfig().Find("max_batch_size", &mbs_value);
772-
mbs_value.SetInt(1);
773-
SetMaxBatchSize(1);
792+
mbs_value.SetInt(max_batch_size);
793+
SetMaxBatchSize(max_batch_size);
794+
774795
LOG_MESSAGE(
775796
TRITONSERVER_LOG_WARN,
776-
(std::string("autofilled max_batch_size to 1 for model '") + Name() +
797+
(std::string(
798+
"autofilled max_batch_size to " +
799+
std::to_string(max_batch_size) + " for model '") +
800+
Name() +
777801
"' since batching is supporrted but no max_batch_size is "
778802
"specified "
779803
"in model configuration. Must specify max_batch_size to utilize "
780804
"autofill with a larger max batch size")
781805
.c_str());
782806
}
807+
808+
// Check to see if we need to turn on dynamic batching
809+
// since model supports batching
810+
if (MaxBatchSize() > 1) {
811+
triton::common::TritonJson::Value value;
812+
bool found_sequence_batching =
813+
ModelConfig().Find("sequence_batching", &value);
814+
bool found_dynamic_batching =
815+
ModelConfig().Find("dynamic_batching", &value);
816+
if (!found_sequence_batching && !found_dynamic_batching) {
817+
triton::common::TritonJson::Value dynamic_batching(
818+
ModelConfig(), triton::common::TritonJson::ValueType::OBJECT);
819+
ModelConfig().Add("dynamic_batching", std::move(dynamic_batching));
820+
}
821+
}
822+
783823
} else if (MaxBatchSize() != 0) {
784824
return TRITONSERVER_ErrorNew(
785825
TRITONSERVER_ERROR_INVALID_ARG,
@@ -2484,6 +2524,22 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
24842524
// Onetime initialization for the onnxruntime loader.
24852525
RETURN_IF_ERROR(OnnxLoader::Init(backend_config));
24862526

2527+
std::unique_ptr<BackendConfiguration> lconfig(new BackendConfiguration());
2528+
triton::common::TritonJson::Value cmdline;
2529+
if (backend_config.Find("cmdline", &cmdline)) {
2530+
triton::common::TritonJson::Value value;
2531+
std::string value_str;
2532+
if (cmdline.Find("default-max-batch-size", &value)) {
2533+
RETURN_IF_ERROR(value.AsString(&value_str));
2534+
int lvalue;
2535+
RETURN_IF_ERROR(ParseIntValue(value_str, &lvalue));
2536+
lconfig->default_max_batch_size_ = lvalue;
2537+
}
2538+
}
2539+
RETURN_IF_ERROR(TRITONBACKEND_BackendSetState(
2540+
backend, reinterpret_cast<void*>(lconfig.get())));
2541+
2542+
lconfig.release();
24872543
return nullptr; // success
24882544
}
24892545

src/onnxruntime_loader.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ OnnxLoader::Init(common::TritonJson::Value& backend_config)
5656
OrtLoggingLevel logging_level =
5757
TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)
5858
? ORT_LOGGING_LEVEL_VERBOSE
59-
: TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_WARN)
60-
? ORT_LOGGING_LEVEL_WARNING
61-
: ORT_LOGGING_LEVEL_ERROR;
59+
: TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_WARN)
60+
? ORT_LOGGING_LEVEL_WARNING
61+
: ORT_LOGGING_LEVEL_ERROR;
6262

6363
// Controls whether to enable global threadpool which will be shared across
6464
// sessions. Use this in conjunction with DisablePerSessionThreads API or

0 commit comments

Comments
 (0)