Autocomplete now makes use of the default-max-batch-size (#120)

nv-kmcgill53 · web-flow · commit abc3ee7c820c · 2022-06-02T11:11:03.000-07:00
* autocomplete now makes use of the default-max-batch-size

* check to determine dynamic batch scheduler after we set the new batch size

* added default-max-batch-size section

* when default_max_batch_size=0, then max_batch_size=0
diff --git a/README.md b/README.md
@@ -185,8 +185,29 @@ parameters { key: "inter_op_thread_count" value: { string_value: "0" } }
 * `memory.enable_memory_arena_shrinkage`: See [this](https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h) for more information.
 
 ### Command line options
+
+#### Thread Pools
+
 When intra and inter op threads is set to 0 or a value higher than 1, by default ORT creates threadpool per session. This may not be ideal in every scenario, therefore ORT also supports global threadpools. When global threadpools are enabled ORT creates 1 global threadpool which is shared by every session. Use the backend config to enable global threadpool. When global threadpool is enabled, intra and inter op num threads config should also be provided via backend config. Config values provided in model config will be ignored.
 
 ```
 --backend-config=onnxruntime,enable-global-threadpool=<0,1>, --backend-config=onnxruntime,intra_op_thread_count=<int> , --backend-config=onnxruntime,inter_op_thread_count=<int> 
 ```
+
+#### Default Max Batch Size
+
+The default-max-batch-size value is used for max_batch_size during [Autocomplete](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#auto-generated-model-configuration) when no 
+other value is found. If the `--strict-model-config=false` command-line
+option is used, the onnxruntime backend will set the max_batch_size
+of the model to this default value under the following conditions:
+
+1. Autocomplete has determined the model is capable of batching requests. 
+2. max_batch_size is 0 in the model configuration or max_batch_size 
+   is omitted from the model configuration.
+
+If max_batch_size > 1 and no [scheduler](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#scheduling-and-batching) is provided, the dynamic batch scheduler will be used.
+
+```
+--backend-config=onnxruntime,default-max-batch-size=<int>
+```
+
diff --git a/src/onnxruntime.cc b/src/onnxruntime.cc
@@ -53,6 +53,13 @@ struct SessionDeleter {
   void operator()(OrtSession* f) { OnnxLoader::UnloadSession(f); }
 };
 
+// BackendConfiguration
+struct BackendConfiguration {
+  BackendConfiguration() : default_max_batch_size_(0) {}
+
+  int default_max_batch_size_;
+};
+
 //
 // ModelState
 //
@@ -767,19 +774,52 @@ ModelState::AutoCompleteMaxBatch(
   // initialized in the model state.
   if (can_support_batching) {
     if (MaxBatchSize() == 0) {
+      int default_max_batch_size = 0;
+      {
+        TRITONBACKEND_Backend* backend;
+        THROW_IF_BACKEND_INSTANCE_ERROR(
+            TRITONBACKEND_ModelBackend(TritonModel(), &backend));
+        void* state;
+        THROW_IF_BACKEND_INSTANCE_ERROR(
+            TRITONBACKEND_BackendState(backend, &state));
+        default_max_batch_size = reinterpret_cast<BackendConfiguration*>(state)
+                                     ->default_max_batch_size_;
+      }
+      int max_batch_size = std::max(default_max_batch_size, 0);
+
       triton::common::TritonJson::Value mbs_value;
       ModelConfig().Find("max_batch_size", &mbs_value);
-      mbs_value.SetInt(1);
-      SetMaxBatchSize(1);
+      mbs_value.SetInt(max_batch_size);
+      SetMaxBatchSize(max_batch_size);
+
       LOG_MESSAGE(
           TRITONSERVER_LOG_WARN,
-          (std::string("autofilled max_batch_size to 1 for model '") + Name() +
+          (std::string(
+               "autofilled max_batch_size to " +
+               std::to_string(max_batch_size) + " for model '") +
+           Name() +
            "' since batching is supporrted but no max_batch_size is "
            "specified "
            "in model configuration. Must specify max_batch_size to utilize "
            "autofill with a larger max batch size")
               .c_str());
     }
+
+    // Check to see if we need to turn on dynamic batching
+    // since model supports batching
+    if (MaxBatchSize() > 1) {
+      triton::common::TritonJson::Value value;
+      bool found_sequence_batching =
+          ModelConfig().Find("sequence_batching", &value);
+      bool found_dynamic_batching =
+          ModelConfig().Find("dynamic_batching", &value);
+      if (!found_sequence_batching && !found_dynamic_batching) {
+        triton::common::TritonJson::Value dynamic_batching(
+            ModelConfig(), triton::common::TritonJson::ValueType::OBJECT);
+        ModelConfig().Add("dynamic_batching", std::move(dynamic_batching));
+      }
+    }
+
   } else if (MaxBatchSize() != 0) {
     return TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_INVALID_ARG,
@@ -2484,6 +2524,22 @@ TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
   // Onetime initialization for the onnxruntime loader.
   RETURN_IF_ERROR(OnnxLoader::Init(backend_config));
 
+  std::unique_ptr<BackendConfiguration> lconfig(new BackendConfiguration());
+  triton::common::TritonJson::Value cmdline;
+  if (backend_config.Find("cmdline", &cmdline)) {
+    triton::common::TritonJson::Value value;
+    std::string value_str;
+    if (cmdline.Find("default-max-batch-size", &value)) {
+      RETURN_IF_ERROR(value.AsString(&value_str));
+      int lvalue;
+      RETURN_IF_ERROR(ParseIntValue(value_str, &lvalue));
+      lconfig->default_max_batch_size_ = lvalue;
+    }
+  }
+  RETURN_IF_ERROR(TRITONBACKEND_BackendSetState(
+      backend, reinterpret_cast<void*>(lconfig.get())));
+
+  lconfig.release();
   return nullptr;  // success
 }
 
diff --git a/src/onnxruntime_loader.cc b/src/onnxruntime_loader.cc
@@ -56,9 +56,9 @@ OnnxLoader::Init(common::TritonJson::Value& backend_config)
     OrtLoggingLevel logging_level =
         TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)
             ? ORT_LOGGING_LEVEL_VERBOSE
-        : TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_WARN)
-            ? ORT_LOGGING_LEVEL_WARNING
-            : ORT_LOGGING_LEVEL_ERROR;
+            : TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_WARN)
+                  ? ORT_LOGGING_LEVEL_WARNING
+                  : ORT_LOGGING_LEVEL_ERROR;
 
     // Controls whether to enable global threadpool which will be shared across
     // sessions. Use this in conjunction with DisablePerSessionThreads API or