Fix L0_infer_cudashm failure (#108)

Tabrizian · web-flow · commit 90c09650d190 · 2022-03-22T19:11:27.000-04:00
* Fix L0_infer_cudashm failure

* Review edit
diff --git a/src/onnxruntime.cc b/src/onnxruntime.cc
@@ -1576,8 +1576,7 @@ ModelInstanceState::ProcessRequests(
                    .c_str()));
         } else if (iit->second.type_ != ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
           // Query the memory type of destination output buffer. Bind the
-          // output
-          // to this destination memory type. The destination memory type
+          // output to this destination memory type. The destination memory type
           // for an output for all requests should be same. So use any request
           // for this query.
           memory_type = preferred_memory_type;
@@ -1598,6 +1597,13 @@ ModelInstanceState::ProcessRequests(
             memory_type_id = 0;
           }
         }
+
+        // If the cuda allocator is not set, bind the output to CPU.
+        if (cuda_allocator_info_ == nullptr) {
+          memory_type = TRITONSERVER_MEMORY_CPU;
+          memory_type_id = 0;
+        }
+
         // finally save the derived mem type and device id as we need it for
         // reading the outputs.
         output_device_info_.insert(
@@ -1775,13 +1781,12 @@ ModelInstanceState::SetInputTensors(
       std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>
           allowed_input_types;
       if (Kind() == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
-        allowed_input_types = {
-            {TRITONSERVER_MEMORY_GPU, DeviceId()},
-            {TRITONSERVER_MEMORY_CPU_PINNED, 0},
-            {TRITONSERVER_MEMORY_CPU, 0}};
+        allowed_input_types = {{TRITONSERVER_MEMORY_GPU, DeviceId()},
+                               {TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                               {TRITONSERVER_MEMORY_CPU, 0}};
       } else {
-        allowed_input_types = {
-            {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+        allowed_input_types = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
+                               {TRITONSERVER_MEMORY_CPU, 0}};
       }
 
       RETURN_IF_ERROR(collector->ProcessTensor(