@@ -123,12 +123,12 @@ GenerationInput::TensorPtr TensorrtllmEngine::GetTensorChatMLStopWordList() {
123123
124124GenerationInput TensorrtllmEngine::CreateGenerationInput (std::vector<int32_t > input_ids_host) {
125125 int input_len = input_ids_host.size ();
126- std::vector<int32_t > input_lengths_host (batchSize , input_len);
126+ std::vector<int32_t > input_lengths_host (batch_size_ , input_len);
127127 GenerationInput::TensorPtr input_lengths
128- = gpt_session->getBufferManager ().copyFrom (input_lengths_host, ITensor::makeShape ({batchSize }), MemoryType::kGPU );
128+ = gpt_session->getBufferManager ().copyFrom (input_lengths_host, ITensor::makeShape ({batch_size_ }), MemoryType::kGPU );
129129 GenerationInput::TensorPtr input_ids = gpt_session->getBufferManager ().copyFrom (
130- input_ids_host, ITensor::makeShape ({batchSize , input_len}), MemoryType::kGPU );
131- GenerationInput generation_input{0 , 0 , input_ids, input_lengths, model_config ->usePackedInput ()};
130+ input_ids_host, ITensor::makeShape ({batch_size_ , input_len}), MemoryType::kGPU );
131+ GenerationInput generation_input{0 , 0 , input_ids, input_lengths, model_config_ ->usePackedInput ()};
132132 generation_input.stopWordsList = GetTensorChatMLStopWordList ();
133133
134134 LOG_INFO << " Create generation input successfully" ;
@@ -249,7 +249,7 @@ bool TensorrtllmEngine::CheckModelLoaded(std::function<void(Json::Value&&, Json:
249249
250250void TensorrtllmEngine::HandleChatCompletion (std::shared_ptr<Json::Value> json_body, std::function<void (Json::Value&&, Json::Value&&)>&& callback) {
251251 inferences::ChatCompletionRequest request = inferences::fromJson (json_body);
252- std::string formatted_input = pre_prompt ;
252+ std::string formatted_input = pre_prompt_ ;
253253 nlohmann::json data;
254254 // data["stream"] = completion.stream;
255255 // data["n_predict"] = completion.max_tokens;
@@ -261,17 +261,17 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
261261 std::string input_role = message[" role" ].asString ();
262262 std::string role;
263263 if (input_role == " user" ) {
264- role = user_prompt ;
264+ role = user_prompt_ ;
265265 std::string content = message[" content" ].asString ();
266266 formatted_input += role + content;
267267 }
268268 else if (input_role == " assistant" ) {
269- role = ai_prompt ;
269+ role = ai_prompt_ ;
270270 std::string content = message[" content" ].asString ();
271271 formatted_input += role + content;
272272 }
273273 else if (input_role == " system" ) {
274- role = system_prompt ;
274+ role = system_prompt_ ;
275275 std::string content = message[" content" ].asString ();
276276 formatted_input = role + content + formatted_input;
277277 }
@@ -281,7 +281,7 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
281281 formatted_input += role + content;
282282 }
283283 }
284- formatted_input += ai_prompt ;
284+ formatted_input += ai_prompt_ ;
285285 // LOG_INFO << formatted_input;
286286 // Format the input from user
287287
@@ -366,14 +366,14 @@ void TensorrtllmEngine::LoadModel(std::shared_ptr<Json::Value> json_body, std::f
366366 is_openhermes_ = IsOpenhermes (request.model_path );
367367
368368 int ctx_len = request.ctx_len ;
369- this -> user_prompt = request.user_prompt .empty () ? GetUserPrompt (is_openhermes_) : request.user_prompt ;
370- this -> ai_prompt = request.ai_prompt .empty () ? GetAiPrompt (is_openhermes_) : request.ai_prompt ;
371- this -> system_prompt = request.system_prompt .empty () ? GetSystemPrompt (is_openhermes_) : request.system_prompt ;
372- this -> model_id_ = GetModelId (*json_body);
369+ user_prompt_ = request.user_prompt .empty () ? GetUserPrompt (is_openhermes_) : request.user_prompt ;
370+ ai_prompt_ = request.ai_prompt .empty () ? GetAiPrompt (is_openhermes_) : request.ai_prompt ;
371+ system_prompt_ = request.system_prompt .empty () ? GetSystemPrompt (is_openhermes_) : request.system_prompt ;
372+ model_id_ = GetModelId (*json_body);
373373
374- logger = std::make_shared<TllmLogger>();
375- logger ->setLevel (nvinfer1::ILogger::Severity::kINFO );
376- initTrtLlmPlugins (logger .get ());
374+ logger_ = std::make_shared<TllmLogger>();
375+ logger_ ->setLevel (nvinfer1::ILogger::Severity::kINFO );
376+ initTrtLlmPlugins (logger_ .get ());
377377
378378 std::filesystem::path tokenizer_model_name = model_dir / " tokenizer.model" ;
379379 cortex_tokenizer = std::make_unique<Tokenizer>(tokenizer_model_name.string ());
@@ -382,20 +382,20 @@ void TensorrtllmEngine::LoadModel(std::shared_ptr<Json::Value> json_body, std::f
382382 std::filesystem::path json_file_name = model_dir / " config.json" ;
383383 auto json = GptJsonConfig::parse (json_file_name);
384384 auto config = json.getModelConfig ();
385- model_config = std::make_unique<GptModelConfig>(config);
385+ model_config_ = std::make_unique<GptModelConfig>(config);
386386 auto world_config = WorldConfig::mpi (1 , json.getTensorParallelism (), json.getPipelineParallelism ());
387387 LOG_INFO << " Loaded config from " << json_file_name.string ();
388388 // auto dtype = model_config->getDataType();
389389
390390 // Currently doing fixed session config
391- session_config .maxBatchSize = batchSize ;
392- session_config .maxBeamWidth = 1 ; // Fixed for simplicity
393- session_config .maxSequenceLength = ctx_len;
394- session_config .cudaGraphMode = true ; // Fixed for simplicity
391+ session_config_ .maxBatchSize = batch_size_ ;
392+ session_config_ .maxBeamWidth = 1 ; // Fixed for simplicity
393+ session_config_ .maxSequenceLength = ctx_len;
394+ session_config_ .cudaGraphMode = true ; // Fixed for simplicity
395395
396396 // Init gpt_session
397397 auto model_path = model_dir / json.engineFilename (world_config, model_id_);
398- gpt_session = std::make_unique<GptSession>(session_config , *model_config , world_config, model_path.string (), logger );
398+ gpt_session = std::make_unique<GptSession>(session_config_ , *model_config_ , world_config, model_path.string (), logger_ );
399399
400400 model_loaded_ = true ;
401401 if (q_ == nullptr ) {
@@ -427,8 +427,8 @@ void TensorrtllmEngine::UnloadModel(std::shared_ptr<Json::Value> json_body, std:
427427 gpt_session.reset ();
428428 cortex_tokenizer.reset ();
429429 q_.reset ();
430- model_config .reset ();
431- logger .reset ();
430+ model_config_ .reset ();
431+ logger_ .reset ();
432432 model_loaded_ = false ;
433433
434434 Json::Value json_resp;
0 commit comments