@@ -471,7 +471,34 @@ void TensorrtllmEngine::LoadModel(std::shared_ptr<Json::Value> json_body, std::f
471471
472472 // Init gpt_session
473473 auto model_path = model_dir / json.engineFilename (world_config, model_id_);
474- gpt_session = std::make_unique<GptSession>(session_config_, *model_config_, world_config, model_path.string (), logger_);
474+ try {
475+ gpt_session = std::make_unique<GptSession>(session_config_, *model_config_, world_config, model_path.string (), logger_);
476+ } catch (const std::exception& e) {
477+ LOG_ERROR << " Failed to load model: " << e.what ();
478+ LOG_INFO << " Retry once with smaller maxSequenceLength" ;
479+ gpt_session.reset ();
480+ // Retry again with smaller maxSequenceLength once
481+ session_config_.maxSequenceLength /= 2 ;
482+ try {
483+ gpt_session = std::make_unique<GptSession>(session_config_, *model_config_, world_config, model_path.string (), logger_);
484+ } catch (const std::exception& e) {
485+ LOG_ERROR << " Failed to load model: " << e.what ();
486+ gpt_session.reset ();
487+ cortex_tokenizer.reset ();
488+ q_.reset ();
489+ model_config_.reset ();
490+ logger_.reset ();
491+ Json::Value json_resp;
492+ json_resp[" message" ] = " Failed to load model" ;
493+ Json::Value status;
494+ status[" is_done" ] = false ;
495+ status[" has_error" ] = true ;
496+ status[" is_stream" ] = false ;
497+ status[" status_code" ] = k500InternalServerError;
498+ callback (std::move (status), std::move (json_resp));
499+ return ;
500+ }
501+ }
475502
476503 model_loaded_ = true ;
477504 if (q_ == nullptr ) {
0 commit comments