cloudera
diff --git a/‎backend/src/main/java/com/cloudera/cai/rag/Types.java‎
Lines changed: 2 additions & 0 deletions b/‎backend/src/main/java/com/cloudera/cai/rag/Types.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backend/src/main/java/com/cloudera/cai/rag/sessions/SessionRepository.java‎
Lines changed: 13 additions & 2 deletions b/‎backend/src/main/java/com/cloudera/cai/rag/sessions/SessionRepository.java‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎backend/src/test/java/com/cloudera/cai/rag/TestData.java‎
Lines changed: 13 additions & 2 deletions b/‎backend/src/test/java/com/cloudera/cai/rag/TestData.java‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎backend/src/test/java/com/cloudera/cai/rag/sessions/SessionControllerTest.java‎
Lines changed: 10 additions & 4 deletions b/‎backend/src/test/java/com/cloudera/cai/rag/sessions/SessionControllerTest.java‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎llm-service/app/routers/index/sessions/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎llm-service/app/routers/index/sessions/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎llm-service/app/services/chat/streaming_chat.py‎
Lines changed: 13 additions & 6 deletions b/‎llm-service/app/services/chat/streaming_chat.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎llm-service/app/services/metadata_apis/session_metadata_api.py‎
Lines changed: 5 additions & 0 deletions b/‎llm-service/app/services/metadata_apis/session_metadata_api.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎llm-service/app/services/query/agents/tool_calling_querier.py‎
Lines changed: 61 additions & 25 deletions b/‎llm-service/app/services/query/agents/tool_calling_querier.py‎
Lines changed: 61 additions & 25 deletions
@@ -103,10 +103,12 @@ public record RagDataSource(
       Long associatedSessionId) {}
 
   @With
+  @Builder
   public record QueryConfiguration(
       boolean enableHyde,
       boolean enableSummaryFilter,
       boolean enableToolCalling,
+      Boolean disableStreaming,
       List<String> selectedTools) {}
 
   @With
 
@@ -43,6 +43,7 @@
 import com.cloudera.cai.rag.configuration.JdbiConfiguration;
 import com.cloudera.cai.util.exceptions.NotFound;
 import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import java.time.Instant;
 import java.util.List;
@@ -57,9 +58,16 @@
 @Component
 public class SessionRepository {
   public static final Types.QueryConfiguration DEFAULT_QUERY_CONFIGURATION =
-      new Types.QueryConfiguration(false, true, false, List.of());
+      Types.QueryConfiguration.builder()
+          .enableHyde(false)
+          .enableSummaryFilter(true)
+          .enableToolCalling(false)
+          .disableStreaming(true)
+          .selectedTools(List.of())
+          .build();
   private final DatabaseOperations databaseOperations;
-  private final ObjectMapper objectMapper = new ObjectMapper();
+  private final ObjectMapper objectMapper =
+      new ObjectMapper().disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
 
   public SessionRepository(DatabaseOperations databaseOperations) {
     this.databaseOperations = databaseOperations;
@@ -177,6 +185,9 @@ private Types.QueryConfiguration extractQueryConfiguration(RowView rowView)
     if (queryConfiguration.selectedTools() == null) {
       queryConfiguration = queryConfiguration.withSelectedTools(List.of());
     }
+    if (queryConfiguration.disableStreaming() == null) {
+      queryConfiguration = queryConfiguration.withDisableStreaming(false);
+    }
     return queryConfiguration;
   }
 
 
@@ -83,7 +83,12 @@ public static Types.Session createTestSessionInstance(
         null,
         "test-rerank-model",
         3,
-        new Types.QueryConfiguration(false, true, true, List.of()));
+        Types.QueryConfiguration.builder()
+            .enableSummaryFilter(true)
+            .enableToolCalling(true)
+            .disableStreaming(true)
+            .selectedTools(List.of())
+            .build());
   }
 
   public static Types.CreateSession createSessionInstance(String sessionName) {
@@ -99,7 +104,13 @@ public static Types.CreateSession createSessionInstance(
         null,
         "test-rerank-model",
         3,
-        new Types.QueryConfiguration(false, true, true, List.of()),
+        Types.QueryConfiguration.builder()
+            .enableHyde(false)
+            .enableSummaryFilter(true)
+            .enableToolCalling(true)
+            .disableStreaming(true)
+            .selectedTools(List.of())
+            .build(),
         projectId);
   }
 
 
@@ -137,6 +137,14 @@ void update() {
     var updatedRerankModel = "new-rerank-model";
     var updatedProjectId = project.id();
 
+    var queryConfiguration =
+        Types.QueryConfiguration.builder()
+            .enableHyde(true)
+            .enableSummaryFilter(false)
+            .enableToolCalling(true)
+            .disableStreaming(true)
+            .selectedTools(List.of("foo"))
+            .build();
     var updatedSession =
         sessionController.update(
             insertedSession
@@ -145,8 +153,7 @@ void update() {
                 .withRerankModel(updatedRerankModel)
                 .withName(updatedName)
                 .withProjectId(updatedProjectId)
-                .withQueryConfiguration(
-                    new Types.QueryConfiguration(true, false, true, List.of("foo"))),
+                .withQueryConfiguration(queryConfiguration),
             request);
 
     assertThat(updatedSession.id()).isNotNull();
@@ -160,8 +167,7 @@ void update() {
     assertThat(updatedSession.timeUpdated()).isAfter(insertedSession.timeUpdated());
     assertThat(updatedSession.createdById()).isEqualTo("test-user");
     assertThat(updatedSession.lastInteractionTime()).isNull();
-    assertThat(updatedSession.queryConfiguration())
-        .isEqualTo(new Types.QueryConfiguration(true, false, true, List.of("foo")));
+    assertThat(updatedSession.queryConfiguration()).isEqualTo(queryConfiguration);
   }
 
   @Test
 
@@ -301,6 +301,18 @@ def generate_stream() -> Generator[str, None, None]:
 
             first_message = True
             stream = future.result()
+
+            # If streaming is disabled, immediately send a loading event to show StreamedEvents
+            if session.query_configuration.disable_streaming:
+                loading = ChatEvent(
+                    type="thinking",
+                    name="thinking",
+                    timestamp=time.time(),
+                    data="Preparing full response...",
+                )
+                event_json = json.dumps({"event": loading.model_dump()})
+                yield f"data: {event_json}\n\n"
+                first_message = False
             for item in stream:
                 response: ChatResponse = item
                 # Check for cancellation between each response
 
@@ -83,6 +83,7 @@ def stream_chat(
         use_hyde=session.query_configuration.enable_hyde,
         use_summary_filter=session.query_configuration.enable_summary_filter,
         use_tool_calling=session.query_configuration.enable_tool_calling,
+        use_streaming=not session.query_configuration.disable_streaming,
     )
 
     response_id = str(uuid.uuid4())
@@ -188,14 +189,20 @@ def _stream_direct_llm_chat(
     user_name: Optional[str],
 ) -> Generator[ChatResponse, None, None]:
     record_direct_llm_mlflow_run(response_id, session, user_name)
-
-    chat_response = llm_completion.stream_completion(
-        session.id, query, session.inference_model
-    )
-    response: ChatResponse = ChatResponse(message=ChatMessage(content=query))
-    for response in chat_response:
+    response: ChatResponse
+    if session.query_configuration.disable_streaming:
+        # Use non-streaming completion when streaming is disabled
+        response = llm_completion.completion(session.id, query, session.inference_model)
         response.additional_kwargs["response_id"] = response_id
         yield response
+    else:
+        chat_response = llm_completion.stream_completion(
+            session.id, query, session.inference_model
+        )
+        response = ChatResponse(message=ChatMessage(content=query))
+        for response in chat_response:
+            response.additional_kwargs["response_id"] = response_id
+            yield response
 
     new_chat_message = RagStudioChatMessage(
         id=response_id,
 
@@ -52,6 +52,7 @@ class SessionQueryConfiguration:
     enable_summary_filter: bool
     enable_tool_calling: bool = False
     selected_tools: list[str] = field(default_factory=list)
+    disable_streaming: bool = False
 
 
 @dataclass
@@ -126,6 +127,9 @@ def session_from_java_response(data: dict[str, Any]) -> Session:
             enable_tool_calling=data["queryConfiguration"].get(
                 "enableToolCalling", False
             ),
+            disable_streaming=data["queryConfiguration"].get(
+                "disableStreaming", False
+            ),
             selected_tools=data["queryConfiguration"]["selectedTools"] or [],
         ),
         associated_data_source_id=data.get("associatedDataSourceId", None),
@@ -146,6 +150,7 @@ def update_session(session: Session, user_name: Optional[str]) -> Session:
             "enableSummaryFilter": session.query_configuration.enable_summary_filter,
             "enableToolCalling": session.query_configuration.enable_tool_calling,
             "selectedTools": session.query_configuration.selected_tools,
+            "disableStreaming": session.query_configuration.disable_streaming,
         },
         associatedDataSourceId=session.associated_data_source_id,
     )
 
@@ -74,6 +74,7 @@
     FlexibleContextChatEngine,
 )
 from app.services.query.chat_events import ChatEvent
+from app.services.query.query_configuration import QueryConfiguration
 
 if os.environ.get("ENABLE_OPIK") == "True":
     opik.configure(
@@ -199,6 +200,7 @@ def stream_chat(
     chat_messages: list[ChatMessage],
     session: Session,
     data_source_summaries: dict[int, str],
+    configuration: QueryConfiguration,
 ) -> StreamingAgentChatResponse:
     mcp_tools: list[BaseTool] = []
     if session.query_configuration and session.query_configuration.selected_tools:
@@ -221,7 +223,7 @@ def stream_chat(
         tools.insert(0, retrieval_tool)
 
     gen, source_nodes = _run_streamer(
-        chat_engine, chat_messages, enhanced_query, llm, tools
+        chat_engine, chat_messages, enhanced_query, llm, tools, configuration
     )
 
     return StreamingAgentChatResponse(chat_stream=gen, source_nodes=source_nodes)
@@ -233,9 +235,12 @@ def _run_streamer(
     enhanced_query: str,
     llm: FunctionCallingLLM,
     tools: list[BaseTool],
+    configuration: QueryConfiguration,
     verbose: bool = True,
 ) -> tuple[Generator[ChatResponse, None, None], list[NodeWithScore]]:
-    agent, enhanced_query = build_function_agent(enhanced_query, llm, tools)
+    agent, enhanced_query = build_function_agent(
+        enhanced_query, llm, tools, configuration.use_streaming or False
+    )
 
     source_nodes: list[NodeWithScore] = []
 
@@ -251,11 +256,22 @@ def _run_streamer(
             return chat_gen.chat_stream, chat_gen.source_nodes
 
         # If no chat engine is provided, we can use the LLM directly
-        direct_chat_gen = llm.stream_chat(
-            messages=chat_messages
-            + [ChatMessage(role=MessageRole.USER, content=enhanced_query)]
-        )
-        return direct_chat_gen, source_nodes
+        if configuration.use_streaming:
+            direct_chat_gen = llm.stream_chat(
+                messages=chat_messages
+                + [ChatMessage(role=MessageRole.USER, content=enhanced_query)]
+            )
+            return direct_chat_gen, source_nodes
+
+        # Use non-streaming LLM for direct chat when streaming is disabled
+        def _fake_direct_stream() -> Generator[ChatResponse, None, None]:
+            response = llm.chat(
+                messages=chat_messages
+                + [ChatMessage(role=MessageRole.USER, content=enhanced_query)]
+            )
+            yield response
+
+        return _fake_direct_stream(), source_nodes
 
     async def agen() -> AsyncGenerator[ChatResponse, None]:
         handler = agent.run(user_msg=enhanced_query, chat_history=chat_messages)
@@ -358,23 +374,33 @@ async def agen() -> AsyncGenerator[ChatResponse, None]:
                         f"{str(event.response) if event.response else 'No content'}"
                     )
                     logger.info("========================")
-                yield ChatResponse(
-                    message=ChatMessage(
-                        role=MessageRole.TOOL,
-                        content=(
-                            event.response.content if event.response.content else ""
+                if configuration.use_streaming:
+                    yield ChatResponse(
+                        message=ChatMessage(
+                            role=(MessageRole.TOOL),
+                            content=event.response.content,
                         ),
-                    ),
-                    delta="",
-                    raw=event.raw,
-                    additional_kwargs={
-                        "chat_event": ChatEvent(
-                            type="agent_response",
-                            name=event.current_agent_name,
-                            data=data,
+                        delta="",
+                        raw=event.raw,
+                        additional_kwargs=(
+                            {
+                                "chat_event": ChatEvent(
+                                    type="agent_response",
+                                    name=event.current_agent_name,
+                                    data=data,
+                                ),
+                            }
                         ),
-                    },
-                )
+                    )
+                else:
+                    yield ChatResponse(
+                        message=ChatMessage(
+                            role=(MessageRole.ASSISTANT),
+                            content=event.response.content,
+                        ),
+                        delta=(event.response.content),
+                        raw=event.raw,
+                    )
             elif isinstance(event, AgentStream):
                 if len(event.tool_calls) > 0:
                     continue
@@ -436,15 +462,22 @@ def gen() -> Generator[ChatResponse, None, None]:
 
 
 def build_function_agent(
-    enhanced_query: str, llm: FunctionCallingLLM, tools: list[BaseTool]
+    enhanced_query: str,
+    llm: FunctionCallingLLM,
+    tools: list[BaseTool],
+    streaming_enabled: bool,
 ) -> tuple[FunctionAgent, str]:
     formatted_prompt = DEFAULT_AGENT_PROMPT.format(
         date=datetime.datetime.now().strftime("%A, %B %d, %Y"),
         time=datetime.datetime.now().strftime("%H:%M:%S %p"),
     )
     callable_tools = cast(list[BaseTool | Callable[[], Any]], tools)
     if llm.metadata.model_name in NON_SYSTEM_MESSAGE_MODELS:
-        agent = FunctionAgent(tools=callable_tools, llm=llm)
+        agent = FunctionAgent(
+            tools=callable_tools,
+            llm=llm,
+            streaming=streaming_enabled,
+        )
         enhanced_query = (
             "ROLE DESCRIPTION =========================================\n"
             + formatted_prompt
@@ -460,7 +493,10 @@ def build_function_agent(
         ):
             llm = FakeStreamBedrockConverse.from_bedrock_converse(llm)
         agent = FunctionAgent(
-            tools=callable_tools, llm=llm, system_prompt=formatted_prompt
+            tools=callable_tools,
+            llm=llm,
+            system_prompt=formatted_prompt,
+            streaming=streaming_enabled,
         )
 
     return agent, enhanced_query