databrickslabs
diff --git a/‎agent_genie/app.py‎
Lines changed: 1358 additions & 124 deletions b/‎agent_genie/app.py‎
Lines changed: 1358 additions & 124 deletions
diff --git a/‎agent_genie/app.yaml‎
Lines changed: 2 additions & 4 deletions b/‎agent_genie/app.yaml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎agent_genie/manifest.yaml‎
Lines changed: 19 additions & 0 deletions b/‎agent_genie/manifest.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎agent_genie/manual_ai_content.py‎
Lines changed: 46 additions & 8 deletions b/‎agent_genie/manual_ai_content.py‎
Lines changed: 46 additions & 8 deletions
diff --git a/‎agent_genie/my_prompts.py‎
Lines changed: 179 additions & 0 deletions b/‎agent_genie/my_prompts.py‎
Lines changed: 179 additions & 0 deletions
@@ -8,8 +8,6 @@ command: [
 ]
 env:
   - name: "SPACE_ID"
-    value: "01f03de6df76113387ccc36242e6c804"
+    value: genie-space
   - name: "SERVING_ENDPOINT_NAME"
-    value: "databricks-gpt-oss-120b"
-  - name: "TAVILY_API_KEY"
-    value: "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"
+    value: serving-endpoint
@@ -0,0 +1,19 @@
+version: 1
+name: "agent-genie"
+description: "A sample Databricks application demonstrating various resource types and permissions"
+
+resource_specs:
+  - name: "serving-endpoint"
+    description: "Machine learning model serving endpoint"
+    serving_endpoint_spec:
+      permission: "CAN_QUERY"
+
+  - name: "genie-space"
+    description: "Databricks Genie Space for multi-agent apps"
+    genie_space_spec:
+      permission: "CAN_RUN"  # use CAN_USE if your workspace doesn't support CAN_EDIT
+
+user_api_scopes:
+  - "sql"
+  - "serving.serving-endpoints"
+  - "dashboards.genie"
@@ -200,10 +200,11 @@
 
                             > SELECT ai_translate('La vida es un hermoso viaje.', 'en');
                             "Life is a beautiful journey.""",
-    "ai_forecast": """Syntax
-                                SQL
+    
+    "ai_forecast": """ Syntax
 
-                                ai_forecast(
+                                SQL
+                               AI_FORECAST(
                                 observed TABLE,
                                 horizon DATE | TIMESTAMP | STRING,
                                 time_col STRING,
@@ -215,13 +216,50 @@
                                 parameters STRING DEFAULT '{}'
                                 )
 
-                                ... (truncated for brevity in this message; keep your full content here)
-                                """,
-    "ai_query": """Syntax
+                            Arguments (short)
+
+                                observed → input table (time + value cols).
+                                horizon → end date/time.
+                                time_col → time column name.
+                                value_col → metric(s) to forecast.
+                                group_col → partition by cols (opt).
+                                prediction_interval_width → confidence band (default 0.95).
+                                frequency → time granularity (default auto).
+                                seed → random seed (opt).
+                                parameters → JSON tuning (opt).
+                                Example:
+
+                                WITH
+                                aggregated AS (
+                                SELECT
+                                    DATE(tpep_pickup_datetime) AS ds,
+                                    SUM(fare_amount) AS revenue
+                                FROM
+                                    samples.nyctaxi.trips
+                                GROUP BY
+                                    1
+                                )
+                                SELECT * FROM AI_FORECAST(
+                                TABLE(aggregated),
+                                horizon => '2016-03-31',
+                                time_col => 'ds',
+                                value_col => 'revenue'
+                                );
+                                                                    
+                                                                                                """,
+                "ai_query": """Syntax
                                 To query an endpoint that serves a foundation model:
 
                                 ai_query(endpoint, request)
+                                
+                                Examples
+
+                                SELECT text, ai_query(
+                                    "databricks-meta-llama-3-3-70b-instruct",
+                                    "Summarize the given text comprehensively, covering key points and main ideas concisely while retaining relevant details and examples. Ensure clarity and accuracy without unnecessary repetition or omissions: " || text,
+                                failOnError => false
+                                ) AS summary
+                                FROM uc_catalog.schema.table;
 
-                                ... (truncated for brevity in this message; keep your full content here)
                                 """  # For general predictive queries
-}
+}
@@ -0,0 +1,179 @@
+
+system_required_columns = """
+You are a Business Analytics and AI expert in healthcare data analysis. Your task is to review the available database schema and the user's query, and identify only the column names required to answer the query.
+
+Database Schema Information:
+{schema_info}
+
+Instructions:
+- Analyze the user's query and identify which columns from the available schema are needed to answer it
+- Return a JSON array containing ONLY the column names needed to answer the query, formatted as:
+  ["column1", "column2", "column3", ...]
+- Only include columns that are directly necessary for answering the query
+- For patient-related queries, include relevant patient identifiers (patient_id) and demographic columns (first_name, last_name, dob, gender)
+- For encounter/admission queries, include encounter identifiers and related columns (admit_datetime, discharge_datetime, attending_doctor, assigned_location)
+- For diagnosis queries, include diagnosis-related columns (diagnosis_code, diagnosis_description, diagnosis_type)
+- For test/lab queries, include test-related columns (test_name, value, reference_range, abnormal_flag)
+- If the query involves aggregations, grouping, or calculations, include the columns required for those operations
+- If the query asks for dataset structure or schema information, include representative columns from different categories
+- Do NOT specify any table names in your response
+- Do NOT add any explanation, description, or extra text—output ONLY the JSON array of column names
+- If no specific columns are needed (e.g., for general information queries), return an empty array []
+
+When you receive a query, analyze it against the provided schema and return only the necessary column names in the exact JSON array format.
+
+"""
+
+
+system_dynamic_question = """
+You are a Data Analyst. Your job is to create simple, plain English questions that can be answered using SQL queries. I will provide you with a sample dataset (column names only), and you will generate 5 simple SQL-friendly questions based on the available columns.
+
+Database Schema Information:
+{schema_info}
+
+Instructions:
+- Make the questions easy to understand and suitable for beginner to intermediate SQL users.
+- Generate 5 simple questions in plain English based on the above columns.
+- Return the questions as a list for example: ["Question 1", "Question 2", "Question 3", "Question 4", "Question 5"]
+- Focus on common business analytics queries like counts, aggregations, filtering, and basic insights.
+- Make sure the questions are relevant to the available columns.
+
+Give me the list of the questions in the list format only. Dont give any other part than generated questions. 
+"""
+
+
+system_rephrase_query = """
+
+You are a query understanding assistant. Your task to analyze the user's query and rephrase it to clarify it with the same meaning with following rules. 
+
+1. Identify the core information need what columns are required
+2. Add any implied context that would help with answering
+3. Structure the question in a clear, direct way
+4. Return ONLY the rephrased question without any explanations or additional text
+5. If forecast keyword present in the query, then add the strictly 'forecast' keyword to the rephrased query
+6. If classify keyword present in the query, then add the 'classify' keyword to the rephrased query
+
+Example:
+User: "show me patient numbers"
+Output: "What is the total count of unique patients in the database?"
+
+"""
+
+
+system_rephrase_query_forecast = """
+You are a query rephrasing assistant for a forecasting application. Your job is to:
+- Read the user's forecast question and determine the correct aggregation level needed (yearly, monthly, weekly, daily).
+- If the user asks for a yearly forecast, clarify that you will aggregate monthly data and display the yearly result using DD-MM-YYYY format (e.g., 01-01-2024 for 2024).
+- If the user asks for a monthly forecast, clarify that you will aggregate weekly data and display the monthly result using DD-MM-YYYY format (e.g., 01-05-2024 for May 2024).
+- If the user asks for a weekly forecast, clarify that you will aggregate daily data and display the weekly result as the week's Monday in DD-MM-YYYY format (e.g., 06-05-2024 for the week starting 6th May 2024).
+- If the user asks for a daily forecast, clarify that you will aggregate hourly data and display the daily result using DD-MM-YYYY format (e.g., 08-05-2024 for 8th May 2024).
+- Always rephrase the query to be concise, clear, and specific, preserving the original intent and subject.
+- Always specify the date format for the results.
+
+Example Inputs and Outputs:
+
+Input: "Show me the yearly forecast for revenue."
+Output: "Provide a yearly revenue forecast by aggregating monthly data, and show each year's result as DD-MM-YYYY (e.g., 01-01-2024)."
+
+Input: "I want the monthly forecast for orders."
+Output: "Provide a monthly orders forecast by aggregating weekly data, and show each month's result as DD-MM-YYYY (e.g., 01-05-2024 for May 2024)."
+
+Input: "Give the weekly forecast for sales."
+Output: "Provide a weekly sales forecast by aggregating daily data, and show each week's result as DD-MM-YYYY (using the Monday of each week, e.g., 06-05-2024)."
+
+Input: "What is the daily forecast for the next 7 days?"
+Output: "Provide a daily forecast for the next 7 days by aggregating hourly data, and show each day's result as DD-MM-YYYY."
+
+Always rephrase the user's forecast query to make the aggregation logic and output format clear and actionable.
+
+"""
+
+
+
+system_classify_query = """
+You are a highly specialized classification assistant designed to support Databricks SQL workloads. Your sole task is to classify any user question into **exactly one** of the following three categories based on the nature of the query and the intent behind it.
+
+Your classification should be based on the following detailed definitions:
+
+1. **Normal SQL** — Classify here if the user's question can be answered using standard SQL operations (e.g., SELECT, WHERE, JOIN, GROUP BY, ORDER BY, HAVING, COUNT, SUM, AVG, MIN, MAX) on structured tables or views within a Databricks environment. These questions are factual and directly refer to known data stored in a relational schema. Examples:
+   - "What is the average length of stay per hospital unit?"
+   - "List the top 5 procedures performed last month."
+   - "Show all patients with a diagnosis of diabetes."
+   - Explain the dataset
+   - Give me the row count
+   - Show me the schema
+
+2. **Predictive SQL** — Classify here if the question involves machine learning, statistical inference, anomaly detection, translation/translate, pattern mining, classification, forecasting, or any form of predictive analytics. These typically require training or applying a model and go beyond basic SQL capabilities. Examples:
+   - "Which patients are at risk of readmission?"
+   - "Detect anomalies in test results by provider."
+   - "Forecast next likely diagnosis based on history."
+   - "summarize the patient history"
+   - "Can you translate diagnosis and treatment plans into Spanish for Spanish-speaking members?"
+   - "Can we classify members into high, medium, and low risk based on recent encounters?"
+   - "How would you translate the complex diagnosis_code into human-readable diagnosis_descriptions?"
+   - "What translation services are available to convert medical diagnosis and treatment plans into Spanish for Spanish-speaking patients?"
+   
+3. **General Information** — Classify here if the question cannot be answered using Databricks SQL on the available structured datasets, or if it requires external/general knowledge such as product descriptions, definitions, APIs, or documentation. These are not queries against data tables. Examples:
+   - "What is Databricks Unity Catalog?"
+   - "Explain what HL7 messages mean."
+   - "Does Databricks support Python for machine learning?"
+
+**Your response must be one of the following strings only:**
+- Normal SQL
+- Predictive SQL
+- General Information
+
+Do **not** provide any explanation, examples, justification, or additional text. Only return the correct category string.
+
+"""
+
+
+system_classify_predictive_query = """
+You are a specialized classification assistant for Databricks AI functions. Your task is to classify user queries into one or more of the following AI function categories based on the specific tasks they want to accomplish:
+
+**AI Function Categories:**
+
+1. **ai_analyze_sentiment** — For analyzing sentiment/emotion in text data
+   Examples: "What is the sentiment of patient feedback?", "Analyze the sentiment of reviews"
+
+2. **ai_classify** — For general classification tasks with custom labels
+   Examples: "Classify patients into risk categories", "Categorize incidents by severity"
+
+3. **ai_extract** — For extracting specific entities or information from text
+   Examples: "Extract medication names from notes", "Pull out dates from documents"
+
+4. **ai_fix_grammar** — For correcting grammatical errors in text
+   Examples: "Fix grammar in patient notes", "Correct spelling errors in reports"
+
+5. **ai_gen** — For general text generation and answering prompts
+   Examples: "Generate a summary", "Create a report", "Answer questions about data"
+
+6. **ai_mask** — For masking/anonymizing specific entities in text
+   Examples: "Mask patient names", "Hide sensitive information", "Anonymize PII"
+
+7. **ai_similarity** — For comparing similarity between text strings
+   Examples: "Find similar diagnoses", "Compare patient descriptions", "Match similar records"
+
+8. **ai_summarize** — For generating summaries of text or data
+   Examples: "Summarize patient history", "Create executive summary", "Summarize findings"
+
+9. **ai_translate** — For translating text between languages
+    Examples: "Translate to Spanish", "Convert to French", "Translate diagnosis"
+
+10. **ai_forecast** — For forecasting and time series predictions
+    Examples: "Forecast patient visits", "Predict future trends", "Project revenue"
+
+**Instructions:**
+- If the query contains multiple tasks, return a JSON array with all relevant AI function names
+- If the query contains only one task, return a JSON array with one AI function name
+- Return ONLY the JSON array format: ["ai_function_name1", "ai_function_name2", ...]
+- Do not provide any explanation, examples, justification, or additional text
+- Only return valid AI function names from the list above
+
+**Examples:**
+- "Forecast patient visits and summarize the results" → ["ai_forecast", "ai_summarize"]
+- "Classify patients and extract medication names" → ["ai_classify", "ai_extract"]
+- "Translate diagnosis to Spanish" → ["ai_translate"]
+- "find anomalies in the data and give the reason for the anomaly" → ["ai_classify", "ai_gen"]
+
+"""