feat(openai): instrument openai responses prompts (#15159)

PROFeNoM · web-flow · commit acd8a7daef30 · 2025-11-14T11:08:54.000+01:00
# PR Description ## Description Adds prompt tracking for OpenAI reusable prompts. **The problem:** OpenAI returns rendered prompts (with variables filled in), but prompt tracking needs templates with placeholders like `{{variable_name}}`. **The solution:** Reverse templating - reconstruct the template by replacing variable values with placeholders. **How it works:** ```python # Input from OpenAI: variables: {"question": "What is ML?"} instructions: [{role: "user", content: "Answer: What is ML?"}] # We do: 1. Build map: {"What is ML?": "{{question}}"} 2. Extract: "Answer: What is ML?" 3. Replace: "Answer: What is ML?" → "Answer: {{question}}" # Output: chat_template: [{role: "user", content: "Answer: {{question}}"}] ``` **Why longest values first?** Overlapping values need careful handling: ```python # Problem: overlapping values variables = {"short": "AI", "long": "AI is cool"} text = "AI is cool" # Wrong order breaks: text.replace("AI", "{{short}}") # -> "{{short}} is cool" # Now can't find "AI is cool" anymore! # Solution: sort by length (longest first), then replace sorted_values = ["AI is cool", "AI"] # Longest first for value in sorted_values: text = text.replace(value, placeholder) # Result: "{{long}}" ``` The implementation uses a simple `.replace()` loop with longest-first sorting. Benchmarks show this is faster than regex for typical prompts with <50 variables. ## Testing - Added `test_response_with_prompt_tracking()` verifying prompt metadata, chat_template extraction, and placeholder replacement. - Added comprehensive unit tests for `_extract_chat_template_from_instructions()` covering edge cases (overlaps, special chars, large patterns, etc.) - Tested on a personal sandbox with real templates. They can be found on staging here: [link](https://dd.datad0g.com/llm/applications?query=%40ml_app%3Allmobs-sandbox&compareLens=inputs&fromUser=false&start=1762765198999&end=1762766040247&paused=true#promptTemplates) ## Risks Making this perfect is likely impossible since we're reverse-engineering the template from rendered output. The approach works well for typical real-world usage where: - Variable values are reasonably unique - Users follow sensible naming patterns - Variables don't create ambiguous overlaps For instance, when two variables have the same value, only one placeholder will be used: ``` variables = {"var1": "hello", "var2": "hello"} text = "Say hello" # Result: "Say {{var2}}" or "Say {{var1}}" ``` ## Additional Notes OpenAI doesn't expose templates via API, so we reconstruct them. If they add template retrieval later or backend supports template-less prompts, we can remove this logic.
diff --git a/ddtrace/llmobs/_integrations/utils.py b/ddtrace/llmobs/_integrations/utils.py
@@ -16,6 +16,7 @@
 from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
 from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
 from ddtrace.llmobs._constants import INPUT_MESSAGES
+from ddtrace.llmobs._constants import INPUT_PROMPT
 from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import INPUT_VALUE
 from ddtrace.llmobs._constants import METADATA
@@ -26,6 +27,7 @@
 from ddtrace.llmobs._constants import TOOL_DEFINITIONS
 from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._utils import _get_attr
+from ddtrace.llmobs._utils import _validate_prompt
 from ddtrace.llmobs._utils import load_data_value
 from ddtrace.llmobs._utils import safe_json
 from ddtrace.llmobs._utils import safe_load_json
@@ -738,9 +740,78 @@ def openai_get_metadata_from_response(
     return metadata
 
 
+def _extract_chat_template_from_instructions(
+    instructions: List[Any], variables: Dict[str, Any]
+) -> List[Dict[str, str]]:
+    """
+    Extract a chat template from OpenAI response instructions by replacing variable values with placeholders.
+
+    Args:
+        instructions: List of instruction messages from the OpenAI response
+        variables: Dictionary of variables used in the prompt
+
+    Returns:
+        List of chat template messages with placeholders (e.g., {{variable_name}})
+    """
+    chat_template = []
+
+    # Create a mapping of variable values to placeholder names
+    value_to_placeholder = {}
+    for var_name, var_value in variables.items():
+        if hasattr(var_value, "text"):  # ResponseInputText
+            value_str = str(var_value.text)
+        else:
+            value_str = str(var_value)
+
+        # Skip empty values
+        if not value_str:
+            continue
+
+        value_to_placeholder[value_str] = f"{{{{{var_name}}}}}"
+
+    # Sort by length (longest first) to handle overlapping values correctly
+    sorted_values = sorted(value_to_placeholder.keys(), key=len, reverse=True)
+
+    for instruction in instructions:
+        role = _get_attr(instruction, "role", "")
+        if not role:
+            continue
+
+        content_items = _get_attr(instruction, "content", [])
+        if not content_items:
+            continue
+
+        text_parts = []
+        for content_item in content_items:
+            text = _get_attr(content_item, "text", "")
+            if text:
+                text_parts.append(str(text))
+
+        if not text_parts:
+            continue
+
+        full_text = "".join(text_parts)
+
+        # Replace variable values with placeholders (longest first)
+        for value_str in sorted_values:
+            placeholder = value_to_placeholder[value_str]
+            full_text = full_text.replace(value_str, placeholder)
+
+        chat_template.append({"role": role, "content": full_text})
+
+    return chat_template
+
+
 def openai_set_meta_tags_from_response(span: Span, kwargs: Dict[str, Any], response: Optional[Any]) -> None:
     """Extract input/output tags from response and set them as temporary "_ml_obs.meta.*" tags."""
     input_data = kwargs.get("input", [])
+
+    # For reusable prompts, input may not be in kwargs, extract from response.instructions
+    if not input_data and response and "prompt" in kwargs:
+        instructions = _get_attr(response, "instructions", [])
+        if instructions:
+            input_data = load_data_value(instructions)
+
     input_messages = openai_get_input_messages_from_response_input(input_data)
 
     if "instructions" in kwargs:
@@ -753,6 +824,25 @@ def openai_set_meta_tags_from_response(span: Span, kwargs: Dict[str, Any], respo
         }
     )
 
+    if "prompt" in kwargs:
+        prompt_data = kwargs.get("prompt")
+        if prompt_data:
+            try:
+                # Extract chat_template from response instructions if available
+                if response and not prompt_data.get("chat_template") and not prompt_data.get("template"):
+                    instructions = _get_attr(response, "instructions", None)
+                    variables = prompt_data.get("variables", {})
+                    if instructions and variables:
+                        chat_template = _extract_chat_template_from_instructions(instructions, variables)
+                        if chat_template:
+                            prompt_data = dict(prompt_data)  # Make a copy to avoid modifying the original
+                            prompt_data["chat_template"] = chat_template
+
+                validated_prompt = _validate_prompt(prompt_data, strict_validation=False)
+                span._set_ctx_item(INPUT_PROMPT, validated_prompt)
+            except (TypeError, ValueError, AttributeError) as e:
+                logger.debug("Failed to validate prompt for OpenAI response: %s", e)
+
     if span.error or not response:
         span._set_ctx_item(OUTPUT_MESSAGES, [Message(content="")])
         return
diff --git a/releasenotes/notes/instrument-openai-responses-prompt-d8d0f21a6f21ed4d.yaml b/releasenotes/notes/instrument-openai-responses-prompt-d8d0f21a6f21ed4d.yaml
@@ -0,0 +1,5 @@
+---
+features:
+  - |
+    LLM Observability: The OpenAI integration now captures prompt metadata (id, version, variables, and chat template) 
+    for reusable prompts when using the ``responses`` endpoint (available in OpenAI SDK >= 1.87.0).
diff --git a/tests/contrib/openai/cassettes/v1/response_with_prompt.yaml b/tests/contrib/openai/cassettes/v1/response_with_prompt.yaml
@@ -0,0 +1,134 @@
+interactions:
+- request:
+    body: '{"prompt":{"id":"pmpt_690b24669d8c81948acc0e98da10e6490190feb3a62eee0b","version":"4","variables":{"question":"What
+      is machine learning?"}}}'
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '140'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 2.3.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - Linux
+      x-stainless-package-version:
+      - 2.3.0
+      x-stainless-read-timeout:
+      - '600'
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.10.18
+    method: POST
+    uri: https://api.openai.com/v1/responses
+  response:
+    body:
+      string: !!binary |
+        H4sIAAAAAAAAA7xYWW8byRF+968o8CGQAYoYHqJFvQTGbhZYJM4GmyyMIDaImu6amVr1MemDFL3w
+        fw+qhxySkh1vXvImdU/X8dVXF397BTBhPXmASaDYb6u7iqo7Pd9sar1EWlbz9aZSuFrS6k7f3883
+        c1xu5tWmwfWqUW82b5rJVET4+ldS6STGu0jDuQqEifQW5W7+Zr1YLdfzu/tyFxOmHOWN8rY3lEgP
+        j2pUj23w2YldDZpIwzEbw66dPMBvrwAAJj0eKMh7TTsyvqcweQXwuXxMIXi5c9mYcsDupGWrKSGb
+        eH0bU8gqsXdy/q+iYFADMEmHnkSPpRixHVwrF8q7RC6NLy5fXb1k1+e0TfSUxsfDvZw8wOR7DqQS
+        /AG+825HIaJYggaSdzQZH3w+/vVxNCB4Qy8QOGLwf3Xgnz4DBgKEjkzfZAMYI8eELs3gb4YwEqCL
+        ewqQOo7w70xRfHyA9x0m4AgWVceOwBAGx6794+/xO8fR5VfH+4nFp63PqdjrH8ldBVouk/dmq9Bc
+        U8B6TUZk+tWtZce3i2pxd1utbufrI8eLzJfsOKZP/HryNFW1UZI892pxp5ekqvVcKXW/PgfihHMg
+        jF7cP1/FbC2Ggyj++MXoDgbY2H7VgtV8tdxUYkGt5zXVzRu1vluuNqvqpQUvOPK1PP3fCHSKyAsG
+        oXM+4Zh5H68ujW/74Osv3JyI9+4ZbYRKCHVApzrwDWBI3LBiNMAukTHcklMESWjX8o4iiFs5UYiQ
+        OgKs2XA6QPKDTOgxJQouAjoNFh8JNCmOYjA0wVvQmHAKAVNX2I0OahJL6Kk3rDiZA/TBtwGtJQ2N
+        D0A7CgeIihwG9jP40YEvr/c+6DgFKUeEWuw/+Az7wEkEfsiLaq64uZUUuk0duVtxoxxrCNlQnJYH
+        DZEuzsRDTGSBnlAiN7jACRpuc6AIPqfyWXaagjmIjiIFvANOEfzezT64D+7PdADWhPFB/hN9iwV8
+        jwmLwBEg+OBgMHIJUhCUN0bKmgAENy7bmkKcgoRuCmyxFXsjuegDBELNro1ToKRmr4tggzUZsXff
+        kQNHpEnPrrS8PccdTesDp84COjSHTxSHGBflyYOmJLaMxhadZiBex32cXbj2TmoB1JmN2HSl8R9C
+        kbMq/WuOKRZqBSnYPQa0VMh0IwgOISu1ZQjTa7GlpphAYZ9yECb6SKNd1/79VKgakB3paQlVEQUK
+        3UDFPpDmoW+JR2dmegFsP4XsIpErKBQP3yE7kKSMwq7nRbfEdz6Dv+eewo4j6TPAxa6TYb9EikN8
+        SJ/pdUOzdjaFvvPJR0jYtqSPEKiRpz4cj7Rvj5jMroX/RVRKGlvse1Fd0qy0ILgZeFNgHGoK3BQ7
+        4nMp33lrvYOE8TE+gDLSkBpWJeJwE3u0sIszcD7dyj+vpxCoDRRjuT8B61rofJYABVYUXxcQFzP4
+        xcVvYfTeh8cIe04dZHfCasiFASeEng1JHFSOyVsK0OegOmmWHcfkA9MLt35gpyMM44rQxweQYUlw
+        usrb/w5GjomCWHxzegyRLRsMoy3x9RQ0W3KxzCJSEwPpYUiCm8i2N9yUmiE+DcAsZ/AzsWt8UGTJ
+        pa8gcwxwfYAUpDJLspeJTWKgiHelEtEegy607smhSSwMSx27R4GsZEVJfGjR0m1vsBiDLbn0HLSS
+        tXIBPavHCHhMGXaADsjtOHhXDE4eLD6x5U8EKtssFWJHR2PA76TEs6Xi7c+E5nbvgzmnwGWB/JNF
+        NlCI1rA5Nhg8YiKKYu8TZLdHl0jDsfEOFWAsRDsuIe6DF+gFHm8tOT3QmFzLTmD5K6XG8NMU3lr8
+        5N3rSyE/SsaUl63j8oydtIkgieqoQLSJQ9YC9j3cYE7eYmIFDaqifxjwS0qza6/Ev83JO299jrCj
+        jpWh53463HGLqVSypmF1+fqHgFkfi7OYNnSsEpka3SO7tkD9vjtII7ClSg4ov+j8vd8LxhbdARKp
+        znnjWyHNnkAyeOi6Gg9TqZ2F8r5Jexlbo8WQKEzBehliNfYS9WkhZk2iEzBBh07LCgLDIPQkbB0Z
+        UCosvI1DhsvDYa4YLQOZldhlEkja4PfTF+X3FAoZbwsCOUp4pUsqLmOLyNXI5gCGG4JHoj7KoIFO
+        OtXs98zM42D+fHCW3mUMmesBOYU87F99oB37HLenFW9bBs9xgO6Dt306r2fDWNrbPm3Xm6perNbr
+        jb6XOXh1j0pVtLnXOK9ovdpU803VUL3E9YKIqvo4IU52GBhrQ3GUCjA5rQ4XZ99aVMZp8ZvLxufL
+        CXsiq9igabI675aDo1uFqqPtIx1egnC8CySz8SBg/OI84Y9IUdP4kIbhW3O2J/fPk7+8HvVHbCgd
+        tqxFeMN0tedG6UmKtolPu3GD2QxgTKSl0NVSncj2FFA6iWzos+p4+nQRyMYHi+kyBCewy3fPEat9
+        5HS48GY0fOBV51kNRMzJT8aL84w/Sb7fXkz+1XjYX9oYshuaeXGToxDl+AtCLhvMmYnuahlczqcv
+        zy9+FRjdLFHU54fVlavPd8z1avGlmy8JHjlwfj3fLK6kJ5/QXAh/sxxhLEvv5eZKCaXoiIbPrz7/
+        BwAA//8DAHhWLkvUEQAA
+    headers:
+      CF-RAY:
+      - 99a4fa22dbb601cc-CDG
+      Connection:
+      - keep-alive
+      Content-Encoding:
+      - gzip
+      Content-Type:
+      - application/json
+      Date:
+      - Thu, 06 Nov 2025 13:36:05 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - __cf_bm=oAc59HaJwUjrUv2uHgTgDkTP1sVynTMJVzliRX11b7o-1762436165-1.0.1.1-STkKgI9BlQHAvGzS.Rqi6UQVssVb5_M5J9QpUZICssvaO35gDy6yDFJo.tYdjVGKAGufaBJ9rwowcVi0u.xMc6oV0zOSTM2nqB6IjkP9W.4;
+        path=/; expires=Thu, 06-Nov-25 14:06:05 GMT; domain=.api.openai.com; HttpOnly;
+        Secure; SameSite=None
+      - _cfuvid=bDZxnxovYk7l9OeXSX6u2DbwKyUR5GDTvi_l5SLAkiY-1762436165819-0.0.1.1-604800000;
+        path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
+      Strict-Transport-Security:
+      - max-age=31536000; includeSubDomains; preload
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - datadog-staging
+      openai-processing-ms:
+      - '7512'
+      openai-project:
+      - proj_gt6TQZPRbZfoY2J9AQlEJMpd
+      openai-version:
+      - '2020-10-01'
+      x-envoy-upstream-service-time:
+      - '7514'
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999762'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_2409b397395c43bcaa8b763bb736ebf5
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
@@ -2150,6 +2150,57 @@ class MathResponse(BaseModel):
             )
         )
 
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 87),
+        reason="Reusable prompts only available in openai >= 1.87",
+    )
+    def test_response_with_prompt_tracking(self, openai, mock_llmobs_writer, mock_tracer):
+        """Test that prompt metadata (id, version, variables) is captured for reusable prompts."""
+        with get_openai_vcr(subdirectory_name="v1").use_cassette("response_with_prompt.yaml"):
+            client = openai.OpenAI()
+            client.responses.create(
+                prompt={
+                    "id": "pmpt_690b24669d8c81948acc0e98da10e6490190feb3a62eee0b",
+                    "version": "4",
+                    "variables": {"question": "What is machine learning?"},
+                }
+            )
+        mock_tracer.pop_traces()
+        assert mock_llmobs_writer.enqueue.call_count == 1
+
+        call_args = mock_llmobs_writer.enqueue.call_args[0][0]
+
+        # Verify prompt metadata is captured
+        assert "prompt" in call_args["meta"]["input"]
+        actual_prompt = call_args["meta"]["input"]["prompt"]
+        assert actual_prompt["id"] == "pmpt_690b24669d8c81948acc0e98da10e6490190feb3a62eee0b"
+        assert actual_prompt["version"] == "4"
+        assert actual_prompt["variables"] == {"question": "What is machine learning?"}
+
+        # Verify chat_template is extracted with variable placeholders
+        assert "chat_template" in actual_prompt
+        chat_template = actual_prompt["chat_template"]
+        assert len(chat_template) == 2
+        # First message: developer role
+        assert chat_template[0]["role"] == "developer"
+        assert chat_template[0]["content"] == "Direct & Conversational tone"
+        # Second message: user role with variable placeholder
+        assert chat_template[1]["role"] == "user"
+        assert chat_template[1]["content"] == "You are a helpful assistant. Please answer this question: {{question}}"
+
+        # Verify the actual prompt content is captured in input messages
+        input_messages = call_args["meta"]["input"]["messages"]
+        assert len(input_messages) == 2
+        # Developer message
+        assert input_messages[0]["role"] == "developer"
+        assert input_messages[0]["content"] == "Direct & Conversational tone"
+        # User message with rendered variables
+        assert input_messages[1]["role"] == "user"
+        assert (
+            input_messages[1]["content"]
+            == "You are a helpful assistant. Please answer this question: What is machine learning?"
+        )
+
 
 @pytest.mark.parametrize(
     "ddtrace_global_config",
diff --git a/tests/llmobs/test_integrations_utils.py b/tests/llmobs/test_integrations_utils.py