1- from pandas_llm import PandasLLM
1+ import pandas as pd
2+ import datetime
3+ import numpy as np
4+ import openai
5+ import os
6+ import re
7+ import json
8+
9+ # sandbox.py
10+ from RestrictedPython import compile_restricted
11+ from RestrictedPython .Guards import safe_builtins ,guarded_iter_unpack_sequence
12+ from RestrictedPython .Eval import default_guarded_getattr , default_guarded_getitem , default_guarded_getiter
13+ import pandas as pd
14+
15+ class Sandbox :
16+ def __init__ (self ):
17+ self ._allowed_imports = {}
18+
19+ def allow_import (self , module_name ):
20+ try :
21+ module = __import__ (module_name )
22+ self ._allowed_imports [module_name ] = module
23+ except ImportError :
24+ pass
25+
26+ def execute (self , code , local_vars = {}):
27+ allowed_builtins = safe_builtins
28+ # Add __builtins__, __import__, and allowed imports to the globals
29+ restricted_globals = {"__builtins__" : allowed_builtins }
30+ restricted_globals .update (self ._allowed_imports )
31+
32+ builtin_mappings = {
33+ "__import__" : __import__ ,
34+ "_getattr_" : default_guarded_getattr ,
35+ "_getitem_" : default_guarded_getitem ,
36+ "_getiter_" : default_guarded_getiter ,
37+ "_iter_unpack_sequence_" : guarded_iter_unpack_sequence ,
38+ "list" : list ,
39+ "set" : set ,
40+ "pd" : pd ,
41+ }
42+
43+ series_methods = [
44+ "sum" , "mean" , "any" , "argmax" , "argmin" , "count" , "cumsum" , "cumprod" , "diff" ,
45+ "dropna" , "fillna" , "head" , "idxmax" , "idxmin" , "last" , "max" , "min" , "notna" ,
46+ "prod" , "quantile" , "rename" , "round" , "tail" , "to_frame" , "to_list" , "to_numpy" ,
47+ "to_string" ,"unique" , "sort_index" , "sort_values" , "aggregate"
48+ ]
49+
50+
51+ builtin_mappings .update ({method : getattr (pd .Series , method ) for method in series_methods })
52+
53+ restricted_globals ["__builtins__" ].update (builtin_mappings )
54+
55+ byte_code = compile_restricted (source = code , filename = '<inline>' , mode = 'exec' )
56+
57+ # Execute the restricted code
58+ exec (byte_code , restricted_globals , local_vars )
59+
60+ return local_vars
61+
62+
63+ class PandasLLM (pd .DataFrame ):
64+ """
65+ PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
66+ wrapper around the OpenAI API.
67+ """
68+
69+ code_blocks = [r'```python(.*?)```' ,r'```(.*?)```' ]
70+
71+ llm_default_model = "gpt-3.5-turbo"
72+ llm_default_temperature = 0.2
73+ llm_engine = "openai"
74+ llm_default_params = { "model" : llm_default_model ,
75+ "temperature" : llm_default_temperature }
76+ llm_api_key = None
77+
78+ prompt_override = False
79+ custom_prompt = ""
80+ data_privacy = True
81+ path = None
82+ verbose = False
83+ code_block = ""
84+ force_sandbox = False
85+ def __init__ (self ,
86+ data ,
87+ llm_engine :str = "openai" , llm_params = llm_default_params ,
88+ prompt_override :bool = False ,
89+ custom_prompt :str = "" ,
90+ path :str = None ,
91+ verbose :bool = False ,
92+ data_privacy :bool = True ,
93+ llm_api_key :str = None ,
94+ force_sandbox :bool = False ,
95+ * args , ** kwargs ):
96+ """
97+ This is the constructor for the PandasLLM class. It takes in the following arguments:
98+ data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
99+ a list of dictionaries, a dictionary, a string, or a list.
100+ llm_engine: The name of the OpenAI engine to use.
101+ llm_params: A dictionary of parameters to be used with the OpenAI API.
102+ prompt_override: A boolean that determines whether or not the prompt is overridden.
103+ custom_prompt: A string that overrides the prompt.
104+ path: The path to the file to be used.
105+ verbose: A boolean that determines whether or not the output is verbose.
106+ data_privacy: A boolean that determines whether or not the data is private.
107+ llm_api_key: The OpenAI API key to be used.
108+ force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
109+
110+ The constructor also calls the parent class's constructor.
111+
112+
113+ Args:
114+ data (pandas dataframe, mandatory): dataset to query. Defaults to None.
115+ llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
116+ llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
117+ prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
118+ custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
119+ path (str, optional): the path where the files containing debug data will be save. Defaults to None.
120+ verbose (bool, optional): if True debugging info will be printed. Defaults to False.
121+ data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
122+ llm_api_key (str, optional): the Open API key. Defaults to None.
123+ force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
124+ """
125+
126+
127+ super ().__init__ (data , * args , ** kwargs )
128+
129+ self .llm_params = llm_params or {}
130+
131+ # Set up OpenAI API key from the environment or the config
132+ self .llm_api_key = llm_api_key or os .environ .get ("OPENAI_API_KEY" )
133+
134+ self .llm_engine = llm_engine
135+ self .llm_params = llm_params or {}
136+ self .model = self .llm_params .get ("model" , self .llm_default_model )
137+ self .temperature = self .llm_params .get ("temperature" , self .llm_default_temperature )
138+
139+ self .prompt_override = prompt_override
140+ self .custom_prompt = custom_prompt
141+
142+ self .data_privacy = data_privacy
143+ self .path = path
144+ self .verbose = verbose
145+ self .force_sandbox = force_sandbox
146+
147+ def _buildPromptForRole (self ):
148+ prompt_role = f"""
149+ I want you to act as a data scientist and Python coder. I want you code for me.
150+ I have a dataset of { len (self )} rows and { len (self .columns )} columns.
151+ Columns and their type are the following:
152+ """
153+
154+ for col in self .columns :
155+ col_type = self .dtypes [col ]
156+ prompt_role += f"{ col } ({ col_type } )\n "
157+
158+ return prompt_role
159+
160+ def _buildPromptForProblemSolving (self , request ):
161+
162+ if self .prompt_override :
163+ return self .custom_prompt
164+
165+ columns = ""
166+ for col in self .columns :
167+ col_type = self .dtypes [col ]
168+ columns += f"{ col } ({ col_type } )\n "
169+
170+ prompt_problem = f"""
171+ Given a DataFrame named 'df' of { len (self )} rows and { len (self .columns )} columns,
172+ Its columns are the following:
173+
174+ { columns }
175+
176+ I want you to solve the following problem:
177+ write a Python code snippet that addresses the following request:
178+ { request }
179+
180+ While crafting the code, please follow these guidelines:
181+ 1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search.
182+ 2. Ensure that the answer is a single line of code without explanations, comments, or additional details.
183+ 3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'.
184+ 4. Assign the resulting code to the variable 'result'.
185+ 5. Avoid importing any additional libraries than pandas and numpy.
186+
187+ """
188+ if not self .custom_prompt is None and len (self .custom_prompt ) > 0 :
189+
190+ prompt_problem += f"""
191+ Also:
192+ { self .custom_prompt }
193+ """
194+
195+ return prompt_problem
196+
197+ def _extractPythonCode (self , text : str , regexp : str ) -> str :
198+ # Define the regular expression pattern for the Python code block
199+ pattern = regexp
200+
201+ # Search for the pattern in the input text
202+ match = re .search (pattern , text , re .DOTALL )
203+
204+ # If a match is found, return the extracted code (without the markers)
205+ if match :
206+ return match .group (1 ).strip ()
207+
208+ # If no match is found, return an empty string
209+ return ""
210+
211+ def _print (self , * args , ** kwargs ):
212+ if self .verbose :
213+ print (* args , ** kwargs )
214+
215+ # def _variable_to_string(self, variable):
216+ # if variable is None: return None
217+ # try:
218+
219+ # if isinstance(variable, pd.Series):
220+ # # convert to dataframe
221+ # variable = variable.to_frame()
222+
223+ # if isinstance(variable, pd.DataFrame):
224+ # variable = variable.drop_duplicates()
225+ # if len(variable) == 0: return None
226+ # return str(variable)
227+
228+ # elif isinstance(variable, np.ndarray):
229+ # if len(variable) == 0: return None
230+ # return np.array2string(variable)
231+ # else:
232+ # # Convert the variable to a string
233+ # return str(variable)
234+ # except Exception as e:
235+ # return str(variable)
236+
237+
238+ def _save (self ,name ,value ):
239+ if self .path is None or self .path == "" :
240+ return
241+ try :
242+ with open (f"{ self .path } /{ name } " , 'w' ) as file :
243+ file .write (value )
244+ except Exception as e :
245+ self ._print (f"error { e } " )
246+ return
247+
248+ def _execInSandbox (self , df , generated_code :str ):
249+
250+ # Create a Sandbox instance and allow pandas to be imported
251+ sandbox = Sandbox ()
252+ sandbox .allow_import ("pandas" )
253+ sandbox .allow_import ("numpy" )
254+
255+ # Define the initial code to set up the DataFrame
256+ initial_code = f"""
257+ import pandas as pd
258+ import datetime
259+ from pandas import Timestamp
260+ import numpy as np
261+
262+ """
263+
264+ # Combine the initial code and the generated code
265+ full_code = initial_code + "\n " + generated_code
266+
267+ self ._save ("temp/prompt_code.py" ,full_code )
268+ # Execute the combined code in the Sandbox
269+ sandbox_result = sandbox .execute (full_code , {"df" :df })
270+
271+ # Get the result from the local_vars dictionary
272+ result = sandbox_result .get ("result" )
273+ return result
274+
275+ def prompt (self , request : str ):
276+ """
277+
278+ Args:
279+ request (str): prompt containing the request. it must be expressed as a question or a problem to solve
280+
281+ Returns:
282+ Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
283+ """
284+
285+ # Set up OpenAI API key
286+ openai .api_key = self .llm_api_key
287+
288+ messages = [
289+ {"role" : "system" ,
290+ "content" : self ._buildPromptForRole ()},
291+ {"role" : "user" ,
292+ "content" : self ._buildPromptForProblemSolving (request )
293+ }
294+ ]
295+
296+ response = None
297+ for times in range (0 ,3 ):
298+ try :
299+ response = openai .ChatCompletion .create (
300+ model = self .model ,
301+ temperature = self .temperature ,
302+ messages = messages
303+ )
304+ break ;
305+ except Exception as e :
306+ self ._print (f"error { e } " )
307+ continue
308+
309+ if response is None :
310+ return "Please try later"
311+
312+ self ._save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
313+
314+ generated_code = response .choices [0 ].message .content
315+ if generated_code == "" or generated_code is None :
316+ self .code_block = ""
317+ return None
318+
319+ self .code_block = generated_code
320+
321+ results = []
322+ for regexp in self .code_blocks :
323+ cleaned_code = self ._extractPythonCode (generated_code ,regexp )
324+ if cleaned_code == "" or cleaned_code is None :
325+ continue
326+ results .append (cleaned_code )
327+ results .append (generated_code )
328+
329+ if len (results ) == 0 :
330+ return None
331+
332+ result = None
333+ for cleaned_code in results :
334+
335+ try :
336+ result = self .execInSandbox (self , cleaned_code )
337+ except Exception as e :
338+ self ._print (f"error { e } " )
339+ if not self .force_sandbox :
340+ try :
341+ expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
342+ result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
343+ except Exception as e :
344+ self ._print (f"error { e } " )
345+ pass
346+
347+ if result is not None and str (result ) != "" :
348+ break
349+
350+ if self .data_privacy == True :
351+ # non formatted result
352+ return result
353+
354+ # currently the privacy option is not needed.
355+ # in the future, we can choose to send data to LLM if privacy is set to false
356+
357+ return result
358+
359+
0 commit comments