11import pandas as pd
22import datetime
33import numpy as np
4- import time
54import openai
65import os
76from sandbox import Sandbox
87import re
98import json
109
1110class PandasLLM (pd .DataFrame ):
11+ """
12+ PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
13+ wrapper around the OpenAI API.
14+ """
1215
1316 code_blocks = [r'```python(.*?)```' ,r'```(.*?)```' ]
1417
@@ -17,31 +20,63 @@ class PandasLLM(pd.DataFrame):
1720 llm_engine = "openai"
1821 llm_default_params = { "model" : llm_default_model ,
1922 "temperature" : llm_default_temperature }
20- openai_api_key = None
23+ llm_api_key = None
2124
2225 prompt_override = False
2326 custom_prompt = ""
2427 data_privacy = True
2528 path = None
2629 verbose = False
2730 code_block = ""
31+ force_sandbox = False
2832 def __init__ (self ,
29- data = None ,
30- llm_engine = "openai" , llm_params = llm_default_params ,
31- prompt_override = False ,
32- custom_prompt = "" ,
33- path = None ,
34- verbose = False ,
35- data_privacy = True ,
36- openai_api_key = None ,
33+ data ,
34+ llm_engine :str = "openai" , llm_params = llm_default_params ,
35+ prompt_override :bool = False ,
36+ custom_prompt :str = "" ,
37+ path :str = None ,
38+ verbose :bool = False ,
39+ data_privacy :bool = True ,
40+ llm_api_key :str = None ,
41+ force_sandbox :bool = False ,
3742 * args , ** kwargs ):
43+ """
44+ This is the constructor for the PandasLLM class. It takes in the following arguments:
45+ data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
46+ a list of dictionaries, a dictionary, a string, or a list.
47+ llm_engine: The name of the OpenAI engine to use.
48+ llm_params: A dictionary of parameters to be used with the OpenAI API.
49+ prompt_override: A boolean that determines whether or not the prompt is overridden.
50+ custom_prompt: A string that overrides the prompt.
51+ path: The path to the file to be used.
52+ verbose: A boolean that determines whether or not the output is verbose.
53+ data_privacy: A boolean that determines whether or not the data is private.
54+ llm_api_key: The OpenAI API key to be used.
55+ force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
56+
57+ The constructor also calls the parent class's constructor.
58+
3859
60+ Args:
61+ data (pandas dataframe, mandatory): dataset to query. Defaults to None.
62+ llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
63+ llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
64+ prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
65+ custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
66+ path (str, optional): the path where the files containing debug data will be save. Defaults to None.
67+ verbose (bool, optional): if True debugging info will be printed. Defaults to False.
68+ data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
69+ llm_api_key (str, optional): the Open API key. Defaults to None.
70+ force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
71+ """
72+
73+
3974 super ().__init__ (data , * args , ** kwargs )
4075
4176 self .llm_params = llm_params or {}
4277
4378 # Set up OpenAI API key from the environment or the config
44- self .openai_api_key = openai_api_key or os .environ .get ("OPENAI_API_KEY" )
79+ self .llm_api_key = llm_api_key or os .environ .get ("OPENAI_API_KEY" )
4580
4681 self .llm_engine = llm_engine
4782 self .llm_params = llm_params or {}
@@ -54,8 +89,9 @@ def __init__(self,
5489 self .data_privacy = data_privacy
5590 self .path = path
5691 self .verbose = verbose
92+ self .force_sandbox = force_sandbox
5793
58- def buildPromptForRole (self ):
94+ def _buildPromptForRole (self ):
5995 prompt_role = f"""
6096I want you to act as a data scientist and Python coder. I want you code for me.
6197I have a dataset of { len (self )} rows and { len (self .columns )} columns.
@@ -68,7 +104,7 @@ def buildPromptForRole(self):
68104
69105 return prompt_role
70106
71- def buildPromptForProblemSolving (self , request ):
107+ def _buildPromptForProblemSolving (self , request ):
72108
73109 if self .prompt_override :
74110 return self .custom_prompt
@@ -105,7 +141,7 @@ def buildPromptForProblemSolving(self, request):
105141
106142 return prompt_problem
107143
108- def extractPythonCode (self , text : str , regexp : str ) -> str :
144+ def _extractPythonCode (self , text : str , regexp : str ) -> str :
109145 # Define the regular expression pattern for the Python code block
110146 pattern = regexp
111147
@@ -119,44 +155,44 @@ def extractPythonCode(self, text: str, regexp: str) -> str:
119155 # If no match is found, return an empty string
120156 return ""
121157
122- def print (self , * args , ** kwargs ):
158+ def _print (self , * args , ** kwargs ):
123159 if self .verbose :
124160 print (* args , ** kwargs )
125161
126- def variable_to_string (self , variable ):
127- if variable is None : return None
128- try :
129-
130- if isinstance (variable , pd .Series ):
131- # convert to dataframe
132- variable = variable .to_frame ()
133-
134- if isinstance (variable , pd .DataFrame ):
135- variable = variable .drop_duplicates ()
136- if len (variable ) == 0 : return None
137- return str (variable )
138-
139- elif isinstance (variable , np .ndarray ):
140- if len (variable ) == 0 : return None
141- return np .array2string (variable )
142- else :
143- # Convert the variable to a string
144- return str (variable )
145- except Exception as e :
146- return str (variable )
162+ # def _variable_to_string (self, variable):
163+ # if variable is None: return None
164+ # try:
165+
166+ # if isinstance(variable, pd.Series):
167+ # # convert to dataframe
168+ # variable = variable.to_frame()
169+
170+ # if isinstance(variable, pd.DataFrame):
171+ # variable = variable.drop_duplicates()
172+ # if len(variable) == 0: return None
173+ # return str(variable)
174+
175+ # elif isinstance(variable, np.ndarray):
176+ # if len(variable) == 0: return None
177+ # return np.array2string(variable)
178+ # else:
179+ # # Convert the variable to a string
180+ # return str(variable)
181+ # except Exception as e:
182+ # return str(variable)
147183
148184
149- def save (self ,name ,value ):
185+ def _save (self ,name ,value ):
150186 if self .path is None or self .path == "" :
151187 return
152188 try :
153189 with open (f"{ self .path } /{ name } " , 'w' ) as file :
154190 file .write (value )
155191 except Exception as e :
156- self .print (f"error { e } " )
192+ self ._print (f"error { e } " )
157193 return
158194
159- def execInSandbox (self , df , generated_code :str ):
195+ def _execInSandbox (self , df , generated_code :str ):
160196
161197 # Create a Sandbox instance and allow pandas to be imported
162198 sandbox = Sandbox ()
@@ -175,25 +211,32 @@ def execInSandbox(self, df, generated_code:str):
175211 # Combine the initial code and the generated code
176212 full_code = initial_code + "\n " + generated_code
177213
178- self .save ("temp/prompt_code.py" ,full_code )
214+ self ._save ("temp/prompt_code.py" ,full_code )
179215 # Execute the combined code in the Sandbox
180216 sandbox_result = sandbox .execute (full_code , {"df" :df })
181217
182218 # Get the result from the local_vars dictionary
183219 result = sandbox_result .get ("result" )
184220 return result
185221
222+ def prompt (self , request : str ):
223+ """
186224
225+ Args:
226+ request (str): prompt containing the request. it must be expressed as a question or a problem to solve
187227
188- def prompt (self , request : str ):
228+ Returns:
229+ Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
230+ """
231+
189232 # Set up OpenAI API key
190- openai .api_key = self .openai_api_key
233+ openai .api_key = self .llm_api_key
191234
192235 messages = [
193236 {"role" : "system" ,
194- "content" : self .buildPromptForRole ()},
237+ "content" : self ._buildPromptForRole ()},
195238 {"role" : "user" ,
196- "content" : self .buildPromptForProblemSolving (request )
239+ "content" : self ._buildPromptForProblemSolving (request )
197240 }
198241 ]
199242
@@ -207,13 +250,13 @@ def prompt(self, request: str):
207250 )
208251 break ;
209252 except Exception as e :
210- self .print (f"error { e } " )
253+ self ._print (f"error { e } " )
211254 continue
212255
213256 if response is None :
214257 return "Please try later"
215258
216- self .save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
259+ self ._save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
217260
218261 generated_code = response .choices [0 ].message .content
219262 if generated_code == "" or generated_code is None :
@@ -224,7 +267,7 @@ def prompt(self, request: str):
224267
225268 results = []
226269 for regexp in self .code_blocks :
227- cleaned_code = self .extractPythonCode (generated_code ,regexp )
270+ cleaned_code = self ._extractPythonCode (generated_code ,regexp )
228271 if cleaned_code == "" or cleaned_code is None :
229272 continue
230273 results .append (cleaned_code )
@@ -233,19 +276,20 @@ def prompt(self, request: str):
233276 if len (results ) == 0 :
234277 return None
235278
279+ result = None
236280 for cleaned_code in results :
237-
238- result = None
281+
239282 try :
240283 result = self .execInSandbox (self , cleaned_code )
241284 except Exception as e :
242- self .print (f"error { e } " )
243- try :
244- expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
245- result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
246- except Exception as e :
247- self .print (f"error { e } " )
248- pass
285+ self ._print (f"error { e } " )
286+ if not self .force_sandbox :
287+ try :
288+ expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
289+ result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
290+ except Exception as e :
291+ self ._print (f"error { e } " )
292+ pass
249293
250294 if result is not None and str (result ) != "" :
251295 break
@@ -257,6 +301,6 @@ def prompt(self, request: str):
257301 # currently the privacy option is not needed.
258302 # in the future, we can choose to send data to LLM if privacy is set to false
259303
260- return None
304+ return result
261305
262306
0 commit comments