77
88This code is experimental and both APIs and code generated is liable to change in future versions.
99"""
10- from pyspark .sql .types import LongType , FloatType , IntegerType , StringType , DoubleType , BooleanType , ShortType , \
11- TimestampType , DateType , DecimalType , ByteType , BinaryType , StructType , ArrayType , DataType
10+ import logging
1211
1312import pyspark .sql as ssql
14- import pyspark .sql .functions as F
13+ from pyspark .sql .types import LongType , FloatType , IntegerType , StringType , DoubleType , BooleanType , ShortType , \
14+ TimestampType , DateType , DecimalType , ByteType , BinaryType , StructType , ArrayType , DataType
1515
16- from .utils import strip_margins
1716from .spark_singleton import SparkSingleton
17+ from .utils import strip_margins
18+
19+ SUMMARY_FIELD_NAME = "summary"
20+ SUMMARY_FIELD_NAME_RENAMED = "__summary__"
21+ DATA_SUMMARY_FIELD_NAME = "__data_summary__"
1822
1923
2024class DataAnalyzer :
@@ -23,6 +27,8 @@ class DataAnalyzer:
2327
2428 :param df: Spark dataframe to analyze
2529 :param sparkSession: Spark session instance to use when performing spark operations
30+ :param debug: If True, additional debug information is logged
31+ :param verbose: If True, additional information is logged
2632
2733 .. warning::
2834 Experimental
@@ -43,11 +49,17 @@ class DataAnalyzer:
4349 |# Column definitions are stubs only - modify to generate correct data
4450 |#""" , '|' )
4551
46- def __init__ (self , df = None , sparkSession = None ):
52+ def __init__ (self , df = None , sparkSession = None , debug = False , verbose = False ):
4753 """ Constructor:
4854 :param df: Dataframe to analyze
4955 :param sparkSession: Spark session to use
5056 """
57+ # set up logging
58+ self .verbose = verbose
59+ self .debug = debug
60+
61+ self ._setupLogger ()
62+
5163 assert df is not None , "dataframe must be supplied"
5264
5365 self ._df = df
@@ -58,6 +70,19 @@ def __init__(self, df=None, sparkSession=None):
5870 self ._sparkSession = sparkSession
5971 self ._dataSummary = None
6072
73+ def _setupLogger (self ):
74+ """Set up logging
75+
76+ This will set the logger at warning, info or debug levels depending on the instance construction parameters
77+ """
78+ self .logger = logging .getLogger ("DataAnalyzer" )
79+ if self .debug :
80+ self .logger .setLevel (logging .DEBUG )
81+ elif self .verbose :
82+ self .logger .setLevel (logging .INFO )
83+ else :
84+ self .logger .setLevel (logging .WARNING )
85+
6186 def _displayRow (self , row ):
6287 """Display details for row"""
6388 results = []
@@ -95,6 +120,31 @@ def _addMeasureToSummary(self, measureName, summaryExpr="''", fieldExprs=None, d
95120
96121 return dfResult
97122
123+ def _get_dataframe_describe_stats (self , df ):
124+ """ Get summary statistics for dataframe handling renaming of summary field if necessary"""
125+ print ("schema" , df .schema )
126+
127+ src_fields = [fld .name for fld in df .schema .fields ]
128+ print ("src_fields" , src_fields )
129+ renamed_summary = False
130+
131+ # get summary statistics handling the case where a field named 'summary' exists
132+ # if the `summary` field name exists, we'll rename it to avoid a conflict
133+ if SUMMARY_FIELD_NAME in src_fields :
134+ renamed_summary = True
135+ df = df .withColumnRenamed (SUMMARY_FIELD_NAME , SUMMARY_FIELD_NAME_RENAMED )
136+
137+ # The dataframe describe method produces a field named `summary`. We'll rename this to avoid conflict with
138+ # any natural fields using the same name.
139+ summary_df = df .describe ().withColumnRenamed (SUMMARY_FIELD_NAME , DATA_SUMMARY_FIELD_NAME )
140+
141+ # if we renamed a field called `summary` in the data, we'll rename it back.
142+ # The data summary field produced by the describe method has already been renamed so there will be no conflict.
143+ if renamed_summary :
144+ summary_df = summary_df .withColumnRenamed (SUMMARY_FIELD_NAME_RENAMED , SUMMARY_FIELD_NAME )
145+
146+ return summary_df
147+
98148 def summarizeToDF (self ):
99149 """ Generate summary analysis of data set as dataframe
100150
@@ -154,11 +204,12 @@ def summarizeToDF(self):
154204 dfData = self ._df ,
155205 dfSummary = dfDataSummary )
156206
157- descriptionDf = self ._df .describe ().where ("summary in ('mean', 'stddev')" )
207+ descriptionDf = (self ._get_dataframe_describe_stats (self ._df )
208+ .where (f"{ DATA_SUMMARY_FIELD_NAME } in ('mean', 'stddev')" ))
158209 describeData = descriptionDf .collect ()
159210
160211 for row in describeData :
161- measure = row ['summary' ]
212+ measure = row [DATA_SUMMARY_FIELD_NAME ]
162213
163214 values = {k [0 ]: '' for k in dtypes }
164215
@@ -401,7 +452,12 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
401452
402453 """
403454 assert self ._df is not None
404- assert type (self ._df ) is ssql .DataFrame , "sourceDf must be a valid Pyspark dataframe"
455+
456+ if not isinstance (self ._df , ssql .DataFrame ):
457+ self .logger .warning (strip_margins (
458+ """The parameter `sourceDf` should be a valid Pyspark dataframe.
459+ |Note this warning may false due to use of remote connection to a Spark cluster""" ,
460+ '|' ))
405461
406462 if self ._dataSummary is None :
407463 df_summary = self .summarizeToDF ()
0 commit comments