Source code for handyspark.sql.pandas
from handyspark.sql.datetime import HandyDatetime
from handyspark.sql.string import HandyString
from handyspark.sql.transform import HandyTransform
from handyspark.util import check_columns
import pandas as pd
[docs]class HandyPandas(object):
__supported = {'boolean': ['between', 'between_time', 'isin', 'isna', 'isnull', 'notna', 'notnull'],
'same': ['abs', 'clip', 'clip_lower', 'clip_upper', 'replace', 'round', 'truncate',
'tz_convert', 'tz_localize']}
__as_series = ['rank', 'interpolate', 'pct_change', 'bfill', 'cummax', 'cummin', 'cumprod', 'cumsum', 'diff',
'ffill', 'fillna', 'shift']
__available = sorted(__supported['boolean'] + __supported['same'])
__types = {n: t for t, v in __supported.items() for n in v}
def __init__(self, df):
self._df = df
self._colname = None
def __getitem__(self, *args):
if isinstance(args[0], tuple):
args = args[0]
item = args[0]
check_columns(self._df, item)
self._colname = item
return self
@property
def str(self):
"""Returns a class to access pandas-like string column based methods through pandas UDFs
Available methods:
- contains
- startswith / endswitch
- match
- isalpha / isnumeric / isalnum / isdigit / isdecimal / isspace
- islower / isupper / istitle
- replace
- repeat
- join
- pad
- slice / slice_replace
- strip / lstrip / rstrip
- wrap / center / ljust / rjust
- translate
- get
- normalize
- lower / upper / capitalize / swapcase / title
- zfill
- count
- find / rfind
- len
"""
return HandyString(self._df, self._colname)
@property
def dt(self):
"""Returns a class to access pandas-like datetime column based methods through pandas UDFs
Available methods:
- is_leap_year / is_month_end / is_month_start / is_quarter_end / is_quarter_start / is_year_end / is_year_start
- strftime
- tz / time / tz_convert / tz_localize
- day / dayofweek / dayofyear / days_in_month / daysinmonth
- hour / microsecond / minute / nanosecond / second
- week / weekday / weekday_name
- month / quarter / year / weekofyear
- date
- ceil / floor / round
- normalize
"""
return HandyDatetime(self._df, self._colname)
def __getattribute__(self, name):
try:
attr = object.__getattribute__(self, name)
return attr
except AttributeError as e:
if name in self.__available:
def wrapper(*args, **kwargs):
returnType=self.__types.get(name, 'string')
if returnType == 'same':
returnType = self._df.notHandy().select(self._colname).dtypes[0][1]
return HandyTransform.gen_pandas_udf(f=lambda col: col.__getattribute__(name)(**kwargs),
args=(self._colname,),
returnType=returnType)
if name not in ['str', 'dt']:
wrapper.__doc__ = getattr(pd.Series, name).__doc__
return wrapper
else:
raise e