Source code for handyspark.sql.string

from handyspark.sql.transform import HandyTransform
import unicodedata
import pandas as pd

[docs]class HandyString(object): __supported = {'boolean': ['contains', 'startswith', 'endswith', 'match', 'isalpha', 'isnumeric', 'isalnum', 'isdigit', 'isdecimal', 'isspace', 'islower', 'isupper', 'istitle'], 'string': ['replace', 'repeat', 'join', 'pad', 'slice', 'slice_replace', 'strip', 'wrap', 'translate', 'get', 'center', 'ljust', 'rjust', 'zfill', 'lstrip', 'rstrip', 'normalize', 'lower', 'upper', 'title', 'capitalize', 'swapcase'], 'integer': ['count', 'find', 'len', 'rfind']} __unsupported = ['cat', 'extract', 'extractall', 'get_dummies', 'findall', 'index', 'split', 'rsplit', 'partition', 'rpartition', 'rindex', 'decode', 'encode'] __available = sorted(__supported['boolean'] + __supported['string'] + __supported['integer']) __types = {n: t for t, v in __supported.items() for n in v} _colname = None def __init__(self, df, colname): self._df = df self._colname = colname @staticmethod def _remove_accents(input): return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore').decode('unicode_escape')
[docs] def remove_accents(self): return HandyTransform.gen_pandas_udf(f=lambda col: col.apply(HandyString._remove_accents), args=(self._colname,), returnType='string')
def __getattribute__(self, name): try: attr = object.__getattribute__(self, name) return attr except AttributeError as e: if name in self.__available: def wrapper(*args, **kwargs): return HandyTransform.gen_pandas_udf(f=lambda col: col.str.__getattribute__(name)(**kwargs), args=(self._colname,), returnType=self.__types.get(name, 'string')) wrapper.__doc__ = getattr(pd.Series.str, name).__doc__ return wrapper else: raise e