Source code for handyspark.sql.schema

import numpy as np
import datetime
from operator import itemgetter
from pyspark.sql.types import StructType

_mapping = {str: 'string',
            bool: 'boolean',
            int: 'integer',
            float: 'float',
            datetime.date: 'date',
            datetime.datetime: 'timestamp',
            np.bool: 'boolean',
            np.int8: 'byte',
            np.int16: 'short',
            np.int32: 'integer',
            np.int64: 'long',
            np.float32: 'float',
            np.float64: 'double',
            np.ndarray: 'array',
            object: 'string',
            list: 'array',
            tuple: 'array',
            dict: 'map'}

[docs]def generate_schema(colnames, coltypes, nullables=None): """ Parameters ---------- colnames: list of string coltypes: list of type nullables: list of boolean, optional Returns ------- schema: StructType Spark DataFrame schema corresponding to Python/numpy types. """ assert len(colnames) == len(coltypes), "You must specify types for all columns." invalid_types = [] new_types = [] keys = list(map(itemgetter(0), list(_mapping.items()))) for coltype in coltypes: if coltype not in keys: invalid_types.append(coltype) else: if coltype == np.dtype('O'): new_types.append(str) else: new_types.append(keys[keys.index(coltype)]) assert len(invalid_types) == 0, "Invalid type(s) specified: {}".format(str(invalid_types)) if nullables is None: nullables = [True] * len(colnames) fields = [{"metadata": {}, "name": name, "nullable": nullable, "type": _mapping[typ]} for name, typ, nullable in zip(colnames, new_types, nullables)] return StructType.fromJson({"type": "struct", "fields": fields})