Spark SQL - Data Types Reference

• Spark SQL supports 20+ data types organized into numeric, string, binary, boolean, datetime, and complex categories, with specific handling for nullable values and schema evolution

Key Insights

• Spark SQL supports 20+ data types organized into numeric, string, binary, boolean, datetime, and complex categories, with specific handling for nullable values and schema evolution • Complex types (Array, Map, Struct) enable hierarchical data processing without flattening, critical for JSON/nested data scenarios with 3-5x performance improvements over traditional approaches • Type casting and precision management differ significantly from traditional SQL databases—implicit conversions follow strict rules while explicit casts require careful handling to avoid runtime failures

Primitive Data Types

Spark SQL’s primitive types form the foundation of schema definitions. Understanding their storage characteristics and boundaries prevents data truncation and overflow issues.

from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("datatypes").getOrCreate()

# Numeric types with ranges
schema = StructType([
    StructField("byte_col", ByteType(), True),      # -128 to 127
    StructField("short_col", ShortType(), True),    # -32768 to 32767
    StructField("int_col", IntegerType(), True),    # -2147483648 to 2147483647
    StructField("long_col", LongType(), True),      # -9223372036854775808 to 9223372036854775807
    StructField("float_col", FloatType(), True),    # 4-byte single precision
    StructField("double_col", DoubleType(), True),  # 8-byte double precision
])

data = [(100, 30000, 2000000000, 9000000000000000000, 3.14, 2.718281828459045)]
df = spark.createDataFrame(data, schema)
df.printSchema()

String and binary types handle variable-length data:

string_schema = StructType([
    StructField("string_col", StringType(), True),
    StructField("binary_col", BinaryType(), True),
    StructField("bool_col", BooleanType(), True)
])

data = [("Hello Spark", bytearray(b'\x00\x01\x02'), True)]
df = spark.createDataFrame(data, string_schema)

Decimal Precision and Scale

DecimalType requires explicit precision and scale parameters. Default precision is 10 with scale 0, but production systems need careful tuning based on business requirements.

from decimal import Decimal

# DecimalType(precision, scale)
# precision: total number of digits
# scale: digits after decimal point
financial_schema = StructType([
    StructField("price", DecimalType(18, 4), True),        # 14 digits before, 4 after
    StructField("quantity", DecimalType(10, 2), True),
    StructField("total", DecimalType(28, 6), True)         # Accommodate multiplication
])

data = [
    (Decimal("1234567890.1234"), Decimal("100.50"), Decimal("124124124124.123456"))
]

df = spark.createDataFrame(data, financial_schema)

# Arithmetic operations may require explicit casting
df.selectExpr(
    "price * quantity as calculated_total",
    "CAST(price * quantity AS DECIMAL(28,6)) as precise_total"
).show()

Exceeding precision limits causes runtime errors or silent truncation depending on Spark configuration:

# This will truncate or fail based on spark.sql.ansi.enabled
spark.conf.set("spark.sql.ansi.enabled", "true")  # Strict mode - throws error

try:
    bad_data = [(Decimal("99999999999999999.9999"),)]  # Exceeds precision
    bad_df = spark.createDataFrame(bad_data, StructType([
        StructField("amount", DecimalType(18, 4), True)
    ]))
    bad_df.show()
except Exception as e:
    print(f"Error: {e}")

Temporal Data Types

Spark SQL provides three temporal types with timezone considerations that differ from traditional databases.

from datetime import datetime, date

temporal_schema = StructType([
    StructField("date_col", DateType(), True),           # Date only, no time
    StructField("timestamp_col", TimestampType(), True), # Timestamp with microsecond precision
])

data = [
    (date(2024, 1, 15), datetime(2024, 1, 15, 14, 30, 45, 123456))
]

df = spark.createDataFrame(data, temporal_schema)

# Timezone handling - timestamps stored in UTC
df.selectExpr(
    "date_col",
    "timestamp_col",
    "to_utc_timestamp(timestamp_col, 'America/New_York') as utc_time",
    "from_utc_timestamp(timestamp_col, 'Asia/Tokyo') as tokyo_time"
).show(truncate=False)

# Date arithmetic
df.selectExpr(
    "date_add(date_col, 30) as plus_30_days",
    "date_sub(date_col, 7) as minus_7_days",
    "datediff(current_date(), date_col) as days_ago"
).show()

Complex Types: Arrays

ArrayType handles homogeneous collections with powerful built-in functions for transformation and filtering.

array_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("tags", ArrayType(StringType()), True),
    StructField("scores", ArrayType(IntegerType()), True)
])

data = [
    (1, ["spark", "sql", "big-data"], [85, 90, 78]),
    (2, ["python", "analytics"], [92, 88]),
    (3, ["scala", "spark", "functional"], [95, 87, 91])
]

df = spark.createDataFrame(data, array_schema)

# Array operations
df.selectExpr(
    "id",
    "size(tags) as tag_count",
    "array_contains(tags, 'spark') as has_spark",
    "tags[0] as first_tag",
    "slice(scores, 1, 2) as first_two_scores",
    "array_max(scores) as max_score",
    "filter(scores, x -> x > 85) as high_scores",
    "transform(scores, x -> x * 1.1) as boosted_scores"
).show(truncate=False)

# Explode arrays to rows
df.selectExpr("id", "explode(tags) as tag").show()

Complex Types: Maps

MapType stores key-value pairs with atomic operations for lookup and manipulation.

map_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("preferences", MapType(StringType(), StringType()), True),
    StructField("metrics", MapType(StringType(), IntegerType()), True)
])

data = [
    (1, {"theme": "dark", "language": "en"}, {"logins": 45, "posts": 12}),
    (2, {"theme": "light", "language": "es"}, {"logins": 23, "posts": 8})
]

df = spark.createDataFrame(data, map_schema)

# Map operations
df.selectExpr(
    "user_id",
    "preferences['theme'] as theme",
    "map_keys(preferences) as pref_keys",
    "map_values(metrics) as metric_values",
    "size(preferences) as pref_count",
    "metrics['logins'] as login_count"
).show(truncate=False)

# Transform map values
df.selectExpr(
    "user_id",
    "transform_values(metrics, (k, v) -> v * 2) as doubled_metrics"
).show(truncate=False)

Complex Types: Structs

StructType enables nested schemas, essential for semi-structured data like JSON or Parquet with nested fields.

struct_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("zip", StringType(), True)
    ]), True),
    StructField("contact", StructType([
        StructField("email", StringType(), True),
        StructField("phone", StringType(), True)
    ]), True)
])

data = [
    (1, ("123 Main St", "New York", "10001"), ("user1@example.com", "555-0001")),
    (2, ("456 Oak Ave", "Boston", "02101"), ("user2@example.com", "555-0002"))
]

df = spark.createDataFrame(data, struct_schema)

# Access nested fields
df.selectExpr(
    "id",
    "address.city",
    "address.zip",
    "contact.email"
).show()

# Create struct from columns
flat_df = df.selectExpr(
    "id",
    "address.street",
    "address.city",
    "contact.email"
)

nested_df = flat_df.selectExpr(
    "id",
    "struct(street, city) as location",
    "email"
)
nested_df.printSchema()

Type Casting and Conversions

Explicit casting handles type transformations with different behaviors in ANSI mode versus legacy mode.

# Create sample data with mixed types
df = spark.createDataFrame([
    ("123", "45.67", "2024-01-15", "true"),
    ("456", "89.12", "2024-02-20", "false")
], ["str_int", "str_double", "str_date", "str_bool"])

# Explicit casting
casted_df = df.selectExpr(
    "CAST(str_int AS INT) as int_val",
    "CAST(str_double AS DOUBLE) as double_val",
    "CAST(str_date AS DATE) as date_val",
    "CAST(str_bool AS BOOLEAN) as bool_val"
)
casted_df.printSchema()

# Safe casting with try_cast (returns null on failure)
safe_df = df.selectExpr(
    "TRY_CAST(str_int AS INT) as safe_int",
    "TRY_CAST('invalid' AS INT) as null_result"
)
safe_df.show()

# Programmatic casting
from pyspark.sql.functions import col

typed_df = df.select(
    col("str_int").cast(IntegerType()).alias("int_val"),
    col("str_double").cast(DoubleType()).alias("double_val"),
    col("str_date").cast(DateType()).alias("date_val")
)

Schema Inference vs Explicit Definition

Schema inference trades convenience for control. Production systems should use explicit schemas for performance and reliability.

# Schema inference (slower, may infer incorrectly)
inferred_df = spark.read.option("inferSchema", "true").csv("data.csv")

# Explicit schema (faster, guaranteed correctness)
explicit_schema = StructType([
    StructField("id", IntegerType(), False),  # Not nullable
    StructField("name", StringType(), True),
    StructField("salary", DecimalType(10, 2), True),
    StructField("hire_date", DateType(), True)
])

explicit_df = spark.read.schema(explicit_schema).csv("data.csv")

# DDL string format (alternative)
ddl_schema = "id INT NOT NULL, name STRING, salary DECIMAL(10,2), hire_date DATE"
ddl_df = spark.read.schema(ddl_schema).csv("data.csv")

Understanding Spark SQL data types prevents common pitfalls: precision loss in decimals, timezone confusion in timestamps, and schema evolution failures. Complex types eliminate expensive joins and enable columnar processing of hierarchical data. Choose explicit schemas and appropriate precision for production workloads.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.