Spark SQL - Data Types Reference
• Spark SQL supports 20+ data types organized into numeric, string, binary, boolean, datetime, and complex categories, with specific handling for nullable values and schema evolution
Key Insights
• Spark SQL supports 20+ data types organized into numeric, string, binary, boolean, datetime, and complex categories, with specific handling for nullable values and schema evolution • Complex types (Array, Map, Struct) enable hierarchical data processing without flattening, critical for JSON/nested data scenarios with 3-5x performance improvements over traditional approaches • Type casting and precision management differ significantly from traditional SQL databases—implicit conversions follow strict rules while explicit casts require careful handling to avoid runtime failures
Primitive Data Types
Spark SQL’s primitive types form the foundation of schema definitions. Understanding their storage characteristics and boundaries prevents data truncation and overflow issues.
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("datatypes").getOrCreate()
# Numeric types with ranges
schema = StructType([
StructField("byte_col", ByteType(), True), # -128 to 127
StructField("short_col", ShortType(), True), # -32768 to 32767
StructField("int_col", IntegerType(), True), # -2147483648 to 2147483647
StructField("long_col", LongType(), True), # -9223372036854775808 to 9223372036854775807
StructField("float_col", FloatType(), True), # 4-byte single precision
StructField("double_col", DoubleType(), True), # 8-byte double precision
])
data = [(100, 30000, 2000000000, 9000000000000000000, 3.14, 2.718281828459045)]
df = spark.createDataFrame(data, schema)
df.printSchema()
String and binary types handle variable-length data:
string_schema = StructType([
StructField("string_col", StringType(), True),
StructField("binary_col", BinaryType(), True),
StructField("bool_col", BooleanType(), True)
])
data = [("Hello Spark", bytearray(b'\x00\x01\x02'), True)]
df = spark.createDataFrame(data, string_schema)
Decimal Precision and Scale
DecimalType requires explicit precision and scale parameters. Default precision is 10 with scale 0, but production systems need careful tuning based on business requirements.
from decimal import Decimal
# DecimalType(precision, scale)
# precision: total number of digits
# scale: digits after decimal point
financial_schema = StructType([
StructField("price", DecimalType(18, 4), True), # 14 digits before, 4 after
StructField("quantity", DecimalType(10, 2), True),
StructField("total", DecimalType(28, 6), True) # Accommodate multiplication
])
data = [
(Decimal("1234567890.1234"), Decimal("100.50"), Decimal("124124124124.123456"))
]
df = spark.createDataFrame(data, financial_schema)
# Arithmetic operations may require explicit casting
df.selectExpr(
"price * quantity as calculated_total",
"CAST(price * quantity AS DECIMAL(28,6)) as precise_total"
).show()
Exceeding precision limits causes runtime errors or silent truncation depending on Spark configuration:
# This will truncate or fail based on spark.sql.ansi.enabled
spark.conf.set("spark.sql.ansi.enabled", "true") # Strict mode - throws error
try:
bad_data = [(Decimal("99999999999999999.9999"),)] # Exceeds precision
bad_df = spark.createDataFrame(bad_data, StructType([
StructField("amount", DecimalType(18, 4), True)
]))
bad_df.show()
except Exception as e:
print(f"Error: {e}")
Temporal Data Types
Spark SQL provides three temporal types with timezone considerations that differ from traditional databases.
from datetime import datetime, date
temporal_schema = StructType([
StructField("date_col", DateType(), True), # Date only, no time
StructField("timestamp_col", TimestampType(), True), # Timestamp with microsecond precision
])
data = [
(date(2024, 1, 15), datetime(2024, 1, 15, 14, 30, 45, 123456))
]
df = spark.createDataFrame(data, temporal_schema)
# Timezone handling - timestamps stored in UTC
df.selectExpr(
"date_col",
"timestamp_col",
"to_utc_timestamp(timestamp_col, 'America/New_York') as utc_time",
"from_utc_timestamp(timestamp_col, 'Asia/Tokyo') as tokyo_time"
).show(truncate=False)
# Date arithmetic
df.selectExpr(
"date_add(date_col, 30) as plus_30_days",
"date_sub(date_col, 7) as minus_7_days",
"datediff(current_date(), date_col) as days_ago"
).show()
Complex Types: Arrays
ArrayType handles homogeneous collections with powerful built-in functions for transformation and filtering.
array_schema = StructType([
StructField("id", IntegerType(), True),
StructField("tags", ArrayType(StringType()), True),
StructField("scores", ArrayType(IntegerType()), True)
])
data = [
(1, ["spark", "sql", "big-data"], [85, 90, 78]),
(2, ["python", "analytics"], [92, 88]),
(3, ["scala", "spark", "functional"], [95, 87, 91])
]
df = spark.createDataFrame(data, array_schema)
# Array operations
df.selectExpr(
"id",
"size(tags) as tag_count",
"array_contains(tags, 'spark') as has_spark",
"tags[0] as first_tag",
"slice(scores, 1, 2) as first_two_scores",
"array_max(scores) as max_score",
"filter(scores, x -> x > 85) as high_scores",
"transform(scores, x -> x * 1.1) as boosted_scores"
).show(truncate=False)
# Explode arrays to rows
df.selectExpr("id", "explode(tags) as tag").show()
Complex Types: Maps
MapType stores key-value pairs with atomic operations for lookup and manipulation.
map_schema = StructType([
StructField("user_id", IntegerType(), True),
StructField("preferences", MapType(StringType(), StringType()), True),
StructField("metrics", MapType(StringType(), IntegerType()), True)
])
data = [
(1, {"theme": "dark", "language": "en"}, {"logins": 45, "posts": 12}),
(2, {"theme": "light", "language": "es"}, {"logins": 23, "posts": 8})
]
df = spark.createDataFrame(data, map_schema)
# Map operations
df.selectExpr(
"user_id",
"preferences['theme'] as theme",
"map_keys(preferences) as pref_keys",
"map_values(metrics) as metric_values",
"size(preferences) as pref_count",
"metrics['logins'] as login_count"
).show(truncate=False)
# Transform map values
df.selectExpr(
"user_id",
"transform_values(metrics, (k, v) -> v * 2) as doubled_metrics"
).show(truncate=False)
Complex Types: Structs
StructType enables nested schemas, essential for semi-structured data like JSON or Parquet with nested fields.
struct_schema = StructType([
StructField("id", IntegerType(), True),
StructField("address", StructType([
StructField("street", StringType(), True),
StructField("city", StringType(), True),
StructField("zip", StringType(), True)
]), True),
StructField("contact", StructType([
StructField("email", StringType(), True),
StructField("phone", StringType(), True)
]), True)
])
data = [
(1, ("123 Main St", "New York", "10001"), ("user1@example.com", "555-0001")),
(2, ("456 Oak Ave", "Boston", "02101"), ("user2@example.com", "555-0002"))
]
df = spark.createDataFrame(data, struct_schema)
# Access nested fields
df.selectExpr(
"id",
"address.city",
"address.zip",
"contact.email"
).show()
# Create struct from columns
flat_df = df.selectExpr(
"id",
"address.street",
"address.city",
"contact.email"
)
nested_df = flat_df.selectExpr(
"id",
"struct(street, city) as location",
"email"
)
nested_df.printSchema()
Type Casting and Conversions
Explicit casting handles type transformations with different behaviors in ANSI mode versus legacy mode.
# Create sample data with mixed types
df = spark.createDataFrame([
("123", "45.67", "2024-01-15", "true"),
("456", "89.12", "2024-02-20", "false")
], ["str_int", "str_double", "str_date", "str_bool"])
# Explicit casting
casted_df = df.selectExpr(
"CAST(str_int AS INT) as int_val",
"CAST(str_double AS DOUBLE) as double_val",
"CAST(str_date AS DATE) as date_val",
"CAST(str_bool AS BOOLEAN) as bool_val"
)
casted_df.printSchema()
# Safe casting with try_cast (returns null on failure)
safe_df = df.selectExpr(
"TRY_CAST(str_int AS INT) as safe_int",
"TRY_CAST('invalid' AS INT) as null_result"
)
safe_df.show()
# Programmatic casting
from pyspark.sql.functions import col
typed_df = df.select(
col("str_int").cast(IntegerType()).alias("int_val"),
col("str_double").cast(DoubleType()).alias("double_val"),
col("str_date").cast(DateType()).alias("date_val")
)
Schema Inference vs Explicit Definition
Schema inference trades convenience for control. Production systems should use explicit schemas for performance and reliability.
# Schema inference (slower, may infer incorrectly)
inferred_df = spark.read.option("inferSchema", "true").csv("data.csv")
# Explicit schema (faster, guaranteed correctness)
explicit_schema = StructType([
StructField("id", IntegerType(), False), # Not nullable
StructField("name", StringType(), True),
StructField("salary", DecimalType(10, 2), True),
StructField("hire_date", DateType(), True)
])
explicit_df = spark.read.schema(explicit_schema).csv("data.csv")
# DDL string format (alternative)
ddl_schema = "id INT NOT NULL, name STRING, salary DECIMAL(10,2), hire_date DATE"
ddl_df = spark.read.schema(ddl_schema).csv("data.csv")
Understanding Spark SQL data types prevents common pitfalls: precision loss in decimals, timezone confusion in timestamps, and schema evolution failures. Complex types eliminate expensive joins and enable columnar processing of hierarchical data. Choose explicit schemas and appropriate precision for production workloads.