Pandas - to_datetime() Convert String to Datetime

• `pd.to_datetime()` handles multiple string formats automatically, including ISO 8601, common date patterns, and custom formats via the `format` parameter using strftime codes

Key Insights

pd.to_datetime() handles multiple string formats automatically, including ISO 8601, common date patterns, and custom formats via the format parameter using strftime codes • The errors parameter (‘raise’, ‘coerce’, ‘ignore’) controls behavior for invalid dates, with ‘coerce’ converting failures to NaT for robust data pipelines • Performance improves dramatically when specifying exact formats with format parameter, especially critical for large datasets with millions of rows

Basic String to Datetime Conversion

The pd.to_datetime() function converts string representations of dates into pandas datetime objects. The simplest usage relies on pandas’ automatic format detection:

import pandas as pd

# Single string conversion
date_string = '2024-03-15'
datetime_obj = pd.to_datetime(date_string)
print(datetime_obj)
# Output: 2024-03-15 00:00:00

# List of strings
date_strings = ['2024-03-15', '2024-06-20', '2024-12-01']
datetime_series = pd.to_datetime(date_strings)
print(datetime_series)
# Output: DatetimeIndex(['2024-03-15', '2024-06-20', '2024-12-01'], dtype='datetime64[ns]', freq=None)

# DataFrame column conversion
df = pd.DataFrame({
    'date': ['2024-01-15', '2024-02-20', '2024-03-25'],
    'value': [100, 200, 300]
})
df['date'] = pd.to_datetime(df['date'])
print(df.dtypes)
# date     datetime64[ns]
# value             int64

Handling Different Date Formats

Pandas recognizes common date formats automatically, but you can specify exact formats for better control and performance:

# Various automatic format recognition
dates = pd.to_datetime([
    '2024-03-15',           # ISO format
    '03/15/2024',           # US format
    '15-Mar-2024',          # Month abbreviation
    'March 15, 2024',       # Full month name
    '2024.03.15'            # Dot separator
])
print(dates)

# Explicit format specification using strftime codes
us_dates = ['03/15/2024', '06/20/2024', '12/01/2024']
datetime_series = pd.to_datetime(us_dates, format='%m/%d/%Y')
print(datetime_series)

# European date format (day/month/year)
eu_dates = ['15/03/2024', '20/06/2024', '01/12/2024']
datetime_series = pd.to_datetime(eu_dates, format='%d/%m/%Y')
print(datetime_series)

# Custom format with time
custom_dates = ['2024-03-15 14:30:00', '2024-06-20 09:15:30']
datetime_series = pd.to_datetime(custom_dates, format='%Y-%m-%d %H:%M:%S')
print(datetime_series)

Common Format Codes Reference

# Most frequently used strftime codes
examples = {
    '%Y': '2024',      # 4-digit year
    '%y': '24',        # 2-digit year
    '%m': '03',        # Month as zero-padded number
    '%d': '15',        # Day as zero-padded number
    '%H': '14',        # Hour (24-hour)
    '%I': '02',        # Hour (12-hour)
    '%M': '30',        # Minute
    '%S': '45',        # Second
    '%p': 'PM',        # AM/PM
    '%b': 'Mar',       # Abbreviated month
    '%B': 'March',     # Full month name
}

# Practical examples
date_formats = {
    'YYYYMMDD': ('20240315', '%Y%m%d'),
    'DD-MM-YYYY': ('15-03-2024', '%d-%m-%Y'),
    'MM/DD/YY': ('03/15/24', '%m/%d/%y'),
    'YYYY-MM-DD HH:MM:SS': ('2024-03-15 14:30:45', '%Y-%m-%d %H:%M:%S'),
}

for name, (date_str, fmt) in date_formats.items():
    result = pd.to_datetime(date_str, format=fmt)
    print(f"{name}: {result}")

Error Handling Strategies

The errors parameter controls how pandas handles unparseable dates:

# Default behavior: raises exception on invalid dates
try:
    invalid_dates = ['2024-03-15', 'not-a-date', '2024-06-20']
    pd.to_datetime(invalid_dates)
except Exception as e:
    print(f"Error: {e}")

# errors='coerce': converts invalid dates to NaT (Not a Time)
dates_with_invalid = ['2024-03-15', 'invalid', '2024-06-20', '2024-13-45']
result = pd.to_datetime(dates_with_invalid, errors='coerce')
print(result)
# DatetimeIndex(['2024-03-15', 'NaT', '2024-06-20', 'NaT'], dtype='datetime64[ns]', freq=None)

# errors='ignore': returns original input if parsing fails
result = pd.to_datetime(dates_with_invalid, errors='ignore')
print(result)
# Index(['2024-03-15', 'invalid', '2024-06-20', '2024-13-45'], dtype='object')

# Practical use case: cleaning data
df = pd.DataFrame({
    'date': ['2024-01-15', 'TBD', '2024-03-20', '2024-99-99', '2024-04-10']
})
df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce')
df['is_valid'] = df['date_parsed'].notna()
print(df)

Performance Optimization

Specifying the exact format significantly improves performance for large datasets:

import numpy as np
import time

# Generate large dataset
n = 1_000_000
dates = [f'2024-{i%12+1:02d}-{i%28+1:02d}' for i in range(n)]

# Without format specification (slower)
start = time.time()
result1 = pd.to_datetime(dates)
time_auto = time.time() - start

# With format specification (faster)
start = time.time()
result2 = pd.to_datetime(dates, format='%Y-%m-%d')
time_format = time.time() - start

print(f"Auto detection: {time_auto:.3f}s")
print(f"With format: {time_format:.3f}s")
print(f"Speedup: {time_auto/time_format:.1f}x")

# Using infer_datetime_format (deprecated in newer versions)
# Consider using format parameter instead

Working with Unix Timestamps

Convert Unix timestamps (seconds since epoch) to datetime:

# Unix timestamp conversion
timestamps = [1710504000, 1718841600, 1733011200]
datetime_series = pd.to_datetime(timestamps, unit='s')
print(datetime_series)

# Milliseconds timestamp
timestamps_ms = [1710504000000, 1718841600000, 1733011200000]
datetime_series = pd.to_datetime(timestamps_ms, unit='ms')
print(datetime_series)

# DataFrame with timestamps
df = pd.DataFrame({
    'timestamp': [1710504000, 1718841600, 1733011200],
    'event': ['login', 'purchase', 'logout']
})
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
print(df)

# Setting timezone
df['datetime_utc'] = pd.to_datetime(df['timestamp'], unit='s', utc=True)
print(df['datetime_utc'])

Timezone Handling

Manage timezone-aware datetime conversions:

# UTC timezone
dates = ['2024-03-15 10:30:00', '2024-06-20 14:45:00']
datetime_utc = pd.to_datetime(dates, utc=True)
print(datetime_utc)

# Convert to specific timezone
datetime_ny = datetime_utc.tz_convert('America/New_York')
print(datetime_ny)

# Parse with timezone information in string
dates_with_tz = ['2024-03-15 10:30:00+00:00', '2024-06-20 14:45:00+00:00']
datetime_series = pd.to_datetime(dates_with_tz)
print(datetime_series)

# Localize naive datetime to timezone
naive_dates = pd.to_datetime(['2024-03-15 10:30:00'])
localized = naive_dates.tz_localize('US/Eastern')
print(localized)

Practical Data Pipeline Example

Complete example showing real-world data cleaning:

# Messy real-world data
data = {
    'transaction_date': [
        '2024-01-15',
        '01/20/2024',
        '2024.02.10',
        'invalid',
        '2024-03-15 14:30:00',
        None,
        '1710504000'  # Unix timestamp
    ],
    'amount': [100, 200, 150, 300, 250, 180, 220]
}

df = pd.DataFrame(data)

# Strategy 1: Attempt multiple formats
def parse_flexible_dates(series):
    # Try ISO format first
    result = pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')
    
    # Fill NaT with US format attempt
    mask = result.isna()
    result[mask] = pd.to_datetime(series[mask], format='%m/%d/%Y', errors='coerce')
    
    # Fill remaining NaT with dot format
    mask = result.isna()
    result[mask] = pd.to_datetime(series[mask], format='%Y.%m.%d', errors='coerce')
    
    # Try unix timestamp for numeric strings
    mask = result.isna()
    result[mask] = pd.to_datetime(pd.to_numeric(series[mask], errors='coerce'), unit='s', errors='coerce')
    
    return result

df['parsed_date'] = parse_flexible_dates(df['transaction_date'])
df['is_valid'] = df['parsed_date'].notna()

# Filter valid dates
valid_df = df[df['is_valid']].copy()
print(f"Valid records: {len(valid_df)}/{len(df)}")
print(valid_df)

# Extract date components
valid_df['year'] = valid_df['parsed_date'].dt.year
valid_df['month'] = valid_df['parsed_date'].dt.month
valid_df['day'] = valid_df['parsed_date'].dt.day
valid_df['quarter'] = valid_df['parsed_date'].dt.quarter
print(valid_df[['parsed_date', 'year', 'month', 'quarter']])

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.