NumPy - Array to Bytes and Back (tobytes, frombuffer)

NumPy's `tobytes()` method serializes array data into a raw byte string, stripping away all metadata like shape, dtype, and strides. This produces the smallest possible representation of your array...

Key Insights

  • tobytes() creates a compact binary representation of NumPy arrays, reducing memory overhead and enabling efficient serialization for network transmission or file storage
  • frombuffer() reconstructs arrays from byte sequences without copying data when possible, but requires explicit dtype and shape information since metadata isn’t preserved
  • Understanding memory layout (C-contiguous vs Fortran-contiguous) is critical for correct deserialization and interoperability with other systems expecting specific byte orderings

Converting Arrays to Bytes with tobytes()

NumPy’s tobytes() method serializes array data into a raw byte string, stripping away all metadata like shape, dtype, and strides. This produces the smallest possible representation of your array data.

import numpy as np

# Create a simple array
arr = np.array([1, 2, 3, 4, 5], dtype=np.int32)
byte_data = arr.tobytes()

print(f"Array: {arr}")
print(f"Bytes: {byte_data}")
print(f"Length: {len(byte_data)} bytes")
# Output: Length: 20 bytes (5 elements × 4 bytes per int32)

The byte representation depends entirely on the dtype. Floating-point numbers, integers of different sizes, and complex numbers all produce different byte sequences:

# Different dtypes produce different byte lengths
int8_arr = np.array([1, 2, 3, 4, 5], dtype=np.int8)
int64_arr = np.array([1, 2, 3, 4, 5], dtype=np.int64)
float64_arr = np.array([1.0, 2.0, 3.0], dtype=np.float64)

print(f"int8: {len(int8_arr.tobytes())} bytes")    # 5 bytes
print(f"int64: {len(int64_arr.tobytes())} bytes")  # 40 bytes
print(f"float64: {len(float64_arr.tobytes())} bytes")  # 24 bytes

Reconstructing Arrays with frombuffer()

frombuffer() interprets a byte sequence as array data. Since tobytes() doesn’t preserve metadata, you must specify the dtype explicitly. The shape must be inferred or tracked separately:

# Serialize
original = np.array([10, 20, 30, 40, 50], dtype=np.int32)
byte_data = original.tobytes()

# Deserialize - dtype is required
restored = np.frombuffer(byte_data, dtype=np.int32)
print(f"Restored: {restored}")
print(f"Arrays equal: {np.array_equal(original, restored)}")

For multi-dimensional arrays, you must reshape after deserialization:

# 2D array serialization
matrix = np.array([[1, 2, 3],
                   [4, 5, 6]], dtype=np.float32)
original_shape = matrix.shape

byte_data = matrix.tobytes()

# Reconstruct with reshape
restored = np.frombuffer(byte_data, dtype=np.float32).reshape(original_shape)
print(f"Original:\n{matrix}")
print(f"Restored:\n{restored}")

Memory Layout: C vs Fortran Order

NumPy arrays can be stored in row-major (C) or column-major (Fortran) order. This affects how tobytes() serializes multi-dimensional arrays:

# C-contiguous (row-major, default)
c_array = np.array([[1, 2, 3],
                    [4, 5, 6]], order='C')

# Fortran-contiguous (column-major)
f_array = np.array([[1, 2, 3],
                    [4, 5, 6]], order='F')

print(f"C-order bytes: {c_array.tobytes()}")
# b'\x01\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00...'
# Reads: 1, 2, 3, 4, 5, 6 (row by row)

print(f"F-order bytes: {f_array.tobytes()}")
# Reads: 1, 4, 2, 5, 3, 6 (column by column)

When deserializing, specify the order parameter to match the original layout:

c_bytes = c_array.tobytes()
f_bytes = f_array.tobytes()

# Restore with correct order
c_restored = np.frombuffer(c_bytes, dtype=np.int32).reshape((2, 3), order='C')
f_restored = np.frombuffer(f_bytes, dtype=np.int32).reshape((2, 3), order='F')

print(f"C-restored:\n{c_restored}")
print(f"F-restored:\n{f_restored}")

Practical Use Case: Network Transmission

When sending NumPy arrays over a network, serialize to bytes and transmit the metadata separately:

import socket
import struct
import numpy as np

def serialize_array(arr):
    """Serialize array with metadata."""
    byte_data = arr.tobytes()
    dtype_str = arr.dtype.str
    
    # Pack metadata: dtype length, dtype, shape length, shape values
    metadata = struct.pack('I', len(dtype_str))
    metadata += dtype_str.encode('utf-8')
    metadata += struct.pack('I', len(arr.shape))
    metadata += struct.pack(f'{len(arr.shape)}I', *arr.shape)
    
    return metadata + byte_data

def deserialize_array(data):
    """Deserialize array from bytes."""
    offset = 0
    
    # Unpack dtype
    dtype_len = struct.unpack('I', data[offset:offset+4])[0]
    offset += 4
    dtype_str = data[offset:offset+dtype_len].decode('utf-8')
    offset += dtype_len
    
    # Unpack shape
    shape_len = struct.unpack('I', data[offset:offset+4])[0]
    offset += 4
    shape = struct.unpack(f'{shape_len}I', data[offset:offset+4*shape_len])
    offset += 4 * shape_len
    
    # Reconstruct array
    arr = np.frombuffer(data[offset:], dtype=dtype_str).reshape(shape)
    return arr

# Example usage
original = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float32)
serialized = serialize_array(original)
restored = deserialize_array(serialized)

print(f"Serialized size: {len(serialized)} bytes")
print(f"Arrays equal: {np.array_equal(original, restored)}")

File I/O with Byte Serialization

For file storage, combine tobytes() with binary file operations. Always store metadata:

def save_array_binary(filename, arr):
    """Save array to binary file with metadata."""
    with open(filename, 'wb') as f:
        # Write dtype string length and dtype
        dtype_bytes = arr.dtype.str.encode('utf-8')
        f.write(struct.pack('I', len(dtype_bytes)))
        f.write(dtype_bytes)
        
        # Write shape
        f.write(struct.pack('I', len(arr.shape)))
        f.write(struct.pack(f'{len(arr.shape)}I', *arr.shape))
        
        # Write array data
        f.write(arr.tobytes())

def load_array_binary(filename):
    """Load array from binary file."""
    with open(filename, 'rb') as f:
        # Read dtype
        dtype_len = struct.unpack('I', f.read(4))[0]
        dtype_str = f.read(dtype_len).decode('utf-8')
        
        # Read shape
        shape_len = struct.unpack('I', f.read(4))[0]
        shape = struct.unpack(f'{shape_len}I', f.read(4 * shape_len))
        
        # Read array data
        byte_data = f.read()
        return np.frombuffer(byte_data, dtype=dtype_str).reshape(shape)

# Test file operations
test_array = np.random.rand(100, 50).astype(np.float64)
save_array_binary('array.bin', test_array)
loaded_array = load_array_binary('array.bin')

print(f"File size: {len(test_array.tobytes())} bytes")
print(f"Match: {np.allclose(test_array, loaded_array)}")

Performance Considerations

frombuffer() creates a view when possible, avoiding memory copies. However, this means the buffer must remain valid:

# frombuffer creates a view - buffer must persist
def dangerous_pattern():
    byte_data = np.array([1, 2, 3], dtype=np.int32).tobytes()
    # byte_data is local and will be garbage collected
    return np.frombuffer(byte_data, dtype=np.int32)

# Safe pattern - copy the data
def safe_pattern():
    byte_data = np.array([1, 2, 3], dtype=np.int32).tobytes()
    return np.frombuffer(byte_data, dtype=np.int32).copy()

# Performance comparison
large_array = np.random.rand(1000000)
byte_data = large_array.tobytes()

import time

start = time.time()
view = np.frombuffer(byte_data, dtype=np.float64)
print(f"View creation: {time.time() - start:.6f}s")

start = time.time()
copy = np.frombuffer(byte_data, dtype=np.float64).copy()
print(f"With copy: {time.time() - start:.6f}s")

Working with Structured Arrays

Structured arrays with multiple fields serialize all fields sequentially:

# Define structured dtype
dt = np.dtype([('id', np.int32), ('value', np.float64), ('flag', np.bool_)])

# Create structured array
data = np.array([(1, 3.14, True), (2, 2.71, False), (3, 1.41, True)], dtype=dt)

# Serialize
byte_data = data.tobytes()
print(f"Structured array bytes: {len(byte_data)}")

# Deserialize - dtype must match exactly
restored = np.frombuffer(byte_data, dtype=dt)
print(f"Restored:\n{restored}")
print(f"Access field: {restored['value']}")

The tobytes() and frombuffer() pair provides low-level control over array serialization, essential for performance-critical applications, custom protocols, and interoperability with non-Python systems. Always track metadata separately and understand memory layout to avoid deserialization errors.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.