NumPy - Save Array to File (np.save, np.savez)
NumPy arrays can be saved as text using `np.savetxt()`, but binary formats offer significant advantages. Binary files preserve exact data types, handle multidimensional arrays naturally, and provide...
Key Insights
- NumPy provides binary formats (
.npy,.npz) that preserve array data types and shapes perfectly while offering 10-50x faster I/O than text formats - Use
np.save()for single arrays,np.savez()for multiple arrays in one file, andnp.savez_compressed()when disk space matters more than write speed - Binary formats are not human-readable but essential for production workflows involving large datasets, checkpoints, or cross-process data sharing
Binary vs Text Formats
NumPy arrays can be saved as text using np.savetxt(), but binary formats offer significant advantages. Binary files preserve exact data types, handle multidimensional arrays naturally, and provide dramatically faster I/O operations.
import numpy as np
import time
# Create a large array
arr = np.random.rand(1000, 1000)
# Binary format timing
start = time.time()
np.save('data.npy', arr)
binary_time = time.time() - start
# Text format timing
start = time.time()
np.savetxt('data.txt', arr)
text_time = time.time() - start
print(f"Binary: {binary_time:.4f}s, Text: {text_time:.4f}s")
print(f"Speedup: {text_time/binary_time:.1f}x")
On typical hardware, binary saves run 20-50x faster than text saves for large arrays.
Saving Single Arrays with np.save()
The np.save() function writes a single array to a .npy file. It automatically preserves dtype, shape, and memory layout.
import numpy as np
# Create arrays with different dtypes
float_arr = np.array([1.5, 2.7, 3.9], dtype=np.float32)
int_arr = np.array([[1, 2], [3, 4]], dtype=np.int16)
complex_arr = np.array([1+2j, 3+4j], dtype=np.complex128)
# Save each array
np.save('floats.npy', float_arr)
np.save('integers.npy', int_arr)
np.save('complex.npy', complex_arr)
# Load and verify
loaded_float = np.load('floats.npy')
print(f"Original dtype: {float_arr.dtype}, Loaded dtype: {loaded_float.dtype}")
print(f"Arrays equal: {np.array_equal(float_arr, loaded_float)}")
The .npy extension is automatically added if omitted:
# These are equivalent
np.save('data.npy', arr)
np.save('data', arr) # .npy added automatically
Loading .npy Files
Use np.load() to read .npy files back into memory:
import numpy as np
# Save an array
original = np.array([[1, 2, 3], [4, 5, 6]])
np.save('matrix.npy', original)
# Load it back
loaded = np.load('matrix.npy')
print(f"Shape: {loaded.shape}")
print(f"Dtype: {loaded.dtype}")
print(f"Content:\n{loaded}")
Memory mapping allows working with arrays larger than RAM:
# Create a large array and save it
large_arr = np.random.rand(10000, 10000)
np.save('large.npy', large_arr)
# Memory-map instead of loading entirely
mmap_arr = np.load('large.npy', mmap_mode='r')
# Access specific elements without loading full array
print(mmap_arr[0, 0]) # Only reads necessary bytes
print(mmap_arr[100:110, 200:210]) # Reads only this slice
Saving Multiple Arrays with np.savez()
When working with related arrays, np.savez() bundles them into a single .npz archive:
import numpy as np
# Create related arrays
features = np.random.rand(100, 10)
labels = np.random.randint(0, 2, 100)
metadata = np.array(['train', 'test', 'validation'])
# Save all arrays in one file
np.savez('dataset.npz', features=features, labels=labels, meta=metadata)
# Load the archive
data = np.load('dataset.npz')
# Access arrays by name
print(f"Features shape: {data['features'].shape}")
print(f"Labels shape: {data['labels'].shape}")
print(f"Metadata: {data['meta']}")
# List all arrays in the archive
print(f"Available arrays: {data.files}")
# Close when done
data.close()
Without keyword arguments, arrays are named arr_0, arr_1, etc.:
# Positional saving
np.savez('arrays.npz', arr1, arr2, arr3)
# Load with default names
data = np.load('arrays.npz')
print(data['arr_0']) # First array
print(data['arr_1']) # Second array
print(data['arr_2']) # Third array
Compressed Archives with np.savez_compressed()
For large datasets where disk space is constrained, use compression:
import numpy as np
import os
# Create a large sparse array
sparse_data = np.zeros((1000, 1000))
sparse_data[::10, ::10] = np.random.rand(100, 100)
# Save uncompressed
np.savez('uncompressed.npz', data=sparse_data)
# Save compressed
np.savez_compressed('compressed.npz', data=sparse_data)
# Compare file sizes
uncompressed_size = os.path.getsize('uncompressed.npz')
compressed_size = os.path.getsize('compressed.npz')
print(f"Uncompressed: {uncompressed_size / 1024:.1f} KB")
print(f"Compressed: {compressed_size / 1024:.1f} KB")
print(f"Compression ratio: {uncompressed_size / compressed_size:.1f}x")
Compression works best with arrays containing repeated values or patterns. Random data compresses poorly:
# Low entropy data (compresses well)
repeated = np.ones((1000, 1000))
np.savez_compressed('repeated.npz', data=repeated)
# High entropy data (compresses poorly)
random = np.random.rand(1000, 1000)
np.savez_compressed('random.npz', data=random)
print(f"Repeated: {os.path.getsize('repeated.npz') / 1024:.1f} KB")
print(f"Random: {os.path.getsize('random.npz') / 1024:.1f} KB")
Context Managers and Resource Management
.npz files should be closed after use. Context managers handle this automatically:
import numpy as np
# Manual close (error-prone)
data = np.load('dataset.npz')
features = data['features']
data.close()
# Context manager (recommended)
with np.load('dataset.npz') as data:
features = data['features'].copy() # Copy data before context exits
labels = data['labels'].copy()
# data is automatically closed here
print(features.shape)
Without .copy(), arrays become invalid after the file closes:
with np.load('dataset.npz') as data:
features = data['features'] # Reference, not copy
# This may fail or return garbage
print(features[0]) # Undefined behavior
Practical Example: Model Checkpoint System
Here’s a complete checkpoint system for machine learning workflows:
import numpy as np
from pathlib import Path
class CheckpointManager:
def __init__(self, checkpoint_dir='checkpoints'):
self.checkpoint_dir = Path(checkpoint_dir)
self.checkpoint_dir.mkdir(exist_ok=True)
def save_checkpoint(self, epoch, weights, optimizer_state, metrics):
"""Save training checkpoint with all state."""
checkpoint_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.npz'
np.savez_compressed(
checkpoint_path,
weights=weights,
optimizer_state=optimizer_state,
metrics=metrics,
epoch=np.array([epoch])
)
print(f"Saved checkpoint: {checkpoint_path}")
return checkpoint_path
def load_checkpoint(self, epoch):
"""Load checkpoint from specific epoch."""
checkpoint_path = self.checkpoint_dir / f'checkpoint_epoch_{epoch}.npz'
if not checkpoint_path.exists():
raise FileNotFoundError(f"No checkpoint for epoch {epoch}")
with np.load(checkpoint_path) as data:
return {
'weights': data['weights'].copy(),
'optimizer_state': data['optimizer_state'].copy(),
'metrics': data['metrics'].copy(),
'epoch': int(data['epoch'])
}
def get_latest_checkpoint(self):
"""Find and load the most recent checkpoint."""
checkpoints = list(self.checkpoint_dir.glob('checkpoint_epoch_*.npz'))
if not checkpoints:
return None
latest = max(checkpoints, key=lambda p: int(p.stem.split('_')[-1]))
epoch = int(latest.stem.split('_')[-1])
return self.load_checkpoint(epoch)
# Usage
manager = CheckpointManager()
# Training loop
for epoch in range(5):
weights = np.random.rand(100, 50)
optimizer_state = np.random.rand(10)
metrics = np.array([0.95, 0.87, 0.02]) # accuracy, precision, loss
manager.save_checkpoint(epoch, weights, optimizer_state, metrics)
# Resume from latest
checkpoint = manager.get_latest_checkpoint()
print(f"Resumed from epoch {checkpoint['epoch']}")
print(f"Metrics: {checkpoint['metrics']}")
This pattern handles checkpoint creation, loading, and recovery efficiently while preserving all training state with exact precision.