Python - Read/Write Binary Files
Binary files contain raw bytes without text encoding interpretation. Unlike text files, binary mode preserves exact byte sequences, making it critical for non-text data.
Key Insights
- Binary file operations in Python use mode flags ‘rb’ and ‘wb’, bypassing text encoding to work directly with raw bytes, essential for images, audio, executables, and custom binary formats
- The
structmodule provides precise control over binary data serialization with C-style format strings, enabling interoperability with low-level systems and network protocols - Memory-mapped files via
mmapoffer significant performance gains for large binary files by mapping file contents directly into memory, avoiding expensive I/O operations
Basic Binary File Operations
Binary files contain raw bytes without text encoding interpretation. Unlike text files, binary mode preserves exact byte sequences, making it critical for non-text data.
# Writing binary data
data = bytes([0x48, 0x65, 0x6C, 0x6C, 0x6F]) # "Hello" in ASCII
with open('output.bin', 'wb') as f:
f.write(data)
# Reading binary data
with open('output.bin', 'rb') as f:
content = f.read()
print(content) # b'Hello'
print(list(content)) # [72, 101, 108, 108, 111]
The bytes type is immutable. For mutable byte sequences, use bytearray:
buffer = bytearray(1024) # Pre-allocate 1KB buffer
with open('data.bin', 'rb') as f:
bytes_read = f.readinto(buffer)
print(f"Read {bytes_read} bytes")
# Process buffer without additional allocation
Working with Structured Binary Data
The struct module translates between Python values and C structs represented as bytes. This is fundamental for binary protocols and file formats.
import struct
# Pack data into binary format
# Format: 'I' = unsigned int, 'f' = float, '10s' = 10-byte string
header = struct.pack('I f 10s', 42, 3.14159, b'HEADER')
with open('structured.bin', 'wb') as f:
f.write(header)
# Unpack binary data
with open('structured.bin', 'rb') as f:
data = f.read(18) # 4 + 4 + 10 bytes
unpacked = struct.unpack('I f 10s', data)
print(unpacked) # (42, 3.1415901184082031, b'HEADER\x00\x00\x00\x00')
Common format characters:
# Integer types
struct.pack('b', -128) # signed char (1 byte)
struct.pack('B', 255) # unsigned char (1 byte)
struct.pack('h', -32768) # short (2 bytes)
struct.pack('i', 2147483647) # int (4 bytes)
struct.pack('q', 2**63-1) # long long (8 bytes)
# Floating point
struct.pack('f', 3.14) # float (4 bytes)
struct.pack('d', 3.14159) # double (8 bytes)
# Byte order prefixes
struct.pack('<I', 42) # Little-endian
struct.pack('>I', 42) # Big-endian
struct.pack('!I', 42) # Network (big-endian)
Reading Binary Files in Chunks
For large files, reading in chunks prevents memory exhaustion:
def process_large_binary(filename, chunk_size=8192):
with open(filename, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
# Process chunk
process_bytes(chunk)
def process_bytes(data):
# Example: count specific byte value
return data.count(0x00)
Using iteration for line-like processing:
def read_fixed_records(filename, record_size):
"""Read binary file as fixed-size records"""
with open(filename, 'rb') as f:
while True:
record = f.read(record_size)
if len(record) < record_size:
break
yield record
# Usage
for record in read_fixed_records('records.bin', 64):
# Each record is exactly 64 bytes
id_num, name, value = struct.unpack('I 50s f', record[:58])
Seeking and Random Access
Binary files support efficient random access using seek() and tell():
with open('database.bin', 'rb+') as f: # r+ allows read/write
# Jump to byte 1000
f.seek(1000)
# Read 100 bytes
data = f.read(100)
# Current position
pos = f.tell() # 1100
# Seek from end: -10 bytes from EOF
f.seek(-10, 2)
# Seek from current position
f.seek(5, 1)
Practical example - updating specific records:
class BinaryDatabase:
RECORD_SIZE = 64
def __init__(self, filename):
self.filename = filename
def write_record(self, index, data):
"""Write record at specific index"""
with open(self.filename, 'rb+') as f:
f.seek(index * self.RECORD_SIZE)
f.write(struct.pack('64s', data))
def read_record(self, index):
"""Read record at specific index"""
with open(self.filename, 'rb') as f:
f.seek(index * self.RECORD_SIZE)
data = f.read(self.RECORD_SIZE)
return struct.unpack('64s', data)[0]
# Usage
db = BinaryDatabase('records.bin')
db.write_record(5, b'Record data at index 5')
print(db.read_record(5))
Memory-Mapped Files
Memory-mapped files provide array-like access to file contents without explicit read/write calls:
import mmap
# Read-only memory mapping
with open('large_file.bin', 'rb') as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped:
# Direct byte access
first_byte = mmapped[0]
# Slicing
header = mmapped[0:1024]
# Search
position = mmapped.find(b'\x00\xFF')
# Read structured data
mmapped.seek(100)
value = struct.unpack('I', mmapped.read(4))[0]
Write operations with memory mapping:
with open('output.bin', 'r+b') as f:
with mmap.mmap(f.fileno(), 0) as mmapped:
# Modify in place
mmapped[0:4] = struct.pack('I', 12345)
# Write at specific position
mmapped.seek(1000)
mmapped.write(b'Updated data')
# Ensure changes are written
mmapped.flush()
Performance comparison for large file processing:
import time
def traditional_read(filename):
with open(filename, 'rb') as f:
data = f.read()
return data.count(b'pattern')
def mmap_read(filename):
with open(filename, 'rb') as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
return m[:].count(b'pattern')
# For 1GB file, mmap is typically 2-3x faster
Working with Image Files
Reading and manipulating image headers without external libraries:
def read_bmp_header(filename):
"""Read BMP file header"""
with open(filename, 'rb') as f:
# BMP signature
signature = f.read(2)
if signature != b'BM':
raise ValueError("Not a BMP file")
# File size, reserved, data offset
file_size, _, _, data_offset = struct.unpack('<I H H I', f.read(12))
# DIB header
dib_size = struct.unpack('<I', f.read(4))[0]
width, height = struct.unpack('<i i', f.read(8))
return {
'file_size': file_size,
'data_offset': data_offset,
'width': width,
'height': height
}
# Read PNG signature
def is_png(filename):
with open(filename, 'rb') as f:
signature = f.read(8)
return signature == b'\x89PNG\r\n\x1a\n'
Writing Custom Binary Formats
Creating a simple binary file format with header and data sections:
class BinaryFileWriter:
MAGIC = b'MYFT'
VERSION = 1
def __init__(self, filename):
self.filename = filename
self.records = []
def add_record(self, record_type, data):
self.records.append((record_type, data))
def write(self):
with open(self.filename, 'wb') as f:
# Write header
header = struct.pack('4s H I',
self.MAGIC,
self.VERSION,
len(self.records)
)
f.write(header)
# Write records
for rec_type, data in self.records:
record_header = struct.pack('B I', rec_type, len(data))
f.write(record_header)
f.write(data)
class BinaryFileReader:
def __init__(self, filename):
self.filename = filename
def read(self):
with open(self.filename, 'rb') as f:
# Read header
magic, version, count = struct.unpack('4s H I', f.read(10))
if magic != b'MYFT':
raise ValueError("Invalid file format")
# Read records
records = []
for _ in range(count):
rec_type, length = struct.unpack('B I', f.read(5))
data = f.read(length)
records.append((rec_type, data))
return records
# Usage
writer = BinaryFileWriter('data.myft')
writer.add_record(1, b'First record')
writer.add_record(2, b'Second record')
writer.write()
reader = BinaryFileReader('data.myft')
records = reader.read()
Binary file operations are essential for performance-critical applications, protocol implementations, and working with non-text data formats. Understanding byte-level manipulation, structured data packing, and memory-mapped I/O provides the foundation for efficient binary data processing in Python.