Python - Read/Write Binary Files

Binary files contain raw bytes without text encoding interpretation. Unlike text files, binary mode preserves exact byte sequences, making it critical for non-text data.

Key Insights

  • Binary file operations in Python use mode flags ‘rb’ and ‘wb’, bypassing text encoding to work directly with raw bytes, essential for images, audio, executables, and custom binary formats
  • The struct module provides precise control over binary data serialization with C-style format strings, enabling interoperability with low-level systems and network protocols
  • Memory-mapped files via mmap offer significant performance gains for large binary files by mapping file contents directly into memory, avoiding expensive I/O operations

Basic Binary File Operations

Binary files contain raw bytes without text encoding interpretation. Unlike text files, binary mode preserves exact byte sequences, making it critical for non-text data.

# Writing binary data
data = bytes([0x48, 0x65, 0x6C, 0x6C, 0x6F])  # "Hello" in ASCII

with open('output.bin', 'wb') as f:
    f.write(data)

# Reading binary data
with open('output.bin', 'rb') as f:
    content = f.read()
    print(content)  # b'Hello'
    print(list(content))  # [72, 101, 108, 108, 111]

The bytes type is immutable. For mutable byte sequences, use bytearray:

buffer = bytearray(1024)  # Pre-allocate 1KB buffer

with open('data.bin', 'rb') as f:
    bytes_read = f.readinto(buffer)
    print(f"Read {bytes_read} bytes")
    # Process buffer without additional allocation

Working with Structured Binary Data

The struct module translates between Python values and C structs represented as bytes. This is fundamental for binary protocols and file formats.

import struct

# Pack data into binary format
# Format: 'I' = unsigned int, 'f' = float, '10s' = 10-byte string
header = struct.pack('I f 10s', 42, 3.14159, b'HEADER')

with open('structured.bin', 'wb') as f:
    f.write(header)

# Unpack binary data
with open('structured.bin', 'rb') as f:
    data = f.read(18)  # 4 + 4 + 10 bytes
    unpacked = struct.unpack('I f 10s', data)
    print(unpacked)  # (42, 3.1415901184082031, b'HEADER\x00\x00\x00\x00')

Common format characters:

# Integer types
struct.pack('b', -128)      # signed char (1 byte)
struct.pack('B', 255)       # unsigned char (1 byte)
struct.pack('h', -32768)    # short (2 bytes)
struct.pack('i', 2147483647) # int (4 bytes)
struct.pack('q', 2**63-1)   # long long (8 bytes)

# Floating point
struct.pack('f', 3.14)      # float (4 bytes)
struct.pack('d', 3.14159)   # double (8 bytes)

# Byte order prefixes
struct.pack('<I', 42)       # Little-endian
struct.pack('>I', 42)       # Big-endian
struct.pack('!I', 42)       # Network (big-endian)

Reading Binary Files in Chunks

For large files, reading in chunks prevents memory exhaustion:

def process_large_binary(filename, chunk_size=8192):
    with open(filename, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            # Process chunk
            process_bytes(chunk)

def process_bytes(data):
    # Example: count specific byte value
    return data.count(0x00)

Using iteration for line-like processing:

def read_fixed_records(filename, record_size):
    """Read binary file as fixed-size records"""
    with open(filename, 'rb') as f:
        while True:
            record = f.read(record_size)
            if len(record) < record_size:
                break
            yield record

# Usage
for record in read_fixed_records('records.bin', 64):
    # Each record is exactly 64 bytes
    id_num, name, value = struct.unpack('I 50s f', record[:58])

Seeking and Random Access

Binary files support efficient random access using seek() and tell():

with open('database.bin', 'rb+') as f:  # r+ allows read/write
    # Jump to byte 1000
    f.seek(1000)
    
    # Read 100 bytes
    data = f.read(100)
    
    # Current position
    pos = f.tell()  # 1100
    
    # Seek from end: -10 bytes from EOF
    f.seek(-10, 2)
    
    # Seek from current position
    f.seek(5, 1)

Practical example - updating specific records:

class BinaryDatabase:
    RECORD_SIZE = 64
    
    def __init__(self, filename):
        self.filename = filename
    
    def write_record(self, index, data):
        """Write record at specific index"""
        with open(self.filename, 'rb+') as f:
            f.seek(index * self.RECORD_SIZE)
            f.write(struct.pack('64s', data))
    
    def read_record(self, index):
        """Read record at specific index"""
        with open(self.filename, 'rb') as f:
            f.seek(index * self.RECORD_SIZE)
            data = f.read(self.RECORD_SIZE)
            return struct.unpack('64s', data)[0]

# Usage
db = BinaryDatabase('records.bin')
db.write_record(5, b'Record data at index 5')
print(db.read_record(5))

Memory-Mapped Files

Memory-mapped files provide array-like access to file contents without explicit read/write calls:

import mmap

# Read-only memory mapping
with open('large_file.bin', 'rb') as f:
    with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped:
        # Direct byte access
        first_byte = mmapped[0]
        
        # Slicing
        header = mmapped[0:1024]
        
        # Search
        position = mmapped.find(b'\x00\xFF')
        
        # Read structured data
        mmapped.seek(100)
        value = struct.unpack('I', mmapped.read(4))[0]

Write operations with memory mapping:

with open('output.bin', 'r+b') as f:
    with mmap.mmap(f.fileno(), 0) as mmapped:
        # Modify in place
        mmapped[0:4] = struct.pack('I', 12345)
        
        # Write at specific position
        mmapped.seek(1000)
        mmapped.write(b'Updated data')
        
        # Ensure changes are written
        mmapped.flush()

Performance comparison for large file processing:

import time

def traditional_read(filename):
    with open(filename, 'rb') as f:
        data = f.read()
        return data.count(b'pattern')

def mmap_read(filename):
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
            return m[:].count(b'pattern')

# For 1GB file, mmap is typically 2-3x faster

Working with Image Files

Reading and manipulating image headers without external libraries:

def read_bmp_header(filename):
    """Read BMP file header"""
    with open(filename, 'rb') as f:
        # BMP signature
        signature = f.read(2)
        if signature != b'BM':
            raise ValueError("Not a BMP file")
        
        # File size, reserved, data offset
        file_size, _, _, data_offset = struct.unpack('<I H H I', f.read(12))
        
        # DIB header
        dib_size = struct.unpack('<I', f.read(4))[0]
        width, height = struct.unpack('<i i', f.read(8))
        
        return {
            'file_size': file_size,
            'data_offset': data_offset,
            'width': width,
            'height': height
        }

# Read PNG signature
def is_png(filename):
    with open(filename, 'rb') as f:
        signature = f.read(8)
        return signature == b'\x89PNG\r\n\x1a\n'

Writing Custom Binary Formats

Creating a simple binary file format with header and data sections:

class BinaryFileWriter:
    MAGIC = b'MYFT'
    VERSION = 1
    
    def __init__(self, filename):
        self.filename = filename
        self.records = []
    
    def add_record(self, record_type, data):
        self.records.append((record_type, data))
    
    def write(self):
        with open(self.filename, 'wb') as f:
            # Write header
            header = struct.pack('4s H I', 
                self.MAGIC, 
                self.VERSION,
                len(self.records)
            )
            f.write(header)
            
            # Write records
            for rec_type, data in self.records:
                record_header = struct.pack('B I', rec_type, len(data))
                f.write(record_header)
                f.write(data)

class BinaryFileReader:
    def __init__(self, filename):
        self.filename = filename
    
    def read(self):
        with open(self.filename, 'rb') as f:
            # Read header
            magic, version, count = struct.unpack('4s H I', f.read(10))
            
            if magic != b'MYFT':
                raise ValueError("Invalid file format")
            
            # Read records
            records = []
            for _ in range(count):
                rec_type, length = struct.unpack('B I', f.read(5))
                data = f.read(length)
                records.append((rec_type, data))
            
            return records

# Usage
writer = BinaryFileWriter('data.myft')
writer.add_record(1, b'First record')
writer.add_record(2, b'Second record')
writer.write()

reader = BinaryFileReader('data.myft')
records = reader.read()

Binary file operations are essential for performance-critical applications, protocol implementations, and working with non-text data formats. Understanding byte-level manipulation, structured data packing, and memory-mapped I/O provides the foundation for efficient binary data processing in Python.

Liked this? There's more.

Every week: one practical technique, explained simply, with code you can use immediately.