Python - Read/Write Binary Files | Application Architect

Key Insights

Binary file operations in Python use mode flags ‘rb’ and ‘wb’, bypassing text encoding to work directly with raw bytes, essential for images, audio, executables, and custom binary formats
The struct module provides precise control over binary data serialization with C-style format strings, enabling interoperability with low-level systems and network protocols
Memory-mapped files via mmap offer significant performance gains for large binary files by mapping file contents directly into memory, avoiding expensive I/O operations

Basic Binary File Operations

Binary files contain raw bytes without text encoding interpretation. Unlike text files, binary mode preserves exact byte sequences, making it critical for non-text data.

# Writing binary data
data = bytes([0x48, 0x65, 0x6C, 0x6C, 0x6F])  # "Hello" in ASCII

with open('output.bin', 'wb') as f:
    f.write(data)

# Reading binary data
with open('output.bin', 'rb') as f:
    content = f.read()
    print(content)  # b'Hello'
    print(list(content))  # [72, 101, 108, 108, 111]

The bytes type is immutable. For mutable byte sequences, use bytearray:

buffer = bytearray(1024)  # Pre-allocate 1KB buffer

with open('data.bin', 'rb') as f:
    bytes_read = f.readinto(buffer)
    print(f"Read {bytes_read} bytes")
    # Process buffer without additional allocation

Working with Structured Binary Data

The struct module translates between Python values and C structs represented as bytes. This is fundamental for binary protocols and file formats.

import struct

# Pack data into binary format
# Format: 'I' = unsigned int, 'f' = float, '10s' = 10-byte string
header = struct.pack('I f 10s', 42, 3.14159, b'HEADER')

with open('structured.bin', 'wb') as f:
    f.write(header)

# Unpack binary data
with open('structured.bin', 'rb') as f:
    data = f.read(18)  # 4 + 4 + 10 bytes
    unpacked = struct.unpack('I f 10s', data)
    print(unpacked)  # (42, 3.1415901184082031, b'HEADER\x00\x00\x00\x00')

Common format characters:

# Integer types
struct.pack('b', -128)      # signed char (1 byte)
struct.pack('B', 255)       # unsigned char (1 byte)
struct.pack('h', -32768)    # short (2 bytes)
struct.pack('i', 2147483647) # int (4 bytes)
struct.pack('q', 2**63-1)   # long long (8 bytes)

# Floating point
struct.pack('f', 3.14)      # float (4 bytes)
struct.pack('d', 3.14159)   # double (8 bytes)

# Byte order prefixes
struct.pack('<I', 42)       # Little-endian
struct.pack('>I', 42)       # Big-endian
struct.pack('!I', 42)       # Network (big-endian)

Reading Binary Files in Chunks

For large files, reading in chunks prevents memory exhaustion:

def process_large_binary(filename, chunk_size=8192):
    with open(filename, 'rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            # Process chunk
            process_bytes(chunk)

def process_bytes(data):
    # Example: count specific byte value
    return data.count(0x00)

Using iteration for line-like processing:

def read_fixed_records(filename, record_size):
    """Read binary file as fixed-size records"""
    with open(filename, 'rb') as f:
        while True:
            record = f.read(record_size)
            if len(record) < record_size:
                break
            yield record

# Usage
for record in read_fixed_records('records.bin', 64):
    # Each record is exactly 64 bytes
    id_num, name, value = struct.unpack('I 50s f', record[:58])

Seeking and Random Access

Binary files support efficient random access using seek() and tell():

with open('database.bin', 'rb+') as f:  # r+ allows read/write
    # Jump to byte 1000
    f.seek(1000)
    
    # Read 100 bytes
    data = f.read(100)
    
    # Current position
    pos = f.tell()  # 1100
    
    # Seek from end: -10 bytes from EOF
    f.seek(-10, 2)
    
    # Seek from current position
    f.seek(5, 1)

Practical example - updating specific records:

class BinaryDatabase:
    RECORD_SIZE = 64
    
    def __init__(self, filename):
        self.filename = filename
    
    def write_record(self, index, data):
        """Write record at specific index"""
        with open(self.filename, 'rb+') as f:
            f.seek(index * self.RECORD_SIZE)
            f.write(struct.pack('64s', data))
    
    def read_record(self, index):
        """Read record at specific index"""
        with open(self.filename, 'rb') as f:
            f.seek(index * self.RECORD_SIZE)
            data = f.read(self.RECORD_SIZE)
            return struct.unpack('64s', data)[0]

# Usage
db = BinaryDatabase('records.bin')
db.write_record(5, b'Record data at index 5')
print(db.read_record(5))

Memory-Mapped Files

Memory-mapped files provide array-like access to file contents without explicit read/write calls:

import mmap

# Read-only memory mapping
with open('large_file.bin', 'rb') as f:
    with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped:
        # Direct byte access
        first_byte = mmapped[0]
        
        # Slicing
        header = mmapped[0:1024]
        
        # Search
        position = mmapped.find(b'\x00\xFF')
        
        # Read structured data
        mmapped.seek(100)
        value = struct.unpack('I', mmapped.read(4))[0]

Write operations with memory mapping:

with open('output.bin', 'r+b') as f:
    with mmap.mmap(f.fileno(), 0) as mmapped:
        # Modify in place
        mmapped[0:4] = struct.pack('I', 12345)
        
        # Write at specific position
        mmapped.seek(1000)
        mmapped.write(b'Updated data')
        
        # Ensure changes are written
        mmapped.flush()

Performance comparison for large file processing:

import time

def traditional_read(filename):
    with open(filename, 'rb') as f:
        data = f.read()
        return data.count(b'pattern')

def mmap_read(filename):
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
            return m[:].count(b'pattern')

# For 1GB file, mmap is typically 2-3x faster

Working with Image Files

Reading and manipulating image headers without external libraries:

def read_bmp_header(filename):
    """Read BMP file header"""
    with open(filename, 'rb') as f:
        # BMP signature
        signature = f.read(2)
        if signature != b'BM':
            raise ValueError("Not a BMP file")
        
        # File size, reserved, data offset
        file_size, _, _, data_offset = struct.unpack('<I H H I', f.read(12))
        
        # DIB header
        dib_size = struct.unpack('<I', f.read(4))[0]
        width, height = struct.unpack('<i i', f.read(8))
        
        return {
            'file_size': file_size,
            'data_offset': data_offset,
            'width': width,
            'height': height
        }

# Read PNG signature
def is_png(filename):
    with open(filename, 'rb') as f:
        signature = f.read(8)
        return signature == b'\x89PNG\r\n\x1a\n'

Writing Custom Binary Formats

Creating a simple binary file format with header and data sections:

class BinaryFileWriter:
    MAGIC = b'MYFT'
    VERSION = 1
    
    def __init__(self, filename):
        self.filename = filename
        self.records = []
    
    def add_record(self, record_type, data):
        self.records.append((record_type, data))
    
    def write(self):
        with open(self.filename, 'wb') as f:
            # Write header
            header = struct.pack('4s H I', 
                self.MAGIC, 
                self.VERSION,
                len(self.records)
            )
            f.write(header)
            
            # Write records
            for rec_type, data in self.records:
                record_header = struct.pack('B I', rec_type, len(data))
                f.write(record_header)
                f.write(data)

class BinaryFileReader:
    def __init__(self, filename):
        self.filename = filename
    
    def read(self):
        with open(self.filename, 'rb') as f:
            # Read header
            magic, version, count = struct.unpack('4s H I', f.read(10))
            
            if magic != b'MYFT':
                raise ValueError("Invalid file format")
            
            # Read records
            records = []
            for _ in range(count):
                rec_type, length = struct.unpack('B I', f.read(5))
                data = f.read(length)
                records.append((rec_type, data))
            
            return records

# Usage
writer = BinaryFileWriter('data.myft')
writer.add_record(1, b'First record')
writer.add_record(2, b'Second record')
writer.write()

reader = BinaryFileReader('data.myft')
records = reader.read()

Binary file operations are essential for performance-critical applications, protocol implementations, and working with non-text data formats. Understanding byte-level manipulation, structured data packing, and memory-mapped I/O provides the foundation for efficient binary data processing in Python.