NumPy - Unique Values in Array (np.unique)

Key Insights

np.unique() returns sorted unique elements with optional indices, inverse indices, and counts—essential for data deduplication and frequency analysis
Understanding the return_index, return_inverse, and return_counts parameters unlocks advanced array manipulation patterns for data pipelines
Multi-dimensional array handling requires axis-aware operations or flattening strategies depending on whether you need row/column-level or element-level uniqueness

Basic Usage and Return Values

np.unique() identifies and returns unique values from an array. By default, it flattens the input and returns a sorted 1D array of unique elements.

import numpy as np

arr = np.array([3, 1, 2, 1, 3, 3, 4, 2])
unique_values = np.unique(arr)
print(unique_values)
# Output: [1 2 3 4]

The function accepts several parameters that return additional information about the unique elements:

arr = np.array([5, 2, 8, 2, 5, 1, 8])

# Get unique values and their first occurrence indices
values, indices = np.unique(arr, return_index=True)
print(f"Values: {values}")      # [1 2 5 8]
print(f"Indices: {indices}")    # [5 1 0 2]

# Get unique values and inverse indices for reconstruction
values, inverse = np.unique(arr, return_inverse=True)
print(f"Values: {values}")      # [1 2 5 8]
print(f"Inverse: {inverse}")    # [2 1 3 1 2 0 3]
print(f"Reconstructed: {values[inverse]}")  # [5 2 8 2 5 1 8]

# Get unique values and their counts
values, counts = np.unique(arr, return_counts=True)
print(f"Values: {values}")      # [1 2 5 8]
print(f"Counts: {counts}")      # [1 2 2 2]

Practical Pattern: Frequency Analysis

Combining return_counts=True with array indexing enables efficient frequency analysis without loops.

# Find most common elements
data = np.array([10, 20, 10, 30, 20, 10, 40, 20, 10])
values, counts = np.unique(data, return_counts=True)

# Get top 3 most frequent values
top_indices = np.argsort(counts)[::-1][:3]
top_values = values[top_indices]
top_counts = counts[top_indices]

for val, cnt in zip(top_values, top_counts):
    print(f"Value {val}: {cnt} occurrences")
# Output:
# Value 10: 4 occurrences
# Value 20: 3 occurrences
# Value 30: 1 occurrences

This pattern is particularly useful for categorical data analysis:

# Simulate user activity data
user_ids = np.random.randint(1000, 1010, size=1000)
unique_users, visit_counts = np.unique(user_ids, return_counts=True)

# Calculate engagement metrics
total_visits = len(user_ids)
active_users = len(unique_users)
avg_visits_per_user = total_visits / active_users

print(f"Active users: {active_users}")
print(f"Average visits per user: {avg_visits_per_user:.2f}")
print(f"Max visits by single user: {visit_counts.max()}")

Multi-Dimensional Arrays and Axis Parameter

For multi-dimensional arrays, np.unique() flattens by default. Use the axis parameter to find unique rows or columns.

# 2D array - default behavior (flattens)
arr_2d = np.array([[1, 2, 3],
                   [4, 5, 6],
                   [1, 2, 3]])

unique_elements = np.unique(arr_2d)
print(unique_elements)  # [1 2 3 4 5 6]

# Find unique rows (axis=0)
unique_rows = np.unique(arr_2d, axis=0)
print(unique_rows)
# [[1 2 3]
#  [4 5 6]]

# Find unique columns (axis=1)
arr_cols = np.array([[1, 2, 1],
                     [3, 4, 3],
                     [5, 6, 5]])

unique_cols = np.unique(arr_cols, axis=1)
print(unique_cols)
# [[1 2]
#  [3 4]
#  [5 6]]

Axis-based uniqueness with indices enables advanced filtering:

# Remove duplicate rows and track which rows were kept
data = np.array([[1, 2],
                 [3, 4],
                 [1, 2],
                 [5, 6],
                 [3, 4]])

unique_rows, indices = np.unique(data, axis=0, return_index=True)
print(f"Unique rows:\n{unique_rows}")
print(f"Original indices: {indices}")  # [0 1 3]

# If you have associated metadata, filter it the same way
metadata = np.array(['A', 'B', 'C', 'D', 'E'])
unique_metadata = metadata[indices]
print(f"Corresponding metadata: {unique_metadata}")  # ['A' 'B' 'D']

Inverse Indices for Grouping Operations

The return_inverse parameter provides a mapping from original positions to unique value positions, enabling efficient group-by operations.

# Group data by categories
categories = np.array(['apple', 'banana', 'apple', 'cherry', 'banana', 'apple'])
values = np.array([10, 20, 15, 30, 25, 12])

unique_cats, inverse_indices = np.unique(categories, return_inverse=True)

# Calculate sum per category
sums = np.zeros(len(unique_cats))
np.add.at(sums, inverse_indices, values)

for cat, total in zip(unique_cats, sums):
    print(f"{cat}: {total}")
# Output:
# apple: 37.0
# banana: 45.0
# cherry: 30.0

This pattern extends to complex aggregations:

# Multi-metric aggregation
product_ids = np.array([101, 102, 101, 103, 102, 101, 103])
quantities = np.array([5, 3, 7, 2, 4, 3, 6])
revenues = np.array([50, 30, 70, 20, 40, 30, 60])

unique_products, inverse = np.unique(product_ids, return_inverse=True)

# Aggregate multiple metrics
total_qty = np.zeros(len(unique_products))
total_rev = np.zeros(len(unique_products))
order_count = np.zeros(len(unique_products))

np.add.at(total_qty, inverse, quantities)
np.add.at(total_rev, inverse, revenues)
np.add.at(order_count, inverse, 1)

# Create summary report
for pid, qty, rev, orders in zip(unique_products, total_qty, total_rev, order_count):
    avg_order_value = rev / orders
    print(f"Product {pid}: {int(qty)} units, ${rev:.0f} revenue, "
          f"{int(orders)} orders, ${avg_order_value:.2f} AOV")

Performance Considerations

np.unique() uses sorting algorithms with O(n log n) complexity. For large datasets where you only need to check existence, consider alternative approaches.

import time

# Large array performance test
large_array = np.random.randint(0, 1000, size=10_000_000)

# np.unique approach
start = time.time()
unique_vals = np.unique(large_array)
np_time = time.time() - start

# Set-based approach for existence check only
start = time.time()
unique_set = set(large_array)
set_time = time.time() - start

print(f"np.unique time: {np_time:.4f}s")
print(f"set() time: {set_time:.4f}s")
print(f"Number of unique values: {len(unique_vals)}")

For boolean masks indicating first occurrences without sorting:

def first_unique_mask(arr):
    """Return boolean mask of first occurrence of each unique value"""
    _, indices = np.unique(arr, return_index=True)
    mask = np.zeros(len(arr), dtype=bool)
    mask[indices] = True
    return mask

data = np.array([5, 2, 8, 2, 5, 1, 8])
mask = first_unique_mask(data)
first_occurrences = data[mask]
print(first_occurrences)  # [5 2 8 1]

Working with Structured Arrays

np.unique() handles structured arrays by comparing entire records:

# Create structured array
dtype = [('name', 'U10'), ('age', 'i4'), ('score', 'f4')]
data = np.array([
    ('Alice', 25, 85.5),
    ('Bob', 30, 92.0),
    ('Alice', 25, 85.5),
    ('Charlie', 25, 88.0)
], dtype=dtype)

unique_records = np.unique(data)
print(unique_records)
# [('Alice', 25, 85.5) ('Bob', 30, 92.0) ('Charlie', 25, 88.0)]

# Find unique values for specific field
unique_ages = np.unique(data['age'])
print(unique_ages)  # [25 30]

Edge Cases and Data Types

np.unique() handles various data types including strings, floating-point numbers with NaN, and complex numbers:

# String arrays
strings = np.array(['apple', 'banana', 'apple', 'Cherry', 'banana'])
print(np.unique(strings))  # ['Cherry' 'apple' 'banana']

# Floating point with NaN
floats = np.array([1.5, 2.3, np.nan, 1.5, np.nan, 2.3])
print(np.unique(floats))  # [1.5 2.3 nan]

# Complex numbers
complex_arr = np.array([1+2j, 3+4j, 1+2j, 5+6j])
print(np.unique(complex_arr))  # [1.+2.j 3.+4.j 5.+6.j]

# Boolean arrays
bools = np.array([True, False, True, True, False])
print(np.unique(bools))  # [False  True]

For floating-point comparisons with tolerance, pre-round the data:

# Handle floating point precision issues
measurements = np.array([1.0, 1.0000001, 1.0000002, 2.0, 2.0000001])
rounded = np.round(measurements, decimals=5)
unique_measurements = np.unique(rounded)
print(unique_measurements)  # [1. 2.]