NumPy - Unique Values in Array (np.unique)
import numpy as np
Key Insights
np.unique()returns sorted unique elements with optional indices, inverse indices, and counts—essential for data deduplication and frequency analysis- Understanding the
return_index,return_inverse, andreturn_countsparameters unlocks advanced array manipulation patterns for data pipelines - Multi-dimensional array handling requires axis-aware operations or flattening strategies depending on whether you need row/column-level or element-level uniqueness
Basic Usage and Return Values
np.unique() identifies and returns unique values from an array. By default, it flattens the input and returns a sorted 1D array of unique elements.
import numpy as np
arr = np.array([3, 1, 2, 1, 3, 3, 4, 2])
unique_values = np.unique(arr)
print(unique_values)
# Output: [1 2 3 4]
The function accepts several parameters that return additional information about the unique elements:
arr = np.array([5, 2, 8, 2, 5, 1, 8])
# Get unique values and their first occurrence indices
values, indices = np.unique(arr, return_index=True)
print(f"Values: {values}") # [1 2 5 8]
print(f"Indices: {indices}") # [5 1 0 2]
# Get unique values and inverse indices for reconstruction
values, inverse = np.unique(arr, return_inverse=True)
print(f"Values: {values}") # [1 2 5 8]
print(f"Inverse: {inverse}") # [2 1 3 1 2 0 3]
print(f"Reconstructed: {values[inverse]}") # [5 2 8 2 5 1 8]
# Get unique values and their counts
values, counts = np.unique(arr, return_counts=True)
print(f"Values: {values}") # [1 2 5 8]
print(f"Counts: {counts}") # [1 2 2 2]
Practical Pattern: Frequency Analysis
Combining return_counts=True with array indexing enables efficient frequency analysis without loops.
# Find most common elements
data = np.array([10, 20, 10, 30, 20, 10, 40, 20, 10])
values, counts = np.unique(data, return_counts=True)
# Get top 3 most frequent values
top_indices = np.argsort(counts)[::-1][:3]
top_values = values[top_indices]
top_counts = counts[top_indices]
for val, cnt in zip(top_values, top_counts):
print(f"Value {val}: {cnt} occurrences")
# Output:
# Value 10: 4 occurrences
# Value 20: 3 occurrences
# Value 30: 1 occurrences
This pattern is particularly useful for categorical data analysis:
# Simulate user activity data
user_ids = np.random.randint(1000, 1010, size=1000)
unique_users, visit_counts = np.unique(user_ids, return_counts=True)
# Calculate engagement metrics
total_visits = len(user_ids)
active_users = len(unique_users)
avg_visits_per_user = total_visits / active_users
print(f"Active users: {active_users}")
print(f"Average visits per user: {avg_visits_per_user:.2f}")
print(f"Max visits by single user: {visit_counts.max()}")
Multi-Dimensional Arrays and Axis Parameter
For multi-dimensional arrays, np.unique() flattens by default. Use the axis parameter to find unique rows or columns.
# 2D array - default behavior (flattens)
arr_2d = np.array([[1, 2, 3],
[4, 5, 6],
[1, 2, 3]])
unique_elements = np.unique(arr_2d)
print(unique_elements) # [1 2 3 4 5 6]
# Find unique rows (axis=0)
unique_rows = np.unique(arr_2d, axis=0)
print(unique_rows)
# [[1 2 3]
# [4 5 6]]
# Find unique columns (axis=1)
arr_cols = np.array([[1, 2, 1],
[3, 4, 3],
[5, 6, 5]])
unique_cols = np.unique(arr_cols, axis=1)
print(unique_cols)
# [[1 2]
# [3 4]
# [5 6]]
Axis-based uniqueness with indices enables advanced filtering:
# Remove duplicate rows and track which rows were kept
data = np.array([[1, 2],
[3, 4],
[1, 2],
[5, 6],
[3, 4]])
unique_rows, indices = np.unique(data, axis=0, return_index=True)
print(f"Unique rows:\n{unique_rows}")
print(f"Original indices: {indices}") # [0 1 3]
# If you have associated metadata, filter it the same way
metadata = np.array(['A', 'B', 'C', 'D', 'E'])
unique_metadata = metadata[indices]
print(f"Corresponding metadata: {unique_metadata}") # ['A' 'B' 'D']
Inverse Indices for Grouping Operations
The return_inverse parameter provides a mapping from original positions to unique value positions, enabling efficient group-by operations.
# Group data by categories
categories = np.array(['apple', 'banana', 'apple', 'cherry', 'banana', 'apple'])
values = np.array([10, 20, 15, 30, 25, 12])
unique_cats, inverse_indices = np.unique(categories, return_inverse=True)
# Calculate sum per category
sums = np.zeros(len(unique_cats))
np.add.at(sums, inverse_indices, values)
for cat, total in zip(unique_cats, sums):
print(f"{cat}: {total}")
# Output:
# apple: 37.0
# banana: 45.0
# cherry: 30.0
This pattern extends to complex aggregations:
# Multi-metric aggregation
product_ids = np.array([101, 102, 101, 103, 102, 101, 103])
quantities = np.array([5, 3, 7, 2, 4, 3, 6])
revenues = np.array([50, 30, 70, 20, 40, 30, 60])
unique_products, inverse = np.unique(product_ids, return_inverse=True)
# Aggregate multiple metrics
total_qty = np.zeros(len(unique_products))
total_rev = np.zeros(len(unique_products))
order_count = np.zeros(len(unique_products))
np.add.at(total_qty, inverse, quantities)
np.add.at(total_rev, inverse, revenues)
np.add.at(order_count, inverse, 1)
# Create summary report
for pid, qty, rev, orders in zip(unique_products, total_qty, total_rev, order_count):
avg_order_value = rev / orders
print(f"Product {pid}: {int(qty)} units, ${rev:.0f} revenue, "
f"{int(orders)} orders, ${avg_order_value:.2f} AOV")
Performance Considerations
np.unique() uses sorting algorithms with O(n log n) complexity. For large datasets where you only need to check existence, consider alternative approaches.
import time
# Large array performance test
large_array = np.random.randint(0, 1000, size=10_000_000)
# np.unique approach
start = time.time()
unique_vals = np.unique(large_array)
np_time = time.time() - start
# Set-based approach for existence check only
start = time.time()
unique_set = set(large_array)
set_time = time.time() - start
print(f"np.unique time: {np_time:.4f}s")
print(f"set() time: {set_time:.4f}s")
print(f"Number of unique values: {len(unique_vals)}")
For boolean masks indicating first occurrences without sorting:
def first_unique_mask(arr):
"""Return boolean mask of first occurrence of each unique value"""
_, indices = np.unique(arr, return_index=True)
mask = np.zeros(len(arr), dtype=bool)
mask[indices] = True
return mask
data = np.array([5, 2, 8, 2, 5, 1, 8])
mask = first_unique_mask(data)
first_occurrences = data[mask]
print(first_occurrences) # [5 2 8 1]
Working with Structured Arrays
np.unique() handles structured arrays by comparing entire records:
# Create structured array
dtype = [('name', 'U10'), ('age', 'i4'), ('score', 'f4')]
data = np.array([
('Alice', 25, 85.5),
('Bob', 30, 92.0),
('Alice', 25, 85.5),
('Charlie', 25, 88.0)
], dtype=dtype)
unique_records = np.unique(data)
print(unique_records)
# [('Alice', 25, 85.5) ('Bob', 30, 92.0) ('Charlie', 25, 88.0)]
# Find unique values for specific field
unique_ages = np.unique(data['age'])
print(unique_ages) # [25 30]
Edge Cases and Data Types
np.unique() handles various data types including strings, floating-point numbers with NaN, and complex numbers:
# String arrays
strings = np.array(['apple', 'banana', 'apple', 'Cherry', 'banana'])
print(np.unique(strings)) # ['Cherry' 'apple' 'banana']
# Floating point with NaN
floats = np.array([1.5, 2.3, np.nan, 1.5, np.nan, 2.3])
print(np.unique(floats)) # [1.5 2.3 nan]
# Complex numbers
complex_arr = np.array([1+2j, 3+4j, 1+2j, 5+6j])
print(np.unique(complex_arr)) # [1.+2.j 3.+4.j 5.+6.j]
# Boolean arrays
bools = np.array([True, False, True, True, False])
print(np.unique(bools)) # [False True]
For floating-point comparisons with tolerance, pre-round the data:
# Handle floating point precision issues
measurements = np.array([1.0, 1.0000001, 1.0000002, 2.0, 2.0000001])
rounded = np.round(measurements, decimals=5)
unique_measurements = np.unique(rounded)
print(unique_measurements) # [1. 2.]