API Basics

Summary of the most important Hub API commands

Please note that the code examples below may not run standalone.

Creating and Loading Hub Datasets

from hub import Dataset
# Load a Hub Dataset if it already exists, or initialize a new
# Hub Dataset if it does not already exist.
ds = Dataset('./local_path') # Local path
ds = Dataset('hub://username/dataset_name') # Activeloop Platform Storage
ds = Dataset('s3://bucket_name/dataset_name', creds = {}) # AWS S3
# Automatically create a Hub Dataset - Coming Soon
ds = Dataset.from_path('source_path', 'hub_dataset_path')
ds = Dataset.from_kaggle('kaggle_path', 'hub_dataset_path')
# Delete a Hub Dataset

Creating Tensors and Adding Data

# Create a tensor
# Specifying htype and dtype is recommended for maximizing performance
ds.create_tensor('my_tensor', htype = 'bbox', dtype = 'int32')
ds.create_tensor('localization/my_tensor', htype = 'bbox', dtype = 'float32')
# Specifiying the correct compression is critical for images, videos, and
# other rich data types.
ds.create_tensor('images', htype = 'image', sample_compression = 'jpeg')
# Append a single sample array at the end of a tensor
ds.my_tensor.append(np.ones((1,4))) # Appends an array at the end of a tensor
# Append multiple samples at the end of a tensor. The first axis in the
# numpy array is assumed to be the sample axis for the tensor
# Append multiple samples at the end of a tensor.
ds.my_tensor.extend([np.ones((1,4)), np.ones((3,4)), np.ones((2,4))])

Maximizing performance

# Data gets written to long-terms storage at the end of the 'with'
# block or whenever the cache is full. This minimizes the number of
# write operations during dataset creation.
with Dataset('dataset_path') as ds:
for i in range(10):

Accessing Tensor Data

# Read tensor sample into numpy array
np_array = ds.my_tensor[0].numpy()
# Read multiple tensor samples into numpy array
# Returns an error if tensor samples do not have equal shape
np_array = ds.my_tensor[0:10].numpy()
# Read multiple tensor samples into a list of numpy arrays
np_array_list = ds.my_tensor[0:10].numpy(aslist=True)

Connecting Hub Datasets to ML Frameworks

# PyTorch Dataloader
dataloader = ds.pytorch()
# TensorFlow Dataset
ds_tensorflow = ds.tensorflow()