Deep Lake datasets can be stored at a variety of storage locations using the appropriate dataset_path parameter below. We support S3, GCS, Activeloop storage, and are constantly adding to the list.
# Load a Deep Lake Datasetds = deeplake.load('dataset_path', creds = {'optional'}, token ='optional')
Creating Deep Lake Datasets
# Create an empty Deep Lake datasetds = deeplake.empty('dataset_path', creds = {'optional'}, token ='optional')# Create an Deep Lake Dataset with the same tensors as another datasetds = deeplake.like(ds_object or'dataset_path', creds = {'optional'}, token ='optional')# Automatically create a Deep Lake Dataset from another data sourceds = deeplake.ingest('source_path', 'deeplake_dataset_path', creds = {'optional'}, token ='optional')ds = deeplake.ingest_kaggle('kaggle_path', 'deeplake_dataset_path', creds = {'optional'}, token ='optional')
# Specifying htype is recommended for maximizing performance.ds.create_tensor('my_tensor', htype ='bbox')# Specifiying the correct compression is critical for images, videos, audio and # other rich data types. ds.create_tensor('songs', htype ='audio', sample_compression ='mp3')
Creating Tensor Hierarchies
ds.create_group('my_group')ds.my_group.create_tensor('my_tensor')ds.create_tensor('my_group/my_tensor')#Automatically creates the group 'my_group'
# Append a single sampleds.my_tensor.append(np.ones((1,4)))ds.my_tensor.append(deeplake.read('image.jpg'))# Append multiple samples. The first axis in the # numpy array is assumed to be the sample axis for the tensords.my_tensor.extend(np.ones((5,1,4)))# Editing or adding data at a specific indexds.my_tensor[i]= deeplake.read('image.jpg')
Appending Empty Samples or Skipping Samples
# Data appended as None will be returned as an empty arrayds.append('tensor_1': deeplake.read(...), 'tensor_2': None)ds.my_tensor.append(None)# Empty arrays can be explicitly appended if the length of the shape # of the empty array matches that of the other samplesds.boxes.append(np.zeros((0,4))
Accessing Tensor Data
# Read the i-th tensor samplenp_array = ds.my_tensor[i].numpy()text = ds.my_text_tensor[i].data()# More comprehensive view of the databytes= ds.my_tensor[i].tobytes()# More comprehensive view of the data# Read the i-th dataset sample as a numpy arrayimage = ds[i].images.numpy()# Read the i-th labels as a numpy array or list of stringslabels_array = ds.labels[i].numpy()labels_array = ds.labels[i].data()['value'] # same as .numpy()labels_string_list = ds.labels[i].data()['text']# Read a tensor sample from a hierarchical groupnp_array = ds.my_group.my_tensor_1[i].numpy()np_array = ds.my_group.my_tensor_2[i].numpy()# Read multiple tensor samples into numpy arraynp_array = ds.my_tensor[0:10].numpy()# Read multiple tensor samples into a list of numpy arraysnp_array_list = ds.my_tensor[0:10].numpy(aslist=True)
# Commit datacommit_id = ds.commit('Added 100 images of trucks')# Print the commit loglog = ds.log()# Checkout a branch or commit ds.checkout('branch_name'or commit_id)# Create a new branchds.checkout('new_branch', create =True)# Examine differences between commitsds.diff()
Adding Tensor and Dataset-Level Metadata
# Add or update dataset metadatads.info.update(key1 ='text', key2 = number)# Also can run ds.info.update({'key1'='value1', 'key2' = num_value})# Add or update tensor metadatads.my_tensor.info.update(key1 ='text', key2 = number)# Delete metadatads.info.delete()#Delete all metadatads.info.delete('key1')#Delete 1 key in metadatads.info.delete(['key1', 'key2'])#Delete multiple keys in metadata
Copying datasets
# Fastest option - copies everything including version historyds = deeplake.deepcopy('src_dataset_path', 'dest_dataset_path', src_creds, dest_creds, src_token, dest_token)# Slower option - copies only data on the last commitds = deeplake.copy('src_dataset_path', 'dest_dataset_path', src_creds, dest_creds, src_token, dest_token)
Advanced
# Load a Deep Lake Dataset if it already exists (same as deeplake.load), or initialize # a new Deep Lake Dataset if it does not already exist (same as deeplake.empty)ds = deeplake.dataset('dataset_path', creds = {'optional'}, token ='optional')# Append multiple samples using a listds.my_tensor.extend([np.ones((1,4)), np.ones((3,4)), np.ones((2,4)# Fetch adjacent data in the chunk -> Increases speed when loading # sequantially or if a tensor's data fits in the cache.numeric_label = ds.labels[i].numpy(fetch_chunks =True)