redis/modules/vector-sets/examples/glove-100/insert.py
YaacovHazan 41b1b5df18 Add vector-sets module
The vector-sets module is a part of Redis Core and is available by default,
just like any other data type in Redis.

As a result, when building Redis from the source, the vector-sets module
is also compiled as part of the Redis binary and loaded at server start-up.

This new data type added as a preview currently doesn't support
all the capabilities in Redis like:
32-bit OS
C99
Short-read that might end with memory leak
AOF rewirte
defrag
2025-04-02 15:06:24 +00:00

55 lines
1.7 KiB
Python

#
# Copyright (c) 2009-Present, Redis Ltd.
# All rights reserved.
#
# Licensed under your choice of the Redis Source Available License 2.0
# (RSALv2) or the Server Side Public License v1 (SSPLv1).
#
import h5py
import redis
from tqdm import tqdm
# Initialize Redis connection
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, encoding='utf-8')
def add_to_redis(index, embedding):
"""Add embedding to Redis using VADD command"""
args = ["VADD", "glove_embeddings", "VALUES", "100"] # 100 is vector dimension
args.extend(map(str, embedding))
args.append(f"{index}") # Using index as identifier since we don't have words
args.append("EF")
args.append("200")
# args.append("NOQUANT")
# args.append("BIN")
redis_client.execute_command(*args)
def main():
with h5py.File('glove-100-angular.hdf5', 'r') as f:
# Get the train dataset
train_vectors = f['train']
total_vectors = train_vectors.shape[0]
print(f"Starting to process {total_vectors} vectors...")
# Process in batches to avoid memory issues
batch_size = 1000
for i in tqdm(range(0, total_vectors, batch_size)):
batch_end = min(i + batch_size, total_vectors)
batch = train_vectors[i:batch_end]
for j, vector in enumerate(batch):
try:
current_index = i + j
add_to_redis(current_index, vector)
except Exception as e:
print(f"Error processing vector {current_index}: {str(e)}")
continue
if (i + batch_size) % 10000 == 0:
print(f"Processed {i + batch_size} vectors")
if __name__ == "__main__":
main()