Simple in memory data cache designed for local non distributed ML applications. Built using Redis and Apache Arrow's Plasma in-memory store.
Install using pip:
pip install git+https://github.com/jchacks/data_cache.git
There are a few python packages that are required.
- Pyarrow
- Redis
Along with a running Redis server for the message queue.
from data_cache import PlasmaServer
s = PlasmaServer(100000000) # 100MB
s.start()
s.wait()
# The location of the plasma store will be printed
# e.g. '/tmp/plasma-qd3yeugu/plasma.sock'
# This location is also added to the Redis store
# so clients can automatically find it.
from data_cache import Client
# Ensure the `namespace` is the same everywhere the data is needed to be accessed
c = Client()
q = c.make_queue('plasma', None)
# Put some dummy data into the queue
import numpy as np
for i in range(10):
r = q.put(np.ones((100000,)).astype('float32') * i)
from data_cache import Client
c = Client()
q = c.make_queue('plasma', None) # Use the same name as above
# Fetch data off the queue using c.get()
import numpy as np
d = np.stack([q.get() for i in range(10)])
print(d)
# This will print the numpy array of
# concatenated data in order 1->10
import numpy as np
from data_cache import Client
c = Client()
generic = c.get_or_create_store('generic')
generic['abc'] = np.ones((100000,)).astype('float32')
# This will access the data and not remove it from plasma
print(generic['abc'])