Are the ways to speed up ZMQ's recv

I have multiple clients that send dicts of Numpy arrays to a ZMQ server. I managed to pack the dicts of Numpy arrays into a multi part message to avoid memcpy's during deserialization, which doubled the throughput.

However, the vast majority of the time is now spent in ZMQ's recv_multipart() function, which presumably also copies the data from the network interface to RAM. I'm wondering if there are any ways to further remove this second bottleneck?

For example, is the time spent for malloc of the new buffer to then copy the message into? In that case, is there a way to reuse buffers for receiving messages in ZMQ? Or is this just a fundamental limitation of going through TCP that cannot be optimized much further?

Total Samples 30400GIL: 73.00%, Active: 73.00%, Threads: 1  %Own   %Total  OwnTime  TotalTime  Function (filename:line) 70.00%  70.00%   203.7s    203.7s   recv_multipart (zmq/sugar/socket.py:808)  1.00%   1.00%    3.01s     4.13s   recv_multipart (zmq/sugar/socket.py:807)  0.00%   0.00%    2.62s     2.62s   <listcomp> (zmq_gbs_dict_seq.py:37)  0.00%   0.00%    2.49s     2.49s   send (zmq/sugar/socket.py:696)  0.00%   0.00%    1.32s     1.32s   unpack (zmq_gbs_dict_seq.py:35)  0.00%   0.00%   0.690s     1.22s   __call__ (enum.py:717)  0.00%  72.00%   0.520s    209.9s   server (zmq_gbs_dict_seq.py:82)  1.00%   1.00%   0.500s    0.840s   inner (typing.py:341)  0.00%   0.00%   0.500s     5.32s   server (zmq_gbs_dict_seq.py:83)  0.00%   1.00%   0.400s     1.33s   recv_multipart (zmq/sugar/socket.py:812)  1.00%   1.00%   0.360s     3.07s   send_multipart (zmq/sugar/socket.py:751)  0.00%   0.00%   0.350s    0.350s   __new__ (enum.py:1106)  0.00%   0.00%   0.300s    0.300s   __hash__ (typing.py:1352)  0.00%   0.00%   0.270s    0.270s   <genexpr> (zmq_gbs_dict_seq.py:93)  0.00%   0.00%   0.260s    0.260s   server (zmq_gbs_dict_seq.py:101)  0.00%   0.00%   0.250s    0.660s   server (zmq_gbs_dict_seq.py:92)  0.00%   0.00%   0.250s     3.04s   unpack (zmq_gbs_dict_seq.py:36)  0.00%   0.00%   0.210s    0.210s   unpack (zmq_gbs_dict_seq.py:38)  0.00%   0.00%   0.210s    0.210s   server (zmq_gbs_dict_seq.py:91)  0.00%   0.00%   0.200s    0.200s   unpack (zmq_gbs_dict_seq.py:39)  0.00%   1.00%   0.200s     4.04s   server (zmq_gbs_dict_seq.py:99)

import multiprocessingimport pickleimport timeimport numpy as npimport zmqdef client(port):  socket = zmq.Context.instance().socket(zmq.DEALER)  socket.set_hwm(0)  socket.connect(f'tcp://localhost:{port}')  data = {'foo': np.zeros((1024, 64, 64, 3), np.uint8),'bar': np.zeros((1024, 1024), np.float32),'baz': np.zeros((1024,), np.float32),  }  parts = pack(data)  while True:    socket.send_multipart(parts)    msg = socket.recv()    assert msg == b'done'  socket.close()def server(port):  socket = zmq.Context.instance().socket(zmq.ROUTER)  socket.set_hwm(0)  socket.bind(f'tcp://*:{port}')  time.sleep(3)  print('Start')  start = time.time()  steps = 0  nbytes = 0  poller = zmq.Poller()  poller.register(socket, zmq.POLLIN)  while True:    if poller.poll():      addr, *parts = socket.recv_multipart(zmq.NOBLOCK)      data = unpack(parts)      steps += data['foo'].shape[0]      nbytes += sum(v.nbytes for v in data.values())      socket.send_multipart([addr, b'done'])    duration = time.time() - start    if duration > 1:      fps = steps / duration      gbs = (nbytes / 1024 / 1024 / 1024) / duration      print(f'{fps/1e3:.2f}k fps {gbs:.2f} gb/s')      start = time.time()      steps = 0      nbytes = 0  socket.close()def pack(data):  dtypes, shapes, buffers = [], [], []  items = sorted(data.items(), key=lambda x: x[0])  keys, vals = zip(*items)  dtypes = [v.dtype.name for v in vals]  shapes = [v.shape for v in vals]  buffers = [v.tobytes() for v in vals]  meta = (keys, dtypes, shapes)  parts = [pickle.dumps(meta), *buffers]  return partsdef unpack(parts):  meta, *buffers = parts  keys, dtypes, shapes = pickle.loads(meta)  vals = [      np.frombuffer(b, d).reshape(s)      for i, (d, s, b) in enumerate(zip(dtypes, shapes, buffers))]  data = dict(zip(keys, vals))  return datadef main():  mp = multiprocessing.get_context('spawn')  workers = []  for _ in range(32):    workers.append(mp.Process(target=client, args=(5555,)))  workers.append(mp.Process(target=server, args=(5555,)))  [x.start() for x in workers]  [x.join() for x in workers]if __name__ == '__main__':  main()

Latest Images

Trending Articles

Latest Images