Read Benchmarks for HDF5, Parquet, Feather, CSV

Preamble

In [1]:
import pyarrow as pa
import pandas as pd
import sys
import os
In [2]:
# https://unix.stackexchange.com/q/23106
# https://stackoverflow.com/q/15639779
# https://docs.python.org/dev/library/os.html#os.sched_setaffinity
print(list(os.sched_getaffinity(os.getpid())))
print(os.getpid())
os.sched_setaffinity(os.getpid(), {23})
print(list(os.sched_getaffinity(os.getpid())))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
30856
[23]

Benchmarks

In [3]:
os.sync()
In [4]:
%%timeit
df = pd.read_parquet('/data/volkerh/bench/out.parquet')
os.sync()
2.68 s ± 17.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [5]:
%%timeit
df = pd.read_parquet('/data/volkerh/bench/out.snappy.parquet')
os.sync()
2.97 s ± 21.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [6]:
%%timeit
df = pd.read_feather('/data/volkerh/bench/out.feather')
os.sync()
931 ms ± 11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [7]:
%%timeit
df = pd.read_hdf('/data/volkerh/bench/out.pddf.hdf5')
os.sync()
4.67 s ± 91.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [8]:
%%timeit
df = pd.read_hdf('/data/volkerh/bench/out.fixed.pddf.hdf5')
os.sync()
813 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [9]:
%%timeit
df = pd.read_csv('/data/volkerh/bench/out.csv')
os.sync()
17 s ± 42.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Versions

In [10]:
import pyarrow as pa
import pandas as pd
import tables
import h5py
In [11]:
print(pa.__version__)
print(pd.__version__)
print(tables.__version__)
print(h5py.__version__)
0.13.0
0.24.2
3.5.2
2.9.0
In [ ]: