Write Benchmarks for HDF5, Parquet, Feather, CSV

Preamble

In [1]:
import sys
import os
In [2]:
# https://unix.stackexchange.com/q/23106
# https://stackoverflow.com/q/15639779
# https://docs.python.org/dev/library/os.html#os.sched_setaffinity
print(list(os.sched_getaffinity(os.getpid())))
print(os.getpid())
os.sched_setaffinity(os.getpid(), {23})
print(list(os.sched_getaffinity(os.getpid())))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
30107
[23]
In [3]:
# dingding
sys.path.append('/home/volkerh/Source/EarlyWarn/harmonics')
In [4]:
import harmonics.io
In [5]:
basedir = '/data/volkerh/earlywarn/harmonics_OLD/snasa_22kV'
basedir += '/rms_2017_nofaults_100samples'
df, fnames = \
    harmonics.io.load_from_csv(basedir=basedir, \
                               kind='nofaults', nchopoff=0, \
                               compute_diffs=True)
// Loading from /data/volkerh/earlywarn/harmonics_OLD/snasa_22kV/rms_2017_nofaults_100samples
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17820000 entries, 0 to 17819999
Data columns (total 11 columns):
Time [s]           float64
RMS_V12_AVG        float64
RMS_V1_AVG         float64
RMS_V23_AVG        float64
RMS_V2_AVG         float64
RMS_V31_AVG        float64
RMS_V3_AVG         float64
file_id            uint16
RMS_V1_AVG_DIFF    float64
RMS_V2_AVG_DIFF    float64
RMS_V3_AVG_DIFF    float64
dtypes: float64(10), uint16(1)
memory usage: 1.4 GB

Benchmarks

In [7]:
os.sync()
In [8]:
%%timeit
df.to_parquet('/data/volkerh/bench/out.parquet', compression=None)
os.sync()
5.59 s ± 105 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [9]:
%%timeit
df.to_parquet('/data/volkerh/bench/out.snappy.parquet', compression='snappy')
os.sync()
5.67 s ± 19.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [10]:
%%timeit
df.to_feather('/data/volkerh/bench/out.feather')
os.sync()
5.78 s ± 134 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [11]:
%%timeit
df.to_hdf('/data/volkerh/bench/out.pddf.hdf5', 'df', format='table')
os.sync()
12.9 s ± 319 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [12]:
%%timeit
df.to_hdf('/data/volkerh/bench/out.fixed.pddf.hdf5', 'df', format='fixed')
os.sync()
5.61 s ± 21.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [13]:
%%timeit
df.to_csv('/data/volkerh/bench/out.csv', index=False)
os.sync()
2min 35s ± 272 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

File Sizes

In [14]:
!ls -sh /data/volkerh/bench/out.* | sort -h
200M /data/volkerh/bench/out.snappy.parquet
317M /data/volkerh/bench/out.parquet
1.4G /data/volkerh/bench/out.csv
1.4G /data/volkerh/bench/out.feather
1.5G /data/volkerh/bench/out.fixed.pddf.hdf5
1.5G /data/volkerh/bench/out.pddf.hdf5

Versions

In [25]:
import pyarrow as pa
import pandas as pd
import tables
import h5py
In [26]:
print(pa.__version__)
print(pd.__version__)
print(tables.__version__)
print(h5py.__version__)
0.13.0
0.24.2
3.5.2
2.9.0
In [ ]: