Benchmarks¶
Speed comparisons.
In [1]:
Copied!
import time
import numpy as np
from spector import indices, matrix, vector
def timed(func, *args):
"""Return elapsed time of function call."""
start = time.time()
_ = func(*args) # noqa
return time.time() - start
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2**18))
values = np.ones(len(keys))
import time
import numpy as np
from spector import indices, matrix, vector
def timed(func, *args):
"""Return elapsed time of function call."""
start = time.time()
_ = func(*args) # noqa
return time.time() - start
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2**18))
values = np.ones(len(keys))
indices
vs. set
¶
In [2]:
Copied!
# from array
timed(indices, keys) / timed(set, keys)
# from array
timed(indices, keys) / timed(set, keys)
Out[2]:
0.6119605002287631
In [3]:
Copied!
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
Out[3]:
0.12495496469231877
In [4]:
Copied!
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
Out[4]:
0.10616412643591862
vector
vs. dict
¶
In [5]:
Copied!
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
Out[5]:
0.3500064561947188
In [6]:
Copied!
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
Out[6]:
0.04848073707127757
In [7]:
Copied!
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
Out[7]:
0.11978443012925771
In [8]:
Copied!
# sum
timed(np.sum, vec) / timed(sum, d.values())
# sum
timed(np.sum, vec) / timed(sum, d.values())
Out[8]:
0.056443819731178076
In [9]:
Copied!
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
Out[9]:
0.022048193702504
In [10]:
Copied!
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base**exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append(
{
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
}
)
return pd.DataFrame(rows, index=buckets)
df = measure(10**5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base**exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append(
{
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
}
)
return pd.DataFrame(rows, index=buckets)
df = measure(10**5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
Out[10]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 0.002118 | 0.000906 | 0.002442 |
10 | 0.001679 | 0.001257 | 0.002088 |
100 | 0.001990 | 0.002352 | 0.003052 |
1000 | 0.004758 | 0.006618 | 0.014139 |
10000 | 0.029814 | 0.028990 | 0.125765 |
100000 | 0.165212 | 0.160292 | 0.784008 |
In [11]:
Copied!
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
Out[11]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 2.338599 | 1.000000 | 2.696682 |
10 | 1.335293 | 1.000000 | 1.660914 |
100 | 1.000000 | 1.182123 | 1.533908 |
1000 | 1.000000 | 1.390921 | 2.971340 |
10000 | 1.028440 | 1.000000 | 4.338298 |
100000 | 1.030694 | 1.000000 | 4.891119 |