Benchmarks¶
Speed comparisons.
In [1]:
Copied!
import time
import numpy as np
from spector import indices, matrix, vector
def timed(func, *args):
"""Return elapsed time of function call."""
start = time.time()
_ = func(*args) # noqa
return time.time() - start
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2**18))
values = np.ones(len(keys))
import time
import numpy as np
from spector import indices, matrix, vector
def timed(func, *args):
"""Return elapsed time of function call."""
start = time.time()
_ = func(*args) # noqa
return time.time() - start
def dok(size):
return {(i, j): 1.0 for i in range(size) for j in range(size)}
def vecs(size):
arr = np.array(range(size))
return matrix((i, vector(arr)) for i in range(size))
keys = np.array(range(2**18))
values = np.ones(len(keys))
indices
vs. set
¶
In [2]:
Copied!
# from array
timed(indices, keys) / timed(set, keys)
# from array
timed(indices, keys) / timed(set, keys)
Out[2]:
0.8525398613004564
In [3]:
Copied!
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
# to array
timed(np.array, indices(keys)) / timed(np.fromiter, keys, keys.dtype, len(keys))
Out[3]:
0.07840436623888167
In [4]:
Copied!
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
# set op
timed(indices(keys).__sub__, indices(keys)) / timed(set(keys).__sub__, set(keys))
Out[4]:
0.16599170994330364
vector
vs. dict
¶
In [5]:
Copied!
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
# from arrays
timed(vector, keys, values) / timed(dict, zip(keys, values))
Out[5]:
0.2647781825610296
In [6]:
Copied!
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
vec, d = vector(keys, values), dict(zip(keys, values))
# keys
timed(vec.keys) / timed(np.fromiter, d.keys(), keys.dtype, len(d))
Out[6]:
0.07311205429616445
In [7]:
Copied!
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
# values
timed(vec.values) / timed(np.fromiter, d.values(), values.dtype, len(d))
Out[7]:
0.16635607472387617
In [8]:
Copied!
# sum
timed(np.sum, vec) / timed(sum, d.values())
# sum
timed(np.sum, vec) / timed(sum, d.values())
Out[8]:
0.059441563487103256
In [9]:
Copied!
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
# matmul
timed(vec.__matmul__, vec) / timed(sum, (d[k] * d[k] for k in d))
Out[9]:
0.02234832679629475
In [10]:
Copied!
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base**exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append(
{
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
}
)
return pd.DataFrame(rows, index=buckets)
df = measure(10**5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
import collections
import math
import random
import pandas as pd
from spector import groupby
def measure(size, base=10):
buckets = [base**exp for exp in range(round(math.log(size, base)) + 1)]
data = np.array([random.randint(0, size) for _ in range(size)])
rows = []
values = np.arange(len(data))
for num in buckets:
keys = data % num
df = pd.DataFrame({'keys': keys, 'values': values})
rows.append(
{
'hashed': timed(collections.deque, groupby(keys, values), 0),
'sorted': timed(collections.deque, groupby(keys.astype('u8'), values), 0),
'pandas': timed(collections.deque, df.groupby('keys', sort=False)['values'], 0),
}
)
return pd.DataFrame(rows, index=buckets)
df = measure(10**5, 10)[['hashed', 'sorted', 'pandas']]
df.index.name = 'buckets'
df
Out[10]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 0.002021 | 0.000962 | 0.002286 |
10 | 0.001561 | 0.001216 | 0.002084 |
100 | 0.001829 | 0.002200 | 0.002702 |
1000 | 0.004171 | 0.005865 | 0.011016 |
10000 | 0.023951 | 0.025723 | 0.110423 |
100000 | 0.154255 | 0.130689 | 0.604733 |
In [11]:
Copied!
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
for i in df.index:
df.loc[i] = df.loc[i] / df.loc[i].min()
df
Out[11]:
hashed | sorted | pandas | |
---|---|---|---|
buckets | |||
1 | 2.101636 | 1.000000 | 2.376549 |
10 | 1.283026 | 1.000000 | 1.713054 |
100 | 1.000000 | 1.202868 | 1.477705 |
1000 | 1.000000 | 1.405979 | 2.640947 |
10000 | 1.000000 | 1.073982 | 4.610410 |
100000 | 1.180327 | 1.000000 | 4.627278 |