The following assemble data and produces the charts presented in:
Beyond Macrobenchmarks: Microbenchmark-based Graph Database Evaluation. by Lissandrini, Matteo; Brugnara, Martin; and Velegrakis, Yannis. In PVLDB, 12(4):390-403, 2018.
This notebook can process experiments incrementally, so you can use the notebook while experiments are still running, replaying the notebook should produce updated charts.
Note: in the following we assume that missing experiments imply the system has reached the time-out.
# Starting fresh each time.
import os
if os.path.exists('results.db'):
os.remove('results.db')
%matplotlib inline
import os
import re
import csv
import sqlite3
import matplotlib.pyplot as plt
from os import path
from collections import OrderedDict
from IPython.core.display import display, HTML
def get_default_db():
return DbSession('results.db')
class DbSession(object):
def __init__(self, conn_str):
self.conn = sqlite3.connect(conn_str)
self.cursor = self.conn.cursor()
def __enter__(self):
return self.cursor
def __exit__(self, t, value, traceback):
if t is not None:
log(traceback)
self.cursor.close()
self.conn.commit()
self.conn.close()
def log(msg):
import datetime
print('%s - %s' % (datetime.datetime.now().isoformat(), msg))
def gby(lst, key=0):
""" Group by that respect keys order.
"""
from collections import OrderedDict
res = OrderedDict()
for item in lst:
k = item[key]
if k in res:
res[k].append(item)
else:
res[k] = [item]
return res
Let's first create, intialize and then load the data from the *.csv
file.
tables_list = ['results', 'queries', 'datasets', 'images', 'dborder']
with get_default_db() as c:
# Reset database
stmt = 'DROP TABLE IF EXISTS '
for tbl in tables_list:
c.execute(stmt + tbl)
# Setup database
create_tables = [
'''
-- Only common fields
CREATE TABLE IF NOT EXISTS results (
-- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
date INTEGER NOT NULL,
dbengine TEXT NOT NULL,
dataset TEXT NOT NULL, -- please store just name, not path
query TEXT NOT NULL,
sid INTEGER,
int_order INTEGER,
exec_time INTEGER NOT NULL
)
''',
'''
CREATE TABLE IF NOT EXISTS queries (
-- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
query TEXT,
cnt INTEGER, -- expected number of queries
cls TEXT,
qord int,
paper_id TEXT
);
''',
'''
CREATE TABLE IF NOT EXISTS datasets (
-- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
dataset TEXT,
json2 REAL,
dord INT
);
''',
'''
-- Images size on disk
CREATE TABLE IF NOT EXISTS images (
-- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
dbengine TEXT,
dataset TEXT, -- Special value 'empty'
space REAL
);
''',
'''
-- Defined ordering
CREATE TABLE IF NOT EXISTS dborder (
-- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
dbengine TEXT,
dbalias TEXT,
ord int
);
''',
]
assert(len(tables_list) == len(create_tables))
with get_default_db() as c:
for tbl in create_tables:
c.execute(tbl)
log('Database intialized')
While the following meta data may be derived from the queries themselves,
hardcoding them allows us to simplify the presentation.
See for example, how we deal with the BFS length parameters in the insert_result() function.
Comment any of the following tuples to avoid analizing the specific queries. Please remeber the implicit assumption that if there is no result entry then it is a timeout.
"""
Meta array format:
(QueryClass, [
('query file name wo .rb', <expected # of results for run>, '<id_on_paper>')
])
"""
meta = [
('L', [('loader', 1, '1')]),
('C', [
('insert-node', 10, '2'),
('insert-edge', 10, '3'),
('insert-edge-with-property', 10, '4'),
('insert-node-property', 10, '5'),
('insert-edge-property', 10, '6'),
('insert-node-with-edges', 10, '7'),
]),
('R', [
('count-nodes', 1, '8'),
('count-edges', 1, '9'),
('find-unique-labels', 1, '10'),
('node-property-search', 10, '11'),
('edge-specific-property-search', 10, '12'),
('label-search', 10, '13'),
('id-search-node', 10, '14'),
('id-search-edge', 10, '15'),
]),
('U', [
('update-node-property', 10, '16'),
('update-edge-property', 10, '17'),
]),
('D', [
('delete-nodes', 10, '18'),
('delete-edges', 10, '19'),
('delete-node-property', 10, '20'),
('delete-edge-property', 10, '21'),
]),
('T', [
('NN-incoming', 10, '22'),
('NN-outgoing', 10, '23'),
('NN-both-filtered', 10, '24'),
('NN-incoming-unique-label', 10, '25'),
('NN-outgoing-unique-label', 10, '26'),
('NN-both-unique-label', 10, '27'),
('k-degree-in', 1, '28'),
('k-degree-out', 1, '29'),
('k-degree-both', 1, '30'),
('find-non-root-nodes', 1, '31'),
# BFS, and BFS-labelled would go here
('shortest-path', 10, '34'),
('shortest-path-labelled', 10, '35'),
]),
('B', [
# 32
('BFS2', 10, ''),
('BFS3', 10, ''),
('BFS4', 10, ''),
('BFS5', 10, ''),
# 33
('BFS-labelled2', 10, ''),
('BFS-labelled3', 10, ''),
('BFS-labelled4', 10, ''),
('BFS-labelled5', 10, ''),
]),
('I', [
# ("X-create-index", 1, ''),
("X-insert-node", 10, '2'),
("X-insert-node-property", 10, '5'),
("X-node-property-search", 10, '11'),
("X-update-node-property", 10, '16'),
("X-delete-nodes", 10, '18'),
("X-delete-node-property", 10, '20'),
]),
('S', [
# ("ldbc-create-index", 1, ''),
("ldbc-macrobench-max-iid", 10, ''),
("ldbc-macrobench-max-oid", 10, ''),
("ldbc-macrobench-create", 10, ''),
("ldbc-macrobench-city", 10, ''),
("ldbc-macrobench-company", 10, ''),
("ldbc-macrobench-university", 10, ''),
("ldbc-macrobench-friend1", 10, ''),
("ldbc-macrobench-friend2", 10, ''),
("ldbc-macrobench-friend-tags", 10, ''),
("ldbc-macrobench-add-tags", 10, ''),
("ldbc-macrobench-friend-of-friend", 10, ''),
("ldbc-macrobench-triangle-closure", 10, ''),
("ldbc-macrobench-places", 10, ''),
]),
]
# Add queries in order
insert = 'INSERT INTO queries VALUES (?, ?, ?, ?, ?);'
with get_default_db() as c:
qord = 0
for (cls, queries) in meta:
for (q, cnt, paper_id) in queries:
c.execute(insert, [q, cnt, cls, qord, paper_id])
qord += 1
log('Queries meta have been loaded')
# Databases
# NOTE: first index-capables then others.
dborder = [
("neo4j", "Neo4j 1.9"),
("neo4j-tp3", "Neo4j 3.0"),
("orientdb", "OrientDB"),
("titan", "Titan 0.5"),
("titan-tp3", "Titan 1.0"),
("sparksee", "Sparksee"),
("arangodb", "ArangoDB"),
("pg", "Sqlg"),
("blazegraph", "Blazegraph"),
]
with get_default_db() as c:
for (i,d) in enumerate(dborder):
c.execute('INSERT INTO dborder VALUES (?,?,?)', (*d, i))
log('Fixture loadings completed')
q = 'INSERT INTO datasets VALUES (?,?,?)'
with get_default_db() as c:
with open('datasets.tsv') as f:
i = 0
for l in f:
size, ds = l.split()
c.execute(q, [ds, float(size)/1024, i])
i += 1
q = 'INSERT INTO images VALUES (?,?,?)'
with get_default_db() as c:
with open('images.csv') as f:
for l in f:
name, size = l.split(',')
if '_' in name:
db, ds = name.strip()[8:-6].split('_',1)
else:
db, ds = name.strip()[8:], 'empty'
size = size.strip()
size = float(size[:-2]) * (1024 if size[-2] == 'G' else 1)
c.execute(q, [db, ds, size])
The following import support importing results of experiments carried out on different time or machines. It is up to the researche to guarantee the consistency of the external environemnt, e.g. when usign multiple machine, they must make sure the machines are identical in hardware, software, and configuration.
See collect.sh for more detail about results harvesting.
# Path to results directory
RES_DIR='/results/'
# Get the list of the experiment results available on disk.
exps = set(fn.split('_',1)[0]
for fn in os.listdir(RES_DIR)
if os.path.isfile(path.join(RES_DIR, fn))
and fn[0] is not '.')
log('Importing: {}'.format(exps))
# Get list of experiments executed with indexes.
indexed_exps = []
if os.path.exists('indexed.csv'):
with open('indexed.csv') as f:
indexed_exps = set(t.strip() for t in f)
log('Experiments with indexes: {}'.format(indexed_exps))
# Support function
def insert_result(c, timestamp, row, indexed_exps):
insert_res = 'INSERT INTO results VALUES (?, ?, ?, ?, ?, ?, ?);'
# NOTE: we use replace instead of substr so it works also for LDBC.
query = row[2].split('/')[-1].replace('.groovy', '').replace('bulk', '')
if query.startswith('BFS'):
query += row[-1]
exec_time = int(row[6])
if timestamp in indexed_exps:
query = 'X-' + query
if query.startswith('loader'):
exec_time *= 1000000.0
param = [
timestamp,
row[0][len('gremlin-'):],
row[1].split('/')[-1].replace('_hashed', '').replace('_noslash.json', '').rstrip('2')[:-len('.json')],
query,
row[3] or '-1', # sid
row[5], # int_order
exec_time,
]
c.execute(insert_res, param)
with get_default_db() as c:
for res in [fn for fn in os.listdir(RES_DIR)
if os.path.isfile(path.join(RES_DIR, fn))
and fn[0] is not '.' and fn.split('_',2)[0] in exps]:
with open(path.join(RES_DIR, res), 'r') as csvfile:
if not res[-len('results.csv'):] == 'results.csv':
continue
reader = (row for row in csv.reader(csvfile) if row[0].startswith('gremlin-'))
for row in reader:
insert_result(c, res.split('_',2)[0], row, indexed_exps)
log('Results import process completed')
Define several views used to make analitics' SQL for the Charts less obscure.
views_names = [
'one_per_row', 'single', 'bulk', 'dbs',
'full_queries_set', 'single_means_wtimeout',
'bulk_sum_wtimeout', 'healt', 'healt_bulk',
]
views = [
# Results normalization:
# sum the query that reports the exec_time splitted in multiple record.
# NOTE, here we make the assumption (even if not 100% valid)
# that all "pieces" are in the same result file ('date'),
# and that a query is not run more than once
# in the same experiment ('date')
# NOTE, "-tp3" may have been colleted both in `$DB` and `${DB}-tp3`
# results. Thus use DISTINCT.
'''
CREATE VIEW one_per_row AS
SELECT dbengine,dataset,query,sid,int_order,
CASE
WHEN COUNT(*) >= 5 THEN
(SUM(exec_time) - MIN(exec_time) - MAX(exec_time)) / (COUNT(*)-2)
ELSE
AVG(exec_time)
END AS exec_time
FROM (
-- already ok
SELECT DISTINCT * FROM results
WHERE
(query <> 'insert-node-with-edges') AND
(dbengine <> "arangodb" OR query <> 'loader')
UNION
-- insert-node-with-edges.groovy & arango loader
SELECT date,dbengine,dataset,query,sid,int_order,SUM(exec_time)
FROM results
WHERE
query = 'insert-node-with-edges' OR
(dbengine = "arangodb" AND query = 'loader')
GROUP BY date,dbengine,dataset,query,sid,int_order
UNION
SELECT date,dbengine,dataset,'ldbc-create-user',sid,int_order,SUM(exec_time)
FROM results
WHERE query LIKE 'ldbc-create-user%'
GROUP BY date,dbengine,dataset,sid,int_order
)
GROUP BY dbengine,dataset,query,sid,int_order
''',
# Run that are NOT bulk mode
'''
CREATE VIEW single AS
SELECT DISTINCT * FROM one_per_row WHERE int_order = 0 OR int_order = ''
''',
# ONLY run that are bulk mode
'''
CREATE VIEW bulk AS
SELECT DISTINCT * FROM one_per_row WHERE NOT (int_order = 0 OR int_order = '')
''',
# dbengines view (based on images size)
'''
CREATE VIEW dbs AS
SELECT DISTINCT dbengine
FROM images
''',
# Expected query set (wo/int_order)
'''
CREATE VIEW full_queries_set AS
SELECT dbengine,dataset,query,cnt
FROM dbs, queries, datasets
''',
# single_means_wtimeout
# Assumption: if COUNT() < expected --> timedout
# SUM(query) + timeout (:= 2h) * (Expected query count - COUNT(query))
'''
CREATE VIEW single_means_wtimeout AS
SELECT dbengine,dataset,q.query,
(SUM(exec_time/1000000) + (7200000 * max(0,q.cnt - SUM(done))))/q.cnt AS exec_time,
q.cnt, SUM(done) AS done
FROM (
SELECT dbengine,dataset,query,exec_time,1 AS done
FROM single
UNION
SELECT dbengine,dataset,query,0,0
FROM full_queries_set
) AS r
NATURAL JOIN queries AS q
GROUP BY dbengine,dataset,query
''',
# bulk_sum_wtimeout
# Assumption: if COUNT() < expected --> timedout
'''
CREATE VIEW bulk_sum_wtimeout AS
SELECT dbengine,dataset,r.query,
CASE
WHEN SUM(done) <> q.cnt THEN 7200000
ELSE SUM(exec_time) / 1000000
END AS exec_time,
q.cnt, SUM(done) AS done
FROM (
SELECT dbengine,dataset,query,exec_time,1 AS done
FROM bulk
UNION
SELECT dbengine,dataset,query,0,0
FROM full_queries_set
WHERE cnt > 1
) AS r
NATURAL JOIN queries AS q
GROUP BY dbengine,dataset,query
''',
# health
# Checks how many queries has been completed and how many timedout.
'''
CREATE VIEW health AS
SELECT dbengine, dataset, query,
-- NOTE, count on any of "single" field, but not on *.
-- Remember that we are using a left natual join.
-- Assumption, NOT NULL
COUNT(single.query) AS cnt,
-- NOTE, assumpion SID is always present (NOT NULL, DEFAULT '-1')
COUNT(DISTINCT sid) AS sid_cnt,
cnt AS expected_cnt,
CASE
WHEN COUNT(DISTINCT sid) < cnt
THEN "missing"
WHEN COUNT(DISTINCT sid) > cnt
THEN "wat!"
WHEN COUNT(DISTINCT sid) <> COUNT(*)
THEN "sid inconsistence"
ELSE "ok"
END AS status
FROM full_queries_set
LEFT NATURAL JOIN single
GROUP BY dbengine, dataset, query
''',
# healt_bulk
# Checks how many queries has been completed and how many timedout.
'''
CREATE VIEW health_bulk AS
SELECT dbengine, dataset, query,
COUNT(bulk.query) AS cnt,
COUNT(DISTINCT int_order) AS sid_cnt,
cnt AS expected_cnt,
CASE
WHEN COUNT(DISTINCT int_order) < cnt
THEN "missing"
WHEN COUNT(DISTINCT int_order) > cnt
THEN "wat!"
WHEN COUNT(DISTINCT int_order) <> COUNT(*)
THEN "sid inconsistence"
ELSE "ok"
END AS status
FROM full_queries_set
LEFT NATURAL JOIN bulk
-- queries with cnt == 1 does not have bulk version
WHERE cnt > 1
GROUP BY dbengine, dataset, query
''',
]
assert len(views_names) is len(views)
with get_default_db() as c:
stmt = 'DROP VIEW IF EXISTS %s'
for v in views_names:
c.execute(stmt % v)
log("All views have been dropped")
for v in views:
c.execute(v)
log("All views have been recreated")
Below all the plots of the experiments on running time comparison. For all the following charts, lower values are better.
import numpy as np
# Style from paper.
styles = [
{'color':'blue', 'linestyle':':', 'marker': 'x'},
{'color':'cyan', 'linestyle':':', 'marker': 'o'},
{'color':'gold', 'linestyle':'-', 'marker': 'D'},
{'color':'olive', 'linestyle':'-', 'marker': 'x'},
{'color':'darkgreen', 'linestyle':':', 'marker': '+'},
{'color':'black', 'linestyle':'-.', 'marker': '*'},
{'color':'red', 'linestyle':'--', 'marker': '^'},
{'color':'grey', 'linestyle':':', 'marker': 'v'},
{'color':'green', 'linestyle':'-', 'marker': 's'},
]
markers = {'markersize': 8, 'fillstyle': 'none'}
# Make it readable
plt.rcParams.update({'figure.figsize': (20,10), 'font.size': 20})
def ds_short(ds_name):
if ds_name[:len('freebase')] == 'freebase':
return 'FBR-' + ds_name.split('_')[1][0].upper()
return ' '.join(ds_name.upper().split())
re_qpff = re.compile('[^a-zA-Z0-9]')
def q_prettify(q):
return re_qpff.sub(' ', q)
def get_queries(not_cls=list()):
""" Queries with at least one experiment,
in selcted classes (not_cls).
"""
with get_default_db() as c:
q = '''
SELECT DISTINCT cls, s.query, paper_id || ' ' || s.query
FROM single s
JOIN queries as q ON q.query = s.query AND cls NOT IN ({})
ORDER BY qord
'''
qf = q.format(','.join(map(lambda x: "'" + x + "'", not_cls)))
queries = [(r[0], r[1]) for r in c.execute(qf)]
queries_labels = [q_prettify(r[2].strip()) for r in c.execute(qf)]
in_queries = ','.join(map(lambda x: "'" + x[1] + "'", queries))
return queries, queries_labels, in_queries
# Datasets with at least one experiment
with get_default_db() as c:
q= '''
SELECT DISTINCT dataset
FROM single
NATURAL JOIN datasets
ORDER BY dord
'''
datasets = [r[0] for r in c.execute(q)]
in_datasets = ','.join(map(lambda x: "'" + x + "'", datasets))
The plots shows for the mean execution time for each query, where timeouts are counted as the timeout it self, 2h.
# Plots all queries excpet these with specialized plots
q = '''
SELECT dbalias, dataset, ROUND(exec_time, 2)
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
NATURAL JOIN datasets AS ds
WHERE q.query='{}' AND dataset IN ({})
-- Exclude classes with specialized plots
AND q.cls NOT IN ('B', 'S', 'I')
ORDER BY ord, dord
'''
queries, queries_labels, in_queries = get_queries(['B', 'S', 'I'])
xlabels = list(map(ds_short, datasets))
current_cls = ''
with get_default_db() as c:
for i, x in enumerate(queries):
cls, query = x
if cls != current_cls:
current_cls = cls
display(HTML('<h1 id="current_cls">{}</h1>'.format(current_cls)))
plt.title('#'+queries_labels[i].upper())
plt.xlabel('Dataset')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)
rs = [r for r in c.execute(q.format(query, in_datasets))]
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
q = '''
SELECT dbalias, t.query, ROUND(exec_time, 2)
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
WHERE
-- select class and propoer datasets
q.query LIKE 'BFS-labelled%' AND dataset = 'ldbc'
ORDER BY ord, qord
'''
plt.title('#33 BFS-labelled on LDBC')
plt.xlabel('Depth')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)
with get_default_db() as c:
rs = list(c.execute(q))
llabels = None
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
if not llabels:
llabels = ['DEPTH ' + x[1][len('BFS-labelled'):] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(llabels)), llabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
The queries for the Macrobenchmark are executed alltogether by a single query file on top of an image with ad-hoc indexes. The experiments prints the execution time of different sub-queries that are represented as checkpoints.
q = '''
SELECT dbalias, t.query, ROUND(exec_time, 2)
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
WHERE
-- select class and propoer datasets
q.cls = 'S' AND dataset = 'ldbc' AND
-- filter out non index-capable systems
dbengine <> 'blazegraph'
ORDER BY ord, qord
'''
plt.title('LDBC Macrobench')
plt.xlabel('Checkpoints')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)
with get_default_db() as c:
rs = list(c.execute(q))
llabels = None
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
if not llabels:
llabels = [x[1][len('ldbc-macrobench-'):] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(llabels)), llabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
For each system and dataset we sum the running time for each query. We separate running time of queries in isolation and running times of queries in batch mode.
Note: While all queries can be run in isolation, not all are run in batch mode, only those based on different seeds.
# SINGLE
# On paper they include: CRUDTB
# --> no LIS
q = '''
SELECT dbalias, dataset, ROUND(SUM(exec_time) / 1000,2)
FROM (
SELECT dbengine, dataset, query, (exec_time*cnt) as exec_time
FROM single_means_wtimeout
NATURAL JOIN queries as q
WHERE dataset IN ({}) AND
cls NOT IN ('L', 'I', 'S') AND
query IN ({})
) as v
NATURAL JOIN dborder
NATURAL JOIN queries as q
NATURAL JOIN datasets as dds
GROUP BY v.dataset, dbengine
ORDER BY ord, dord
'''
# Avoid the chart be dominated by timeouts on partial runs.
queries, queries_labels, in_queries = get_queries(['L', 'I', 'S'])
plt.title('All queries'.upper())
plt.xlabel('Dataset')
plt.ylabel('Time (s)')
plt.yscale('log')
plt.grid(True)
with get_default_db() as c:
rs = [r for r in c.execute(q.format(in_datasets, in_queries))]
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
# BATCH
# On paper they include: CRUDTB
# --> no BIL
q = '''
SELECT dbalias, dataset, ROUND(SUM(exec_time) / 1000,2)
FROM (
SELECT dbengine,dataset,query, exec_time
FROM bulk_sum_wtimeout
NATURAL JOIN queries as q
WHERE dataset IN ({}) AND
cls NOT IN ('B', 'I', 'L') AND
query IN ({})
) as v
NATURAL JOIN dborder
NATURAL JOIN queries as q
NATURAL JOIN datasets as dds
GROUP BY v.dataset, dbengine
ORDER BY ord, dord
'''
plt.title('Only Batch Queries'.upper())
plt.xlabel('Dataset')
plt.ylabel('Time (s)')
plt.yscale('log')
plt.grid(True)
with get_default_db() as c:
rs = [r for r in c.execute(q.format(in_datasets, in_queries))]
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
q = '''
SELECT dbalias, dataset, ROUND(exec_time, 2)
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
NATURAL JOIN datasets AS ds
WHERE q.query='{}' AND dataset IN ({})
AND q.cls IN ('I') AND dbengine <> 'blazegraph'
ORDER BY ord, dord
'''
queries, queries_labels, in_queries = get_queries(['L', 'C', 'R', 'U', 'D', 'T', 'B', 'S'])
xlabels = list(map(ds_short, datasets))
current_cls = ''
with get_default_db() as c:
for i, x in enumerate(queries):
cls, query = x
if cls != current_cls:
current_cls = cls
display(HTML('<h1 id="current_cls">{}</h1>'.format(current_cls)))
n, _, qname = queries_labels[i].upper().split(' ', 2)
plt.title('#' + n + ' ' + qname + ' [indexed]')
plt.xlabel('Dataset')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)
rs = [r for r in c.execute(q.format(query, in_datasets))]
for i, r in enumerate(gby(rs).items()):
db, items = r
ts = [x[2] for x in items]
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
Comparing the increment of size of images for each system, before and after loading the dataset. Raw datasets are loaded in a volume, so this takes into account only the actual system internal storage.
# NOTE 1: we may take care of image size resolution vs dataset size.
# A pratical example. Neo4j-tp3 image size resolution is in 10MB (0.01GB),
# Yeast is 1.468 KB. The effect of the loading might be hidden in the measurament error,
# that implies image_with_data - image_clean = 0.
# Since we are using a log scale for the y axis we assum at least a variation of 1MB
# from an image_clean to an image_with_data.
# NOTE 2: In this version of the chart, the grey area represents the size of the dataset
# encoded in JSON on disk; not the number of items in it.
q = '''
SELECT dbalias, dataset, space
FROM (
SELECT dbalias, dataset, MAX(1, im.space - empty.space) AS space, ord, dord
FROM images im
NATURAL JOIN dborder
NATURAL JOIN datasets
JOIN (
SELECT dbengine, space
FROM images
WHERE dataset == 'empty'
) AS empty ON im.dbengine = empty.dbengine
WHERE dataset IN ({in_datasets})
UNION
SELECT 'json', dataset, json2, (SELECT COUNT(*) FROM images) + 1, dord
FROM datasets
WHERE dataset IN ({in_datasets})
) AS t
ORDER BY ord, dord
'''
with get_default_db() as c:
rs = list(c.execute(q.format(in_datasets=in_datasets)))
plt.title('DISK SPACE USAGE')
plt.xlabel('Dataset')
plt.ylabel('Space (MB)')
plt.yscale('log', basey=2)
plt.grid(True)
lst = list(enumerate(gby(rs).items()))
for i, r in lst:
db, items = r
ts = [x[2] for x in items]
if i < (len(lst) -1):
plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
else:
plt.fill_between(np.arange(len(ts)), ts, color='grey', alpha=0.3, label=db)
plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
# NOTE: Like in paper, indexed and macro-benchmark queries are excluded.
# modify the WHERE in the inner query to change that.
q = '''
SELECT dataset, db, cnt
FROM (
SELECT dbengine || '_0' AS db, dataset, SUM(t) - SUM(c) AS cnt, ord, dord
FROM (
SELECT h.dataset, h.dbengine, SUM(h.sid_cnt) c ,SUM(h.expected_cnt) t
FROM health h
LEFT JOIN queries q ON h.query = q.query
WHERE q.cls <> 'I' AND q.cls <> 'S' AND
h.query IN ({in_queries}) AND
h.dataset IN ({in_datasets})
GROUP BY h.dataset, h.dbengine
)
NATURAL JOIN dborder
NATURAL JOIN datasets
GROUP BY dbengine, dataset
UNION
SELECT dbengine || '_1', dataset, SUM(t) - SUM(c), ord, dord
FROM (
SELECT dataset, dbengine, SUM(ok) as c , COUNT(ok) as t
FROM (
SELECT h.dataset,h.dbengine,h.query, SUM(h.sid_cnt / h.expected_cnt) AS ok
FROM health_bulk h
JOIN queries q ON h.query = q.query
WHERE q.cls <> 'I' AND q.cls <> 'S' AND
h.query IN ({in_queries}) AND
h.dataset IN ({in_datasets})
GROUP BY h.dataset,h.dbengine,h.query
)
GROUP BY dataset,dbengine
)
NATURAL JOIN dborder
NATURAL JOIN datasets
GROUP BY dbengine, dataset
)
ORDER BY dord, ord, db
-- ORDER BY DESC dord, ord, db
'''
# NOTE: suffix mapping
# 0 -> I
# 1 -> B
queries, queries_labels, in_queries = get_queries(['I', 'S'])
with get_default_db() as c:
rs = list(c.execute(q.format(in_datasets=in_datasets, in_queries=in_queries)))
plt.title('TIME-OUTS')
plt.xlabel('DB Engine and Execution Method')
plt.ylabel('# Timeouts')
plt.grid(True)
mod_xlabels = ['I\n' + ('\n' if i%4 else '') + k[:-2]
if not i%2 else 'B' for (i, k) in enumerate(OrderedDict((r[1], None) for r in rs).keys())]
loc = np.arange(len(mod_xlabels))
top = np.zeros(len(mod_xlabels))
for i, r in enumerate(gby(rs).items()):
ds, items = r
timeouts = np.array([x[2] for x in items])
plt.bar(loc, timeouts, label=ds_short(ds), bottom=top)
top += timeouts
plt.xticks(loc, mod_xlabels, rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()