Result Visualization Notebook

The following assemble data and produces the charts presented in:

Beyond Macrobenchmarks: Microbenchmark-based Graph Database Evaluation. by Lissandrini, Matteo; Brugnara, Martin; and Velegrakis, Yannis. In PVLDB, 12(4):390-403, 2018.

This notebook can process experiments incrementally, so you can use the notebook while experiments are still running, replaying the notebook should produce updated charts.

Note: in the following we assume that missing experiments imply the system has reached the time-out.

Quick References

Data Preprocessing

In [1]:
# Starting fresh each time.
import os
if os.path.exists('results.db'):
    os.remove('results.db')
In [2]:
%matplotlib inline
import os
import re
import csv
import sqlite3
import matplotlib.pyplot as plt
from os import path
from collections import OrderedDict
from IPython.core.display import display, HTML


def get_default_db():
    return DbSession('results.db')


class DbSession(object):
    def __init__(self, conn_str):
        self.conn = sqlite3.connect(conn_str)
        self.cursor = self.conn.cursor()
        
    def __enter__(self):
        return self.cursor
    
    def __exit__(self, t, value, traceback):
        if t is not None:
            log(traceback)
        self.cursor.close()
        self.conn.commit()
        self.conn.close()


def log(msg):
    import datetime
    print('%s - %s' % (datetime.datetime.now().isoformat(), msg))

    
def gby(lst, key=0):
    """ Group by that respect keys order.
    """
    from collections import OrderedDict
    res = OrderedDict()
    for item in lst:
        k = item[key]
        if k in res: 
            res[k].append(item)
        else: 
            res[k] = [item]
    return res

Bootstrap

Let's first create, intialize and then load the data from the *.csv file.

Schema

In [3]:
tables_list = ['results', 'queries', 'datasets', 'images', 'dborder']

with get_default_db() as c:
    # Reset database
    stmt = 'DROP TABLE IF EXISTS '
    for tbl in tables_list:
        c.execute(stmt + tbl)

# Setup database
create_tables = [
    '''
    -- Only common fields
    CREATE TABLE IF NOT EXISTS results (
            -- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
            date       INTEGER NOT NULL, 
            dbengine   TEXT NOT NULL, 
            dataset    TEXT NOT NULL,     -- please store just name, not path
            query      TEXT NOT NULL, 
            sid        INTEGER,
            int_order  INTEGER,
            exec_time  INTEGER NOT NULL
    )

    ''',
    '''
    CREATE TABLE IF NOT EXISTS queries (
            -- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
            query      TEXT, 
            cnt        INTEGER, -- expected number of queries
            cls        TEXT,
            qord       int,
            paper_id   TEXT
    );
    ''',
    '''
    CREATE TABLE IF NOT EXISTS datasets (
            -- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
            dataset TEXT, 
            json2   REAL,
            dord    INT
    );
    ''',
    '''
    -- Images size on disk
    CREATE TABLE IF NOT EXISTS images (
            -- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
            dbengine      TEXT, 
            dataset       TEXT, -- Special value 'empty'
            space         REAL
    );
    ''',
    '''
    -- Defined ordering
    CREATE TABLE IF NOT EXISTS dborder (
            -- _rowid_ (https://www.sqlite.org/lang_createtable.html#rowid)
            dbengine      TEXT, 
            dbalias       TEXT,
            ord           int 
    );
    ''',
]

assert(len(tables_list) == len(create_tables))
    
with get_default_db() as c:
    for tbl in create_tables:
        c.execute(tbl)
    

log('Database intialized')
2019-09-12T13:26:52.122231 - Database intialized

Fixtures

While the following meta data may be derived from the queries themselves, hardcoding them allows us to simplify the presentation.
See for example, how we deal with the BFS length parameters in the insert_result() function.

Queries

Comment any of the following tuples to avoid analizing the specific queries. Please remeber the implicit assumption that if there is no result entry then it is a timeout.

In [4]:
"""
Meta array format:
(QueryClass, [
  ('query file name wo .rb', <expected # of results for run>, '<id_on_paper>')
])
"""
meta = [
    ('L', [('loader', 1, '1')]),
    ('C', [
        ('insert-node', 10, '2'),
        ('insert-edge', 10, '3'),
        ('insert-edge-with-property', 10, '4'),
        ('insert-node-property', 10, '5'),
        ('insert-edge-property', 10, '6'),
        ('insert-node-with-edges', 10, '7'),
    ]),
    ('R', [
        ('count-nodes', 1, '8'),
        ('count-edges', 1, '9'),
        ('find-unique-labels', 1, '10'),
        ('node-property-search', 10, '11'),
        ('edge-specific-property-search', 10, '12'),
        ('label-search', 10, '13'),
        ('id-search-node', 10, '14'),
        ('id-search-edge', 10, '15'),
    ]),
    ('U', [
        ('update-node-property', 10, '16'),
        ('update-edge-property', 10, '17'),
    ]),
    ('D', [
        ('delete-nodes', 10, '18'),
        ('delete-edges', 10, '19'),
        ('delete-node-property', 10, '20'),
        ('delete-edge-property', 10, '21'),        
    ]),
    ('T', [
        ('NN-incoming', 10, '22'),
        ('NN-outgoing', 10, '23'),        
        ('NN-both-filtered', 10, '24'),   
        ('NN-incoming-unique-label', 10, '25'),
        ('NN-outgoing-unique-label', 10, '26'),
        ('NN-both-unique-label', 10, '27'),
        ('k-degree-in', 1, '28'),
        ('k-degree-out', 1, '29'),
        ('k-degree-both', 1, '30'),
        ('find-non-root-nodes', 1, '31'),
        # BFS, and BFS-labelled would go here
        ('shortest-path', 10, '34'),
        ('shortest-path-labelled', 10, '35'),
    ]),  
    ('B', [
        # 32
        ('BFS2', 10, ''),
        ('BFS3', 10, ''),
        ('BFS4', 10, ''),
        ('BFS5', 10, ''),
        # 33
        ('BFS-labelled2', 10, ''),
        ('BFS-labelled3', 10, ''),
        ('BFS-labelled4', 10, ''),
        ('BFS-labelled5', 10, ''),
    ]),
    
    ('I', [
        # ("X-create-index", 1, ''),
        ("X-insert-node", 10,          '2'),
        ("X-insert-node-property", 10, '5'),
        ("X-node-property-search", 10, '11'),
        ("X-update-node-property", 10, '16'),
        ("X-delete-nodes", 10,         '18'),
        ("X-delete-node-property", 10, '20'),
    ]),
    
    ('S', [
        # ("ldbc-create-index", 1, ''),
        ("ldbc-macrobench-max-iid", 10, ''),
        ("ldbc-macrobench-max-oid", 10, ''),
        ("ldbc-macrobench-create", 10, ''),
        ("ldbc-macrobench-city", 10, ''),
        ("ldbc-macrobench-company", 10, ''),
        ("ldbc-macrobench-university", 10, ''),
        ("ldbc-macrobench-friend1", 10, ''),
        ("ldbc-macrobench-friend2", 10, ''),
        ("ldbc-macrobench-friend-tags", 10, ''),
        ("ldbc-macrobench-add-tags", 10, ''),
        ("ldbc-macrobench-friend-of-friend", 10, ''),
        ("ldbc-macrobench-triangle-closure", 10, ''),
        ("ldbc-macrobench-places", 10, ''),
    ]),
]

# Add queries in order 
insert = 'INSERT INTO queries VALUES (?, ?, ?, ?, ?);'
with get_default_db() as c:
    qord = 0
    for (cls, queries) in  meta:
        for (q, cnt, paper_id) in  queries:
            c.execute(insert, [q, cnt, cls, qord, paper_id])
            qord += 1
            
log('Queries meta have been loaded')
2019-09-12T13:26:52.256371 - Queries meta have been loaded
In [5]:
# Databases
# NOTE: first index-capables then others.
dborder = [
    ("neo4j",      "Neo4j 1.9"),
    ("neo4j-tp3",  "Neo4j 3.0"),
    ("orientdb",   "OrientDB"),
    ("titan",      "Titan 0.5"),
    ("titan-tp3",  "Titan 1.0"),
    ("sparksee",   "Sparksee"),
    ("arangodb",   "ArangoDB"), 
    ("pg",         "Sqlg"),
    ("blazegraph", "Blazegraph"),
]

with get_default_db() as c:
    for (i,d) in enumerate(dborder):
        c.execute('INSERT INTO dborder VALUES (?,?,?)', (*d, i))

log('Fixture loadings completed')
2019-09-12T13:26:52.283086 - Fixture loadings completed

Data

Datasets

In [6]:
q = 'INSERT INTO datasets VALUES (?,?,?)'
with get_default_db() as c:
    with open('datasets.tsv') as f:
        i = 0
        for l in f:
            size, ds = l.split()
            c.execute(q, [ds, float(size)/1024, i])
            i += 1

Images

In [7]:
q = 'INSERT INTO images VALUES (?,?,?)'
with get_default_db() as c:
    with open('images.csv') as f:
        for l in f:
            name, size = l.split(',')
            if '_' in name:
                db, ds = name.strip()[8:-6].split('_',1)
            else:
                db, ds = name.strip()[8:], 'empty'
            size = size.strip()
            size = float(size[:-2]) * (1024 if size[-2] == 'G' else 1)
            c.execute(q, [db, ds, size]) 

Results

The following import support importing results of experiments carried out on different time or machines. It is up to the researche to guarantee the consistency of the external environemnt, e.g. when usign multiple machine, they must make sure the machines are identical in hardware, software, and configuration.

See collect.sh for more detail about results harvesting.

In [8]:
# Path to results directory
RES_DIR='/results/'

# Get the list of the experiment results available on disk.
exps = set(fn.split('_',1)[0]
           for fn in os.listdir(RES_DIR)
           if os.path.isfile(path.join(RES_DIR, fn)) 
               and fn[0] is not '.')
log('Importing: {}'.format(exps))

# Get list of experiments executed with indexes.
indexed_exps = []
if os.path.exists('indexed.csv'):
    with open('indexed.csv') as f:
        indexed_exps = set(t.strip() for t in f)
log('Experiments with indexes: {}'.format(indexed_exps))


# Support function
def insert_result(c, timestamp, row, indexed_exps):    
    insert_res = 'INSERT INTO results VALUES (?, ?, ?, ?, ?, ?, ?);'
    
    # NOTE: we use replace instead of substr so it works also for LDBC.
    query = row[2].split('/')[-1].replace('.groovy', '').replace('bulk', '')
    if query.startswith('BFS'):
        query += row[-1]
    exec_time = int(row[6])
    
    if timestamp in indexed_exps:
        query = 'X-' + query

    if query.startswith('loader'):
        exec_time *= 1000000.0
    
    param = [
        timestamp,
        row[0][len('gremlin-'):],
        row[1].split('/')[-1].replace('_hashed', '').replace('_noslash.json', '').rstrip('2')[:-len('.json')],
        query,
        row[3] or '-1', # sid
        row[5], # int_order
        exec_time,
    ]
    c.execute(insert_res, param)
    

with get_default_db() as c:
    for res in [fn for fn in os.listdir(RES_DIR)
                if os.path.isfile(path.join(RES_DIR, fn))
                and fn[0] is not '.' and fn.split('_',2)[0] in exps]:
        with open(path.join(RES_DIR, res), 'r') as csvfile:
            if not res[-len('results.csv'):] == 'results.csv':
                continue
            reader = (row for row in csv.reader(csvfile) if row[0].startswith('gremlin-'))
            for row in reader:
                insert_result(c, res.split('_',2)[0], row, indexed_exps)
log('Results import process completed')
2019-09-12T13:26:52.376506 - Importing: {'1567548898', '1551761122', '1565805219'}
2019-09-12T13:26:52.377275 - Experiments with indexes: {'1567548898'}
2019-09-12T13:26:55.210225 - Results import process completed

Support Views

Define several views used to make analitics' SQL for the Charts less obscure.

In [9]:
views_names = [
    'one_per_row', 'single', 'bulk', 'dbs',
    'full_queries_set', 'single_means_wtimeout', 
    'bulk_sum_wtimeout', 'healt', 'healt_bulk',
]

views = [
    # Results normalization:
    # sum the query that reports the exec_time splitted in multiple record.
    # NOTE, here we make the assumption (even if not 100% valid) 
    #       that all "pieces" are in the same result file ('date'),
    #       and that a query is not run more than once
    #       in the same experiment ('date')
    # NOTE, "-tp3" may have been colleted both in `$DB` and `${DB}-tp3`
    #       results. Thus use DISTINCT.
    '''
      CREATE VIEW one_per_row AS 
        SELECT dbengine,dataset,query,sid,int_order,
            CASE 
                WHEN COUNT(*) >= 5 THEN
                    (SUM(exec_time) - MIN(exec_time) - MAX(exec_time)) / (COUNT(*)-2)
                ELSE
                    AVG(exec_time)
            END AS exec_time
        FROM (
            -- already ok
            SELECT DISTINCT * FROM results
            WHERE 
                (query <> 'insert-node-with-edges') AND
                (dbengine <> "arangodb" OR query <> 'loader')

            UNION

            -- insert-node-with-edges.groovy & arango loader        
            SELECT date,dbengine,dataset,query,sid,int_order,SUM(exec_time)
            FROM results
            WHERE 
                query = 'insert-node-with-edges' OR
                (dbengine = "arangodb" AND query = 'loader')
            GROUP BY date,dbengine,dataset,query,sid,int_order  
            
            UNION
            
            SELECT date,dbengine,dataset,'ldbc-create-user',sid,int_order,SUM(exec_time)
            FROM results
            WHERE query LIKE 'ldbc-create-user%'
            GROUP BY date,dbengine,dataset,sid,int_order
                
        )
        GROUP BY dbengine,dataset,query,sid,int_order  
    ''',
    
    # Run that are NOT bulk mode
    '''
        CREATE VIEW single AS
            SELECT DISTINCT * FROM one_per_row WHERE int_order = 0 OR int_order = ''
    ''',
    
    # ONLY run that are bulk mode
    '''
        CREATE VIEW bulk AS
            SELECT DISTINCT * FROM one_per_row WHERE NOT (int_order = 0 OR int_order = '')
    ''',
    
    # dbengines view (based on images size)
    '''
        CREATE VIEW dbs AS
            SELECT DISTINCT dbengine 
            FROM images
    ''',
    
    # Expected query set (wo/int_order)
    '''
        CREATE VIEW full_queries_set AS
            SELECT dbengine,dataset,query,cnt
            FROM dbs, queries, datasets
    ''',
       
    # single_means_wtimeout
    # Assumption: if COUNT() < expected --> timedout
    # SUM(query) + timeout (:= 2h) * (Expected query count - COUNT(query))
    '''
        CREATE VIEW single_means_wtimeout AS
            SELECT dbengine,dataset,q.query,
                (SUM(exec_time/1000000) + (7200000 * max(0,q.cnt - SUM(done))))/q.cnt AS exec_time,
                q.cnt, SUM(done) AS done
            FROM (
                SELECT dbengine,dataset,query,exec_time,1 AS done
                FROM single
                
                UNION
                
                SELECT dbengine,dataset,query,0,0
                FROM full_queries_set
            ) AS r
            NATURAL JOIN queries AS q
            GROUP BY dbengine,dataset,query
    ''',
    
    # bulk_sum_wtimeout
    # Assumption: if COUNT() < expected --> timedout
    '''
        CREATE VIEW bulk_sum_wtimeout AS
            SELECT dbengine,dataset,r.query,
                CASE
                    WHEN SUM(done) <> q.cnt THEN 7200000
                    ELSE SUM(exec_time) / 1000000
                END AS exec_time,
                q.cnt, SUM(done) AS done
            FROM (
                SELECT dbengine,dataset,query,exec_time,1 AS done
                FROM bulk
                
                UNION
                
                SELECT dbengine,dataset,query,0,0
                FROM full_queries_set
                WHERE cnt > 1
            ) AS r
            NATURAL JOIN queries AS q
            GROUP BY dbengine,dataset,query
    ''',
    
    # health
    # Checks how many queries has been completed and how many timedout.
    '''
    CREATE VIEW health AS
        SELECT dbengine, dataset, query,
            -- NOTE, count on any of "single" field, but not on *. 
            -- Remember that we are using a left natual join.
            -- Assumption, NOT NULL
            COUNT(single.query) AS cnt, 
            
            -- NOTE, assumpion SID is always present (NOT NULL, DEFAULT '-1')
            COUNT(DISTINCT sid) AS sid_cnt,
            cnt AS expected_cnt,
            CASE      
                WHEN COUNT(DISTINCT sid) < cnt
                    THEN "missing"
                WHEN COUNT(DISTINCT sid) > cnt
                    THEN "wat!"
                WHEN COUNT(DISTINCT sid) <>  COUNT(*)
                    THEN "sid inconsistence"
                ELSE "ok"
            END AS status                   
        FROM full_queries_set
        LEFT NATURAL JOIN single
        GROUP BY dbengine, dataset, query
    ''',
    
    # healt_bulk
    # Checks how many queries has been completed and how many timedout.
    '''
    CREATE VIEW health_bulk AS
        SELECT dbengine, dataset, query,
            COUNT(bulk.query) AS cnt,
            COUNT(DISTINCT int_order) AS sid_cnt,
            cnt AS expected_cnt,
            CASE      
                WHEN COUNT(DISTINCT int_order) < cnt
                    THEN "missing"
                WHEN COUNT(DISTINCT int_order) > cnt
                    THEN "wat!"
                WHEN COUNT(DISTINCT int_order) <>  COUNT(*)
                    THEN "sid inconsistence"
                ELSE "ok"
            END AS status                   
        FROM full_queries_set
        LEFT NATURAL JOIN bulk
        -- queries with cnt == 1 does not have bulk version
        WHERE cnt > 1 
        GROUP BY dbengine, dataset, query        
    ''',
]

assert len(views_names) is len(views)

with get_default_db() as c:
    stmt = 'DROP VIEW IF EXISTS %s'
    for v in views_names:
        c.execute(stmt % v)
    log("All views have been dropped")

    for v in views:
        c.execute(v)   
log("All views have been recreated")
2019-09-12T13:26:55.231511 - All views have been dropped
2019-09-12T13:26:55.351274 - All views have been recreated

Charts

Below all the plots of the experiments on running time comparison. For all the following charts, lower values are better.

In [10]:
import numpy as np

# Style from paper.
styles = [
    {'color':'blue',      'linestyle':':',  'marker': 'x'}, 
    {'color':'cyan',      'linestyle':':',  'marker': 'o'},
    {'color':'gold',      'linestyle':'-',  'marker': 'D'},
    {'color':'olive',     'linestyle':'-',  'marker': 'x'},
    {'color':'darkgreen', 'linestyle':':',  'marker': '+'},
    {'color':'black',     'linestyle':'-.', 'marker': '*'},
    {'color':'red',       'linestyle':'--', 'marker': '^'},
    {'color':'grey',      'linestyle':':',  'marker': 'v'},
    {'color':'green',     'linestyle':'-',  'marker': 's'},
]
markers = {'markersize': 8, 'fillstyle': 'none'}

# Make it readable
plt.rcParams.update({'figure.figsize': (20,10), 'font.size': 20})


def ds_short(ds_name):
    if ds_name[:len('freebase')] == 'freebase':
        return 'FBR-' + ds_name.split('_')[1][0].upper()
    return ' '.join(ds_name.upper().split())    


re_qpff = re.compile('[^a-zA-Z0-9]')
def q_prettify(q):
    return re_qpff.sub(' ', q)


def get_queries(not_cls=list()):
    """ Queries with at least one experiment,
        in selcted classes (not_cls).
    """
    with get_default_db() as c:
        q = '''
        SELECT DISTINCT cls, s.query, paper_id || ' ' || s.query
        FROM single s
        JOIN queries as q ON q.query = s.query AND cls NOT IN ({})
        ORDER BY qord
        '''
        qf = q.format(','.join(map(lambda x: "'" + x + "'", not_cls)))
        queries = [(r[0], r[1]) for r in c.execute(qf)]
        queries_labels = [q_prettify(r[2].strip()) for r in c.execute(qf)]
        in_queries = ','.join(map(lambda x: "'" + x[1] + "'", queries))
        return queries, queries_labels, in_queries

        
# Datasets with at least one experiment
with get_default_db() as c:
    q= '''
    SELECT DISTINCT dataset
    FROM single
    NATURAL JOIN datasets
    ORDER BY dord
    '''
    datasets = [r[0] for r in c.execute(q)]
in_datasets = ','.join(map(lambda x: "'" + x + "'", datasets))

Microbenchmark Queries

The plots shows for the mean execution time for each query, where timeouts are counted as the timeout it self, 2h.

In [11]:
# Plots all queries excpet these with specialized plots
q = '''
    SELECT dbalias, dataset, ROUND(exec_time, 2)        
    FROM single_means_wtimeout as t
    JOIN queries as q ON q.query = t.query
    NATURAL JOIN dborder
    NATURAL JOIN datasets AS ds
    WHERE q.query='{}' AND dataset IN ({}) 
        -- Exclude classes with specialized plots
        AND q.cls NOT IN ('B', 'S', 'I')
    ORDER BY ord, dord
'''

queries, queries_labels, in_queries = get_queries(['B', 'S', 'I'])
    
xlabels = list(map(ds_short, datasets))
current_cls = ''
with get_default_db() as c:
    for i, x in enumerate(queries):
        cls, query = x
        if cls != current_cls:
            current_cls = cls
            display(HTML('<h1 id="current_cls">{}</h1>'.format(current_cls)))
            
        plt.title('#'+queries_labels[i].upper())
        plt.xlabel('Dataset')
        plt.ylabel('Time (ms)')
        plt.yscale('log')
        plt.grid(True)
    
        rs = [r for r in c.execute(q.format(query, in_datasets))]
        for i, r in enumerate(gby(rs).items()):
            db, items = r
            ts  = [x[2] for x in items]
            plt.plot(ts, label=db, **styles[i%len(styles)], **markers)

        plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()

L

C

R

U

D

T

In [12]:
q = '''
SELECT dbalias, t.query, ROUND(exec_time, 2)        
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
WHERE 
  -- select class and propoer datasets
  q.query LIKE 'BFS-labelled%' AND dataset = 'ldbc' 
ORDER BY ord, qord
'''

plt.title('#33 BFS-labelled on LDBC')
plt.xlabel('Depth')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)

with get_default_db() as c:
    rs = list(c.execute(q)) 

llabels = None
for i, r in enumerate(gby(rs).items()):
    db, items = r
    ts  = [x[2] for x in items]
    if not llabels:
        llabels = ['DEPTH ' + x[1][len('BFS-labelled'):] for x in items]
    plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
     
plt.xticks(np.arange(len(llabels)), llabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

LDBC Marcobenchmark

The queries for the Macrobenchmark are executed alltogether by a single query file on top of an image with ad-hoc indexes. The experiments prints the execution time of different sub-queries that are represented as checkpoints.

In [13]:
q = '''
SELECT dbalias, t.query, ROUND(exec_time, 2)        
FROM single_means_wtimeout as t
JOIN queries as q ON q.query = t.query
NATURAL JOIN dborder
WHERE 
  -- select class and propoer datasets
  q.cls = 'S' AND dataset = 'ldbc' AND
  -- filter out non index-capable systems
  dbengine <> 'blazegraph'
ORDER BY ord, qord
'''

plt.title('LDBC Macrobench')
plt.xlabel('Checkpoints')
plt.ylabel('Time (ms)')
plt.yscale('log')
plt.grid(True)

with get_default_db() as c:
    rs = list(c.execute(q)) 

llabels = None
for i, r in enumerate(gby(rs).items()):
    db, items = r
    ts  = [x[2] for x in items]
    if not llabels:
        llabels = [x[1][len('ldbc-macrobench-'):] for x in items]
    plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
     
plt.xticks(np.arange(len(llabels)), llabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

Cumulative Time

For each system and dataset we sum the running time for each query. We separate running time of queries in isolation and running times of queries in batch mode.

Note: While all queries can be run in isolation, not all are run in batch mode, only those based on different seeds.

In [14]:
# SINGLE
# On paper they include: CRUDTB
# --> no LIS
q = '''
    SELECT dbalias, dataset, ROUND(SUM(exec_time) / 1000,2)
    FROM (
        SELECT dbengine, dataset, query,  (exec_time*cnt) as  exec_time
        FROM single_means_wtimeout  
        NATURAL JOIN queries as q
        WHERE dataset IN ({}) AND 
            cls NOT IN ('L', 'I', 'S') AND
            query IN ({})   
   ) as v
    NATURAL JOIN dborder
    NATURAL JOIN queries as q
    NATURAL JOIN datasets as dds  
    GROUP BY v.dataset, dbengine
    ORDER BY ord, dord
'''

# Avoid the chart be dominated by timeouts on partial runs.
queries, queries_labels, in_queries = get_queries(['L', 'I', 'S'])


plt.title('All queries'.upper())
plt.xlabel('Dataset')
plt.ylabel('Time (s)')
plt.yscale('log')
plt.grid(True)

with get_default_db() as c:
    rs = [r for r in c.execute(q.format(in_datasets, in_queries))]
for i, r in enumerate(gby(rs).items()):
    db, items = r
    ts  = [x[2] for x in items]
    plt.plot(ts, label=db, **styles[i%len(styles)], **markers)

plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
In [15]:
# BATCH
# On paper they include: CRUDTB
# --> no BIL
q = '''
    SELECT dbalias, dataset, ROUND(SUM(exec_time) / 1000,2)
    FROM (
        SELECT dbengine,dataset,query, exec_time 
        FROM bulk_sum_wtimeout
        NATURAL JOIN queries as q
        WHERE dataset IN ({}) AND 
            cls NOT IN ('B', 'I', 'L') AND
            query IN ({})        
   ) as v
    NATURAL JOIN dborder
    NATURAL JOIN queries as q
    NATURAL JOIN datasets as dds  
    GROUP BY  v.dataset, dbengine
    ORDER BY  ord, dord
'''

plt.title('Only Batch Queries'.upper())
plt.xlabel('Dataset')
plt.ylabel('Time (s)')
plt.yscale('log')
plt.grid(True)

with get_default_db() as c:
    rs = [r for r in c.execute(q.format(in_datasets, in_queries))]
for i, r in enumerate(gby(rs).items()):
    db, items = r
    ts  = [x[2] for x in items]
    plt.plot(ts, label=db, **styles[i%len(styles)], **markers)

plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

Effect of indexes

The following plots are for queries using indexes

In [19]:
q = '''
    SELECT dbalias, dataset, ROUND(exec_time, 2)        
    FROM single_means_wtimeout as t
    JOIN queries as q ON q.query = t.query
    NATURAL JOIN dborder
    NATURAL JOIN datasets AS ds
    WHERE q.query='{}' AND dataset IN ({}) 
        AND q.cls IN ('I') AND dbengine <> 'blazegraph'
    ORDER BY ord, dord
'''

queries, queries_labels, in_queries = get_queries(['L', 'C', 'R', 'U', 'D', 'T', 'B', 'S'])
    
xlabels = list(map(ds_short, datasets))
current_cls = ''
with get_default_db() as c:
    for i, x in enumerate(queries):
        cls, query = x
        if cls != current_cls:
            current_cls = cls
            display(HTML('<h1 id="current_cls">{}</h1>'.format(current_cls)))
            
        n, _, qname = queries_labels[i].upper().split(' ', 2)
        plt.title('#' + n + ' ' + qname + ' [indexed]')
        plt.xlabel('Dataset')
        plt.ylabel('Time (ms)')
        plt.yscale('log')
        plt.grid(True)
    
        rs = [r for r in c.execute(q.format(query, in_datasets))]
        for i, r in enumerate(gby(rs).items()):
            db, items = r
            ts  = [x[2] for x in items]
            plt.plot(ts, label=db, **styles[i%len(styles)], **markers)

        plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()

I

Disk space usage

Comparing the increment of size of images for each system, before and after loading the dataset. Raw datasets are loaded in a volume, so this takes into account only the actual system internal storage.

In [16]:
# NOTE 1: we may take care of image size resolution vs dataset size.
# A pratical example. Neo4j-tp3 image size resolution is in 10MB (0.01GB),
# Yeast is 1.468 KB. The effect of the loading might be hidden in the measurament error,
# that implies image_with_data - image_clean = 0.
# Since we are using a log scale for the y axis we assum at least a variation of 1MB
# from an image_clean to an image_with_data. 

# NOTE 2: In this version of the chart, the grey area represents the size of the dataset
# encoded in JSON on disk; not the number of items in it.

q = '''
SELECT dbalias, dataset, space
FROM (
    SELECT dbalias, dataset, MAX(1, im.space - empty.space) AS space, ord, dord
    FROM images im
    NATURAL JOIN dborder
    NATURAL JOIN datasets
    JOIN (
        SELECT dbengine, space
        FROM images 
        WHERE dataset == 'empty'
    ) AS empty ON im.dbengine = empty.dbengine
    WHERE dataset IN ({in_datasets})

    UNION
    
    SELECT 'json', dataset, json2, (SELECT COUNT(*) FROM images) + 1, dord
    FROM datasets 
    WHERE dataset IN ({in_datasets})
) AS t
ORDER BY ord, dord
'''


with get_default_db() as c:
    rs = list(c.execute(q.format(in_datasets=in_datasets)))
        
plt.title('DISK SPACE USAGE')
plt.xlabel('Dataset')
plt.ylabel('Space (MB)')
plt.yscale('log', basey=2)
plt.grid(True)

lst = list(enumerate(gby(rs).items()))
for i, r in lst:
    db, items = r
    ts  = [x[2] for x in items]

    if i < (len(lst) -1):
        plt.plot(ts, label=db, **styles[i%len(styles)], **markers)
    else:
        plt.fill_between(np.arange(len(ts)), ts, color='grey', alpha=0.3, label=db)

plt.xticks(np.arange(len(xlabels)), xlabels, rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

Timeouts

We count for each sytem, dataset, and type of query, the number of timeouts

In [17]:
# NOTE: Like in paper, indexed and macro-benchmark queries are excluded.
# modify the WHERE in the inner query to change that.

q = '''
SELECT dataset, db, cnt 
FROM (
    SELECT dbengine || '_0' AS db, dataset, SUM(t) - SUM(c) AS cnt, ord, dord 
    FROM (
        SELECT h.dataset, h.dbengine, SUM(h.sid_cnt) c ,SUM(h.expected_cnt) t
        FROM health h
        LEFT JOIN queries q ON  h.query = q.query
        WHERE q.cls <> 'I' AND q.cls <> 'S' AND
            h.query IN ({in_queries}) AND
            h.dataset IN ({in_datasets})
        GROUP BY h.dataset, h.dbengine
    ) 
    NATURAL JOIN dborder
    NATURAL JOIN datasets    
    GROUP BY dbengine, dataset

    UNION

    SELECT dbengine || '_1', dataset, SUM(t) - SUM(c), ord, dord 
    FROM (     
        SELECT dataset, dbengine, SUM(ok) as c , COUNT(ok) as t
        FROM (
            SELECT h.dataset,h.dbengine,h.query, SUM(h.sid_cnt / h.expected_cnt)   AS ok
            FROM health_bulk h
            JOIN queries q ON  h.query = q.query
            WHERE q.cls <> 'I' AND q.cls <> 'S' AND
                h.query IN ({in_queries}) AND
                h.dataset IN ({in_datasets})
            GROUP BY h.dataset,h.dbengine,h.query
        )
        GROUP BY dataset,dbengine
    ) 
    NATURAL JOIN dborder        
    NATURAL JOIN datasets
    GROUP BY dbengine, dataset

)
ORDER BY dord, ord, db
-- ORDER BY DESC dord, ord, db
'''

# NOTE: suffix mapping
# 0 -> I
# 1 -> B

queries, queries_labels, in_queries = get_queries(['I', 'S'])

with get_default_db() as c:
    rs = list(c.execute(q.format(in_datasets=in_datasets, in_queries=in_queries)))
    
plt.title('TIME-OUTS')
plt.xlabel('DB Engine and Execution Method')
plt.ylabel('# Timeouts')
plt.grid(True)

mod_xlabels = ['I\n' + ('\n' if i%4 else '') + k[:-2] 
               if not i%2 else 'B' for (i, k) in enumerate(OrderedDict((r[1], None) for r in rs).keys())]
loc = np.arange(len(mod_xlabels))
top = np.zeros(len(mod_xlabels))
for i, r in enumerate(gby(rs).items()):
    ds, items = r
    timeouts = np.array([x[2] for x in items])
    plt.bar(loc, timeouts, label=ds_short(ds), bottom=top)
    top += timeouts
    
plt.xticks(loc, mod_xlabels, rotation=0)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()