Source code for openquake.commonlib.datastore

# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (C) 2015-2017 GEM Foundation
#
# OpenQuake is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OpenQuake is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>.

import os
import re
import getpass
import collections
import numpy
import h5py

from openquake.baselib.python3compat import pickle
from openquake.baselib import hdf5
from openquake.commonlib import config
from openquake.commonlib.writers import write_csv

DATADIR = os.environ.get('OQ_DATADIR')
if not DATADIR:
    shared_dir = config.get('directory', 'shared_dir')
    if shared_dir:
        DATADIR = os.path.join(shared_dir, getpass.getuser(), 'oqdata')
    else:  # use the home of the user
        DATADIR = os.path.join(os.path.expanduser('~'), 'oqdata')


[docs]def get_calc_ids(datadir=DATADIR):
    """
    Extract the available calculation IDs from the datadir, in order.
    """
    if not os.path.exists(datadir):
        return []
    calc_ids = []
    for f in os.listdir(datadir):
        mo = re.match(r'calc_(\d+)\.hdf5', f)
        if mo:
            calc_ids.append(int(mo.group(1)))
    return sorted(calc_ids)


[docs]def get_last_calc_id(datadir=DATADIR):
    """
    Extract the latest calculation ID from the given directory.
    If none is found, return 0.
    """
    calcs = get_calc_ids(datadir)
    if not calcs:
        return 0
    return calcs[-1]


[docs]def hdf5new(datadir=DATADIR):
    """
    Return a new `hdf5.File by` instance with name determined by the last
    calculation in the datadir (plus one). Set the .path attribute to the
    generated filename.
    """
    calc_id = get_last_calc_id(datadir) + 1
    fname = os.path.join(datadir, 'calc_%d.hdf5' % calc_id)
    new = hdf5.File(fname, 'w')
    new.path = fname
    return new


[docs]def extract_calc_id_datadir(hdf5path, datadir=DATADIR):
    """
    Extract the calculation ID from the given hdf5path or integer:

    >>> extract_calc_id_datadir('/mnt/ssd/oqdata/calc_25.hdf5')
    (25, '/mnt/ssd/oqdata')
    >>> extract_calc_id_datadir('/mnt/ssd/oqdata/wrong_name.hdf5')
    Traceback (most recent call last):
       ...
    ValueError: Cannot extract calc_id from /mnt/ssd/oqdata/wrong_name.hdf5
    """
    if hdf5path is None:  # use a new datastore
        return get_last_calc_id(datadir) + 1, datadir
    try:
        calc_id = int(hdf5path)
    except:
        datadir = os.path.dirname(hdf5path)
        mo = re.match('calc_(\d+)\.hdf5', os.path.basename(hdf5path))
        if mo is None:
            raise ValueError('Cannot extract calc_id from %s' % hdf5path)
        calc_id = int(mo.group(1))
    return calc_id, datadir


[docs]def read(calc_id, mode='r', datadir=DATADIR):
    """
    :param calc_id: calculation ID or hdf5path
    :param mode: 'r' or 'w'
    :param datadir: the directory where to look
    :returns: the corresponding DataStore instance

    Read the datastore, if it exists and it is accessible.
    """
    dstore = DataStore(calc_id, datadir, mode=mode)
    try:
        hc_id = dstore['oqparam'].hazard_calculation_id
    except KeyError:  # no oqparam
        hc_id = None
    if hc_id:
        # TODO: we will need to store the parent directory to be able
        # to use hazard calculations generated by another user;
        # for the moment we assume that the datadir is the same for
        # parent and child calculations
        dstore.parent = read(hc_id, datadir=datadir)
    return dstore


[docs]class DataStore(collections.MutableMapping):
    """
    DataStore class to store the inputs/outputs of a calculation on the
    filesystem.

    Here is a minimal example of usage:

    >>> ds = DataStore()
    >>> ds['example'] = 'hello world'
    >>> print(ds['example'])
    hello world
    >>> ds.clear()

    When reading the items, the DataStore will return a generator. The
    items will be ordered lexicographically according to their name.

    There is a serialization protocol to store objects in the datastore.
    An object is serializable if it has a method `__toh5__` returning
    an array and a dictionary, and a method `__fromh5__` taking an array
    and a dictionary and populating the object.
    For an example of use see :class:`openquake.hazardlib.site.SiteCollection`.
    """
    def __init__(self, calc_id=None, datadir=DATADIR,
                 params=(), mode=None):
        calc_id, datadir = extract_calc_id_datadir(calc_id, datadir)
        if not os.path.exists(datadir):
            os.makedirs(datadir)
        if calc_id < 0:  # use an old datastore
            calc_ids = get_calc_ids(datadir)
            try:
                self.calc_id = calc_ids[calc_id]
            except IndexError:
                raise IndexError('There are %d old calculations, cannot '
                                 'retrieve the %s' % (len(calc_ids), calc_id))
        else:  # use the given datastore
            self.calc_id = calc_id
        self.params = params
        self.mode = mode
        self.parent = ()  # can be set later
        self.datadir = datadir
        self.calc_dir = os.path.join(datadir, 'calc_%s' % self.calc_id)
        self.hdf5path = self.calc_dir + '.hdf5'
        if mode == 'r' and not os.path.exists(self.hdf5path):
            raise IOError('File not found: %s' % self.hdf5path)
        self.hdf5 = None
        self.open()

[docs]    def open(self):
        """
        Open the underlying .hdf5 file and the parent, if any
        """
        if self.hdf5 is None:  # not already open
            mode = self.mode or 'r+' if os.path.exists(self.hdf5path) else 'w'
            self.hdf5 = hdf5.File(self.hdf5path, mode, libver='latest')
            if self.parent != () and self.parent.hdf5 is None:
                self.parent.open()

    @property
    def export_dir(self):
        """
        Return the underlying export directory
        """
        try:
            return self._export_dir
        except AttributeError:
            self._export_dir = self['oqparam'].export_dir
            return self._export_dir

    @export_dir.setter
    def export_dir(self, value):
        """
        Set the export directory
        """
        self._export_dir = value

    @property
    def export_dir(self):
        """
        Return the underlying export directory
        """
        edir = getattr(self, '_export_dir', None) or self['oqparam'].export_dir
        return edir

    @export_dir.setter
    def export_dir(self, value):
        """
        Set the export directory
        """
        self._export_dir = value

[docs]    def getitem(self, name):
        """
        Return a dataset by using h5py.File.__getitem__
        """
        return h5py.File.__getitem__(self.hdf5, name)

[docs]    def set_nbytes(self, key, nbytes=None):
        """
        Set the `nbytes` attribute on the HDF5 object identified by `key`.
        """
        return self.hdf5.set_nbytes(key, nbytes)

[docs]    def set_attrs(self, key, **kw):
        """
        Set the HDF5 attributes of the given key
        """
        attrs = h5py.File.__getitem__(self.hdf5, key).attrs
        for k, v in kw.items():
            attrs[k] = v

[docs]    def get_attr(self, key, name, default=None):
        """
        :param key: dataset path
        :param name: name of the attribute
        :param default: value to return if the attribute is missing
        """
        try:
            obj = h5py.File.__getitem__(self.hdf5, key)
        except KeyError:
            if self.parent != ():
                return self.parent.get_attr(key, name, default)
            else:
                raise
        try:
            return obj.attrs[name]
        except KeyError:
            if default is None:
                raise
            return default

[docs]    def create_dset(self, key, dtype, shape=(None,), compression=None,
                    fillvalue=0, attrs=None):
        """
        Create a one-dimensional HDF5 dataset.

        :param key: name of the dataset
        :param dtype: dtype of the dataset (usually composite)
        :param shape: shape of the dataset, possibly extendable
        :param compression: the kind of HDF5 compression to use
        :param attrs: dictionary of attributes of the dataset
        :returns: a HDF5 dataset
        """
        return hdf5.create(
            self.hdf5, key, dtype, shape, compression, fillvalue, attrs)

[docs]    def extend(self, key, array, **attrs):
        """
        Extend the dataset associated to the given key; create it if needed

        :param key: name of the dataset
        :param array: array to store
        :param attrs: a dictionary of attributes
        """
        try:
            dset = self.hdf5[key]
        except KeyError:
            dset = hdf5.create(self.hdf5, key, array.dtype,
                               shape=(None,) + array.shape[1:])
        hdf5.extend(dset, array)
        for k, v in attrs.items():
            dset.attrs[k] = v
        return dset

[docs]    def save(self, key, kw):
        """
        Update the object associated to `key` with the `kw` dictionary;
        works for LiteralAttrs objects and automatically flushes.
        """
        if key not in self:
            obj = hdf5.LiteralAttrs()
        else:
            obj = self[key]
        vars(obj).update(kw)
        self[key] = obj
        self.flush()

[docs]    def export_path(self, relname, export_dir=None):
        """
        Return the path of the exported file by adding the export_dir in
        front, the calculation ID at the end.

        :param relname: relative file name
        :param export_dir: export directory (if None use .export_dir)
        """
        assert not os.path.dirname(relname), relname
        name, ext = relname.rsplit('.', 1)
        newname = '%s_%s.%s' % (name, self.calc_id, ext)
        if export_dir is None:
            export_dir = self.export_dir
        return os.path.join(export_dir, newname)

[docs]    def build_fname(self, prefix, postfix, fmt, export_dir=None):
        """
        Build a file name from a realization, by using prefix and extension.

        :param prefix: the prefix to use
        :param postfix: the postfix to use (can be a realization object)
        :param fmt: the extension ('csv', 'xml', etc)
        :param export_dir: export directory (if None use .export_dir)
        :returns: relative pathname including the extension
        """
        if hasattr(postfix, 'sm_lt_path'):  # is a realization
            fname = '%s-rlz-%03d.%s' % (prefix, postfix.ordinal, fmt)
        else:
            fname = '%s-%s.%s' % (prefix, postfix, fmt)
        return self.export_path(fname, export_dir)

[docs]    def export_csv(self, key):
        """
        Generic csv exporter
        """
        return write_csv(self.export_path(key, 'csv'), self[key])

[docs]    def flush(self):
        """Flush the underlying hdf5 file"""
        if self.parent != ():
            self.parent.flush()
        if self.hdf5:  # is open
            self.hdf5.flush()

[docs]    def close(self):
        """Close the underlying hdf5 file"""
        if self.parent != ():
            self.parent.flush()
            self.parent.close()
        if self.hdf5:  # is open
            self.hdf5.flush()
            self.hdf5.close()
            self.hdf5 = None

[docs]    def clear(self):
        """Remove the datastore from the file system"""
        self.close()
        os.remove(self.hdf5path)

[docs]    def getsize(self, key=None):
        """
        Return the size in byte of the output associated to the given key.
        If no key is given, returns the total size of all files.
        """
        if key is None:
            return os.path.getsize(self.hdf5path)
        return hdf5.ByteCounter.get_nbytes(
            h5py.File.__getitem__(self.hdf5, key))

[docs]    def get(self, key, default):
        """
        :returns: the value associated to the datastore key, or the default
        """
        try:
            return self[key]
        except KeyError:
            return default

    def __getitem__(self, key):
        try:
            val = self.hdf5[key]
        except KeyError:
            if self.parent != ():
                try:
                    val = self.parent[key]
                except KeyError:
                    raise KeyError(
                        'No %r found in %s and ancestors' % (key, self))
            else:
                raise KeyError('No %r found in %s' % (key, self))
        try:
            shape = val.shape
        except AttributeError:  # val is a group
            return val
        if not shape:
            val = pickle.loads(val.value)
        return val

    def __setitem__(self, key, value):
        if isinstance(value, dict) or hasattr(value, '__toh5__'):
            val = value
        elif (not isinstance(value, numpy.ndarray) or
                value.dtype is numpy.dtype(object)):
            val = numpy.array(pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
        else:
            val = value
        if key in self.hdf5:
            # there is a bug in the current version of HDF5 for composite
            # arrays: is impossible to save twice the same key; so we remove
            # the key first, then it is possible to save it again
            del self[key]
        try:
            self.hdf5[key] = val
        except RuntimeError as exc:
            raise RuntimeError('Could not save %s: %s in %s' %
                               (key, exc, self.hdf5path))

    def __delitem__(self, key):
        del self.hdf5[key]

    def __enter__(self):
        self.was_close = self.hdf5 is None
        if self.was_close:
            self.open()
        return self

    def __exit__(self, etype, exc, tb):
        if self.was_close:  # and has been opened in __enter__, close it
            self.close()
        del self.was_close

    def __getstate__(self):
        # make the datastore pickleable
        return dict(mode='r',
                    parent=self.parent,
                    calc_id=self.calc_id,
                    hdf5=None,
                    hdf5path=self.hdf5path)

    def __iter__(self):
        if not self.hdf5:
            raise RuntimeError('%s is closed' % self)
        for path in sorted(self.hdf5):
            yield path

    def __contains__(self, key):
        return key in self.hdf5 or self.parent and key in self.parent.hdf5

    def __len__(self):
        return sum(1 for f in self)

    def __repr__(self):
        return '<%s %d>' % (self.__class__.__name__, self.calc_id)


[docs]def persistent_attribute(key):
    """
    Persistent attributes are persisted to the datastore and cached.
    Modifications to mutable objects are not automagically persisted.
    If you have a huge object that does not fit in memory use the datastore
    directory (for instance, open a HDF5 file to create an empty array, then
    populate it). Notice that you can use any dict-like data structure in
    place of the datastore, provided you can set attributes on it.
    Here is an example:

    >>> class Datastore(dict):
    ...     "A fake datastore"

    >>> class Store(object):
    ...     a = persistent_attribute('a')
    ...     def __init__(self, a):
    ...         self.datastore = Datastore()
    ...         self.a = a  # this assegnation will store the attribute

    >>> store = Store([1])
    >>> store.a  # this retrieves the attribute
    [1]
    >>> store.a.append(2)
    >>> store.a = store.a  # remember to store the modified attribute!

    :param key: the name of the attribute to be made persistent
    :returns: a property to be added to a class with a .datastore attribute
    """
    privatekey = '_' + key

    def getter(self):
        # Try to get the value from the privatekey attribute (i.e. from
        # the cache of the datastore); if not possible, get the value
        # from the datastore and set the cache; if not possible, get the
        # value from the parent and set the cache. If the value cannot
        # be retrieved, raise an AttributeError.
        try:
            return getattr(self.datastore, privatekey)
        except AttributeError:
            value = self.datastore[key]
            setattr(self.datastore, privatekey, value)
            return value

    def setter(self, value):
        # Update the datastore and the private key
        self.datastore[key] = value
        setattr(self.datastore, privatekey, value)

    return property(getter, setter)