Source code for openquake.baselib.datastore

# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (C) 2015-2017 GEM Foundation
#
# OpenQuake is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OpenQuake is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>.

import os
import re
import getpass
import collections
import h5py

from openquake.baselib import hdf5, config


[docs]def get_datadir(): """ Extracts the path of the directory where the openquake data are stored from the environment ($OQ_DATADIR) or from the shared_dir in the configuration file. """ datadir = os.environ.get('OQ_DATADIR') if not datadir: shared_dir = config.directory.shared_dir if shared_dir: datadir = os.path.join(shared_dir, getpass.getuser(), 'oqdata') else: # use the home of the user datadir = os.path.join(os.path.expanduser('~'), 'oqdata') return datadir
[docs]def get_calc_ids(datadir=None): """ Extract the available calculation IDs from the datadir, in order. """ datadir = datadir or get_datadir() if not os.path.exists(datadir): return [] calc_ids = [] for f in os.listdir(datadir): mo = re.match(r'calc_(\d+)\.hdf5', f) if mo: calc_ids.append(int(mo.group(1))) return sorted(calc_ids)
[docs]def get_last_calc_id(datadir=None): """ Extract the latest calculation ID from the given directory. If none is found, return 0. """ datadir = datadir or get_datadir() calcs = get_calc_ids(datadir) if not calcs: return 0 return calcs[-1]
[docs]def hdf5new(datadir=None): """ Return a new `hdf5.File by` instance with name determined by the last calculation in the datadir (plus one). Set the .path attribute to the generated filename. """ datadir = datadir or get_datadir() calc_id = get_last_calc_id(datadir) + 1 fname = os.path.join(datadir, 'calc_%d.hdf5' % calc_id) new = hdf5.File(fname, 'w') new.path = fname return new
[docs]def extract_calc_id_datadir(hdf5path, datadir=None): """ Extract the calculation ID from the given hdf5path or integer: >>> extract_calc_id_datadir('/mnt/ssd/oqdata/calc_25.hdf5') (25, '/mnt/ssd/oqdata') >>> extract_calc_id_datadir('/mnt/ssd/oqdata/wrong_name.hdf5') Traceback (most recent call last): ... ValueError: Cannot extract calc_id from /mnt/ssd/oqdata/wrong_name.hdf5 """ datadir = datadir or get_datadir() if hdf5path is None: # use a new datastore return get_last_calc_id(datadir) + 1, datadir try: calc_id = int(hdf5path) except: datadir = os.path.dirname(hdf5path) mo = re.match('calc_(\d+)\.hdf5', os.path.basename(hdf5path)) if mo is None: raise ValueError('Cannot extract calc_id from %s' % hdf5path) calc_id = int(mo.group(1)) return calc_id, datadir
[docs]def read(calc_id, mode='r', datadir=None): """ :param calc_id: calculation ID or hdf5path :param mode: 'r' or 'w' :param datadir: the directory where to look :returns: the corresponding DataStore instance Read the datastore, if it exists and it is accessible. """ datadir = datadir or get_datadir() dstore = DataStore(calc_id, datadir, mode=mode) try: hc_id = dstore['oqparam'].hazard_calculation_id except KeyError: # no oqparam hc_id = None if hc_id: # TODO: we will need to store the parent directory to be able # to use hazard calculations generated by another user; # for the moment we assume that the datadir is the same for # parent and child calculations dstore.parent = read(hc_id, datadir=datadir) return dstore
[docs]class DataStore(collections.MutableMapping): """ DataStore class to store the inputs/outputs of a calculation on the filesystem. Here is a minimal example of usage: >>> ds = DataStore() >>> ds['example'] = 42 >>> print(ds['example'].value) 42 >>> ds.clear() When reading the items, the DataStore will return a generator. The items will be ordered lexicographically according to their name. There is a serialization protocol to store objects in the datastore. An object is serializable if it has a method `__toh5__` returning an array and a dictionary, and a method `__fromh5__` taking an array and a dictionary and populating the object. For an example of use see :class:`openquake.hazardlib.site.SiteCollection`. """ def __init__(self, calc_id=None, datadir=None, params=(), mode=None): datadir = datadir or get_datadir() calc_id, datadir = extract_calc_id_datadir(calc_id, datadir) if not os.path.exists(datadir): os.makedirs(datadir) if calc_id < 0: # use an old datastore calc_ids = get_calc_ids(datadir) try: self.calc_id = calc_ids[calc_id] except IndexError: raise IndexError('There are %d old calculations, cannot ' 'retrieve the %s' % (len(calc_ids), calc_id)) else: # use the given datastore self.calc_id = calc_id self.params = params self.mode = mode self.parent = () # can be set later self.datadir = datadir self.calc_dir = os.path.join(datadir, 'calc_%s' % self.calc_id) self.hdf5path = self.calc_dir + '.hdf5' if mode == 'r' and not os.path.exists(self.hdf5path): raise IOError('File not found: %s' % self.hdf5path) self.hdf5 = None self.open()
[docs] def open(self): """ Open the underlying .hdf5 file and the parent, if any """ if self.hdf5 is None: # not already open mode = self.mode or 'r+' if os.path.exists(self.hdf5path) else 'w' self.hdf5 = hdf5.File(self.hdf5path, mode, libver='latest') if self.parent != () and self.parent.hdf5 is None: self.parent.open()
@property def export_dir(self): """ Return the underlying export directory """ edir = getattr(self, '_export_dir', None) or self['oqparam'].export_dir return edir @export_dir.setter def export_dir(self, value): """ Set the export directory """ self._export_dir = value
[docs] def getitem(self, name): """ Return a dataset by using h5py.File.__getitem__ """ return h5py.File.__getitem__(self.hdf5, name)
[docs] def set_nbytes(self, key, nbytes=None): """ Set the `nbytes` attribute on the HDF5 object identified by `key`. """ return self.hdf5.set_nbytes(key, nbytes)
[docs] def set_attrs(self, key, **kw): """ Set the HDF5 attributes of the given key """ attrs = h5py.File.__getitem__(self.hdf5, key).attrs for k, v in kw.items(): attrs[k] = v
[docs] def get_attr(self, key, name, default=None): """ :param key: dataset path :param name: name of the attribute :param default: value to return if the attribute is missing """ try: obj = h5py.File.__getitem__(self.hdf5, key) except KeyError: if self.parent != (): return self.parent.get_attr(key, name, default) else: raise try: return obj.attrs[name] except KeyError: if default is None: raise return default
[docs] def get_attrs(self, key): """ :param key: dataset path :returns: dictionary of attributes for that path """ return dict(h5py.File.__getitem__(self.hdf5, key).attrs)
[docs] def create_dset(self, key, dtype, shape=(None,), compression=None, fillvalue=0, attrs=None): """ Create a one-dimensional HDF5 dataset. :param key: name of the dataset :param dtype: dtype of the dataset (usually composite) :param shape: shape of the dataset, possibly extendable :param compression: the kind of HDF5 compression to use :param attrs: dictionary of attributes of the dataset :returns: a HDF5 dataset """ return hdf5.create( self.hdf5, key, dtype, shape, compression, fillvalue, attrs)
[docs] def save_vlen(self, key, data): """ Save a sequence of variable-length arrays :param key: name of the dataset :param data: data to store as vlen arrays """ dt = data[0].dtype dset = self.create_dset( key, h5py.special_dtype(vlen=dt), (len(data),), fillvalue=None) nbytes = 0 totlen = 0 for i, val in enumerate(data): dset[i] = val nbytes += val.nbytes totlen += len(val) self.set_attrs(key, nbytes=nbytes, avg_len=totlen / len(data)) self.flush()
[docs] def extend(self, key, array, **attrs): """ Extend the dataset associated to the given key; create it if needed :param key: name of the dataset :param array: array to store :param attrs: a dictionary of attributes """ try: dset = self.hdf5[key] except KeyError: dset = hdf5.create(self.hdf5, key, array.dtype, shape=(None,) + array.shape[1:]) hdf5.extend(dset, array) for k, v in attrs.items(): dset.attrs[k] = v return dset
[docs] def save(self, key, kw): """ Update the object associated to `key` with the `kw` dictionary; works for LiteralAttrs objects and automatically flushes. """ if key not in self: obj = hdf5.LiteralAttrs() else: obj = self[key] vars(obj).update(kw) self[key] = obj self.flush()
[docs] def export_path(self, relname, export_dir=None): """ Return the path of the exported file by adding the export_dir in front, the calculation ID at the end. :param relname: relative file name :param export_dir: export directory (if None use .export_dir) """ # removing inner slashed to avoid creating intermediate directories name, ext = relname.replace('/', '-').rsplit('.', 1) newname = '%s_%s.%s' % (name, self.calc_id, ext) if export_dir is None: export_dir = self.export_dir return os.path.join(export_dir, newname)
[docs] def build_fname(self, prefix, postfix, fmt, export_dir=None): """ Build a file name from a realization, by using prefix and extension. :param prefix: the prefix to use :param postfix: the postfix to use (can be a realization object) :param fmt: the extension ('csv', 'xml', etc) :param export_dir: export directory (if None use .export_dir) :returns: relative pathname including the extension """ if hasattr(postfix, 'sm_lt_path'): # is a realization fname = '%s-rlz-%03d.%s' % (prefix, postfix.ordinal, fmt) else: fname = '%s-%s.%s' % (prefix, postfix, fmt) return self.export_path(fname, export_dir)
[docs] def flush(self): """Flush the underlying hdf5 file""" if self.parent != (): self.parent.flush() if self.hdf5: # is open self.hdf5.flush()
[docs] def close(self): """Close the underlying hdf5 file""" if self.parent != (): self.parent.flush() self.parent.close() if self.hdf5: # is open self.hdf5.flush() self.hdf5.close() self.hdf5 = None
[docs] def clear(self): """Remove the datastore from the file system""" self.close() os.remove(self.hdf5path)
[docs] def getsize(self, key=None): """ Return the size in byte of the output associated to the given key. If no key is given, returns the total size of all files. """ if key is None: return os.path.getsize(self.hdf5path) return hdf5.ByteCounter.get_nbytes( h5py.File.__getitem__(self.hdf5, key))
[docs] def get(self, key, default): """ :returns: the value associated to the datastore key, or the default """ try: return self[key] except KeyError: return default
def __getitem__(self, key): try: val = self.hdf5[key] except KeyError: if self.parent != (): try: val = self.parent[key] except KeyError: raise KeyError( 'No %r found in %s and ancestors' % (key, self)) else: raise KeyError('No %r found in %s' % (key, self)) return val def __setitem__(self, key, val): if key in self.hdf5: # there is a bug in the current version of HDF5 for composite # arrays: is impossible to save twice the same key; so we remove # the key first, then it is possible to save it again del self[key] try: self.hdf5[key] = val except RuntimeError as exc: raise RuntimeError('Could not save %s: %s in %s' % (key, exc, self.hdf5path)) def __delitem__(self, key): del self.hdf5[key] def __enter__(self): self.was_close = self.hdf5 is None if self.was_close: self.open() return self def __exit__(self, etype, exc, tb): if self.was_close: # and has been opened in __enter__, close it self.close() del self.was_close def __getstate__(self): # make the datastore pickleable return dict(mode='r', parent=self.parent, calc_id=self.calc_id, hdf5=None, hdf5path=self.hdf5path) def __iter__(self): if not self.hdf5: raise RuntimeError('%s is closed' % self) for path in sorted(self.hdf5): yield path def __contains__(self, key): return key in self.hdf5 or self.parent and key in self.parent.hdf5 def __len__(self): return sum(1 for f in self) def __hash__(self): return self.calc_id def __repr__(self): status = 'open' if self.hdf5 else 'close' return '<%s %d, %s>' % (self.__class__.__name__, self.calc_id, status)
[docs]def persistent_attribute(key): """ Persistent attributes are persisted to the datastore and cached. Modifications to mutable objects are not automagically persisted. If you have a huge object that does not fit in memory use the datastore directory (for instance, open a HDF5 file to create an empty array, then populate it). Notice that you can use any dict-like data structure in place of the datastore, provided you can set attributes on it. Here is an example: >>> class Datastore(dict): ... "A fake datastore" >>> class Store(object): ... a = persistent_attribute('a') ... def __init__(self, a): ... self.datastore = Datastore() ... self.a = a # this assegnation will store the attribute >>> store = Store([1]) >>> store.a # this retrieves the attribute [1] >>> store.a.append(2) >>> store.a = store.a # remember to store the modified attribute! :param key: the name of the attribute to be made persistent :returns: a property to be added to a class with a .datastore attribute """ privatekey = '_' + key def getter(self): # Try to get the value from the privatekey attribute (i.e. from # the cache of the datastore); if not possible, get the value # from the datastore and set the cache; if not possible, get the # value from the parent and set the cache. If the value cannot # be retrieved, raise an AttributeError. try: return getattr(self.datastore, privatekey) except AttributeError: value = self.datastore[key] setattr(self.datastore, privatekey, value) return value def setter(self, value): # Update the datastore and the private key self.datastore[key] = value setattr(self.datastore, privatekey, value) return property(getter, setter)