Source code for openquake.calculators.event_based

# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (C) 2015-2017 GEM Foundation
#
# OpenQuake is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OpenQuake is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
import time
import math
import os.path
import operator
import itertools
import logging
import collections
import numpy

from openquake.baselib import hdf5
from openquake.baselib.python3compat import zip
from openquake.baselib.general import (
    AccumDict, block_splitter, humansize, split_in_slices)
from openquake.hazardlib.calc.filters import FarAwayRupture, SourceFilter
from openquake.hazardlib.gsim.base import ContextMaker
from openquake.hazardlib.probability_map import ProbabilityMap
from openquake.hazardlib.stats import compute_pmap_stats
from openquake.risklib.riskinput import str2rsi, rsi2str, indices_dt
from openquake.baselib import parallel
from openquake.commonlib import calc, util, readinput
from openquake.calculators import base
from openquake.calculators.getters import GmfGetter, EBRupture, RuptureGetter
from openquake.calculators.classical import (
    ClassicalCalculator, saving_sources_by_task)

U8 = numpy.uint8
U16 = numpy.uint16
U32 = numpy.uint32
U64 = numpy.uint64
F32 = numpy.float32
F64 = numpy.float64
TWO16 = 2 ** 16  # 65,536
TWO32 = 2 ** 32  # 4,294,967,296
TWO48 = 2 ** 48  # 281,474,976,710,656


# ######################## rupture calculator ############################ #


[docs]def set_eids(ebruptures):
    """
    Set event IDs on the given list of ebruptures.

    :param ebruptures: a non-empty list of ruptures with the same grp_id
    :returns: the total number of events set
    """
    if not ebruptures:
        return 0
    num_events = sum(ebr.multiplicity for ebr in ebruptures)
    for ebr in ebruptures:
        assert ebr.multiplicity < TWO32, ebr.multiplicity
        eids = U64(TWO32 * ebr.serial) + numpy.arange(
            ebr.multiplicity, dtype=U64)
        ebr.events['eid'] = eids
    return num_events


[docs]def compute_ruptures(sources, src_filter, gsims, param, monitor):
    """
    :param sources:
        List of commonlib.source.Source tuples
    :param src_filter:
        a source site filter
    :param gsims:
        a list of GSIMs for the current tectonic region model
    :param param:
        a dictionary of additional parameters
    :param monitor:
        monitor instance
    :returns:
        a dictionary src_group_id -> [Rupture instances]
    """
    # NB: by construction each block is a non-empty list with
    # sources of the same src_group_id
    grp_id = sources[0].src_group_id
    eb_ruptures = []
    calc_times = []
    rup_mon = monitor('making contexts', measuremem=False)
    # Compute and save stochastic event sets
    num_ruptures = 0
    cmaker = ContextMaker(gsims, src_filter.integration_distance)
    for src, s_sites in src_filter(sources):
        t0 = time.time()
        if s_sites is None:
            continue
        num_ruptures += src.num_ruptures
        num_occ_by_rup = sample_ruptures(
            src, param['ses_per_logic_tree_path'], sources.samples,
            param['seed'])
        # NB: the number of occurrences is very low, << 1, so it is
        # more efficient to filter only the ruptures that occur, i.e.
        # to call sample_ruptures *before* the filtering
        for ebr in _build_eb_ruptures(
                src, num_occ_by_rup, cmaker, s_sites, param['seed'], rup_mon):
            eb_ruptures.append(ebr)
        dt = time.time() - t0
        calc_times.append((src.id, dt))
    res = AccumDict({grp_id: eb_ruptures})
    res.num_events = set_eids(eb_ruptures)
    res.calc_times = calc_times
    res.eff_ruptures = {grp_id: num_ruptures}
    return res


[docs]def sample_ruptures(src, num_ses, num_samples, seed):
    """
    Sample the ruptures contained in the given source.

    :param src: a hazardlib source object
    :param num_ses: the number of Stochastic Event Sets to generate
    :param num_samples: how many samples for the given source
    :param seed: master seed from the job.ini file
    :returns: a dictionary of dictionaries rupture -> {ses_id: num_occurrences}
    """
    # the dictionary `num_occ_by_rup` contains a dictionary
    # ses_id -> num_occurrences for each occurring rupture
    num_occ_by_rup = collections.defaultdict(AccumDict)
    # generating ruptures for the given source
    for rup_no, rup in enumerate(src.iter_ruptures()):
        rup.seed = src.serial[rup_no] + seed
        numpy.random.seed(rup.seed)
        for sampleid in range(num_samples):
            for ses_idx in range(1, num_ses + 1):
                num_occurrences = rup.sample_number_of_occurrences()
                if num_occurrences:
                    num_occ_by_rup[rup] += {
                        (sampleid, ses_idx): num_occurrences}
        rup.rup_no = rup_no + 1
    return num_occ_by_rup


def _build_eb_ruptures(
        src, num_occ_by_rup, cmaker, s_sites, random_seed, rup_mon):
    """
    Filter the ruptures stored in the dictionary num_occ_by_rup and
    yield pairs (rupture, <list of associated EBRuptures>)
    """
    for rup in sorted(num_occ_by_rup, key=operator.attrgetter('rup_no')):
        with rup_mon:
            try:
                rup.ctx = cmaker.make_contexts(s_sites, rup)
                indices = rup.ctx[0].sids
            except FarAwayRupture:
                # ignore ruptures which are far away
                del num_occ_by_rup[rup]  # save memory
                continue

        # creating EBRuptures
        serial = rup.seed - random_seed + 1
        events = []
        for (sampleid, ses_idx), num_occ in sorted(
                num_occ_by_rup[rup].items()):
            for _ in range(num_occ):
                # NB: the 0 below is a placeholder; the right eid will be
                # set a bit later, in set_eids
                events.append((0, src.src_group_id, ses_idx, sampleid))
        if events:
            yield EBRupture(rup, indices, numpy.array(events, calc.event_dt),
                            serial)


[docs]def get_events(ebruptures):
    """
    Extract an array of dtype stored_event_dt from a list of EBRuptures
    """
    events = []
    year = 0  # to be set later
    for ebr in ebruptures:
        for event in ebr.events:
            rec = (event['eid'], ebr.serial, ebr.grp_id, year, event['ses'],
                   event['sample'])
            events.append(rec)
    return numpy.array(events, readinput.stored_event_dt)


[docs]@base.calculators.add('event_based_rupture')
class EventBasedRuptureCalculator(base.HazardCalculator):
    """
    Event based PSHA calculator generating the ruptures only
    """
    core_task = compute_ruptures
    is_stochastic = True

[docs]    def init(self):
        """
        Set the random seed passed to the SourceManager and the
        minimum_intensity dictionary.
        """
        oq = self.oqparam
        self.min_iml = self.get_min_iml(oq)
        self.rupser = calc.RuptureSerializer(self.datastore)

[docs]    def zerodict(self):
        """
        Initial accumulator, a dictionary (grp_id, gsim) -> curves
        """
        zd = AccumDict()
        zd.calc_times = []
        zd.eff_ruptures = AccumDict()
        self.grp_trt = self.csm.info.grp_by("trt")
        return zd

[docs]    def agg_dicts(self, acc, ruptures_by_grp_id):
        """
        Accumulate dictionaries of ruptures and populate the `events`
        dataset in the datastore.

        :param acc: accumulator dictionary
        :param ruptures_by_grp_id: a nested dictionary grp_id -> ruptures
        """
        if hasattr(ruptures_by_grp_id, 'calc_times'):
            acc.calc_times.extend(ruptures_by_grp_id.calc_times)
        if hasattr(ruptures_by_grp_id, 'eff_ruptures'):
            acc.eff_ruptures += ruptures_by_grp_id.eff_ruptures
        acc += ruptures_by_grp_id
        self.save_ruptures(ruptures_by_grp_id)
        return acc

[docs]    def save_ruptures(self, ruptures_by_grp_id):
        """
        Extend the 'events' dataset with the events from the given ruptures;
        also, save the ruptures if the flag `save_ruptures` is on.

        :param ruptures_by_grp_id: a dictionary grp_id -> list of EBRuptures
        """
        with self.monitor('saving ruptures', autoflush=True):
            for grp_id, ebrs in ruptures_by_grp_id.items():
                if len(ebrs):
                    events = get_events(ebrs)
                    dset = self.datastore.extend('events', events)
                    if self.oqparam.save_ruptures:
                        self.rupser.save(ebrs, eidx=len(dset)-len(events))

[docs]    def gen_args(self, csm, monitor):
        """
        Used in the case of large source model logic trees.

        :param monitor: a :class:`openquake.baselib.performance.Monitor`
        :param csm: a reduced CompositeSourceModel
        :yields: (sources, sites, gsims, monitor) tuples
        """
        oq = self.oqparam
        src_filter = SourceFilter(self.sitecol, oq.maximum_distance)
        csm = self.csm.filter(src_filter)

        def weight(src):
            return src.num_ruptures * src.RUPTURE_WEIGHT
        maxweight = math.ceil(
            sum(weight(src) for src in self.csm.get_sources()) /
            (oq.concurrent_tasks or 1))
        logging.info('Using maxweight=%d', maxweight)
        param = dict(
            truncation_level=oq.truncation_level,
            imtls=oq.imtls, seed=oq.ses_seed,
            maximum_distance=oq.maximum_distance,
            ses_per_logic_tree_path=oq.ses_per_logic_tree_path)

        num_tasks = 0
        num_sources = 0
        for sm in csm.source_models:
            for sg in sm.src_groups:
                gsims = csm.info.gsim_lt.get_gsims(sg.trt)
                csm.add_infos(sg.sources)
                for block in csm.split_in_blocks(
                        maxweight, sg.sources, weight):
                    block.samples = sm.samples
                    yield block, src_filter, gsims, param, monitor
                    num_tasks += 1
                    num_sources += len(block)
        logging.info('Sent %d sources in %d tasks', num_sources, num_tasks)

[docs]    def execute(self):
        mutex_groups = list(self.csm.gen_mutex_groups())
        assert not mutex_groups, 'Mutex sources are not implemented!'
        with self.monitor('managing sources', autoflush=True):
            allargs = self.gen_args(self.csm, self.monitor('classical'))
            iterargs = saving_sources_by_task(allargs, self.datastore)
            if isinstance(allargs, list):
                # there is a trick here: if the arguments are known
                # (a list, not an iterator), keep them as a list
                # then the Starmap will understand the case of a single
                # argument tuple and it will run in core the task
                iterargs = list(iterargs)
            acc = parallel.Starmap(self.core_task.__func__, iterargs).reduce(
                self.agg_dicts, self.zerodict())
        with self.monitor('store source_info', autoflush=True):
            self.store_source_info(self.csm.infos, acc)
        return acc

[docs]    def post_execute(self, result):
        """
        Save the SES collection
        """
        self.rupser.close()
        num_events = sum(set_counts(self.datastore, 'events').values())
        if num_events == 0:
            raise RuntimeError(
                'No seismic events! Perhaps the investigation time is too '
                'small or the maximum_distance is too small')
        num_ruptures = sum(len(ruptures) for ruptures in result.values())
        logging.info('Setting %d event years on %d ruptures',
                     num_events, num_ruptures)
        with self.monitor('setting event years', measuremem=True,
                          autoflush=True):
            numpy.random.seed(self.oqparam.ses_seed)
            set_random_years(self.datastore, 'events',
                             int(self.oqparam.investigation_time))


[docs]def set_counts(dstore, dsetname):
    """
    :param dstore: a DataStore instance
    :dsetname: name of dataset with a field `grp_id`
    :returns: a dictionary grp_id > counts
    """
    groups = dstore[dsetname]['grp_id']
    unique, counts = numpy.unique(groups, return_counts=True)
    dic = dict(zip(unique, counts))
    dstore.set_attrs(dsetname, by_grp=sorted(dic.items()))
    return dic


[docs]def set_random_years(dstore, name, investigation_time):
    """
    Set on the `events` dataset year labels sensitive to the
    SES ordinal and the investigation time.
    """
    events = dstore[name].value
    years = numpy.random.choice(investigation_time, len(events)) + 1
    year_of = dict(zip(numpy.sort(events['eid']), years))  # eid -> year
    for event in events:
        idx = event['ses'] - 1  # starts from 0
        event['year'] = idx * investigation_time + year_of[event['eid']]
    dstore[name] = events


# ######################## GMF calculator ############################ #


[docs]def compute_gmfs_and_curves(getters, oq, monitor):
    """
    :param getters:
        a list of GmfGetter instances
    :param oq:
        an OqParam instance
    :param monitor:
        a Monitor instance
    :returns:
        a list of dictionaries with keys gmfcoll and hcurves
    """
    results = []
    for getter in getters:
        with monitor('GmfGetter.init', measuremem=True):
            getter.init()
        hcurves = {}  # key -> poes
        if oq.hazard_curves_from_gmfs:
            hc_mon = monitor('building hazard curves', measuremem=False)
            duration = oq.investigation_time * oq.ses_per_logic_tree_path
            with monitor('building hazard', measuremem=True):
                gmfdata = numpy.fromiter(getter.gen_gmv(), getter.gmf_data_dt)
                hazard = getter.get_hazard(data=gmfdata)
            for sid, hazardr in zip(getter.sids, hazard):
                for rlzi, array in hazardr.items():
                    if len(array) == 0:  # no data
                        continue
                    with hc_mon:
                        gmvs = array['gmv']
                        for imti, imt in enumerate(getter.imtls):
                            poes = calc._gmvs_to_haz_curve(
                                gmvs[:, imti], oq.imtls[imt],
                                oq.investigation_time, duration)
                            hcurves[rsi2str(rlzi, sid, imt)] = poes
        else:  # fast lane
            with monitor('building hazard', measuremem=True):
                gmfdata = numpy.fromiter(getter.gen_gmv(), getter.gmf_data_dt)
        indices = []
        if oq.ground_motion_fields:
            gmfdata.sort(order=('sid', 'rlzi', 'eid'))
            start = stop = 0
            for sid, rows in itertools.groupby(gmfdata['sid']):
                for row in rows:
                    stop += 1
                indices.append((sid, start, stop))
                start = stop
        else:
            gmfdata = None
        res = dict(gmfdata=gmfdata, hcurves=hcurves, gmdata=getter.gmdata,
                   taskno=monitor.task_no,
                   indices=numpy.array(indices, (U32, 3)))
        if len(getter.gmdata):
            results.append(res)
    return results


[docs]def save_gmdata(calc, n_rlzs):
    """
    Save a composite array `gmdata` in the datastore.

    :param calc: a calculator with a dictionary .gmdata {rlz: data}
    :param n_rlzs: the total number of realizations
    """
    n_sites = len(calc.sitecol)
    dtlist = ([(imt, F32) for imt in calc.oqparam.imtls] +
              [('events', U32), ('nbytes', U32)])
    array = numpy.zeros(n_rlzs, dtlist)
    for rlzi in sorted(calc.gmdata):
        data = calc.gmdata[rlzi]  # (imts, events, nbytes)
        events = data[-2]
        nbytes = data[-1]
        gmv = data[:-2] / events / n_sites
        array[rlzi] = tuple(gmv) + (events, nbytes)
    calc.datastore['gmdata'] = array
    logging.info('Generated %s of GMFs', humansize(array['nbytes'].sum()))


[docs]def update_nbytes(dstore, key, array):
    nbytes = dstore.get_attr(key, 'nbytes', 0)
    dstore.set_attrs(key, nbytes=nbytes + array.nbytes)


[docs]@base.calculators.add('event_based')
class EventBasedCalculator(base.HazardCalculator):
    """
    Event based PSHA calculator generating the ground motion fields and
    the hazard curves from the ruptures, depending on the configuration
    parameters.
    """
    pre_calculator = 'event_based_rupture'
    core_task = compute_gmfs_and_curves
    is_stochastic = True

[docs]    def combine_pmaps_and_save_gmfs(self, acc, results):
        """
        Combine the hazard curves (if any) and save the gmfs (if any)
        sequentially; notice that the gmfs may come from
        different tasks in any order.

        :param acc: an accumulator for the hazard curves
        :param results: dictionaries rlzi, imt -> [gmf_array, curves_by_imt]
        :returns: a new accumulator
        """
        sav_mon = self.monitor('saving gmfs')
        agg_mon = self.monitor('aggregating hcurves')
        hdf5path = self.datastore.hdf5path
        for res in results:
            self.gmdata += res['gmdata']
            data = res['gmfdata']
            if data is not None:
                with sav_mon:
                    hdf5.extend3(hdf5path, 'gmf_data/data', data)
                    # it is important to save the number of bytes while the
                    # computation is going, to see the progress
                    update_nbytes(self.datastore, 'gmf_data/data', data)
                    for sid, start, stop in res['indices']:
                        self.indices[sid].append(
                            (start + self.offset, stop + self.offset))
                    self.offset += len(data)
            slicedic = self.oqparam.imtls.slicedic
            with agg_mon:
                for key, poes in res['hcurves'].items():
                    r, sid, imt = str2rsi(key)
                    array = acc[r].setdefault(sid, 0).array[slicedic[imt], 0]
                    array[:] = 1. - (1. - array) * (1. - poes)
            sav_mon.flush()
            agg_mon.flush()
            self.datastore.flush()
            if 'ruptures' in res:
                vars(EventBasedRuptureCalculator)['save_ruptures'](
                    self, res['ruptures'])
        return acc

[docs]    def gen_args(self):
        """
        :yields: the arguments for compute_gmfs_and_curves
        """
        oq = self.oqparam
        monitor = self.monitor(self.core_task.__name__)
        imts = list(oq.imtls)
        min_iml = self.get_min_iml(oq)
        correl_model = oq.get_correl_model()
        try:
            csm_info = self.csm.info
        except AttributeError:  # no csm
            csm_info = self.datastore['csm_info']
        samples_by_grp = csm_info.get_samples_by_grp()
        rlzs_by_gsim = {grp_id: self.rlzs_assoc.get_rlzs_by_gsim(grp_id)
                        for grp_id in samples_by_grp}
        if self.precalc:
            num_ruptures = sum(len(rs) for rs in self.precalc.result.values())
            block_size = math.ceil(num_ruptures / (oq.concurrent_tasks or 1))
            for grp_id, ruptures in self.precalc.result.items():
                if not ruptures:
                    continue
                for block in block_splitter(ruptures, block_size):
                    getter = GmfGetter(
                        rlzs_by_gsim[grp_id], block, self.sitecol,
                        imts, min_iml, oq.maximum_distance,
                        oq.truncation_level, correl_model,
                        samples_by_grp[grp_id])
                    yield [getter], oq, monitor
            return
        U = len(self.datastore['ruptures'])
        logging.info('Found %d ruptures', U)
        parent = self.can_read_parent() or self.datastore
        for slc in split_in_slices(U, oq.concurrent_tasks or 1):
            getters = []
            for grp_id in rlzs_by_gsim:
                ruptures = RuptureGetter(parent, slc, grp_id)
                if parent is self.datastore:  # not accessible parent
                    ruptures = list(ruptures)
                    if not ruptures:
                        continue
                getters.append(GmfGetter(
                    rlzs_by_gsim[grp_id], ruptures, self.sitecol,
                    imts, min_iml, oq.maximum_distance, oq.truncation_level,
                    correl_model, samples_by_grp[grp_id]))
            yield getters, oq, monitor

[docs]    def execute(self):
        """
        Run in parallel `core_task(sources, sitecol, monitor)`, by
        parallelizing on the ruptures according to their weight and
        tectonic region type.
        """
        oq = self.oqparam
        if not oq.hazard_curves_from_gmfs and not oq.ground_motion_fields:
            return
        if self.oqparam.ground_motion_fields:
            calc.check_overflow(self)

        self.csm_info = self.datastore['csm_info']
        self.sm_id = {tuple(sm.path): sm.ordinal
                      for sm in self.csm_info.source_models}
        L = len(oq.imtls.array)
        R = self.datastore['csm_info'].get_num_rlzs()
        self.gmdata = {}
        self.offset = 0
        self.indices = collections.defaultdict(list)  # sid -> indices
        acc = parallel.Starmap(
            self.core_task.__func__, self.gen_args()
        ).reduce(self.combine_pmaps_and_save_gmfs, {
            r: ProbabilityMap(L) for r in range(R)})
        save_gmdata(self, R)
        if self.indices:
            logging.info('Saving gmf_data/indices')
            with self.monitor('saving gmf_data/indices', measuremem=True,
                              autoflush=True):
                self.datastore.save_vlen(
                    'gmf_data/indices',
                    [numpy.array(self.indices[sid], indices_dt)
                     for sid in self.sitecol.complete.sids])
        return acc

[docs]    def save_gmf_bytes(self):
        """Save the attribute nbytes in the gmf_data datasets"""
        ds = self.datastore
        for sm_id in ds['gmf_data']:
            ds.set_nbytes('gmf_data/' + sm_id)
        ds.set_nbytes('gmf_data')

[docs]    def post_execute(self, result):
        """
        :param result:
            a dictionary (src_group_id, gsim) -> haz_curves or an empty
            dictionary if hazard_curves_from_gmfs is false
        """
        oq = self.oqparam
        if not oq.hazard_curves_from_gmfs and not oq.ground_motion_fields:
            return
        elif oq.hazard_curves_from_gmfs:
            rlzs = self.rlzs_assoc.realizations
            # save individual curves
            for i in sorted(result):
                key = 'hcurves/rlz-%03d' % i
                if result[i]:
                    self.datastore[key] = result[i]
                else:
                    self.datastore[key] = ProbabilityMap(oq.imtls.array.size)
                    logging.info('Zero curves for %s', key)
            # compute and save statistics; this is done in process
            # we don't need to parallelize, since event based calculations
            # involves a "small" number of sites (<= 65,536)
            weights = [rlz.weight for rlz in rlzs]
            hstats = self.oqparam.hazard_stats()
            if len(hstats) and len(rlzs) > 1:
                for kind, stat in hstats:
                    pmap = compute_pmap_stats(result.values(), [stat], weights)
                    self.datastore['hcurves/' + kind] = pmap
        if self.datastore.parent:
            self.datastore.parent.open()
        if 'gmf_data' in self.datastore:
            self.save_gmf_bytes()
        if oq.compare_with_classical:  # compute classical curves
            export_dir = os.path.join(oq.export_dir, 'cl')
            if not os.path.exists(export_dir):
                os.makedirs(export_dir)
            oq.export_dir = export_dir
            # one could also set oq.number_of_logic_tree_samples = 0
            self.cl = ClassicalCalculator(oq, self.monitor('classical'))
            # TODO: perhaps it is possible to avoid reprocessing the source
            # model, however usually this is quite fast and do not dominate
            # the computation
            self.cl.run(close=False)
            cl_mean_curves = get_mean_curves(self.cl.datastore)
            eb_mean_curves = get_mean_curves(self.datastore)
            for imt in eb_mean_curves.dtype.names:
                rdiff, index = util.max_rel_diff_index(
                    cl_mean_curves[imt], eb_mean_curves[imt])
                logging.warn('Relative difference with the classical '
                             'mean curves for IMT=%s: %d%% at site index %d',
                             imt, rdiff * 100, index)


[docs]def get_mean_curves(dstore):
    """
    Extract the mean hazard curves from the datastore, as a composite
    array of length nsites.
    """
    imtls = dstore['oqparam'].imtls
    nsites = len(dstore['sitecol'])
    hcurves = dstore['hcurves']
    if 'mean' in hcurves:
        mean = dstore['hcurves/mean']
    elif len(hcurves) == 1:  # there is a single realization
        mean = dstore['hcurves/rlz-0000']
    return mean.convert(imtls, nsites)