Source code for smdc_perftests.performance_tests.test_cases

# Copyright (c) 2013,Vienna University of Technology,
# Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the Vienna University of Technology,
#      Department of Geodesy and Geoinformation nor the
#      names of its contributors may be used to endorse or promote products
#      derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

'''
This module contains functions that
run tests according to specifications from SMDC Performance comparison
document.

Interfaces to data should be interchangeable as long as they adhere
to interface specifications from rsdata module

Created on Tue Oct 21 13:37:58 2014

@author: christoph.paulik@geo.tuwien.ac.at
'''
import time
import random
import numpy as np
from scipy.stats import t
import math

import netCDF4


[docs]class TestResults(object):

    """
    Simple object that contains the test results
    and can be used to compare the test results
    to other test results.

    Objects of this type can also be plotted by
    the plotting routines.
    Parameters
    ----------
    measured times or filename: list or string
        list of measured times or netCDF4 file produced
        by to_nc of another TestResults object
    ddof: int
        difference degrees of freedom. This is used to calculate
        standard deviation and variance. It is the number that is
        subtracted from the sample number n when estimating
        the population standard deviation and variance.
        see bessel's correction on e.g. wikipedia for explanation


    Attributes
    ----------
    median: float
        median of the measurements
    n: int
        sample size
    stdev: float
        standard deviation
    var: float
        variance
    total: float
        total time expired
    mean: float
        mean time per test run
    """

    def __init__(self, init_obj, name=None,
                 ddof=1):

        if type(init_obj) == str:
            self._from_nc(init_obj)
        elif type(init_obj) == list:
            self._measurements = init_obj
            if name is None:
                raise ValueError("Name must be given for new results.")
            self.name = name

        self.ddof = ddof
        self._init_metrics()

    def _init_metrics(self):
        """
        Initialize the metrics
        """
        self.median = np.median(self._measurements)
        self.n = len(self._measurements)
        self.var = np.var(self._measurements, ddof=self.ddof)
        self.stdev = np.sqrt(self.var)
        self.total = sum(self._measurements)
        self.mean = np.mean(self._measurements)

    def __str__(self):
        string = [""]
        string.append("Results %s" % self.name)
        string.append("%d runs" % self.n)
        string.append("median %.4f mean %.4f stdev %.4f" %
                      (self.median, self.mean, self.stdev))
        string.append("sum %.4f" % self.total)
        string.append(
            "95%% confidence interval of the mean")
        conf = self.confidence_int()
        string.append("upper %.4f" % conf[2])
        string.append("       |")
        string.append("mean  %.4f" % conf[1])
        string.append("       |")
        string.append("lower %.4f" % conf[0])
        return '\n'.join(string)

[docs]    def confidence_int(self, conf_level=95):
        """
        Calculate confidence interval of the mean
        time measured

        Parameters
        ----------
        conf_level: float
            confidence level desired for the confidence interval in percent.
            this will be transformed into the quantile needed to get the z value
            for the t distribution.
            default is 95% confidence interval

        Returns
        -------
        lower_mean : float
            lower confidence interval boundary
        mean : float
            mean value
        upper_mean : float
            upper confidence interval boundary

        """
        # calculate quantile from confidence level in percent
        t_quantile = 1 - (1 - conf_level / 100.0) / 2.0
        # get t value from distribution
        t_val = t.ppf(t_quantile, self.n - self.ddof)
        # calculate standard error for estimated values
        std_err = self.stdev / np.sqrt(self.n)
        lower_mean = self.mean - t_val * std_err
        upper_mean = self.mean + t_val * std_err
        return lower_mean, self.mean, upper_mean

[docs]    def to_nc(self, filename):
        """
        store results on disk as a netCDF4 file

        Parameters
        ----------
        filename: string
            path and filename
        """
        with netCDF4.Dataset(filename, mode='w') as ncdata:
            ncdata.createDimension('measurements', len(self._measurements))
            msmts = ncdata.createVariable(
                'measurements', 'f8', ('measurements',))
            msmts[:] = self._measurements

            ncdata.setncatts({'dataset_name': self.name})

    def _from_nc(self, filename):
        """
        initializes object from netCDF4 file
        """
        with netCDF4.Dataset(filename) as ncdata:
            self._measurements = ncdata.variables['measurements'][:].tolist()
            self.name = ncdata.dataset_name

    def __lt__(self, other):
        """
        Check for overlap of confidence intervals
        only True if upper confidence interval boundary
        is less than lower confidence interval boundary
        of other object
        """
        lms, ms, ums = self.confidence_int()
        lmo, mo, umo = other.confidence_int()
        if ums < lmo:
            return True
        else:
            return False

    def __gt__(self, other):
        """
        Check for overlap of confidence intervals
        only True if lower confidence interval boundary
        is greater than upper confidence interval boundary
        of other object
        """
        lms, ms, ums = self.confidence_int()
        lmo, mo, umo = other.confidence_int()
        if lms > umo:
            return True
        else:
            return False


[docs]class SelfTimingDataset(object):

    """
    Dataset class that times the functions of
    a dataset instance it gets in it's constructor

    Stores the results as TestResults instances in a
    dictionary with the timed function names as keys.
    """

    def __init__(self, ds, timefuncs=["get_timeseries",
                                      "get_avg_image",
                                      "get_data"]):
        self.ds = ds
        self.timefuncs = timefuncs
        self.measurements = {}
        # link attributes of this class to attributes of
        # measuring class
        for func in timefuncs:
            self.gentimedfunc(func)
            self.measurements[func] = []

[docs]    def gentimedfunc(self, funcname):
        """
        generate a timed function that calls
        the function of the given dataset
        but returns the execution time

        Parameters
        ----------
        funcname: string
            function to create/call of the timed dataset
        """

        def f(*args, **kwargs):
            start = time.time()
            getattr(self.ds, funcname)(*args, **kwargs)
            end = time.time()
            duration = end - start
            self.measurements[funcname].append(duration)

        setattr(self, funcname, f)

    def __getattr__(self, name):
        try:
            return self.__dict__[name]
        except KeyError:
            return getattr(self.ds, name)


[docs]def measure(exper_name, runs=5, ddof=1):
    """
    Decorator that measures the running time of a function
    and calculates statistics.

    Parameters
    ----------
    exper_name: string
        experiment name, used for plotting and saving
    runs: int
        number of test runs to perform
    ddof: int
        difference degrees of freedom. This is used to calculate
        standard deviation and variance. It is the number that is
        subtracted from the sample number n when estimating
        the population standard deviation and variance.
        see bessel's correction on e.g. wikipedia for explanation

    Returns
    =======
    results: dict
        TestResults instance
    """
    def decorator(func):
        def inner(*args, **kwargs):
            measured_times = []
            for i in xrange(runs):
                start = time.time()
                func(*args, **kwargs)
                end = time.time()
                duration = end - start
                measured_times.append(duration)
            results = TestResults(measured_times, exper_name, ddof=ddof)
            return results

        return inner
    return decorator


[docs]def read_rand_ts_by_gpi_list(dataset, gpi_list, read_perc=1.0,
                             max_runtime=None, **kwargs):
    """
    reads time series data for random grid point indices in a list
    additional kwargs are given to read_ts method of dataset

    Parameters
    ----------
    dataset: instance
        instance of a class that implements a read_ts(gpi)
        method
    gpi_list: iterable
        list or numpy array of grid point indices
    read_perc: float
        percentage of points from gpi_list to read
    max_runtime: int, optional
        maximum runtime of test in second.
    **kwargs:
        other keywords are passed to the get_timeseries method
        dataset
    """
    gpi_read = random.sample(
        gpi_list, int(math.ceil(len(gpi_list) * read_perc / 100.0)))
    print "reading {} out of {} time series".format(len(gpi_read), len(gpi_list))

    start = time.time()
    for gpi in gpi_read:
        data = dataset.get_timeseries(int(gpi), **kwargs)
        if max_runtime is not None:
            end = time.time()
            duration = end - start
            if duration > max_runtime:
                break


[docs]def read_rand_img_by_date_list(dataset, date_list, read_perc=1.0,
                               max_runtime=None, **kwargs):
    """
    reads image data for random dates on a list
    additional kwargs are given to read_img method
    of dataset

    Parameters
    ----------
    dataset: instance
        instance of a class that implements a read_img(datetime)
        method
    date_list: iterable
        list of datetime objects
    read_perc: float
        percentage of datetimes out of date_list to read
    max_runtime: int, optional
        maximum runtime of test in second.
    **kwargs:
        other keywords are passed to the get_avg_image method
        dataset
    """
    date_read = random.sample(
        date_list, int(math.ceil(len(date_list) * read_perc / 100.0)))
    print "reading {} out of {} dates".format(len(date_read), len(date_list))

    start = time.time()
    for d in date_read:
        data = dataset.get_avg_image(d, **kwargs)
        if max_runtime is not None:
            end = time.time()
            duration = end - start
            if duration > max_runtime:
                break


[docs]def read_rand_img_by_date_range(dataset, date_list, read_perc=1.0,
                                max_runtime=None, **kwargs):
    """
    reads image data between random dates on a list
    additional kwargs are given to read_img method
    of dataset

    Parameters
    ----------
    dataset: instance
        instance of a class that implements a read_img(datetime)
        method
    date_list: iterable
        list of datetime objects
        The format is a list of lists e.g.
        [[datetime(2007,1,1), datetime(2007,1,1)], #reads one day
         [datetime(2007,1,1), datetime(2007,12,31)]] # reads one year
    read_perc: float
        percentage of datetimes out of date_list to read
    max_runtime: int, optional
        maximum runtime of test in second.
    **kwargs:
        other keywords are passed to the get_avg_image method
        dataset
    """
    date_read = random.sample(
        date_list, int(math.ceil(len(date_list) * read_perc / 100.0)))
    print "reading {} out of {} dates".format(len(date_read), len(date_list))

    start = time.time()
    for d1, d2 in date_read:
        data = dataset.get_avg_image(d1, d2, **kwargs)
        if max_runtime is not None:
            end = time.time()
            duration = end - start
            if duration > max_runtime:
                break


[docs]def read_rand_cells_by_cell_list(dataset, cell_date_list, cell_id,
                                 read_perc=1.0, max_runtime=None):
    """
    reads data from the dataset using the get_data method.
    In this method the start and end datetimes are fixed for all
    cell ID's that are read.

    Parameters
    ----------
    dataset: instance
        instance of a class that implements a get_data(date_start, date_end, cell_id)
        method
    date_start: datetime
        start dates which should be read.
    date_end: datetime
        end dates which should be read.
    cell_date_list: list of tuples, time intervals to read for each cell
    cell_id: int or iterable
        cell ids which should be read. can also be a list of integers
    read_perc : float
        percentage of cell ids to read from the
    max_runtime: int, optional
        maximum runtime of test in second.
    """
    # make sure cell_id is iterable
    try:
        iter(cell_id)
    except TypeError:
        cell_id = [cell_id]

    cell_read = random.sample(
        cell_id, int(math.ceil(len(cell_id) * read_perc / 100.0)))

    dates_read = random.sample(
        cell_date_list, int(math.ceil(len(cell_date_list) * read_perc / 100.0)))

    print "reading {} out of {} cells".format(len(cell_read), len(cell_id))
    start = time.time()
    for c, dates in zip(cell_read, dates_read):
        data = dataset.get_data(dates[0], dates[1], c)
        if max_runtime is not None:
            end = time.time()
            duration = end - start
            if duration > max_runtime:
                break