Source code for smdc_perftests.performance_tests.test_cases

# Copyright (c) 2013,Vienna University of Technology,
# Department of Geodesy and Geoinformation
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#   * Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright
#      notice, this list of conditions and the following disclaimer in the
#      documentation and/or other materials provided with the distribution.
#    * Neither the name of the Vienna University of Technology,
#      Department of Geodesy and Geoinformation nor the
#      names of its contributors may be used to endorse or promote products
#      derived from this software without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL VIENNA UNIVERSITY OF TECHNOLOGY,
# DEPARTMENT OF GEODESY AND GEOINFORMATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

'''
This module contains functions that
run tests according to specifications from SMDC Performance comparison
document.

Interfaces to data should be interchangeable as long as they adhere
to interface specifications from rsdata module

Created on Tue Oct 21 13:37:58 2014

@author: christoph.paulik@geo.tuwien.ac.at
'''
import time
import random
import numpy as np
from scipy.stats import t
import math

import netCDF4


[docs]class TestResults(object): """ Simple object that contains the test results and can be used to compare the test results to other test results. Objects of this type can also be plotted by the plotting routines. Parameters ---------- measured times or filename: list or string list of measured times or netCDF4 file produced by to_nc of another TestResults object ddof: int difference degrees of freedom. This is used to calculate standard deviation and variance. It is the number that is subtracted from the sample number n when estimating the population standard deviation and variance. see bessel's correction on e.g. wikipedia for explanation Attributes ---------- median: float median of the measurements n: int sample size stdev: float standard deviation var: float variance total: float total time expired mean: float mean time per test run """ def __init__(self, init_obj, name=None, ddof=1): if type(init_obj) == str: self._from_nc(init_obj) elif type(init_obj) == list: self._measurements = init_obj if name is None: raise ValueError("Name must be given for new results.") self.name = name self.ddof = ddof self._init_metrics() def _init_metrics(self): """ Initialize the metrics """ self.median = np.median(self._measurements) self.n = len(self._measurements) self.var = np.var(self._measurements, ddof=self.ddof) self.stdev = np.sqrt(self.var) self.total = sum(self._measurements) self.mean = np.mean(self._measurements) def __str__(self): string = [""] string.append("Results %s" % self.name) string.append("%d runs" % self.n) string.append("median %.4f mean %.4f stdev %.4f" % (self.median, self.mean, self.stdev)) string.append("sum %.4f" % self.total) string.append( "95%% confidence interval of the mean") conf = self.confidence_int() string.append("upper %.4f" % conf[2]) string.append(" |") string.append("mean %.4f" % conf[1]) string.append(" |") string.append("lower %.4f" % conf[0]) return '\n'.join(string)
[docs] def confidence_int(self, conf_level=95): """ Calculate confidence interval of the mean time measured Parameters ---------- conf_level: float confidence level desired for the confidence interval in percent. this will be transformed into the quantile needed to get the z value for the t distribution. default is 95% confidence interval Returns ------- lower_mean : float lower confidence interval boundary mean : float mean value upper_mean : float upper confidence interval boundary """ # calculate quantile from confidence level in percent t_quantile = 1 - (1 - conf_level / 100.0) / 2.0 # get t value from distribution t_val = t.ppf(t_quantile, self.n - self.ddof) # calculate standard error for estimated values std_err = self.stdev / np.sqrt(self.n) lower_mean = self.mean - t_val * std_err upper_mean = self.mean + t_val * std_err return lower_mean, self.mean, upper_mean
[docs] def to_nc(self, filename): """ store results on disk as a netCDF4 file Parameters ---------- filename: string path and filename """ with netCDF4.Dataset(filename, mode='w') as ncdata: ncdata.createDimension('measurements', len(self._measurements)) msmts = ncdata.createVariable( 'measurements', 'f8', ('measurements',)) msmts[:] = self._measurements ncdata.setncatts({'dataset_name': self.name})
def _from_nc(self, filename): """ initializes object from netCDF4 file """ with netCDF4.Dataset(filename) as ncdata: self._measurements = ncdata.variables['measurements'][:].tolist() self.name = ncdata.dataset_name def __lt__(self, other): """ Check for overlap of confidence intervals only True if upper confidence interval boundary is less than lower confidence interval boundary of other object """ lms, ms, ums = self.confidence_int() lmo, mo, umo = other.confidence_int() if ums < lmo: return True else: return False def __gt__(self, other): """ Check for overlap of confidence intervals only True if lower confidence interval boundary is greater than upper confidence interval boundary of other object """ lms, ms, ums = self.confidence_int() lmo, mo, umo = other.confidence_int() if lms > umo: return True else: return False
[docs]class SelfTimingDataset(object): """ Dataset class that times the functions of a dataset instance it gets in it's constructor Stores the results as TestResults instances in a dictionary with the timed function names as keys. """ def __init__(self, ds, timefuncs=["get_timeseries", "get_avg_image", "get_data"]): self.ds = ds self.timefuncs = timefuncs self.measurements = {} # link attributes of this class to attributes of # measuring class for func in timefuncs: self.gentimedfunc(func) self.measurements[func] = []
[docs] def gentimedfunc(self, funcname): """ generate a timed function that calls the function of the given dataset but returns the execution time Parameters ---------- funcname: string function to create/call of the timed dataset """ def f(*args, **kwargs): start = time.time() getattr(self.ds, funcname)(*args, **kwargs) end = time.time() duration = end - start self.measurements[funcname].append(duration) setattr(self, funcname, f)
def __getattr__(self, name): try: return self.__dict__[name] except KeyError: return getattr(self.ds, name)
[docs]def measure(exper_name, runs=5, ddof=1): """ Decorator that measures the running time of a function and calculates statistics. Parameters ---------- exper_name: string experiment name, used for plotting and saving runs: int number of test runs to perform ddof: int difference degrees of freedom. This is used to calculate standard deviation and variance. It is the number that is subtracted from the sample number n when estimating the population standard deviation and variance. see bessel's correction on e.g. wikipedia for explanation Returns ======= results: dict TestResults instance """ def decorator(func): def inner(*args, **kwargs): measured_times = [] for i in xrange(runs): start = time.time() func(*args, **kwargs) end = time.time() duration = end - start measured_times.append(duration) results = TestResults(measured_times, exper_name, ddof=ddof) return results return inner return decorator
[docs]def read_rand_ts_by_gpi_list(dataset, gpi_list, read_perc=1.0, max_runtime=None, **kwargs): """ reads time series data for random grid point indices in a list additional kwargs are given to read_ts method of dataset Parameters ---------- dataset: instance instance of a class that implements a read_ts(gpi) method gpi_list: iterable list or numpy array of grid point indices read_perc: float percentage of points from gpi_list to read max_runtime: int, optional maximum runtime of test in second. **kwargs: other keywords are passed to the get_timeseries method dataset """ gpi_read = random.sample( gpi_list, int(math.ceil(len(gpi_list) * read_perc / 100.0))) print "reading {} out of {} time series".format(len(gpi_read), len(gpi_list)) start = time.time() for gpi in gpi_read: data = dataset.get_timeseries(int(gpi), **kwargs) if max_runtime is not None: end = time.time() duration = end - start if duration > max_runtime: break
[docs]def read_rand_img_by_date_list(dataset, date_list, read_perc=1.0, max_runtime=None, **kwargs): """ reads image data for random dates on a list additional kwargs are given to read_img method of dataset Parameters ---------- dataset: instance instance of a class that implements a read_img(datetime) method date_list: iterable list of datetime objects read_perc: float percentage of datetimes out of date_list to read max_runtime: int, optional maximum runtime of test in second. **kwargs: other keywords are passed to the get_avg_image method dataset """ date_read = random.sample( date_list, int(math.ceil(len(date_list) * read_perc / 100.0))) print "reading {} out of {} dates".format(len(date_read), len(date_list)) start = time.time() for d in date_read: data = dataset.get_avg_image(d, **kwargs) if max_runtime is not None: end = time.time() duration = end - start if duration > max_runtime: break
[docs]def read_rand_img_by_date_range(dataset, date_list, read_perc=1.0, max_runtime=None, **kwargs): """ reads image data between random dates on a list additional kwargs are given to read_img method of dataset Parameters ---------- dataset: instance instance of a class that implements a read_img(datetime) method date_list: iterable list of datetime objects The format is a list of lists e.g. [[datetime(2007,1,1), datetime(2007,1,1)], #reads one day [datetime(2007,1,1), datetime(2007,12,31)]] # reads one year read_perc: float percentage of datetimes out of date_list to read max_runtime: int, optional maximum runtime of test in second. **kwargs: other keywords are passed to the get_avg_image method dataset """ date_read = random.sample( date_list, int(math.ceil(len(date_list) * read_perc / 100.0))) print "reading {} out of {} dates".format(len(date_read), len(date_list)) start = time.time() for d1, d2 in date_read: data = dataset.get_avg_image(d1, d2, **kwargs) if max_runtime is not None: end = time.time() duration = end - start if duration > max_runtime: break
[docs]def read_rand_cells_by_cell_list(dataset, cell_date_list, cell_id, read_perc=1.0, max_runtime=None): """ reads data from the dataset using the get_data method. In this method the start and end datetimes are fixed for all cell ID's that are read. Parameters ---------- dataset: instance instance of a class that implements a get_data(date_start, date_end, cell_id) method date_start: datetime start dates which should be read. date_end: datetime end dates which should be read. cell_date_list: list of tuples, time intervals to read for each cell cell_id: int or iterable cell ids which should be read. can also be a list of integers read_perc : float percentage of cell ids to read from the max_runtime: int, optional maximum runtime of test in second. """ # make sure cell_id is iterable try: iter(cell_id) except TypeError: cell_id = [cell_id] cell_read = random.sample( cell_id, int(math.ceil(len(cell_id) * read_perc / 100.0))) dates_read = random.sample( cell_date_list, int(math.ceil(len(cell_date_list) * read_perc / 100.0))) print "reading {} out of {} cells".format(len(cell_read), len(cell_id)) start = time.time() for c, dates in zip(cell_read, dates_read): data = dataset.get_data(dates[0], dates[1], c) if max_runtime is not None: end = time.time() duration = end - start if duration > max_runtime: break