Source code for petastorm.predicates

#  Copyright (c) 2017-2018 Uber Technologies, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

Predicates for petastorm
import abc
import collections
import hashlib
import numpy as np
import six
import sys

[docs]@six.add_metaclass(abc.ABCMeta) class PredicateBase(object): """ Base class for row predicates """
[docs] @abc.abstractmethod def get_fields(self): pass
[docs] @abc.abstractmethod def do_include(self, values): pass
def _string_to_bucket(string, bucket_num): hash_str = hashlib.md5(string.encode('utf-8')).hexdigest() return int(hash_str, 16) % bucket_num
[docs]class in_set(PredicateBase): """ Test if predicate_field value is in inclusion_values set """ def __init__(self, inclusion_values, predicate_field): self._inclusion_values = set(inclusion_values) self._predicate_field = predicate_field
[docs] def get_fields(self): return {self._predicate_field}
[docs] def do_include(self, values): return values[self._predicate_field] in self._inclusion_values
[docs]class in_intersection(PredicateBase): """ Test if predicate_field list contain at least one value from inclusion_values set """ def __init__(self, inclusion_values, _predicate_field): self._inclusion_values = list(inclusion_values) self._predicate_field = _predicate_field
[docs] def get_fields(self): return {self._predicate_field}
[docs] def do_include(self, values): if not isinstance(values[self._predicate_field], collections.Iterable): raise ValueError('Predicate field should have iterable type') return any(np.in1d(values[self._predicate_field], self._inclusion_values))
[docs]class in_lambda(PredicateBase): """ Wrap up custom function to be used as a predicate example: in_lambda(['labels_object_roles'], lambda labels_object_roles : len(labels_object_roles) > 3) """ def __init__(self, predicate_fields, predicate_func, state_arg=None): """ :param predicate_fields: list of fields to be used in predicate :param predicate_func: predicate function example: lambda labels_object_roles : len(labels_object_roles) > 3 :param state_arg: additional object to keep function state. it will be passed to predicate_func after fields arguments ONLY if it is not None """ if not isinstance(predicate_fields, list): raise ValueError('Predicate fields should be a list') self._predicate_fields = predicate_fields self._predicate_func = predicate_func self._state_arg = state_arg
[docs] def get_fields(self): return set(self._predicate_fields)
[docs] def do_include(self, values): args = [values[field] for field in self._predicate_fields] if self._state_arg is not None: args.append(self._state_arg) return self._predicate_func(*args)
[docs]class in_negate(PredicateBase): """ A predicate used to negate another predicate. """ def __init__(self, predicate): if not isinstance(predicate, PredicateBase): raise ValueError('Predicate is nor derived from PredicateBase') self._predicate = predicate
[docs] def get_fields(self): return self._predicate.get_fields()
[docs] def do_include(self, values): return not self._predicate.do_include(values)
[docs]class in_reduce(PredicateBase): """ A predicate used to aggregate other predicates using any reduce logical operation.""" def __init__(self, predicate_list, reduce_func): """ predicate_list: list of predicates reduce_func: function to aggregate result of all predicates in the list e.g. all() will implements logical 'And', any() implements logical 'Or' """ check_list = [isinstance(p, PredicateBase) for p in predicate_list] if not all(check_list): raise ValueError('Predicate is nor derived from PredicateBase') self._predicate_list = predicate_list self._reduce_func = reduce_func
[docs] def get_fields(self): fields = set() for p in self._predicate_list: fields |= p.get_fields() return fields
[docs] def do_include(self, values): include_list = [p.do_include(values) for p in self._predicate_list] return self._reduce_func(include_list)
[docs]class in_pseudorandom_split(PredicateBase): """ Split dataset according to a split list based on volume_guid. The split is pseudorandom (can not supply the seed yet), i.e. the split outcome is always the same. Split is performed by hashing volume_guid uniformly to 0:1 range and returning part of full dataset which was hashed in given sub-range Example: 'split_list = [0.5, 0.2, 0.3]' - dataset will be split on three subsets in proportion subset 1: 0.5 of log data subset 2: 0.2 of log data subset 3: 0.3 of log data Note, split is not exact, so avoid small fraction (e.g. 0.001) to avoid empty sets """ def __init__(self, fraction_list, subset_index, predicate_field): """ split_list: a list of log fractions (real numbers in range [0:1]) subset_index: define which subset will be used by the Reader """ if subset_index >= len(fraction_list): raise ValueError('subset_index is out of range') self._predicate_field = predicate_field # build CDF subsets_high_borders = [sum(fraction_list[:i + 1]) for i in range(len(fraction_list))] if subset_index: fraction_low = subsets_high_borders[subset_index - 1] else: fraction_low = 0 fraction_high = subsets_high_borders[subset_index] self._bucket_low = fraction_low * (sys.maxsize - 1) self._bucket_high = fraction_high * (sys.maxsize - 1)
[docs] def get_fields(self): return {self._predicate_field}
[docs] def do_include(self, values): if self._predicate_field not in values.keys(): raise ValueError('Tested values does not have split key: %s' % self._predicate_field) bucket_idx = _string_to_bucket(values[self._predicate_field], sys.maxsize) return self._bucket_low <= bucket_idx < self._bucket_high