ATHENA/athena/feature_map.py at d76a90b7ddda9973ca6bc4101468cf0ed42c667e · mathLab/ATHENA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""
Module for the feature map class

:References:

    - Francesco Romor, Marco Tezzele, Andrea Lario, Gianluigi Rozza.
      Kernel-based active subspaces with application to computational fluid
      dynamics parametric problems using discontinuous Galerkin method.
      International Journal for Numerical Methods in Engineering,
      123(23):6000–6027, 2022. doi:10.1002/nme.7099

"""
from functools import partial
import numpy as np
from scipy.optimize import brute, dual_annealing
from bayes_opt import BayesianOptimization
from .projection_factory import ProjectionFactory


class FeatureMap():
    """Feature map class.

    :param str distr: name of the spectral distribution to sample from the
        matrix.
    :param numpy.ndarray bias: n_features dimensional bias. It is sampled from a
        Unifrom distribution on the interval [0, 2*PI].
    :param int input_dim: dimension of the input space.
    :param int n_features: dimension of the Reproducing Kernel Hilbert Space.
    :param list params: number of hyperparameters of the spectral distribution.
    :param int sigma_f: multiplicative constant of the feature
        map. Usually it is set as the empirical variance of the outputs.

    :cvar callable fmap: feature map used to project the input samples to the RKHS.
        Default value is rff_map.
    :cvar callable fjac: Jacobian matrix of fmap. Default value is rff_jac, the
        analytical Jacobian of fmap.
    :cvar numpy.ndarray _pr_matrix: n_features-by-input_dim projection
        matrix, whose rows are sampled from the spectral distribution distr.

    :raises TypeError
    """

    def __init__(self, distr, bias, input_dim, n_features, params, sigma_f):
        if callable(distr):
            self.distr = distr
        elif isinstance(distr, str):
            self.distr = ProjectionFactory(distr)
        else:
            raise TypeError('`distr` is not valid.')

        self.bias = bias
        self.input_dim = input_dim
        self.n_features = n_features
        self.params = params
        self.sigma_f = sigma_f
        self.fmap = rff_map
        self.fmap_jac = rff_jac

        self._pr_matrix = None

    @property
    def pr_matrix(self):
        """
        Get the projection matrix.

        :return: the projection matrix.
        :rtype: numpy.ndarray
        """
        return self._pr_matrix

    def _compute_pr_matrix(self):
        """
        Sample the projection matrixx from the spectral distribution.

        :return: the projection matrix.
        :rtype: numpy.ndarray
        """
        return self.distr(self.input_dim, self.n_features, self.params)

    def compute_fmap(self, inputs):
        """
        Evaluate the feature map at inputs.

        :param numpy.ndarray inputs: the inputs to project on the RKHS.
        :return: the n_features dimensional projection of the inputs.
        :rtype: numpy.ndarray
        """
        if self._pr_matrix is None:
            self._pr_matrix = self._compute_pr_matrix()
        return self.fmap(inputs, self._pr_matrix, self.bias, self.n_features,
                         self.sigma_f)

    def compute_fmap_jac(self, inputs):
        """
        Evaluate the Jacobian matrix of feature map at inputs.

        :param numpy.ndarray inputs: the inputs at which compute the Jacobian
            matrix of the feature map.
        :return: the n_features-by-input_dim dimensional Jacobian of the
            feature map at the inputs.
        :rtype: numpy.ndarray
        """
        if self._pr_matrix is None:
            self._pr_matrix = self._compute_pr_matrix()
        return self.fmap_jac(inputs, self._pr_matrix, self.bias,
                             self.n_features, self.sigma_f)

    def tune_pr_matrix(self,
                       func,
                       bounds,
                       method=None,
                       maxiter=50,
                       save_file=False,
                       fn_args=None):
        """
        Tune the parameters of the spectral distribution. Three methods are
        available: log-grid-search (brute), annealing (dual_annealing) and
        Bayesian stochastic optimization (bso) from bayesian-optimization package. The default object
        function to optimize is athena.utils.average_rrmse, which uses a
        cross-validation procedure from athena.utils, see Example and tutorial 06_kernel-based_AS.

        :Example:
        >>> from athena.kas import KernelActiveSubspaces
        >>> from athena.feature_map import FeatureMap
        >>> from athena.projection_factory import ProjectionFactory
        >>> from athena.utils import CrossValidation, average_rrmse
        >>> input_dim, output_dim, n_samples = 2, 1, 30
        >>> n_features, n_params = 10, 1
        >>> xx = np.ones((n_samples, input_dim))
        >>> f = np.ones((n_samples, output_dim))
        >>> df = np.ones((n_samples, output_dim, input_dim))
        >>> fm = FeatureMap(distr='laplace',
                            bias=np.random.uniform(0, 2 * np.pi, n_features),
                            input_dim=input_dim,
                            n_features=n_features,
                            params=np.zeros(n_params),
                            sigma_f=f.var())
        >>> kss = KernelActiveSubspaces(feature_map=fm, dim=1, n_features=n_features)
        >>> csv = CrossValidation(inputs=xx,
                                outputs=f.reshape(-1, 1),
                                gradients=df.reshape(n_samples, output_dim, input_dim),
                                folds=3,
                                subspace=kss)
        >>> best = fm.tune_pr_matrix(func=average_rrmse,
                                    bounds=[slice(-2, 1, 0.2) for i in range(n_params)],
                                    args=(csv, ),
                                    method='bso',
                                    maxiter=20,
                                    save_file=False)

        :param callable func: the objective function to be minimized.
            Must be in the form f(x, *args), where x is the argument in the
            form of a 1-D array and args is a tuple of any additional fixed
            parameters needed to completely specify the function.
        :param tuple bounds: each component of the bounds tuple must be a
            slice tuple of the form (low, high, step). It defines bounds for
            the objective function parameter in a logarithmic scale. Step will
            be ignored for 'dual_annealing' method.
        :param tuple args: any additional fixed parameters needed to
            completely specify the objective function.
        :param str method: method used to optimize the objective function.
            Possible values are 'brute', or 'dual_annealing'.
            Default value is None, and the choice is made automatically wrt
            the dimension of `self.params`. If the dimension is less than 4
            brute force is used, otherwise a traditional Generalized
            Simulated Annealing will be performed with no local search
            strategy applied.
        :param int maxiter: the maximum number of global search iterations.
            Default value is 50.
        :param bool save_file: True to save the optimal projection matrix
        :param dict fn_args: dictionary of arguments passed to func.
        :raises: ValueError
        :return: list that records the best score and the best projection
            matrix. The initial values are 0.8 and a n_features-by-input_dim
            numpy.ndarray of zeros.
        :rtype: list
        """
        if fn_args is None:
            fn_args = {}
        best = [0.8, np.zeros((self.n_features, self.input_dim))]

        if method is None:
            method = 'brute' if len(self.params) < 4 else 'dual_annealing'
        if method == 'brute':
            self.params = brute(func=func,
                                ranges=bounds,
                                args=(
                                    best,
                                    *tuple(fn_args.values()),
                                ),
                                finish=None)
        elif method == 'dual_annealing':
            bounds_list = [[bound.start, bound.stop] for bound in bounds]
            self.params = 10**dual_annealing(func=func,
                                             bounds=bounds_list,
                                             args=(
                                                 best,
                                                 *tuple(fn_args.values()),
                                             ),
                                             maxiter=maxiter,
                                             no_local_search=False).x
        elif method == 'bso':
            # Reformat bounds for BayesianOptimization package format
            # BayesianOptimization uses a dictionary of parameter names and their range tuples
            # Unlike GPyOpt which used a list of dictionaries with 'name', 'type', and 'domain' keys
            bounds_dict = {
                f'var_{i}': (bound.start, bound.stop)
                for i, bound in enumerate(bounds)
            }

            # Create wrapper for the objective function to handle the format difference
            # BayesianOptimization passes parameters as keyword arguments, not as an array
            def bayes_wrapper(**kwargs):
                # Convert from the dict of parameters to array format expected by the original function
                x = np.array([kwargs[f'var_{i}'] for i in range(len(bounds))])
                # BayesianOptimization maximizes functions by default, but we want to minimize
                # So we negate the score (lower scores are better in our original function)
                return -func(x, best, **fn_args)

            # Initialize optimizer with our wrapper function and parameter bounds
            optimizer = BayesianOptimization(
                f=bayes_wrapper,
                pbounds=bounds_dict,
                random_state=42  # For reproducible results
            )

            # Run optimization
            # init_points: how many steps of random exploration to perform
            # n_iter: how many steps of bayesian optimization to perform
            optimizer.maximize(init_points=2, n_iter=maxiter)

            # Extract the best parameters found and transform back
            # optimizer.max contains the best score and parameters found
            best_params = [
                optimizer.max['params'][f'var_{i}'] for i in range(len(bounds))
            ]
            # Apply 10^ transformation as done in the original implementation
            self.params = 10**np.array(best_params)
        else:
            raise ValueError(
                "Method argument can only be 'brute' or 'dual_annealing' or 'bso'."
            )

        self._pr_matrix = best[1]
        self.params = best[0]

        if save_file:
            np.save("opt_pr_matrix", best[1])

        return best


def rff_map(inputs, pr_matrix, bias, n_features, sigma_f):
    """
    Random Fourier Features map. It can be vectorized for inputs of shape n_samples-by-input_dim.

    :param numpy.ndarray inputs: input_dim dimensional inputs to project to the RKHS.
    :param numpy.ndarray pr_matrix: n_features-by-input_dim projection matrix,
        whose rows are sampled from the spectral distribution.
    :param numpy.ndarray bias: n_features dimensional bias. It is sampled from a
        Unifrom distribution on the interval [0, 2*PI].
    :param int n_features: dimension of the RKHS
    :param int sigma_f: multiplicative term representing the empirical variance
        the outptus.
    :return: n_features dimensional projection of the inputs to the RKHS
    :rtype: numpy.ndarray
    """
    return np.sqrt(2 / n_features) * sigma_f * np.cos(
        np.dot(inputs, pr_matrix.T) + bias.reshape(1, -1))


def rff_jac(inputs, pr_matrix, bias, n_features, sigma_f):
    """
    Random Fourier Features map's Jacobian. It can be vectorized for inputs of shape n_samples-by-input_dim.

    :param numpy.ndarray inputs: input_dim dimensional inputs to project to the RKHS.
    :param numpy.ndarray pr_matrix: n_features-by-input_dim projection matrix,
        whose rows are sampled from the spectral distribution.
    :param numpy.ndarray bias: n_features dimensional bias. It is sampled from a
        Unifrom distribution on the interval [0, 2*PI].
    :param int n_features: dimension of the RKHS
    :param int sigma_f: multiplicative term representing the empirical variance
        the outptus.
    :return: n_features-by-input_dim dimensional projection of the inputs to the RKHS
    :rtype: numpy.ndarray
    """
    return (np.sqrt(2 / n_features) * sigma_f *
            (-1) * np.sin(np.dot(inputs, pr_matrix.T) + bias)).reshape(
                inputs.shape[0], n_features, 1) * pr_matrix