Source code for upoqa.utils.sub_solver

# This file contains modified code from COBYQA (https://github.com/cobyqa/cobyqa)
# which is licensed under the BSD 3-Clause License:
#
# Copyright (c) 2021-2025, Tom M. Ragonneau and Zaikun Zhang
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from
#    this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The modifications to this file and the overall project are licensed under the
# GNU General Public License v3.0 (GPLv3). See the project's LICENSE file for
# the full GPLv3 terms.

"""
Trust Region Subproblem Solver
==============================

Implement several sub-routines for approximately solving trust-region sub-problems 
that arise in derivative-free optimization with partially separable structures:

* ``cauchy_geometry`` / ``spider_geometry`` : geometry-improving steps along
  Cauchy / straight-line directions.
* ``tangential_byrd_omojokun`` : truncated conjugate-gradient step on the
  tangential space.
* ``conjugate_gradient_proj_steinmetz`` : modified projected CG for structured
  trust-regions using steinmetz projections.

The ``cauchy_geometry``, ``spider_geometry``, and ``tangential_byrd_omojokun`` solvers 
are drawn from the `COBYQA <https://github.com/cobyqa/cobyqa/>`_ library; see the 
original implementation at 
`here <https://github.com/cobyqa/cobyqa/blob/main/cobyqa/subsolvers/>`_.
"""

import numpy as np
import numpy.linalg as LA
from typing import Optional, Callable
from .projection import *

__all__ = [
    "cauchy_geometry",
    "spider_geometry",
    "tangential_byrd_omojokun",
    "conjugate_gradient_proj_steinmetz",
]

float_tiny = np.finfo(float).tiny
float_eps = np.finfo(float).eps


[docs] def cauchy_geometry(const, grad, curv, delta): r""" Maximize approximately the absolute value of a quadratic function subject to bound constraints in a trust region. This function solves approximately .. math:: \max_{s \in \mathbb{R}^n} \quad \bigg\lvert c + g^{\mathsf{T}} s + \frac{1}{2} s^{\mathsf{T}} H s \bigg\rvert \quad \text{s.t.} \quad \left\{ \begin{array}{l} l \le s \le u,\\ \lVert s \rVert \le \Delta, \end{array} \right. by maximizing the objective function along the constrained Cauchy direction. Parameters ---------- const : float Constant :math:`c` as shown above. grad : ndarray, shape (n,) Gradient :math:`g` as shown above. curv : callable Curvature of :math:`H` along any vector. ``curv(s) -> float`` returns :math:`s^{\mathsf{T}} H s`. delta : float Trust-region radius :math:`\Delta` as shown above. Returns ------- ndarray, shape (n,) Approximate solution :math:`s`. Notes ----- This function is described as the first alternative in Section 6.5 of [1]_. It is assumed that the origin is feasible with respect to the bound constraints and that ``delta`` is finite and positive. References ---------- .. [1] Tom M. Ragonneau. *Model-Based Derivative-Free Optimization Methods and Software*. PhD thesis, Department of Applied Mathematics, The Hong Kong Polytechnic University, Hong Kong, China, 2022. URL: https://theses.lib.polyu.edu.hk/handle/200/12294. """ # To maximize the absolute value of a quadratic function, we maximize the # function itself or its negative, and we choose the solution that provides # the largest function value. step1, q_val1 = _cauchy_geom(const, grad, curv, delta) step2, q_val2 = _cauchy_geom(-const, -grad, lambda x: -curv(x), delta) return step1 if abs(q_val1) >= abs(q_val2) else step2
[docs] def spider_geometry(const, grad, curv, xpt, delta): r""" Maximize approximately the absolute value of a quadratic function subject to bound constraints in a trust region. This function solves approximately .. math:: \max_{s \in \mathbb{R}^n} \quad \bigg\lvert c + g^{\mathsf{T}} s + \frac{1}{2} s^{\mathsf{T}} H s \bigg\rvert \quad \text{s.t.} \quad \left\{ \begin{array}{l} l \le s \le u,\\ \lVert s \rVert \le \Delta, \end{array} \right. by maximizing the objective function along given straight lines. Parameters ---------- const : float Constant :math:`c` as shown above. grad : ndarray, shape (n,) Gradient :math:`g` as shown above. curv : callable Curvature of :math:`H` along any vector. ``curv(s) -> float`` returns :math:`s^{\mathsf{T}} H s`. xpt : ndarray, shape (npt, n) Points defining the straight lines. The straight lines considered are the ones passing through the origin and the points in ``xpt``. delta : float Trust-region radius :math:`\Delta` as shown above. Returns ------- ndarray, shape (n,) Approximate solution :math:`s`. Notes ----- This function is described as the second alternative in Section 6.5 of [1]_. It is assumed that the origin is feasible with respect to the bound constraints and that ``delta`` is finite and positive. References ---------- .. [1] Tom M. Ragonneau. *Model-Based Derivative-Free Optimization Methods and Software*. PhD thesis, Department of Applied Mathematics, The Hong Kong Polytechnic University, Hong Kong, China, 2022. URL: https://theses.lib.polyu.edu.hk/handle/200/12294. """ # Iterate through the straight lines. step = np.zeros_like(grad) q_val = const xpt_norms = LA.norm(xpt, axis=1) npt = xpt.shape[0] for k in range(npt): # Set alpha_tr to the step size for the trust-region constraint. if xpt_norms[k] > float_tiny * delta: alpha_tr = max(delta / xpt_norms[k], 0.0) else: # The current straight line is basically zero. continue # Set alpha_quad_pos and alpha_quad_neg to the step size to the extrema # of the quadratic function along the positive and negative directions. grad_step = grad @ xpt[k] curv_step = curv(xpt[k]) tiny_grad = float_tiny * grad_step if ( grad_step >= 0.0 and curv_step < -tiny_grad or grad_step <= 0.0 and curv_step > -tiny_grad ): alpha_quad_pos = max(-grad_step / curv_step, 0.0) else: alpha_quad_pos = np.inf if ( grad_step >= 0.0 and curv_step > tiny_grad or grad_step <= 0.0 and curv_step < tiny_grad ): alpha_quad_neg = min(-grad_step / curv_step, 0.0) else: alpha_quad_neg = -np.inf # Select the step that provides the largest value of the objective # function if it improves the current best. The best positive step is # either the one that reaches the constraints or the one that reaches # the extremum of the objective function along the current direction # (only possible if the resulting step is feasible). We test both, and # we perform similar calculations along the negative step. # N.B.: we select the largest possible step among all the ones that # maximize the objective function. This is to avoid returning the zero # step in some extreme cases. alpha_pos, alpha_neg = alpha_tr, -alpha_tr alpha_tr_pow_curv = 0.5 * alpha_tr**2.0 * curv_step q_val_pos = const + alpha_pos * grad_step + alpha_tr_pow_curv q_val_neg = const + alpha_neg * grad_step + alpha_tr_pow_curv if alpha_quad_pos < alpha_pos: q_val_quad_pos = ( const - alpha_quad_pos * grad_step - 0.5 * alpha_quad_pos**2.0 * curv_step ) if abs(q_val_quad_pos) > abs(q_val_pos): alpha_pos = alpha_quad_pos q_val_pos = q_val_quad_pos if alpha_quad_neg > alpha_neg: q_val_quad_neg = ( const - alpha_quad_neg * grad_step - 0.5 * alpha_quad_neg**2.0 * curv_step ) if abs(q_val_quad_neg) > abs(q_val_neg): alpha_neg = alpha_quad_neg q_val_neg = q_val_quad_neg abs_q_val_pos, abs_q_val_neg = abs(q_val_pos), abs(q_val_neg) if abs_q_val_pos >= abs_q_val_neg and abs_q_val_pos > abs(q_val): step = alpha_pos * xpt[k] q_val = q_val_pos elif abs_q_val_neg > abs_q_val_pos and abs_q_val_neg > abs(q_val): step = alpha_neg * xpt[k] q_val = q_val_neg return step
def _cauchy_geom(const, grad, curv, delta): """ Same as ``bound_constrained_cauchy_step`` without the absolute value. """ # Calculate the initial active set. fixed_xl = grad > 0.0 fixed_xu = grad < 0.0 # Calculate the Cauchy step. cauchy_step = np.zeros_like(grad) cauchy_step[fixed_xl] = -np.inf cauchy_step[fixed_xu] = np.inf if LA.norm(cauchy_step) > delta: working = fixed_xl | fixed_xu # Calculate the Cauchy step for the directions in the working set. g_norm = LA.norm(grad[working]) delta_reduced = np.sqrt( delta**2.0 - np.inner(cauchy_step[~working], cauchy_step[~working]) ) if g_norm > np.finfo(float).tiny * abs(delta_reduced): mu = max(delta_reduced / g_norm, 0.0) cauchy_step[working] = mu * grad[working] # Calculate the step that maximizes the quadratic along the Cauchy step. grad_step = grad @ cauchy_step if grad_step >= 0.0: # Set alpha_tr to the step size for the trust-region constraint. s_norm = LA.norm(cauchy_step) if s_norm > float_tiny * delta: alpha_tr = max(delta / s_norm, 0.0) else: # The Cauchy step is basically zero. alpha_tr = 0.0 # Set alpha_quad to the step size for the maximization problem. curv_step = curv(cauchy_step) if curv_step < -float_tiny * grad_step: alpha_quad = max(-grad_step / curv_step, 0.0) else: alpha_quad = np.inf # Calculate the solution and the corresponding function value. alpha = min(alpha_tr, alpha_quad) step = alpha * cauchy_step q_val = const + alpha * grad_step + 0.5 * alpha**2.0 * curv_step else: # This case is never reached in exact arithmetic. It prevents this # function to return a step that decreases the objective function. step = np.zeros_like(grad) q_val = const return step, q_val
[docs] def tangential_byrd_omojokun(fun, grad, hess_prod, delta, n, **kwargs): r""" Minimize approximately a quadratic function subject to bound constraints in a trust region. This function solves approximately .. math:: \min_{s\in\mathbb{R}^n}\quad f(s) \quad \text{s.t.} \quad \lVert s \rVert \le \Delta. using a variation of the truncated conjugate gradient method. Parameters ---------- fun : callable Overall surrogate model function :math:`f` as shown above. grad : callable Gradient :math:`\nabla f` of the model function: ``grad(x) -> ndarray, shape (n,)`` returns the gradient at ``x``. hess_prod : callable Product of the Hessian matrix :math:`\nabla^2 f(x)` with any vector :math:`v`: ``hess_prod(x, v) -> ndarray, shape (n,)`` returns the product :math:`\nabla^2 f(x) v`. delta : float Trust-region radius :math:`\Delta` as shown above. n : int Dimension of the function. Returns ------- ndarray, shape (n,) Approximate solution :math:`s`. Other Parameters ---------------- improve_tcg : bool, optional If True, a solution generated by the truncated conjugate gradient method that is on the boundary of the trust region is improved by moving around the trust-region boundary on the two-dimensional space spanned by the solution and the gradient of the quadratic function at the solution (default is True). Notes ----- This function originally implements Algorithm 6.2 of [1]_ and is modified to adapt to the UPOQA solver. It is assumed that the origin is feasible with respect to the bound constraints and that ``delta`` is finite and positive. References ---------- .. [1] Tom M. Ragonneau. *Model-Based Derivative-Free Optimization Methods and Software*. PhD thesis, Department of Applied Mathematics, The Hong Kong Polytechnic University, Hong Kong, China, 2022. URL: https://theses.lib.polyu.edu.hk/handle/200/12294. """ # Copy the arrays that may be modified by the code below.- g = grad(np.zeros((n,))) # Set the initial iterate and the initial search direction. step, sd = np.zeros_like(g), -g # objective value may not be monotonic if xform is enabled in upoqa. best_step, best_decrease = step, 0.0 k, reduct = 0, 0.0 boundary_reached = False while k < n: # Stop the computations if sd is not a descent direction. g_sd = g @ sd if g_sd >= -10.0 * float_eps * n * max(1.0, LA.norm(g)): break # Set alpha_tr to the step size for the trust-region constraint. try: alpha_tr = _alpha_tr(step, sd, delta) except ZeroDivisionError: break # Stop the computations if a step along sd is expected to give a # relatively small reduction in the objective function. if -alpha_tr * g_sd <= 1e-8 * reduct: break # Set alpha_quad to the step size for the minimization problem. hess_sd = hess_prod(step, sd) curv_sd = sd @ hess_sd if curv_sd > float_tiny * abs(g_sd): alpha_quad = max(-g_sd / curv_sd, 0.0) else: alpha_quad = np.inf # Stop the computations if the reduction in the objective function # provided by an unconstrained step is small. alpha = min(alpha_tr, alpha_quad) if -alpha * (g_sd + 0.5 * alpha * curv_sd) <= 1e-8 * reduct: break # Update the iterate. if alpha > 0.0: step = step + alpha * sd g = grad(step) reduct = -fun(step) if reduct > best_decrease: best_step, best_decrease = step, reduct if alpha < alpha_tr: # The current iteration is a conjugate gradient iteration. Update # the search direction so that it is conjugate (with respect to H) # to all the previous search directions. beta = (g @ hess_sd) / curv_sd sd = beta * sd - g k += 1 else: # The current iterate is on the trust-region boundary. Add all the # active bounds to the working set to prepare for the improvement # of the solution, and stop the iterations. boundary_reached = True break # Attempt to improve the solution on the trust-region boundary. if kwargs.get("improve_tcg", True) and boundary_reached: # Check whether a substantial reduction in the objective function # is possible, and set the search direction. step_sq = step.dot(step) g_sq = g.dot(g) g_step = g.dot(step) g_sd = -np.sqrt(max(step_sq * g_sq - g_step**2, 0.0)) sd = g_step * step - step_sq * g if (g_sd < -1e-8 * reduct) and np.all(g_sd < -float_tiny * np.abs(sd)): sd /= -g_sd # Calculate some curvature information. hess_sd = hess_prod(step, sd) curv_sd = sd @ hess_sd # For a range of equally spaced values of tan(0.5 * theta), # calculate the reduction in the objective function that would be # obtained by accepting the corresponding angle. n_samples = 20 t_samples = np.linspace(1 / n_samples, 1, n_samples) sin_values = 2.0 * t_samples / (1.0 + t_samples**2.0) cos_values = (1.0 - t_samples**2.0) / (1.0 + t_samples**2.0) candidate_steps = np.outer(cos_values, step) + np.outer(sin_values, sd) all_reduct = np.empty((n_samples,)) for i in range(n_samples): all_reduct[i] = -fun(candidate_steps[i]) if np.any(all_reduct > 0.0): # No reduction in the objective function is obtained. # Accept the angle that provides the largest reduction in the # objective function, and update the iterate. i_max = np.argmax(all_reduct) step = cos_values[i_max] * step + sin_values[i_max] * sd g = grad(step) reduct = all_reduct[i_max] if reduct > best_decrease: best_step, best_decrease = step, reduct return best_step
[docs] def conjugate_gradient_proj_steinmetz( fun: Callable[[np.ndarray], float], grad: Callable[[np.ndarray], np.ndarray], hess_prod: Callable[[np.ndarray, np.ndarray], np.ndarray], coords_mask: np.ndarray, n: int, deltas: np.ndarray, envelope_delta: float, maxiter: Optional[int] = None, **kwargs, ) -> np.ndarray: r""" Modified projected conjugate gradient method for a quadratic trust-region sub-problem with a structured trust-region. The feasible region is the intersection of :math:`q` cylinders .. math:: \mathcal{S} = \bigcap_{i=1}^{q} \left\{ s\in\mathbb{R}^{n} : \lVert s^{\mathcal{I}_i} \rVert_{2} \le \Delta_i \right\}. This function solves approximately .. math:: \min_{s\in\mathcal{S}}\quad f(s). Parameters ---------- fun : callable Overall surrogate model function :math:`f` as shown above. grad : callable Gradient :math:`\nabla f` of the model function: ``grad(x) -> ndarray, shape (n,)`` returns the gradient at ``x``. hess_prod : callable Product of the Hessian matrix :math:`\nabla^2 f(x)` with any vector :math:`v`: ``hess_prod(x, v) -> ndarray, shape (n,)`` returns the product :math:`\nabla^2 f(x) v`. coords_mask : ndarray, shape (q, n) Bool mask indicating variable indices :math:`\mathcal{I}_i` for each element. n : int Problem dimension deltas : ndarray Elemental trust-region radii :math:`\{\Delta_i\}_{i=1}^q` as shown above. envelope_delta : float A sufficiently large radius ensuring that the structured trust-region lies entirely within the ball centered at the origin with this radius. maxiter : int, optional Maximum number of iterations. Default is None, which sets it to :math:`\max(20, 3n)` in which :math:`2n` steps are reserved for projection steps. Returns ------- ndarray Approximate solution to the trust-region subproblem. Notes ----- This method uses Steinmetz projection (:func:`~upoqa.utils.projection.steinmetz_proj`) and combined projection (:func:`~upoqa.utils.projection.steinmetz_comb_proj`) to maintain feasibility with respect to the cylindrical constraints. """ s0 = np.zeros(n) g = grad(s0) # Set the initial iterate and the initial search direction. step, sd = np.zeros(n), -g fval0 = fun(s0) reduct = 0.0 proj_step, max_proj_step = 0, max(10, 2 * n) maxiter = max(10, n) if maxiter is None else int(maxiter) maxiter += max_proj_step best_step, best_decrease = ( step, 0.0, ) # objective value may not be monotonic if xform is enabled. for it in range(maxiter): # Stop the computations if sd is not a descent direction. g_sd = g @ sd if g_sd >= -10.0 * float_eps * n * max(1.0, LA.norm(g)): break hess_sd = hess_prod(step, sd) curv_sd = sd @ hess_sd try: # Set alpha_quad to the st+ep size for the minimization problem. if curv_sd > float_tiny * abs(g_sd): alpha_quad = max(-g_sd / curv_sd, 0.0) alpha_tr = _alpha_tr(step, sd, envelope_delta) else: alpha_quad = np.inf alpha_tr = _alpha_tr(step, sd, 1e3 * envelope_delta) except ZeroDivisionError: break alpha = min(alpha_quad, alpha_tr) # Stop the computations if the reduction in the objective function # provided by an unconstrained step is small. if -alpha * (g_sd + 0.5 * alpha * curv_sd) <= 1e-8 * reduct: break # Update the iterate. if alpha > 0.0: # Though Dykstra's projection method is an exact projection, # it shows limited experimental improvements and is slower. # skp1, did_project = dykstra_proj(step + alpha * sd, coords_mask, deltas) if it % 2 == 0: skp1, did_project = steinmetz_proj( step + alpha * sd, coords_mask, deltas ) else: skp1, did_project = steinmetz_comb_proj( step + alpha * sd, coords_mask, deltas ) if did_project: # line search along skp1 - step proj_step += 1 if proj_step > max_proj_step: break pj_sd = skp1 - step g_pj_sd = g @ pj_sd hess_pj_sd = hess_prod(step, pj_sd) curv_pj_sd = pj_sd @ hess_pj_sd if curv_pj_sd > float_tiny * abs(g_pj_sd): alpha_ls = min(1.0, max(-g_pj_sd / curv_pj_sd, 0.0)) else: alpha_ls = 1.0 if -alpha_ls * (g_pj_sd + 0.5 * alpha_ls * curv_pj_sd) <= 1e-8 * reduct: break skp1 = step + alpha_ls * pj_sd reduct = fval0 - fun(skp1) step = skp1 g = grad(step) if reduct > best_decrease: best_step, best_decrease = step, reduct else: break if did_project: sd = -g else: # The current iteration is a conjugate gradient iteration. Update # the search direction so that it is conjugate (with respect to H) # to all the previous search directions. beta = (g @ hess_sd) / curv_sd sd = beta * sd - g return best_step
def get_arrays_tol(*arrays): """ Get a relative tolerance for a set of arrays. Parameters ---------- *arrays: tuple Set of ndarray to get the tolerance for. Returns ------- float Relative tolerance for the set of arrays. Raises ------ ValueError If no array is provided. """ if len(arrays) == 0: raise ValueError("At least one array must be provided.") size = max(array.size for array in arrays) weight = max( np.max(np.abs(array[np.isfinite(array)]), initial=1.0) for array in arrays ) return 10.0 * float_eps * max(size, 1.0) * weight def _alpha_tr(step, sd, delta): step_sd = step.dot(sd) sd_sq = sd.dot(sd) dist_tr_sq = delta**2.0 - step.dot(step) temp = np.sqrt(max(step_sd**2.0 + sd_sq * dist_tr_sq, 0.0)) if step_sd <= 0.0 and sd_sq > float_tiny * abs(temp - step_sd): alpha_tr = max((temp - step_sd) / sd_sq, 0.0) elif abs(temp + step_sd) > float_tiny * dist_tr_sq: alpha_tr = max(dist_tr_sq / (temp + step_sd), 0.0) else: raise ZeroDivisionError return alpha_tr