Thu, 23 Jan 2025 23:34:05 +0100
Merging adjustments, parameter tuning, etc.
/*! Iterative algorithms for solving the finite-dimensional subproblem with constraint. */ use nalgebra::DVector; use numeric_literals::replace_float_literals; use itertools::izip; //use std::iter::zip; use std::cmp::Ordering::*; use alg_tools::iterate::{ AlgIteratorFactory, AlgIteratorState, }; use alg_tools::nalgebra_support::ToNalgebraRealField; use alg_tools::norms::{Dist, L1}; use alg_tools::nanleast::NaNLeast; use crate::types::*; use super::{ InnerMethod, InnerSettings }; use super::nonneg::nonneg_soft_thresholding; use super::l1squared_unconstrained::l1squared_prox; /// Return maximum of `dist` and distnce of inteval `[lb, ub]` to zero. #[replace_float_literals(F::cast_from(literal))] pub(super) fn max_interval_dist_to_zero<F : Float>(dist : F, lb : F, ub : F) -> F { if lb < 0.0 { if ub > 0.0 { dist } else { dist.max(-ub) } } else /* ub ≥ 0.0*/ { dist.max(lb) } } /// Returns the ∞-norm minimal subdifferential of $x ↦ (β/2)|x-y|_1^2 - g^⊤ x + λ\|x\|₁ +δ_{≥}(x)$ at $x$. /// /// `v` will be modified and cannot be trusted to contain useful values afterwards. #[replace_float_literals(F::cast_from(literal))] fn min_subdifferential<F : Float + nalgebra::RealField>( y : &DVector<F>, x : &DVector<F>, g : &DVector<F>, λ : F, β : F ) -> F { let mut val = 0.0; let tmp = β*y.dist(x, L1); for (&g_i, &x_i, y_i) in izip!(g.iter(), x.iter(), y.iter()) { let (mut lb, mut ub) = (-g_i, -g_i); match x_i.partial_cmp(y_i) { Some(Greater) => { lb += tmp; ub += tmp }, Some(Less) => { lb -= tmp; ub -= tmp }, Some(Equal) => { lb -= tmp; ub += tmp }, None => {}, } match x_i.partial_cmp(&0.0) { Some(Greater) => { lb += λ; ub += λ }, // Less should not happen Some(Less|Equal) => { lb = F::NEG_INFINITY; ub += λ }, None => {}, }; val = max_interval_dist_to_zero(val, lb, ub); } val } #[replace_float_literals(F::cast_from(literal))] fn lbd_soft_thresholding<F : Float>(v : F, λ : F, b : F) -> F { match (b >= 0.0, v >= b) { (true, false) => b, (true, true) => b.max(v - λ), // soft-to-b from above (false, true) => super::unconstrained::soft_thresholding(v, λ), (false, false) => 0.0.min(b.max(v - λ)), // soft-to-0 with lower bound } } /// Calculate $prox_f(x)$ for $f(x)=\frac{β}{2}\norm{x-y}_1^2 + δ_{≥0}(x)$. /// /// To derive an algorithm for this, we can use /// $prox_f(x) = prox_{f_0}(x - y) - y$ for /// $f_0(z)=\frac{β}{2}\norm{z}_1^2 + δ_{≥-y}(z)$. /// Now, the optimality conditions for $w = prox_{f_0}(x)$ are /// $$\tag{*} /// x ∈ w + β\norm{w}_1\sign w + N_{≥ -y}(w). /// $$ /// If we know $\norm{w}_1$, then this is easily solved by lower-bounded soft-thresholding. /// We find this by sorting the elements by the distance to the 'locked' lower-bounded /// soft-thresholding target ($0$ or $-y_i$). /// Then we loop over this sorted vector, increasing our estimate of $\norm{w}_1$ as we decide /// that the soft-thresholding parameter `β\norm{w}_1` has to be such that the passed elements /// will reach their locked value (after which they cannot change anymore, for a larger /// soft-thresholding parameter. This has to be slightly more fine-grained for account /// for the case that $-y_i<0$ and $x_i < -y_i$. /// /// Indeed, we denote by x' and w' the subset of elements such that w_i ≠ 0 and w_i > -y_i, /// we can calculate by applying $⟨\cdot, \sign w'⟩$ to the corresponding lines of (*) that /// $$ /// \norm{x'} = \norm{w'} + β \norm{w}_1 m. /// $$ /// Having a value for $t = \norm{w}-\norm{w'}$, we can then calculate /// $$ /// \norm{x'} - t = (1+β m)\norm{w}_1, /// $$ /// from where we can calculate the soft-thresholding parameter $λ=β\norm{w}_1$. /// Since we do not actually know the unlocked elements, but just loop over all the possibilities /// for them, we have to check that $λ$ is above the current lower bound for this parameter /// (`shift` in the code), and below a value that would cause changes in the locked set /// (`max_shift` in the code). #[replace_float_literals(F::cast_from(literal))] pub(super) fn l1squared_nonneg_prox<F :Float + nalgebra::RealField>( sorted : &mut Vec<(F, F, F, Option<(F, F)>)>, x : &mut DVector<F>, y : &DVector<F>, β : F ) { // nalgebra double-definition bullshit workaround //let max = alg_tools::NumTraitsFloat::max; let abs = alg_tools::NumTraitsFloat::abs; *x -= y; for (az_x_i, &x_i, &y_i) in izip!(sorted.iter_mut(), x.iter(), y.iter()) { // The first component of each az_x_i contains the distance of x_i to the // soft-thresholding limit. If it is negative, it is always reached. // The second component contains the absolute value of the result for that component // w_i of the solution, if the soft-thresholding limit is reached. // This is stored here due to the sorting, although could otherwise be computed directly. // Likewise the third component contains the absolute value of x_i. // The fourth component contains an initial lower bound. let a_i = abs(x_i); let b = -y_i; *az_x_i = match (b >= 0.0, x_i >= b) { (true, false) => (x_i-b, b, a_i, None), // w_i=b, so sorting element negative! (true, true) => (x_i-b, b, a_i, None), // soft-to-b from above (false, true) => (a_i, 0.0, a_i, None), // soft-to-0 (false, false) => (a_i, 0.0, a_i, Some((b, b-x_i))), // soft-to-0 with initial limit }; } sorted.as_mut_slice() .sort_unstable_by(|(a, _, _, _), (b, _, _, _)| NaNLeast(*a).cmp(&NaNLeast(*b))); let mut nwlow = 0.0; let mut shift = 0.0; // This main loop is over different combinations of elements of the solution locked // to the soft-thresholding lower bound (`0` or `-y_i`), in the sorted order of locking. for (i, az_x_i) in izip!(0.., sorted.iter()) { // This 'attempt is over different combinations of elements locked to the // lower bound (`-y_i ≤ 0`). It calculates `max_shift` as the maximum shift that // can be done until the locking would change (or become non-strictly-complementary). // If the main rule (*) gives an estimate of `λ` that stays below `max_shift`, it is // accepted. Otherwise `shift` is updated to `max_shift`, and we attempt again, // with the non-locking set that participates in the calculation of `λ` then including // the elements that are no longer locked to the lower bound. 'attempt: loop { let mut nwthis = 0.0; // contribution to ‖w‖ from elements with locking //soft-thresholding parameter = `shift` let mut nxmore = 0.0; // ‖x'‖ for those elements through to not be locked to // neither the soft-thresholding limit, nor the lower bound let mut nwlbd = 0.0; // contribution to ‖w‖ from those elements locked to their // lower bound let mut m = 0; let mut max_shift = F::INFINITY; // maximal shift for which our estimate of the set of // unlocked elements is valid. let mut max_shift_from_lbd = false; // Whether max_shift comes from the next element // or from a lower bound being reached. for az_x_j in sorted[i as usize..].iter() { if az_x_j.0 <= shift { nwthis += az_x_j.1; } else { match az_x_j.3 { Some((l, s)) if shift < s => { if max_shift > s { max_shift_from_lbd = true; max_shift = s; } nwlbd += -l; }, _ => { nxmore += az_x_j.2; if m == 0 && max_shift > az_x_j.0 { max_shift = az_x_j.0; max_shift_from_lbd = false; } m += 1; } } } } // We need ‖x'‖ = ‖w'‖ + β m ‖w‖, i.e. ‖x'‖ - (‖w‖-‖w'‖)= (1 + β m)‖w‖. let tmp = β*(nxmore - (nwlow + nwthis + nwlbd))/(1.0 + β*F::cast_from(m)); if tmp > max_shift { if max_shift_from_lbd { shift = max_shift; continue 'attempt; } else { break 'attempt } } else if tmp < shift { // TODO: this should probably be an assert!(false) break 'attempt; } else { // success x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, tmp, -y_i)); return } } shift = az_x_i.0; nwlow += az_x_i.1; } // TODO: this fallback has not been verified to be correct x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, shift, -y_i)); } /// Proximal point method implementation of [`l1squared_nonneg`]. /// For detailed documentation of the inputs and outputs, refer to there. /// /// The `λ` component of the model is handled in the proximal step instead of the gradient step /// for potential performance improvements. #[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] pub fn l1squared_nonneg_pp<F, I>( y : &DVector<F::MixedType>, g : &DVector<F::MixedType>, λ_ : F, β_ : F, x : &mut DVector<F::MixedType>, τ_ : F, θ_ : F, iterator : I ) -> usize where F : Float + ToNalgebraRealField, I : AlgIteratorFactory<F> { let λ = λ_.to_nalgebra_mixed(); let β = β_.to_nalgebra_mixed(); let mut τ = τ_.to_nalgebra_mixed(); let θ = θ_.to_nalgebra_mixed(); let mut tmp = std::iter::repeat((0.0, 0.0, 0.0, None)).take(x.len()).collect(); let mut iters = 0; iterator.iterate(|state| { // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2+δ_{≥0}+}(x^k - τ(λ𝟙^⊤-g)) x.apply(|x_i| *x_i -= τ*λ); x.axpy(τ, g, 1.0); l1squared_nonneg_prox(&mut tmp, x, y, τ*β); iters += 1; // This gives O(1/N^2) rates due to monotonicity of function values. // Higher acceleration does not seem to be numerically stable. τ += θ; // This gives O(1/N^3) rates due to monotonicity of function values. // Higher acceleration does not seem to be numerically stable. //τ + = F::cast_from(iters).to_nalgebra_mixed()*θ; state.if_verbose(|| { F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) }) }); iters } /// PDPS implementation of [`l1squared_nonneg`]. /// For detailed documentation of the inputs and outputs, refer to there. /// /// The `λ` component of the model is handled in the proximal step instead of the gradient step /// for potential performance improvements. /// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model. #[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] pub fn l1squared_nonneg_pdps<F, I>( y : &DVector<F::MixedType>, g : &DVector<F::MixedType>, λ_ : F, β_ : F, x : &mut DVector<F::MixedType>, τ_ : F, σ_ : F, θ_ : F, iterator : I ) -> usize where F : Float + ToNalgebraRealField, I : AlgIteratorFactory<F> { let λ = λ_.to_nalgebra_mixed(); let β = β_.to_nalgebra_mixed(); let τ = τ_.to_nalgebra_mixed(); let σ = σ_.to_nalgebra_mixed(); let θ = θ_.to_nalgebra_mixed(); let mut w = DVector::zeros(x.len()); let mut tmp = DVector::zeros(x.len()); let mut xprev = x.clone(); let mut iters = 0; iterator.iterate(|state| { // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2}(x^k - τ (w^k - g)) x.axpy(-τ*θ, &w, 1.0); x.axpy(τ, g, 1.0); l1squared_prox(&mut tmp, x, y, τ*β); // Dual step: w^{k+1} = proj_{[-∞,λ]}(w^k + σ(2x^{k+1}-x^k)) w.axpy(2.0*σ*θ, x, 1.0); w.axpy(-σ*θ, &xprev, 1.0); w.apply(|w_i| *w_i = w_i.min(λ)); xprev.copy_from(x); iters +=1; state.if_verbose(|| { F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) }) }); iters } /// Alternative PDPS implementation of [`l1squared_nonneg`]. /// For detailed documentation of the inputs and outputs, refer to there. /// /// The `λ` component of the model is handled in the proximal step instead of the gradient step /// for potential performance improvements. /// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model. #[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] pub fn l1squared_nonneg_pdps_alt<F, I>( y : &DVector<F::MixedType>, g : &DVector<F::MixedType>, λ_ : F, β_ : F, x : &mut DVector<F::MixedType>, τ_ : F, σ_ : F, θ_ : F, iterator : I ) -> usize where F : Float + ToNalgebraRealField, I : AlgIteratorFactory<F> { let λ = λ_.to_nalgebra_mixed(); let τ = τ_.to_nalgebra_mixed(); let σ = σ_.to_nalgebra_mixed(); let θ = θ_.to_nalgebra_mixed(); let β = β_.to_nalgebra_mixed(); let βdivθ = β / θ; let σθ = σ*θ; let τθ = τ*θ; let mut w = DVector::zeros(x.len()); let mut tmp = DVector::zeros(x.len()); let mut xprev = x.clone(); let mut iters = 0; iterator.iterate(|state| { // Primal step: x^{k+1} = nonnegsoft_τλ(x^k - τ(θ w^k -g)) x.axpy(-τθ, &w, 1.0); x.axpy(τ, g, 1.0); x.apply(|x_i| *x_i = nonneg_soft_thresholding(*x_i, τ*λ)); // Dual step: w^{k+1} = prox_{σ[(β/2)‖./θ-y‖₁²]^*}(q) for q = w^k + σθ(2x^{k+1}-x^k) // = q - σ prox_{(β/(2σ))‖./θ-y‖₁²}(q/σ) // = q - (σθ) prox_{(β/(2σθ^2))‖.-y‖₁²}(q/(σθ)) // = σθ(q/(σθ) - prox_{(β/(2σθ^2))‖.-y‖₁²}(q/(σθ)) w /= σθ; w.axpy(2.0, x, 1.0); w.axpy(-1.0, &xprev, 1.0); xprev.copy_from(&w); // use xprev as temporary variable l1squared_prox(&mut tmp, &mut xprev, y, βdivθ/σθ); w -= &xprev; w *= σθ; xprev.copy_from(x); iters +=1; state.if_verbose(|| { F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) }) }); iters } /// This function applies an iterative method for the solution of the problem /// <div>$$ /// \min_{x ∈ ℝ^n} \frac{β}{2} |x-y|_1^2 - g^⊤ x + λ\|x\|₁ + δ_{≥ 0}(x). /// $$</div> /// /// This function returns the number of iterations taken. #[replace_float_literals(F::cast_from(literal))] pub fn l1squared_nonneg<F, I>( y : &DVector<F::MixedType>, g : &DVector<F::MixedType>, λ : F, β : F, x : &mut DVector<F::MixedType>, inner : &InnerSettings<F>, iterator : I ) -> usize where F : Float + ToNalgebraRealField, I : AlgIteratorFactory<F> { match inner.method { InnerMethod::PDPS => { let inner_θ = 0.001; // Estimate of ‖K‖ for K=θ\Id. let normest = inner_θ; let (inner_τ, inner_σ) = (inner.pdps_τσ0.0 / normest, inner.pdps_τσ0.1 / normest); l1squared_nonneg_pdps_alt(y, g, λ, β, x, inner_τ, inner_σ, inner_θ, iterator) }, InnerMethod::FB => { // The Lipschitz factor of ∇[x ↦ g^⊤ x + λ∑x]=g - λ𝟙 is FB is just a proximal point // method with on constraints on τ. We “accelerate” it by adding to τ the constant θ // on each iteration. Exponential growth does not seem stable. let inner_τ = inner.fb_τ0; let inner_θ = inner_τ; l1squared_nonneg_pp(y, g, λ, β, x, inner_τ, inner_θ, iterator) }, other => unimplemented!("${other:?} is unimplemented"), } }