diff -r aec67cdd6b14 -r efa60bc4f743 src/subproblem/l1squared_nonneg.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/subproblem/l1squared_nonneg.rs Thu Aug 29 00:00:00 2024 -0500 @@ -0,0 +1,425 @@ +/*! +Iterative algorithms for solving the finite-dimensional subproblem with constraint. +*/ + +use nalgebra::DVector; +use numeric_literals::replace_float_literals; +use itertools::izip; +//use std::iter::zip; +use std::cmp::Ordering::*; + +use alg_tools::iterate::{ + AlgIteratorFactory, + AlgIteratorState, +}; +use alg_tools::nalgebra_support::ToNalgebraRealField; +use alg_tools::norms::{Dist, L1}; +use alg_tools::nanleast::NaNLeast; + +use crate::types::*; +use super::{ + InnerMethod, + InnerSettings +}; +use super::nonneg::nonneg_soft_thresholding; +use super::l1squared_unconstrained::l1squared_prox; + +/// Return maximum of `dist` and distnce of inteval `[lb, ub]` to zero. +#[replace_float_literals(F::cast_from(literal))] +pub(super) fn max_interval_dist_to_zero(dist : F, lb : F, ub : F) -> F { + if lb < 0.0 { + if ub > 0.0 { + dist + } else { + dist.max(-ub) + } + } else /* ub ≥ 0.0*/ { + dist.max(lb) + } +} + +/// Returns the ∞-norm minimal subdifferential of $x ↦ (β/2)|x-y|_1^2 - g^⊤ x + λ\|x\|₁ +δ_{≥}(x)$ at $x$. +/// +/// `v` will be modified and cannot be trusted to contain useful values afterwards. +#[replace_float_literals(F::cast_from(literal))] +fn min_subdifferential( + y : &DVector, + x : &DVector, + g : &DVector, + λ : F, + β : F +) -> F { + let mut val = 0.0; + let tmp = β*y.dist(x, L1); + for (&g_i, &x_i, y_i) in izip!(g.iter(), x.iter(), y.iter()) { + let (mut lb, mut ub) = (-g_i, -g_i); + match x_i.partial_cmp(y_i) { + Some(Greater) => { lb += tmp; ub += tmp }, + Some(Less) => { lb -= tmp; ub -= tmp }, + Some(Equal) => { lb -= tmp; ub += tmp }, + None => {}, + } + match x_i.partial_cmp(&0.0) { + Some(Greater) => { lb += λ; ub += λ }, + // Less should not happen + Some(Less|Equal) => { lb = F::NEG_INFINITY; ub += λ }, + None => {}, + }; + val = max_interval_dist_to_zero(val, lb, ub); + } + val +} + +#[replace_float_literals(F::cast_from(literal))] +fn lbd_soft_thresholding(v : F, λ : F, b : F) -> F +{ + match (b >= 0.0, v >= b) { + (true, false) => b, + (true, true) => b.max(v - λ), // soft-to-b from above + (false, true) => super::unconstrained::soft_thresholding(v, λ), + (false, false) => 0.0.min(b.max(v - λ)), // soft-to-0 with lower bound + } +} + +/// Calculate $prox_f(x)$ for $f(x)=\frac{β}{2}\norm{x-y}_1^2 + δ_{≥0}(x)$. +/// +/// To derive an algorithm for this, we can use +/// $prox_f(x) = prox_{f_0}(x - y) - y$ for +/// $f_0(z)=\frac{β}{2}\norm{z}_1^2 + δ_{≥-y}(z)$. +/// Now, the optimality conditions for $w = prox_{f_0}(x)$ are +/// $$\tag{*} +/// x ∈ w + β\norm{w}_1\sign w + N_{≥ -y}(w). +/// $$ +/// If we know $\norm{w}_1$, then this is easily solved by lower-bounded soft-thresholding. +/// We find this by sorting the elements by the distance to the 'locked' lower-bounded +/// soft-thresholding target ($0$ or $-y_i$). +/// Then we loop over this sorted vector, increasing our estimate of $\norm{w}_1$ as we decide +/// that the soft-thresholding parameter `β\norm{w}_1` has to be such that the passed elements +/// will reach their locked value (after which they cannot change anymore, for a larger +/// soft-thresholding parameter. This has to be slightly more fine-grained for account +/// for the case that $-y_i<0$ and $x_i < -y_i$. +/// +/// Indeed, we denote by x' and w' the subset of elements such that w_i ≠ 0 and w_i > -y_i, +/// we can calculate by applying $⟨\cdot, \sign w'⟩$ to the corresponding lines of (*) that +/// $$ +/// \norm{x'} = \norm{w'} + β \norm{w}_1 m. +/// $$ +/// Having a value for $t = \norm{w}-\norm{w'}$, we can then calculate +/// $$ +/// \norm{x'} - t = (1+β m)\norm{w}_1, +/// $$ +/// from where we can calculate the soft-thresholding parameter $λ=β\norm{w}_1$. +/// Since we do not actually know the unlocked elements, but just loop over all the possibilities +/// for them, we have to check that $λ$ is above the current lower bound for this parameter +/// (`shift` in the code), and below a value that would cause changes in the locked set +/// (`max_shift` in the code). +#[replace_float_literals(F::cast_from(literal))] +pub(super) fn l1squared_nonneg_prox( + sorted : &mut Vec<(F, F, F, Option<(F, F)>)>, + x : &mut DVector, + y : &DVector, + β : F +) { + // nalgebra double-definition bullshit workaround + //let max = alg_tools::NumTraitsFloat::max; + let abs = alg_tools::NumTraitsFloat::abs; + + *x -= y; + + for (az_x_i, &x_i, &y_i) in izip!(sorted.iter_mut(), x.iter(), y.iter()) { + // The first component of each az_x_i contains the distance of x_i to the + // soft-thresholding limit. If it is negative, it is always reached. + // The second component contains the absolute value of the result for that component + // w_i of the solution, if the soft-thresholding limit is reached. + // This is stored here due to the sorting, although could otherwise be computed directly. + // Likewise the third component contains the absolute value of x_i. + // The fourth component contains an initial lower bound. + let a_i = abs(x_i); + let b = -y_i; + *az_x_i = match (b >= 0.0, x_i >= b) { + (true, false) => (x_i-b, b, a_i, None), // w_i=b, so sorting element negative! + (true, true) => (x_i-b, b, a_i, None), // soft-to-b from above + (false, true) => (a_i, 0.0, a_i, None), // soft-to-0 + (false, false) => (a_i, 0.0, a_i, Some((b, b-x_i))), // soft-to-0 with initial limit + }; + } + sorted.as_mut_slice() + .sort_unstable_by(|(a, _, _, _), (b, _, _, _)| NaNLeast(*a).cmp(&NaNLeast(*b))); + + let mut nwlow = 0.0; + let mut shift = 0.0; + // This main loop is over different combinations of elements of the solution locked + // to the soft-thresholding lower bound (`0` or `-y_i`), in the sorted order of locking. + for (i, az_x_i) in izip!(0.., sorted.iter()) { + // This 'attempt is over different combinations of elements locked to the + // lower bound (`-y_i ≤ 0`). It calculates `max_shift` as the maximum shift that + // can be done until the locking would change (or become non-strictly-complementary). + // If the main rule (*) gives an estimate of `λ` that stays below `max_shift`, it is + // accepted. Otherwise `shift` is updated to `max_shift`, and we attempt again, + // with the non-locking set that participates in the calculation of `λ` then including + // the elements that are no longer locked to the lower bound. + 'attempt: loop { + let mut nwthis = 0.0; // contribution to ‖w‖ from elements with locking + //soft-thresholding parameter = `shift` + let mut nxmore = 0.0; // ‖x'‖ for those elements through to not be locked to + // neither the soft-thresholding limit, nor the lower bound + let mut nwlbd = 0.0; // contribution to ‖w‖ from those elements locked to their + // lower bound + let mut m = 0; + let mut max_shift = F::INFINITY; // maximal shift for which our estimate of the set of + // unlocked elements is valid. + let mut max_shift_from_lbd = false; // Whether max_shift comes from the next element + // or from a lower bound being reached. + for az_x_j in sorted[i as usize..].iter() { + if az_x_j.0 <= shift { + nwthis += az_x_j.1; + } else { + match az_x_j.3 { + Some((l, s)) if shift < s => { + if max_shift > s { + max_shift_from_lbd = true; + max_shift = s; + } + nwlbd += -l; + }, + _ => { + nxmore += az_x_j.2; + if m == 0 && max_shift > az_x_j.0 { + max_shift = az_x_j.0; + max_shift_from_lbd = false; + } + m += 1; + } + } + } + } + + // We need ‖x'‖ = ‖w'‖ + β m ‖w‖, i.e. ‖x'‖ - (‖w‖-‖w'‖)= (1 + β m)‖w‖. + let tmp = β*(nxmore - (nwlow + nwthis + nwlbd))/(1.0 + β*F::cast_from(m)); + if tmp > max_shift { + if max_shift_from_lbd { + shift = max_shift; + continue 'attempt; + } else { + break 'attempt + } + } else if tmp < shift { + // TODO: this should probably be an assert!(false) + break 'attempt; + } else { + // success + x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, tmp, -y_i)); + return + } + } + shift = az_x_i.0; + nwlow += az_x_i.1; + } + // TODO: this fallback has not been verified to be correct + x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, shift, -y_i)); +} + +/// Proximal point method implementation of [`l1squared_nonneg`]. +/// For detailed documentation of the inputs and outputs, refer to there. +/// +/// The `λ` component of the model is handled in the proximal step instead of the gradient step +/// for potential performance improvements. +#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] +pub fn l1squared_nonneg_pp( + y : &DVector, + g : &DVector, + λ_ : F, + β_ : F, + x : &mut DVector, + τ_ : F, + θ_ : F, + iterator : I +) -> usize +where F : Float + ToNalgebraRealField, + I : AlgIteratorFactory +{ + let λ = λ_.to_nalgebra_mixed(); + let β = β_.to_nalgebra_mixed(); + let mut τ = τ_.to_nalgebra_mixed(); + let θ = θ_.to_nalgebra_mixed(); + let mut tmp = std::iter::repeat((0.0, 0.0, 0.0, None)).take(x.len()).collect(); + let mut iters = 0; + + iterator.iterate(|state| { + // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2+δ_{≥0}+}(x^k - τ(λ𝟙^⊤-g)) + x.apply(|x_i| *x_i -= τ*λ); + x.axpy(τ, g, 1.0); + l1squared_nonneg_prox(&mut tmp, x, y, τ*β); + + iters += 1; + // This gives O(1/N^2) rates due to monotonicity of function values. + // Higher acceleration does not seem to be numerically stable. + τ += θ; + + // This gives O(1/N^3) rates due to monotonicity of function values. + // Higher acceleration does not seem to be numerically stable. + //τ + = F::cast_from(iters).to_nalgebra_mixed()*θ; + + state.if_verbose(|| { + F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) + }) + }); + + iters +} + +/// PDPS implementation of [`l1squared_nonneg`]. +/// For detailed documentation of the inputs and outputs, refer to there. +/// +/// The `λ` component of the model is handled in the proximal step instead of the gradient step +/// for potential performance improvements. +/// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model. +#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] +pub fn l1squared_nonneg_pdps( + y : &DVector, + g : &DVector, + λ_ : F, + β_ : F, + x : &mut DVector, + τ_ : F, + σ_ : F, + θ_ : F, + iterator : I +) -> usize +where F : Float + ToNalgebraRealField, + I : AlgIteratorFactory +{ + let λ = λ_.to_nalgebra_mixed(); + let β = β_.to_nalgebra_mixed(); + let τ = τ_.to_nalgebra_mixed(); + let σ = σ_.to_nalgebra_mixed(); + let θ = θ_.to_nalgebra_mixed(); + let mut w = DVector::zeros(x.len()); + let mut tmp = DVector::zeros(x.len()); + let mut xprev = x.clone(); + let mut iters = 0; + + iterator.iterate(|state| { + // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2}(x^k - τ (w^k - g)) + x.axpy(-τ*θ, &w, 1.0); + x.axpy(τ, g, 1.0); + l1squared_prox(&mut tmp, x, y, τ*β); + + // Dual step: w^{k+1} = proj_{[-∞,λ]}(w^k + σ(2x^{k+1}-x^k)) + w.axpy(2.0*σ*θ, x, 1.0); + w.axpy(-σ*θ, &xprev, 1.0); + w.apply(|w_i| *w_i = w_i.min(λ)); + xprev.copy_from(x); + + iters +=1; + + state.if_verbose(|| { + F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) + }) + }); + + iters +} + +/// Alternative PDPS implementation of [`l1squared_nonneg`]. +/// For detailed documentation of the inputs and outputs, refer to there. +/// +/// The `λ` component of the model is handled in the proximal step instead of the gradient step +/// for potential performance improvements. +/// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model. +#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())] +pub fn l1squared_nonneg_pdps_alt( + y : &DVector, + g : &DVector, + λ_ : F, + β_ : F, + x : &mut DVector, + τ_ : F, + σ_ : F, + θ_ : F, + iterator : I +) -> usize +where F : Float + ToNalgebraRealField, + I : AlgIteratorFactory +{ + let λ = λ_.to_nalgebra_mixed(); + let τ = τ_.to_nalgebra_mixed(); + let σ = σ_.to_nalgebra_mixed(); + let θ = θ_.to_nalgebra_mixed(); + let β = β_.to_nalgebra_mixed(); + let βdivθ = β / θ; + let σθ = σ*θ; + let τθ = τ*θ; + let mut w = DVector::zeros(x.len()); + let mut tmp = DVector::zeros(x.len()); + let mut xprev = x.clone(); + let mut iters = 0; + + iterator.iterate(|state| { + // Primal step: x^{k+1} = nonnegsoft_τλ(x^k - τ(θ w^k -g)) + x.axpy(-τθ, &w, 1.0); + x.axpy(τ, g, 1.0); + x.apply(|x_i| *x_i = nonneg_soft_thresholding(*x_i, τ*λ)); + + // Dual step: w^{k+1} = prox_{σ[(β/2)‖./θ-y‖₁²]^*}(q) for q = w^k + σθ(2x^{k+1}-x^k) + // = q - σ prox_{(β/(2σ))‖./θ-y‖₁²}(q/σ) + // = q - (σθ) prox_{(β/(2σθ^2))‖.-y‖₁²}(q/(σθ)) + // = σθ(q/(σθ) - prox_{(β/(2σθ^2))‖.-y‖₁²}(q/(σθ)) + w /= σθ; + w.axpy(2.0, x, 1.0); + w.axpy(-1.0, &xprev, 1.0); + xprev.copy_from(&w); // use xprev as temporary variable + l1squared_prox(&mut tmp, &mut xprev, y, βdivθ/σθ); + w -= &xprev; + w *= σθ; + xprev.copy_from(x); + + iters +=1; + + state.if_verbose(|| { + F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β)) + }) + }); + + iters +} + + +/// This function applies an iterative method for the solution of the problem +///
$$ +/// \min_{x ∈ ℝ^n} \frac{β}{2} |x-y|_1^2 - g^⊤ x + λ\|x\|₁ + δ_{≥ 0}(x). +/// $$
+/// +/// This function returns the number of iterations taken. +#[replace_float_literals(F::cast_from(literal))] +pub fn l1squared_nonneg( + y : &DVector, + g : &DVector, + λ : F, + β : F, + x : &mut DVector, + inner : &InnerSettings, + iterator : I +) -> usize +where F : Float + ToNalgebraRealField, + I : AlgIteratorFactory +{ + match inner.method { + InnerMethod::PDPS | InnerMethod::Default => { + let inner_θ = 0.001; + // Estimate of ‖K‖ for K=θ\Id. + let normest = inner_θ; + let (inner_τ, inner_σ) = (inner.pdps_τσ0.0 / normest, inner.pdps_τσ0.1 / normest); + l1squared_nonneg_pdps_alt(y, g, λ, β, x, inner_τ, inner_σ, inner_θ, iterator) + }, + InnerMethod::FB /*| InnerMethod::Default*/ => { + // The Lipschitz factor of ∇[x ↦ g^⊤ x + λ∑x]=g - λ𝟙 is FB is just a proximal point + // method with on constraints on τ. We “accelerate” it by adding to τ the constant θ + // on each iteration. Exponential growth does not seem stable. + let inner_τ = inner.fb_τ0; + let inner_θ = inner_τ; + l1squared_nonneg_pp(y, g, λ, β, x, inner_τ, inner_θ, iterator) + }, + _ => unimplemented!("{:?}", inner.method), + } +}