src/subproblem/l1squared_nonneg.rs

changeset 52
f0e8704d3f0e
parent 42
6a7365d73e4c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/subproblem/l1squared_nonneg.rs	Mon Feb 17 13:54:53 2025 -0500
@@ -0,0 +1,437 @@
+/*!
+Iterative algorithms for solving the finite-dimensional subproblem with constraint.
+*/
+
+use nalgebra::DVector;
+use numeric_literals::replace_float_literals;
+use itertools::izip;
+//use std::iter::zip;
+use std::cmp::Ordering::*;
+
+use alg_tools::iterate::{
+    AlgIteratorFactory,
+    AlgIteratorState,
+};
+use alg_tools::nalgebra_support::ToNalgebraRealField;
+use alg_tools::norms::{Dist, L1};
+use alg_tools::nanleast::NaNLeast;
+
+use crate::types::*;
+use super::{
+    InnerMethod,
+    InnerSettings
+};
+use super::nonneg::nonneg_soft_thresholding;
+use super::l1squared_unconstrained::l1squared_prox;
+
+/// Return maximum of `dist` and distnce of inteval `[lb, ub]` to zero.
+#[replace_float_literals(F::cast_from(literal))]
+pub(super) fn max_interval_dist_to_zero<F : Float>(dist  : F, lb : F, ub : F) -> F {
+    if lb < 0.0 {
+        if ub > 0.0 {
+            dist
+        } else {
+            dist.max(-ub)
+        }
+    } else /* ub ≥ 0.0*/ {
+        dist.max(lb)
+    }
+}
+
+/// Returns the ∞-norm minimal subdifferential of $x ↦ (β/2)|x-y|_1^2 - g^⊤ x + λ\|x\|₁ +δ_{≥}(x)$ at $x$.
+///
+/// `v` will be modified and cannot be trusted to contain useful values afterwards.
+#[replace_float_literals(F::cast_from(literal))]
+fn min_subdifferential<F : Float + nalgebra::RealField>(
+    y : &DVector<F>,
+    x : &DVector<F>,
+    g : &DVector<F>,
+    λ : F,
+    β : F
+) -> F {
+    let mut val = 0.0;
+    let tmp = β*y.dist(x, L1);
+    for (&g_i, &x_i, y_i) in izip!(g.iter(), x.iter(), y.iter()) {
+        let (mut lb, mut ub) = (-g_i, -g_i);
+        match x_i.partial_cmp(y_i) {
+            Some(Greater) => { lb += tmp; ub += tmp },
+            Some(Less) => { lb -= tmp; ub -= tmp },
+            Some(Equal) => { lb -= tmp; ub += tmp },
+            None => {},
+        }
+        match x_i.partial_cmp(&0.0) {
+            Some(Greater) => { lb += λ; ub += λ },
+            // Less should not happen
+            Some(Less|Equal) => { lb = F::NEG_INFINITY; ub += λ },
+            None => {},
+        };
+        val = max_interval_dist_to_zero(val, lb, ub);
+    }
+    val
+}
+
+#[replace_float_literals(F::cast_from(literal))]
+fn lbd_soft_thresholding<F : Float>(v : F, λ : F, b : F) -> F
+{
+    match (b >= 0.0, v >= b) {
+        (true, false)  => b,
+        (true, true)   => b.max(v - λ),         // soft-to-b from above
+        (false, true)  => super::unconstrained::soft_thresholding(v, λ),
+        (false, false)  => 0.0.min(b.max(v - λ)), // soft-to-0 with lower bound
+    }
+}
+
+/// Calculate $prox_f(x)$ for $f(x)=\frac{β}{2}\norm{x-y}_1^2 + δ_{≥0}(x)$.
+///
+/// To derive an algorithm for this, we can use
+/// $prox_f(x) = prox_{f_0}(x - y) - y$ for
+/// $f_0(z)=\frac{β}{2}\norm{z}_1^2 + δ_{≥-y}(z)$.
+/// Now, the optimality conditions for $w = prox_{f_0}(x)$ are
+/// $$\tag{*}
+///     x ∈ w + β\norm{w}_1\sign w + N_{≥ -y}(w).
+/// $$
+/// If we know $\norm{w}_1$, then this is easily solved by lower-bounded soft-thresholding.
+/// We find this by sorting the elements by the distance to the 'locked' lower-bounded
+/// soft-thresholding target ($0$ or $-y_i$).
+/// Then we loop over this sorted vector, increasing our estimate of $\norm{w}_1$ as we decide
+/// that the soft-thresholding parameter `β\norm{w}_1` has to be such that the passed elements
+/// will reach their locked value (after which they cannot change anymore, for a larger
+/// soft-thresholding parameter. This has to be slightly more fine-grained for account
+/// for the case that $-y_i<0$ and $x_i < -y_i$.
+///
+/// Indeed, we denote by x' and w' the subset of elements such that w_i ≠ 0 and w_i > -y_i,
+/// we can calculate by applying $⟨\cdot, \sign w'⟩$ to the corresponding lines of (*) that
+/// $$
+///     \norm{x'} = \norm{w'} + β \norm{w}_1 m.
+/// $$
+/// Having a value for $t = \norm{w}-\norm{w'}$, we can then calculate
+/// $$
+///     \norm{x'} - t = (1+β m)\norm{w}_1,
+/// $$
+/// from where we can calculate the soft-thresholding parameter $λ=β\norm{w}_1$.
+/// Since we do not actually know the unlocked elements, but just loop over all the possibilities
+/// for them, we have to check that $λ$ is above the current lower bound for this parameter
+/// (`shift` in the code), and below a value that would cause changes in the locked set
+/// (`max_shift` in the code).
+#[replace_float_literals(F::cast_from(literal))]
+pub(super) fn l1squared_nonneg_prox<F :Float + nalgebra::RealField>(
+    sorted : &mut Vec<(F, F, F, Option<(F, F)>)>,
+    x : &mut DVector<F>,
+    y : &DVector<F>,
+    β : F
+) {
+    // nalgebra double-definition bullshit workaround
+    //let max = alg_tools::NumTraitsFloat::max;
+    let abs = alg_tools::NumTraitsFloat::abs;
+
+    *x -= y;
+
+    for (az_x_i, &x_i, &y_i) in izip!(sorted.iter_mut(), x.iter(), y.iter()) {
+        // The first component of each az_x_i contains the distance of x_i to the
+        // soft-thresholding limit. If it is negative, it is always reached.
+        // The second component contains the absolute value of the result for that component
+        // w_i of the solution, if the soft-thresholding limit is reached.
+        // This is stored here due to the sorting, although could otherwise be computed directly.
+        // Likewise the third component contains the absolute value of x_i.
+        // The fourth component contains an initial lower bound.
+        let a_i = abs(x_i);
+        let b = -y_i;
+        *az_x_i = match (b >= 0.0, x_i >= b) {
+            (true, false)  => (x_i-b, b, a_i, None),  // w_i=b, so sorting element negative!
+            (true, true)   => (x_i-b, b, a_i, None),  // soft-to-b from above
+            (false, true)  => (a_i, 0.0, a_i, None),  // soft-to-0
+            (false, false) => (a_i, 0.0, a_i, Some((b, b-x_i))),   // soft-to-0 with initial limit
+        };
+    }
+    sorted.as_mut_slice()
+          .sort_unstable_by(|(a, _, _, _), (b, _, _, _)| NaNLeast(*a).cmp(&NaNLeast(*b)));
+
+    let mut nwlow = 0.0;
+    let mut shift = 0.0;
+    // This main loop is over different combinations of elements of the solution locked
+    // to the soft-thresholding lower bound (`0` or `-y_i`), in the sorted order of locking.
+    for (i, az_x_i) in izip!(0.., sorted.iter()) {
+        // This 'attempt is over different combinations of elements locked to the
+        // lower bound (`-y_i ≤ 0`). It calculates `max_shift` as the maximum shift that
+        // can be done until the locking would change (or become non-strictly-complementary).
+        // If the main rule (*) gives an estimate of `λ` that stays below `max_shift`, it is
+        // accepted. Otherwise `shift` is updated to `max_shift`, and we attempt again,
+        // with the non-locking set that participates in the calculation of `λ` then including
+        // the elements that are no longer locked to the lower bound.
+        'attempt: loop {
+            let mut nwthis = 0.0; // contribution to ‖w‖ from elements with locking
+                                  //soft-thresholding parameter = `shift`
+            let mut nxmore = 0.0; // ‖x'‖ for those elements through to not be locked to
+                                  // neither the soft-thresholding limit, nor the lower bound
+            let mut nwlbd = 0.0;  // contribution to ‖w‖ from those elements locked to their
+                                  // lower bound
+            let mut m = 0;
+            let mut max_shift = F::INFINITY; // maximal shift for which our estimate of the set of
+                                             // unlocked elements is valid.
+            let mut max_shift_from_lbd = false; // Whether max_shift comes from the next element
+                                                // or from a lower bound being reached.
+            for az_x_j in sorted[i as usize..].iter() {
+                if az_x_j.0 <= shift {
+                    nwthis += az_x_j.1;
+                } else {
+                    match az_x_j.3 {
+                        Some((l, s)) if shift < s => {
+                            if max_shift > s {
+                                max_shift_from_lbd = true;
+                                max_shift = s;
+                            }
+                            nwlbd += -l;
+                        },
+                        _ => {
+                            nxmore += az_x_j.2;
+                            if m == 0 && max_shift > az_x_j.0 {
+                                max_shift = az_x_j.0;
+                                max_shift_from_lbd = false;
+                            }
+                            m += 1;
+                        }
+                    }
+                }
+            }
+
+            // We need ‖x'‖ = ‖w'‖ + β m ‖w‖, i.e. ‖x'‖ - (‖w‖-‖w'‖)= (1 + β m)‖w‖.
+            let tmp = β*(nxmore - (nwlow + nwthis + nwlbd))/(1.0 + β*F::cast_from(m));
+            if tmp > max_shift {
+                if max_shift_from_lbd {
+                    shift = max_shift;
+                    continue 'attempt;
+                } else {
+                    break 'attempt
+                }
+            } else if tmp < shift {
+                // TODO: this should probably be an assert!(false)
+                break 'attempt;
+            } else {
+                // success
+                x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, tmp, -y_i));
+                return
+            }
+        }
+        shift = az_x_i.0;
+        nwlow += az_x_i.1;
+    }
+    // TODO: this fallback has not been verified to be correct
+    x.zip_apply(y, |x_i, y_i| *x_i = y_i + lbd_soft_thresholding(*x_i, shift, -y_i));
+}
+
+/// Proximal point method implementation of [`l1squared_nonneg`].
+/// For detailed documentation of the inputs and outputs, refer to there.
+///
+/// The `λ` component of the model is handled in the proximal step instead of the gradient step
+/// for potential performance improvements.
+#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())]
+pub fn l1squared_nonneg_pp<F, I>(
+    y : &DVector<F::MixedType>,
+    g : &DVector<F::MixedType>,
+    λ_ : F,
+    β_ : F,
+    x : &mut DVector<F::MixedType>,
+    τ_ : F,
+    θ_ : F,
+    iterator : I
+) -> usize
+where F : Float + ToNalgebraRealField,
+      I : AlgIteratorFactory<F>
+{
+    let λ = λ_.to_nalgebra_mixed();
+    let β = β_.to_nalgebra_mixed();
+    let mut τ = τ_.to_nalgebra_mixed();
+    let θ = θ_.to_nalgebra_mixed();
+    let mut tmp = std::iter::repeat((0.0, 0.0, 0.0, None)).take(x.len()).collect();
+    let mut iters = 0;
+
+    iterator.iterate(|state| {
+        // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2+δ_{≥0}+}(x^k - τ(λ𝟙^⊤-g))
+        x.apply(|x_i| *x_i -= τ*λ);
+        x.axpy(τ, g, 1.0);
+        l1squared_nonneg_prox(&mut tmp, x, y, τ*β);
+        
+        iters += 1;
+        // This gives O(1/N^2) rates due to monotonicity of function values.
+        // Higher acceleration does not seem to be numerically stable.
+        τ += θ;
+
+        // This gives O(1/N^3) rates due to monotonicity of function values.
+        // Higher acceleration does not seem to be numerically stable.
+        //τ + = F::cast_from(iters).to_nalgebra_mixed()*θ;
+
+        state.if_verbose(|| {
+            F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β))
+        })
+    });
+
+    iters
+}
+
+/// PDPS implementation of [`l1squared_nonneg`].
+/// For detailed documentation of the inputs and outputs, refer to there.
+///
+/// The `λ` component of the model is handled in the proximal step instead of the gradient step
+/// for potential performance improvements.
+/// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model.
+#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())]
+pub fn l1squared_nonneg_pdps<F, I>(
+    y : &DVector<F::MixedType>,
+    g : &DVector<F::MixedType>,
+    λ_ : F,
+    β_ : F,
+    x : &mut DVector<F::MixedType>,
+    τ_ : F,
+    σ_ : F,
+    θ_ : F,
+    iterator : I
+) -> usize
+where F : Float + ToNalgebraRealField,
+      I : AlgIteratorFactory<F>
+{
+    let λ = λ_.to_nalgebra_mixed();
+    let β = β_.to_nalgebra_mixed();
+    let τ = τ_.to_nalgebra_mixed();
+    let σ = σ_.to_nalgebra_mixed();
+    let θ = θ_.to_nalgebra_mixed();
+    let mut w = DVector::zeros(x.len());
+    let mut tmp = DVector::zeros(x.len());
+    let mut xprev = x.clone();
+    let mut iters = 0;
+
+    iterator.iterate(|state| {
+        // Primal step: x^{k+1} = prox_{(τβ/2)|.-y|_1^2}(x^k - τ (w^k - g))
+        x.axpy(-τ*θ, &w, 1.0);
+        x.axpy(τ, g, 1.0);
+        l1squared_prox(&mut tmp, x, y, τ*β);
+        
+        // Dual step: w^{k+1} = proj_{[-∞,λ]}(w^k + σ(2x^{k+1}-x^k))
+        w.axpy(2.0*σ*θ, x, 1.0);
+        w.axpy(-σ*θ, &xprev, 1.0);
+        w.apply(|w_i| *w_i = w_i.min(λ));
+        xprev.copy_from(x);
+        
+        iters +=1;
+
+        state.if_verbose(|| {
+            F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β))
+        })
+    });
+
+    iters
+}
+
+/// Alternative PDPS implementation of [`l1squared_nonneg`].
+/// For detailed documentation of the inputs and outputs, refer to there.
+///
+/// By not dualising the 1-norm, this should produce more sparse solutions than
+/// [`l1squared_nonneg_pdps`].
+///
+/// The `λ` component of the model is handled in the proximal step instead of the gradient step
+/// for potential performance improvements.
+/// The parameter `θ` is used to multiply the rescale the operator (identity) of the PDPS model.
+/// We rewrite
+/// <div>$$
+///     \begin{split}
+///     & \min_{x ∈ ℝ^n} \frac{β}{2} |x-y|_1^2 - g^⊤ x + λ\|x\|₁ + δ_{≥ 0}(x) \\
+///     & = \min_{x ∈ ℝ^n} \max_{w} ⟨θ w, x⟩ - g^⊤ x + λ\|x\|₁ + δ_{≥ 0}(x)
+///      - \left(x ↦ \frac{β}{2θ} |x-y|_1^2 \right)^*(w).
+///     \end{split}
+/// $$</div>
+#[replace_float_literals(F::cast_from(literal).to_nalgebra_mixed())]
+pub fn l1squared_nonneg_pdps_alt<F, I>(
+    y : &DVector<F::MixedType>,
+    g : &DVector<F::MixedType>,
+    λ_ : F,
+    β_ : F,
+    x : &mut DVector<F::MixedType>,
+    τ_ : F,
+    σ_ : F,
+    θ_ : F,
+    iterator : I
+) -> usize
+where F : Float + ToNalgebraRealField,
+      I : AlgIteratorFactory<F>
+{
+    let λ = λ_.to_nalgebra_mixed();
+    let τ = τ_.to_nalgebra_mixed();
+    let σ = σ_.to_nalgebra_mixed();
+    let θ = θ_.to_nalgebra_mixed();
+    let β = β_.to_nalgebra_mixed();
+    let σθ = σ*θ;
+    let τθ = τ*θ;
+    let mut w = DVector::zeros(x.len());
+    let mut tmp = DVector::zeros(x.len());
+    let mut xprev = x.clone();
+    let mut iters = 0;
+
+    iterator.iterate(|state| {
+        // Primal step: x^{k+1} = nonnegsoft_τλ(x^k - τ(θ w^k -g))
+        x.axpy(-τθ, &w, 1.0);
+        x.axpy(τ, g, 1.0);
+        x.apply(|x_i| *x_i = nonneg_soft_thresholding(*x_i, τ*λ));
+        
+        // Dual step: with g(x) = (β/(2θ))‖x-y‖₁² and q = w^k + σ(2x^{k+1}-x^k),
+        // we compute w^{k+1} = prox_{σg^*}(q) for
+        //                    = q - σ prox_{g/σ}(q/σ)
+        //                    = q - σ prox_{(β/(2θσ))‖.-y‖₁²}(q/σ)
+        //                    = σ(q/σ - prox_{(β/(2θσ))‖.-y‖₁²}(q/σ))
+        // where q/σ = w^k/σ + (2x^{k+1}-x^k),
+        w /= σ;
+        w.axpy(2.0, x, 1.0);
+        w.axpy(-1.0, &xprev, 1.0);
+        xprev.copy_from(&w); // use xprev as temporary variable
+        l1squared_prox(&mut tmp, &mut xprev, y, β/σθ);
+        w -= &xprev;
+        w *= σ;
+        xprev.copy_from(x);
+        
+        iters += 1;
+
+        state.if_verbose(|| {
+            F::from_nalgebra_mixed(min_subdifferential(y, x, g, λ, β))
+        })
+    });
+
+    iters
+}
+
+
+/// This function applies an iterative method for the solution of the problem
+/// <div>$$
+///     \min_{x ∈ ℝ^n} \frac{β}{2} |x-y|_1^2 - g^⊤ x + λ\|x\|₁ + δ_{≥ 0}(x).
+/// $$</div>
+///
+/// This function returns the number of iterations taken.
+#[replace_float_literals(F::cast_from(literal))]
+pub fn l1squared_nonneg<F, I>(
+    y : &DVector<F::MixedType>,
+    g : &DVector<F::MixedType>,
+    λ : F,
+    β : F,
+    x : &mut DVector<F::MixedType>,
+    inner : &InnerSettings<F>,
+    iterator : I
+) -> usize
+where F : Float + ToNalgebraRealField,
+      I : AlgIteratorFactory<F>
+{
+    match inner.method {
+        InnerMethod::PDPS => {
+            let inner_θ = 1.0;
+            // Estimate of ‖K‖ for K=θ\Id.
+            let normest = inner_θ;
+            let (inner_τ, inner_σ) = (inner.pdps_τσ0.0 / normest, inner.pdps_τσ0.1 / normest);
+            l1squared_nonneg_pdps_alt(y, g, λ, β, x, inner_τ, inner_σ, inner_θ, iterator)
+        },
+        InnerMethod::FB => {
+            // The Lipschitz factor of ∇[x ↦ g^⊤ x + λ∑x]=g - λ𝟙 is FB is just a proximal point
+            // method with on constraints on τ. We “accelerate” it by adding to τ the constant θ
+            // on each iteration. Exponential growth does not seem stable.
+            let inner_τ = inner.fb_τ0;
+            let inner_θ = inner_τ;
+            l1squared_nonneg_pp(y, g, λ, β, x, inner_τ, inner_θ, iterator)
+        },
+        other => unimplemented!("${other:?} is unimplemented"),
+    }
+}

mercurial