rustdf/cluster/
candidates.rs

1use std::collections::HashMap;
2use std::io;
3use std::path::Path;
4use std::sync::Arc;
5use rayon::prelude::*;
6use crate::cluster::cluster::ClusterResult1D;
7use crate::cluster::feature::SimpleFeature;
8use crate::cluster::io::load_parquet;
9use crate::cluster::peak::ThresholdMode;
10use crate::cluster::pseudo::{PseudoSpecOpts, PseudoSpectrum, build_pseudo_spectra_from_pairs, cluster_mz_mu, PseudoFragment};
11use crate::cluster::scoring::{assign_ms2_to_best_ms1_by_xic, jaccard_time, ms1_to_ms2_map, query_precursor_scored, query_precursors_scored_par, MatchScoreMode, PrecursorLike, PrecursorSearchIndex, ScoredHit, XicScoreOpts};
12use crate::cluster::utility::robust_noise_mad;
13use crate::data::dia::{DiaIndex, TimsDatasetDIA};
14// ---------------------------------------------------------------------------
15// Assignment mode abstraction (geom vs XIC)
16// ---------------------------------------------------------------------------
17
18pub enum AssignmentMode<'a> {
19    Geometric(&'a ScoreOpts),
20    Xic(&'a XicScoreOpts),
21}
22
23/// Geometric scoring (width / Jaccard / apex / overlap etc.).
24pub fn best_ms1_for_each_ms2_geom(
25    ms1: &[ClusterResult1D],
26    ms2: &[ClusterResult1D],
27    pairs: &[(usize, usize)],
28    opts: &ScoreOpts,
29) -> Vec<Option<usize>> {
30    crate::cluster::scoring::best_ms1_for_each_ms2(ms1, ms2, pairs, opts)
31}
32
33/// XIC-based assignment: score via RT/IM traces + intensity ratio,
34/// choose best MS1 per MS2, return Vec<Option<ms1_idx>> indexed by ms2_idx.
35pub fn best_ms1_for_each_ms2_xic(
36    ms1: &[ClusterResult1D],
37    ms2: &[ClusterResult1D],
38    pairs: &[(usize, usize)],
39    opts: &XicScoreOpts,
40) -> Vec<Option<usize>> {
41    let triples = assign_ms2_to_best_ms1_by_xic(ms1, ms2, pairs, opts);
42    let mut best: Vec<Option<usize>> = vec![None; ms2.len()];
43
44    for (ms2_idx, ms1_idx, _s) in triples {
45        if ms2_idx < best.len() {
46            best[ms2_idx] = Some(ms1_idx);
47        }
48    }
49    best
50}
51
52/// Switch between geometric and XIC assignment.
53pub fn best_ms1_for_each_ms2_any(
54    ms1: &[ClusterResult1D],
55    ms2: &[ClusterResult1D],
56    pairs: &[(usize, usize)],
57    mode: AssignmentMode<'_>,
58) -> Vec<Option<usize>> {
59    match mode {
60        AssignmentMode::Geometric(opts) => {
61            best_ms1_for_each_ms2_geom(ms1, ms2, pairs, opts)
62        }
63        AssignmentMode::Xic(opts) => {
64            best_ms1_for_each_ms2_xic(ms1, ms2, pairs, opts)
65        }
66    }
67}
68
69// ---------------------------------------------------------------------------
70// Candidate enumeration knobs
71// ---------------------------------------------------------------------------
72
73/// Options for the simple candidate enumeration.
74/// Rule = RT overlap (seconds) AND group eligibility (mz ∩ isolation AND scans ∩ program).
75#[derive(Clone, Debug)]
76pub struct CandidateOpts {
77    /// Require at least this Jaccard in RT (set 0.0 for "any overlap").
78    pub min_rt_jaccard: f32,
79    /// Guard pad on MS2 time bounds (applied symmetrically), in seconds.
80    pub ms2_rt_guard_sec: f64,
81    /// RT bucket width (seconds).
82    pub rt_bucket_width: f64,
83    /// Pre-filters to drop weird clusters.
84    pub max_ms1_rt_span_sec: Option<f64>,
85    pub max_ms2_rt_span_sec: Option<f64>,
86    /// Minimum intensity sum for a cluster to be considered.
87    /// Use ThresholdMode::AdaptiveNoise(N) for N × noise (recommended),
88    /// or ThresholdMode::Fixed(val) for a hard threshold.
89    pub min_raw_sum: ThresholdMode,
90
91    // ---- tight guards ----
92    /// Maximum allowed |rt_apex_MS1 - rt_apex_MS2| in seconds (None disables).
93    pub max_rt_apex_delta_sec: Option<f32>,
94    /// Maximum allowed |im_apex_MS1 - im_apex_MS2| in global scans (None disables).
95    pub max_scan_apex_delta: Option<usize>,
96    /// Require at least this many scan-overlap between MS1 and MS2 IM windows.
97    pub min_im_overlap_scans: usize,
98
99    /// If true, drop pairs where the fragment cluster's **own** selection
100    /// (mz, scan) lies inside the same DIA program slice that could select
101    /// the precursor. This suppresses residual, unfragmented precursor
102    /// intensity being treated as a fragment.
103    pub reject_frag_inside_precursor_tile: bool,
104}
105
106impl Default for CandidateOpts {
107    fn default() -> Self {
108        Self {
109            min_rt_jaccard: 0.0,
110            ms2_rt_guard_sec: 0.0,
111            rt_bucket_width: 1.0,
112            max_ms1_rt_span_sec: Some(60.0),
113            max_ms2_rt_span_sec: Some(60.0),
114            min_raw_sum: ThresholdMode::AdaptiveNoise(3.0),
115
116            max_rt_apex_delta_sec: Some(2.0),
117            max_scan_apex_delta: Some(6),
118            min_im_overlap_scans: 1,
119
120            reject_frag_inside_precursor_tile: false,
121        }
122    }
123}
124
125impl CandidateOpts {
126    /// Create with a fixed raw_sum threshold (legacy behavior).
127    pub fn with_fixed_raw_sum(mut self, val: f32) -> Self {
128        self.min_raw_sum = ThresholdMode::Fixed(val);
129        self
130    }
131
132    /// Create with adaptive noise-based raw_sum threshold.
133    pub fn with_adaptive_raw_sum(mut self, sigma_multiplier: f32) -> Self {
134        self.min_raw_sum = ThresholdMode::AdaptiveNoise(sigma_multiplier);
135        self
136    }
137
138    /// Legacy defaults (exact old behavior with min_raw_sum: 1.0).
139    pub fn legacy_defaults() -> Self {
140        Self {
141            min_raw_sum: ThresholdMode::Fixed(1.0),
142            ..Default::default()
143        }
144    }
145}
146
147// ---------------------------------------------------------------------------
148// Scoring model for geometric assignment
149// ---------------------------------------------------------------------------
150
151/// Compact feature bundle per pair for traceability.
152#[derive(Clone, Copy, Debug)]
153pub struct PairFeatures {
154    pub jacc_rt: f32,            // Jaccard in RT (0..1)
155    pub rt_apex_delta_s: f32,    // |μ_rt(MS1)-μ_rt(MS2)| in seconds
156    pub im_apex_delta_scans: f32,// |μ_im(MS1)-μ_im(MS2)| in scans
157    pub im_overlap_scans: u32,   // intersection size of IM windows in scans
158    pub im_union_scans: u32,     // union size of IM windows in scans
159    pub ms1_raw_sum: f32,        // intensity proxy for MS1
160    pub shape_ok: bool,          // both σ present & finite
161    pub z_rt: f32,               // pooled-σ z for RT apex delta
162    pub z_im: f32,               // pooled-σ z for IM apex delta
163    pub s_shape: f32,            // exp(-0.5 (w_rt z_rt^2 + w_im z_im^2)) in [0,1]
164}
165
166/// Scoring knobs. Defaults are conservative and width-aware but won’t punish
167/// pairs that lack good fits (we use `shape_neutral` when shape data is missing).
168#[derive(Clone, Debug)]
169pub struct ScoreOpts {
170    /// Weight for RT Jaccard.
171    pub w_jacc_rt: f32,
172    /// Weight for shape similarity S_shape.
173    pub w_shape: f32,
174    /// Weight for RT apex proximity term (smaller delta = better).
175    pub w_rt_apex: f32,
176    /// Weight for IM apex proximity term (smaller delta = better).
177    pub w_im_apex: f32,
178    /// Weight for IM overlap ratio.
179    pub w_im_overlap: f32,
180    /// Weight for MS1 raw_sum (log-compressed).
181    pub w_ms1_intensity: f32,
182
183    /// Scales to normalize apex deltas into ~0..1 decays (exp(-delta/scale)).
184    pub rt_apex_scale_s: f32,
185    pub im_apex_scale_scans: f32,
186
187    /// If shape is unavailable, use this neutral value instead of 0.
188    pub shape_neutral: f32,
189
190    /// Floors for σ to avoid division by ~0.
191    pub min_sigma_rt: f32,
192    pub min_sigma_im: f32,
193
194    /// Shape component internal weights (multiply z^2 inside exp).
195    pub w_shape_rt_inner: f32,
196    pub w_shape_im_inner: f32,
197}
198
199impl Default for ScoreOpts {
200    fn default() -> Self {
201        Self {
202            w_jacc_rt: 1.0,
203            w_shape: 1.0,
204            w_rt_apex: 0.75,
205            w_im_apex: 0.75,
206            w_im_overlap: 0.5,
207            w_ms1_intensity: 0.25,
208            rt_apex_scale_s: 0.75,       // ~sub-second deltas favored
209            im_apex_scale_scans: 3.0,    // a few scans favored
210            shape_neutral: 0.6,          // don’t punish missing shape harshly
211            min_sigma_rt: 0.05,
212            min_sigma_im: 0.5,
213            w_shape_rt_inner: 1.0,
214            w_shape_im_inner: 1.0,
215        }
216    }
217}
218
219#[inline]
220pub(crate) fn build_features(ms1: &ClusterResult1D, ms2: &ClusterResult1D, opts: &ScoreOpts) -> PairFeatures {
221    // RT Jaccard over absolute time bounds derived from frame_ids_used + rt_fit.mu as fallback
222    let (rt1_lo, rt1_hi) = (
223        ms1.rt_fit.mu as f64 - (ms1.rt_fit.sigma as f64) * 3.0,
224        ms1.rt_fit.mu as f64 + (ms1.rt_fit.sigma as f64) * 3.0,
225    );
226    let (rt2_lo, rt2_hi) = (
227        ms2.rt_fit.mu as f64 - (ms2.rt_fit.sigma as f64) * 3.0,
228        ms2.rt_fit.mu as f64 + (ms2.rt_fit.sigma as f64) * 3.0,
229    );
230    let jacc_rt = jaccard_time(rt1_lo, rt1_hi, rt2_lo, rt2_hi).clamp(0.0, 1.0);
231
232    // Apex deltas
233    let rt_apex_delta_s = (ms1.rt_fit.mu - ms2.rt_fit.mu).abs();
234    let im_apex_delta_scans = (ms1.im_fit.mu - ms2.im_fit.mu).abs();
235
236    // IM overlap ratio
237    let (im_inter, im_union) = im_overlap_and_union(ms1.im_window, ms2.im_window);
238
239    // Shape similarity using pooled σ in each dimension (only if both finite)
240    let s1_rt = ms1.rt_fit.sigma.max(opts.min_sigma_rt);
241    let s2_rt = ms2.rt_fit.sigma.max(opts.min_sigma_rt);
242    let s1_im = ms1.im_fit.sigma.max(opts.min_sigma_im);
243    let s2_im = ms2.im_fit.sigma.max(opts.min_sigma_im);
244
245    let (mut shape_ok, mut z_rt, mut z_im, mut s_shape) = (false, 0.0, 0.0, 0.0);
246    if let (Some(sig_rt), Some(sig_im)) = (pooled_sigma(s1_rt, s2_rt), pooled_sigma(s1_im, s2_im)) {
247        if sig_rt.is_finite() && sig_rt > 0.0 && sig_im.is_finite() && sig_im > 0.0 {
248            z_rt = rt_apex_delta_s / sig_rt;
249            z_im = im_apex_delta_scans / sig_im;
250            let q = -0.5_f32 * (opts.w_shape_rt_inner * z_rt * z_rt
251                + opts.w_shape_im_inner * z_im * z_im);
252            s_shape = q.exp();         // ∈ (0,1]
253            shape_ok = s_shape.is_finite();
254        }
255    }
256
257    PairFeatures {
258        jacc_rt,
259        rt_apex_delta_s,
260        im_apex_delta_scans,
261        im_overlap_scans: im_inter,
262        im_union_scans: im_union,
263        ms1_raw_sum: ms1.raw_sum,
264        shape_ok,
265        z_rt,
266        z_im,
267        s_shape,
268    }
269}
270
271#[inline]
272fn im_overlap_and_union(a: (usize, usize), b: (usize, usize)) -> (u32, u32) {
273    let lo = a.0.max(b.0);
274    let hi = a.1.min(b.1);
275    let inter = if hi >= lo { (hi - lo + 1) as u32 } else { 0 };
276    let a_len = if a.1 >= a.0 { (a.1 - a.0 + 1) as u32 } else { 0 };
277    let b_len = if b.1 >= b.0 { (b.1 - b.0 + 1) as u32 } else { 0 };
278    let union = a_len + b_len - inter;
279    (inter, union.max(1))
280}
281
282#[inline]
283fn pooled_sigma(s1: f32, s2: f32) -> Option<f32> {
284    let v = s1 * s1 + s2 * s2;
285    if v.is_finite() && v > 0.0 { Some(v.sqrt()) } else { None }
286}
287
288#[inline]
289pub(crate) fn exp_decay(delta: f32, scale: f32) -> f32 {
290    // Monotone in [0,∞): 1 at 0, then decays smoothly
291    if !delta.is_finite() || !scale.is_finite() || scale <= 0.0 { return 0.0; }
292    (-delta / scale).exp()
293}
294
295#[inline]
296pub(crate) fn safe_log1p(x: f32) -> f32 {
297    if x.is_finite() && x >= 0.0 { (1.0 + x as f64).ln() as f32 } else { 0.0 }
298}
299
300// ---------------------------------------------------------------------------
301// End-to-end pseudo-spectra builders
302// ---------------------------------------------------------------------------
303
304// ---------------------------------------------------------------------------
305// End-to-end pseudo-spectra builders
306// ---------------------------------------------------------------------------
307
308#[derive(Clone, Debug)]
309pub struct AssignmentResult {
310    /// All enumerated pairs (ms2_idx, ms1_idx) after your hard guards.
311    pub pairs: Vec<(usize, usize)>,
312    /// For each MS2 j, the chosen MS1 index (or None if no candidate).
313    ///
314    /// NOTE: in non-competitive "all pairs" mode, this is intentionally
315    /// left as `None` for all entries, because there is no unique best.
316    pub ms2_best_ms1: Vec<Option<usize>>,
317    /// For each MS1 i, the list of MS2 indices assigned to it.
318    ///
319    /// In "all pairs" mode this is simply the inverted candidate list, so
320    /// each MS2 may appear in multiple MS1 buckets.
321    pub ms1_to_ms2: Vec<Vec<usize>>,
322}
323
324/// Full result of the DIA → pseudo-DDA pipeline.
325#[derive(Clone, Debug)]
326pub struct PseudoBuildResult {
327    /// Assignment information (pairs, best MS1 per MS2, inverted map).
328    pub assignment: AssignmentResult,
329    /// Final pseudo-MS/MS spectra, one per assigned precursor.
330    pub pseudo_spectra: Vec<PseudoSpectrum>,
331}
332
333/// NON-COMPETITIVE builder: mainly for debugging / exploration.
334///
335/// Links *all* MS1–MS2 pairs that
336///   - are program-legal (same window group, isolation & scans),
337///   - satisfy candidate guards (RT/IM overlap etc.),
338/// and then groups them into pseudo-spectra without any competition.
339///
340/// CONSEQUENCES:
341///   - An MS2 cluster may contribute to **multiple** precursors.
342///   - `assignment.ms2_best_ms1` is left as `None` for all MS2 indices.
343///   - `assignment.ms1_to_ms2[i]` contains **all** MS2 indices linked to MS1 i.
344///   - Use `build_pseudo_spectra_end_to_end{,_xic}` for competitive, 1:1 assignment.
345pub fn build_pseudo_spectra_all_pairs(
346    ds: &TimsDatasetDIA,
347    ms1: &[ClusterResult1D],
348    ms2: &[ClusterResult1D],
349    features: Option<&[SimpleFeature]>,
350    pseudo_opts: &PseudoSpecOpts,
351) -> PseudoBuildResult {
352    // Hard-coded "wide" candidate guards: any reasonable RT/IM overlap is allowed.
353    let cand_opts = CandidateOpts {
354        min_rt_jaccard: 0.0,
355        ms2_rt_guard_sec: 0.0,
356        rt_bucket_width: 1.0,
357        max_ms1_rt_span_sec: None,
358        max_ms2_rt_span_sec: None,
359        min_raw_sum: ThresholdMode::Fixed(1.0), // Fixed threshold for debugging/exploration
360
361        max_rt_apex_delta_sec: None,
362        max_scan_apex_delta:   None,
363        min_im_overlap_scans:  1,
364        reject_frag_inside_precursor_tile: true,
365    };
366
367    // 1) Enumerate all program-legal candidates with hard guards.
368    let idx = PrecursorSearchIndex::build(ds, ms1, &cand_opts);
369    let pairs = idx.enumerate_pairs(ms1, ms2, &cand_opts); // Vec<(ms2_idx, ms1_idx)>
370
371    // trivial fast-path: nothing links to anything
372    if pairs.is_empty() {
373        return PseudoBuildResult {
374            assignment: AssignmentResult {
375                pairs,
376                ms2_best_ms1: vec![None; ms2.len()],
377                ms1_to_ms2: vec![Vec::new(); ms1.len()],
378            },
379            pseudo_spectra: Vec::new(),
380        };
381    }
382
383    // 2) Build pseudo spectra from *all* pairs (no competition).
384    let empty_feats: &[SimpleFeature] = &[];
385    let feat_slice = features.unwrap_or(empty_feats);
386
387    let pseudo_spectra = build_pseudo_spectra_from_pairs(
388        ms1,
389        ms2,
390        feat_slice,
391        &pairs,
392        pseudo_opts,
393    );
394
395    // 3) Build a non-competitive assignment view.
396    //
397    // - ms2_best_ms1: no unique best in all-pairs mode → all None.
398    // - ms1_to_ms2: for each MS1 i, list all MS2 indices linked to it.
399    let mut ms1_to_ms2: Vec<Vec<usize>> = vec![Vec::new(); ms1.len()];
400    for (ms2_idx, ms1_idx) in &pairs {
401        if *ms1_idx < ms1_to_ms2.len() {
402            ms1_to_ms2[*ms1_idx].push(*ms2_idx);
403        }
404    }
405
406    let ms2_best_ms1 = vec![None; ms2.len()];
407
408    let assignment = AssignmentResult {
409        pairs,
410        ms2_best_ms1,
411        ms1_to_ms2,
412    };
413
414    PseudoBuildResult {
415        assignment,
416        pseudo_spectra,
417    }
418}
419
420/// End-to-end, **geometric** competitive builder:
421///   DIA index + MS1 clusters + MS2 clusters (+ optional features)
422///   → candidates → geometric scoring/assignment → pseudo-MS/MS spectra.
423///
424/// Properties:
425///   - Each MS2 cluster participates in **at most one** precursor.
426///   - Physical program legality (group, tiles, apex-in-tile) is enforced
427///     inside `PrecursorSearchIndex::enumerate_pairs(...)`.
428pub fn build_pseudo_spectra_end_to_end(
429    ds: &TimsDatasetDIA,
430    ms1: &[ClusterResult1D],
431    ms2: &[ClusterResult1D],
432    features: Option<&[SimpleFeature]>,
433    cand_opts: &CandidateOpts,
434    score_opts: &ScoreOpts,
435    pseudo_opts: &PseudoSpecOpts,
436) -> PseudoBuildResult {
437    // 1) Enumerate all program-legal candidates with hard guards.
438    let idx = PrecursorSearchIndex::build(ds, ms1, cand_opts);
439    let pairs = idx.enumerate_pairs(ms1, ms2, cand_opts);  // Vec<(ms2_idx, ms1_idx)>
440
441    // trivial fast-path
442    if pairs.is_empty() {
443        return PseudoBuildResult {
444            assignment: AssignmentResult {
445                pairs,
446                ms2_best_ms1: vec![None; ms2.len()],
447                ms1_to_ms2: vec![Vec::new(); ms1.len()],
448            },
449            pseudo_spectra: Vec::new(),
450        };
451    }
452
453    // 2) Score & choose best MS1 per MS2 (this is the **competition** step).
454    let ms2_best_ms1 = best_ms1_for_each_ms2_geom(
455        ms1,
456        ms2,
457        &pairs,
458        score_opts,
459    );
460    let ms1_to_ms2 = ms1_to_ms2_map(
461        ms1.len(),
462        &ms2_best_ms1,
463    );
464
465    let assignment = AssignmentResult {
466        pairs,
467        ms2_best_ms1: ms2_best_ms1.clone(),
468        ms1_to_ms2: ms1_to_ms2.clone(),
469    };
470
471    // 3) Turn the winner map into "winning pairs" (one MS2 → 0 or 1 MS1).
472    let mut winning_pairs: Vec<(usize, usize)> = Vec::new();
473    for (ms1_idx, js) in ms1_to_ms2.iter().enumerate() {
474        for &ms2_idx in js {
475            winning_pairs.push((ms2_idx, ms1_idx));
476        }
477    }
478
479    // Optional sanity check in debug builds: each MS2 appears at most once.
480    debug_assert!({
481        use std::collections::HashSet;
482        let mut seen = HashSet::new();
483        for (ms2_idx, _) in &winning_pairs {
484            if !seen.insert(ms2_idx) {
485                // duplicate MS2 assignment = bug in best_ms1_for_each_ms2_geom
486                panic!("Duplicated ms2");
487            }
488        }
489        true
490    });
491
492    // 4) Build pseudo spectra from those winning links; grouping by
493    //    (precursor, window_group) happens in build_pseudo_spectra_from_pairs.
494    let empty_feats: &[SimpleFeature] = &[];
495    let feat_slice = features.unwrap_or(empty_feats);
496
497    let pseudo_spectra = build_pseudo_spectra_from_pairs(
498        ms1,
499        ms2,
500        feat_slice,
501        &winning_pairs,
502        pseudo_opts,
503    );
504
505    PseudoBuildResult {
506        assignment,
507        pseudo_spectra,
508    }
509}
510
511/// End-to-end, **XIC-based** competitive builder:
512///   DIA index + MS1 clusters + MS2 clusters (+ optional features)
513///   → candidates → XIC scoring/assignment → pseudo-MS/MS spectra.
514///
515/// Properties:
516///   - Each MS2 cluster participates in **at most one** precursor.
517///   - Physical program legality (group, tiles, apex-in-tile) is enforced
518///     inside `PrecursorSearchIndex::enumerate_pairs(...)`.
519pub fn build_pseudo_spectra_end_to_end_xic(
520    ds: &TimsDatasetDIA,
521    ms1: &[ClusterResult1D],
522    ms2: &[ClusterResult1D],
523    features: Option<&[SimpleFeature]>,
524    cand_opts: &CandidateOpts,
525    xic_opts: &XicScoreOpts,
526    pseudo_opts: &PseudoSpecOpts,
527) -> PseudoBuildResult {
528    // 1) Enumerate all program-legal candidates with hard guards.
529    let idx = PrecursorSearchIndex::build(ds, ms1, cand_opts);
530    let pairs = idx.enumerate_pairs(ms1, ms2, cand_opts);  // Vec<(ms2_idx, ms1_idx)>
531
532    // trivial fast-path
533    if pairs.is_empty() {
534        return PseudoBuildResult {
535            assignment: AssignmentResult {
536                pairs,
537                ms2_best_ms1: vec![None; ms2.len()],
538                ms1_to_ms2: vec![Vec::new(); ms1.len()],
539            },
540            pseudo_spectra: Vec::new(),
541        };
542    }
543
544    // 2) XIC scoring & choose best MS1 per MS2 (this is the **competition** step).
545    let ms2_best_ms1 = best_ms1_for_each_ms2_xic(
546        ms1,
547        ms2,
548        &pairs,
549        xic_opts,
550    );
551    let ms1_to_ms2 = ms1_to_ms2_map(
552        ms1.len(),
553        &ms2_best_ms1,
554    );
555
556    let assignment = AssignmentResult {
557        pairs,
558        ms2_best_ms1: ms2_best_ms1.clone(),
559        ms1_to_ms2: ms1_to_ms2.clone(),
560    };
561
562    // 3) Turn the winner map into "winning pairs" (one MS2 → 0 or 1 MS1).
563    let mut winning_pairs: Vec<(usize, usize)> = Vec::new();
564    for (ms1_idx, js) in ms1_to_ms2.iter().enumerate() {
565        for &ms2_idx in js {
566            winning_pairs.push((ms2_idx, ms1_idx));
567        }
568    }
569
570    // Optional sanity check in debug builds: each MS2 appears at most once.
571    debug_assert!({
572        use std::collections::HashSet;
573        let mut seen = HashSet::new();
574        for (ms2_idx, _) in &winning_pairs {
575            if !seen.insert(ms2_idx) {
576                // duplicate MS2 assignment = bug in best_ms1_for_each_ms2_xic
577                panic!("Duplicated ms2");
578            }
579        }
580        true
581    });
582
583    // 4) Build pseudo spectra from those winning links; grouping by
584    //    (precursor, window_group) happens in build_pseudo_spectra_from_pairs.
585    let empty_feats: &[SimpleFeature] = &[];
586    let feat_slice = features.unwrap_or(empty_feats);
587
588    let pseudo_spectra = build_pseudo_spectra_from_pairs(
589        ms1,
590        ms2,
591        feat_slice,
592        &winning_pairs,
593        pseudo_opts,
594    );
595
596    PseudoBuildResult {
597        assignment,
598        pseudo_spectra,
599    }
600}
601
602// ==========================================================
603// FragmentIndex sub-structs (Phase 2 refactoring)
604// ==========================================================
605
606/// Pre-computed per-MS2 metadata for fast filtering and lookup.
607#[derive(Clone, Debug, Default)]
608pub struct FragmentMetadata {
609    /// IM apex (scan space) - from im_fit.mu or midpoint fallback.
610    pub im_mu: Vec<f32>,
611    /// IM window bounds in scan space.
612    pub im_windows: Vec<(usize, usize)>,
613    /// Keep mask - indicates valid MS2 clusters.
614    pub keep: Vec<bool>,
615    /// Stable cluster IDs.
616    pub cluster_ids: Vec<u64>,
617    /// Representative m/z per cluster.
618    pub mz_mu: Vec<f32>,
619}
620
621impl FragmentMetadata {
622    /// Get the number of MS2 clusters.
623    #[inline]
624    pub fn len(&self) -> usize {
625        self.cluster_ids.len()
626    }
627
628    /// Check if empty.
629    #[inline]
630    pub fn is_empty(&self) -> bool {
631        self.cluster_ids.is_empty()
632    }
633
634    /// Count valid (kept) MS2 clusters.
635    #[inline]
636    pub fn count_valid(&self) -> usize {
637        self.keep.iter().filter(|&&k| k).count()
638    }
639
640    /// Get metadata for a single MS2 cluster by index.
641    #[inline]
642    pub fn get(&self, idx: usize) -> Option<FragmentMetadataEntry> {
643        if idx >= self.len() {
644            return None;
645        }
646        Some(FragmentMetadataEntry {
647            im_mu: self.im_mu[idx],
648            im_window: self.im_windows[idx],
649            keep: self.keep[idx],
650            cluster_id: self.cluster_ids[idx],
651            mz_mu: self.mz_mu[idx],
652        })
653    }
654}
655
656/// A single entry from FragmentMetadata.
657#[derive(Clone, Copy, Debug)]
658pub struct FragmentMetadataEntry {
659    pub im_mu: f32,
660    pub im_window: (usize, usize),
661    pub keep: bool,
662    pub cluster_id: u64,
663    pub mz_mu: f32,
664}
665
666/// Storage for MS2 clusters with ID-based lookup.
667#[derive(Clone, Debug)]
668pub struct FragmentStorage {
669    /// Owned MS2 clusters.
670    pub clusters: Arc<[ClusterResult1D]>,
671    /// Fast lookup: cluster_id -> index.
672    pub id_to_idx: HashMap<u64, usize>,
673}
674
675impl FragmentStorage {
676    /// Create from a Vec of clusters.
677    pub fn from_vec(clusters: Vec<ClusterResult1D>) -> Self {
678        let id_to_idx: HashMap<u64, usize> = clusters
679            .iter()
680            .enumerate()
681            .map(|(i, c)| (c.cluster_id, i))
682            .collect();
683        Self {
684            clusters: clusters.into(),
685            id_to_idx,
686        }
687    }
688
689    /// Get cluster by index.
690    #[inline]
691    pub fn get(&self, idx: usize) -> Option<&ClusterResult1D> {
692        self.clusters.get(idx)
693    }
694
695    /// Get cluster by ID.
696    #[inline]
697    pub fn get_by_id(&self, id: u64) -> Option<&ClusterResult1D> {
698        self.id_to_idx.get(&id).and_then(|&i| self.clusters.get(i))
699    }
700
701    /// Get index by ID.
702    #[inline]
703    pub fn index_of(&self, id: u64) -> Option<usize> {
704        self.id_to_idx.get(&id).copied()
705    }
706
707    /// Number of clusters.
708    #[inline]
709    pub fn len(&self) -> usize {
710        self.clusters.len()
711    }
712
713    /// Check if empty.
714    #[inline]
715    pub fn is_empty(&self) -> bool {
716        self.clusters.is_empty()
717    }
718
719    /// Iterate over clusters.
720    #[inline]
721    pub fn iter(&self) -> impl Iterator<Item = &ClusterResult1D> {
722        self.clusters.iter()
723    }
724}
725
726#[derive(Clone, Debug)]
727pub struct FragmentGroupIndex {
728    /// MS2 indices belonging to this WG, sorted by rt_apex.
729    pub ms2_indices: Vec<usize>,
730    /// rt_fit.mu (or RT midpoint fallback) for those indices, same order as `ms2_indices`.
731    pub rt_apex: Vec<f32>,
732}
733
734/// Axis-aligned query knobs for fragment candidates.
735#[derive(Clone, Debug)]
736pub struct FragmentQueryOpts {
737    /// Maximum allowed |rt_apex(prec) - rt_apex(ms2)| in seconds (None = no RT slice).
738    pub max_rt_apex_delta_sec: Option<f32>,
739    /// Maximum allowed |im_apex(prec) - im_apex(ms2)| in global scans.
740    pub max_scan_apex_delta: Option<usize>,
741    /// Minimum IM overlap in scans between precursor and fragment.
742    pub min_im_overlap_scans: usize,
743    /// Require at least one shared DIA tile (ProgramSlice) between precursor and fragment.
744    pub require_tile_compat: bool,
745    /// If true, drop fragments whose own selection (mz, scan)
746    pub reject_frag_inside_precursor_tile: bool,
747}
748
749impl Default for FragmentQueryOpts {
750    fn default() -> Self {
751        Self {
752            max_rt_apex_delta_sec: Some(2.0),
753            max_scan_apex_delta: Some(6),
754            min_im_overlap_scans: 1,
755            require_tile_compat: true,
756            reject_frag_inside_precursor_tile: false,
757        }
758    }
759}
760
761#[derive(Clone, Debug)]
762pub struct FragmentIndex {
763    dia_index: Arc<DiaIndex>,
764
765    /// Owned MS2 clusters (e.g. from Parquet or Python).
766    /// Stored as Arc<[T]> so cloning the index is cheap.
767    ms2_storage: Arc<[ClusterResult1D]>,
768
769    // per-MS2 metadata (global index over all groups)
770    ms2_im_mu: Vec<f32>,
771    ms2_im_windows: Vec<(usize, usize)>,
772    ms2_keep: Vec<bool>,
773    /// Stable cluster IDs, same indexing as ms2_* vectors.
774    ms2_cluster_ids: Vec<u64>,
775
776    /// Representative m/z per MS2 cluster (μ or window midpoint).
777    ms2_mz_mu: Vec<f32>,
778
779    /// group -> RT-sorted MS2
780    by_group: HashMap<u32, FragmentGroupIndex>,
781
782    /// Fast lookup: cluster_id -> index into ms2_storage.
783    id_to_idx: HashMap<u64, usize>,
784}
785
786impl FragmentIndex {
787    #[inline]
788    pub fn ms2_slice(&self) -> &[ClusterResult1D] {
789        &self.ms2_storage
790    }
791
792    #[inline]
793    pub fn get_ms2_by_index(&self, idx: usize) -> Option<&ClusterResult1D> {
794        self.ms2_storage.get(idx)
795    }
796
797    #[inline]
798    pub fn get_ms2_by_cluster_id(&self, cid: u64) -> Option<&ClusterResult1D> {
799        self.id_to_idx.get(&cid).and_then(|&i| self.ms2_storage.get(i))
800    }
801
802    // --- Grouped accessors (Phase 2 refactoring) ---
803
804    /// Get pre-computed MS2 metadata.
805    #[inline]
806    pub fn metadata(&self) -> FragmentMetadata {
807        FragmentMetadata {
808            im_mu: self.ms2_im_mu.clone(),
809            im_windows: self.ms2_im_windows.clone(),
810            keep: self.ms2_keep.clone(),
811            cluster_ids: self.ms2_cluster_ids.clone(),
812            mz_mu: self.ms2_mz_mu.clone(),
813        }
814    }
815
816    /// Get storage wrapper for cluster access.
817    #[inline]
818    pub fn storage(&self) -> FragmentStorage {
819        FragmentStorage {
820            clusters: Arc::clone(&self.ms2_storage),
821            id_to_idx: self.id_to_idx.clone(),
822        }
823    }
824
825    /// Get the DIA index reference.
826    #[inline]
827    pub fn dia_index(&self) -> &Arc<DiaIndex> {
828        &self.dia_index
829    }
830
831    /// Get group index for a specific window group.
832    #[inline]
833    pub fn group_index(&self, group: u32) -> Option<&FragmentGroupIndex> {
834        self.by_group.get(&group)
835    }
836
837    /// Get all window groups.
838    #[inline]
839    pub fn groups(&self) -> Vec<u32> {
840        self.by_group.keys().copied().collect()
841    }
842
843    /// Count valid (kept) MS2 clusters.
844    #[inline]
845    pub fn count_valid(&self) -> usize {
846        self.ms2_keep.iter().filter(|&&k| k).count()
847    }
848
849    /// Count total MS2 clusters.
850    #[inline]
851    pub fn len(&self) -> usize {
852        self.ms2_storage.len()
853    }
854
855    /// Check if empty.
856    #[inline]
857    pub fn is_empty(&self) -> bool {
858        self.ms2_storage.is_empty()
859    }
860
861    /// Core builder: takes *owned* storage (Arc<[ClusterResult1D]>)
862    /// and uses your existing logic.
863    fn build_with_storage(
864        dia_index: Arc<DiaIndex>,
865        ms2_storage: Arc<[ClusterResult1D]>,
866        opts: &CandidateOpts,
867    ) -> Self {
868        let ms2: &[ClusterResult1D] = &ms2_storage;
869        let frame_time = &dia_index.frame_time;
870
871        // --- everything from your old `build` below is unchanged,
872        //     except for:
873        //       - Struct name at the end
874        //       - using ms2_storage instead of ms2 for the field
875        //       - creating id_to_idx
876        // --------------------------------------------------------
877
878        // 1) Absolute MS2 time bounds (seconds) with fallback for missing frame_ids_used
879        let ms2_time_bounds: Vec<(f64, f64)> = ms2
880            .par_iter()
881            .map(|c| {
882                let mut t_lo = f64::INFINITY;
883                let mut t_hi = f64::NEG_INFINITY;
884
885                // Preferred: use explicit frame_ids_used if present
886                if !c.frame_ids_used.is_empty() {
887                    for &fid in &c.frame_ids_used {
888                        if let Some(&t) = frame_time.get(&fid) {
889                            if t < t_lo { t_lo = t; }
890                            if t > t_hi { t_hi = t; }
891                        }
892                    }
893                }
894
895                // Fallback: infer from rt_window if we have no usable times yet
896                if !t_lo.is_finite() || !t_hi.is_finite() {
897                    let (rt_lo, rt_hi) = c.rt_window;
898                    if rt_hi >= rt_lo {
899                        for fid in rt_lo as u32..=rt_hi as u32 {
900                            if let Some(&t) = frame_time.get(&fid) {
901                                if t < t_lo { t_lo = t; }
902                                if t > t_hi { t_hi = t; }
903                            }
904                        }
905                    }
906                }
907
908                (t_lo, t_hi)
909            })
910            .collect();
911
912        // 2) Compute adaptive threshold for min_raw_sum
913        // Collect raw_sum values from MS2 clusters to estimate noise
914        let raw_sums: Vec<f32> = ms2
915            .iter()
916            .filter(|c| c.ms_level == 2)
917            .map(|c| c.raw_sum)
918            .collect();
919        let raw_sum_noise = robust_noise_mad(&raw_sums);
920        let effective_min_raw_sum = opts.min_raw_sum.effective(raw_sum_noise);
921
922        // 3) Keep mask
923        let ms2_keep: Vec<bool> = ms2
924            .par_iter()
925            .enumerate()
926            .map(|(i, c)| {
927                if c.ms_level != 2 {
928                    return false;
929                }
930                if c.window_group.is_none() {
931                    return false;
932                }
933                if c.raw_sum < effective_min_raw_sum {
934                    return false;
935                }
936
937                let (mut t0, mut t1) = ms2_time_bounds[i];
938                if t0.is_finite() {
939                    t0 -= opts.ms2_rt_guard_sec;
940                }
941                if t1.is_finite() {
942                    t1 += opts.ms2_rt_guard_sec;
943                }
944
945                if !(t0.is_finite() && t1.is_finite() && t1 > t0) {
946                    return false;
947                }
948
949                if let Some(max_rt) = opts.max_ms2_rt_span_sec {
950                    if (t1 - t0) > max_rt {
951                        return false;
952                    }
953                }
954
955                true
956            })
957            .collect();
958
959        // 3) IM window + IM apex, with fallback to midpoint – unchanged
960        let ms2_im_windows: Vec<(usize, usize)> =
961            ms2.iter().map(|c| c.im_window).collect();
962
963        let ms2_im_mu: Vec<f32> = ms2
964            .iter()
965            .map(|c| {
966                let mu = c.im_fit.mu;
967                if mu.is_finite() && mu > 0.0 {
968                    mu
969                } else {
970                    let (lo, hi) = c.im_window;
971                    if hi > lo {
972                        ((lo + hi) as f32) * 0.5
973                    } else {
974                        f32::NAN
975                    }
976                }
977            })
978            .collect();
979
980        let ms2_cluster_ids: Vec<u64> =
981            ms2.iter().map(|c| c.cluster_id).collect();
982
983        // 4) RT apex (seconds) with fallback – unchanged
984        let ms2_rt_apex: Vec<f32> = ms2
985            .iter()
986            .enumerate()
987            .map(|(i, c)| {
988                let mu = c.rt_fit.mu;
989                if mu.is_finite() && mu > 0.0 {
990                    mu
991                } else {
992                    let (t0, t1) = ms2_time_bounds[i];
993                    if t0.is_finite() && t1.is_finite() && t1 > t0 {
994                        ((t0 + t1) * 0.5) as f32
995                    } else {
996                        f32::NAN
997                    }
998                }
999            })
1000            .collect();
1001
1002        // 4.5) Representative m/z – unchanged
1003        let ms2_mz_mu: Vec<f32> = ms2
1004            .iter()
1005            .map(|c| {
1006                if let Some(m) = cluster_mz_mu(c) {
1007                    if m.is_finite() && m > 0.0 {
1008                        m
1009                    } else {
1010                        match c.mz_window {
1011                            Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1012                                0.5 * (lo + hi) as f32
1013                            }
1014                            _ => f32::NAN,
1015                        }
1016                    }
1017                } else {
1018                    match c.mz_window {
1019                        Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1020                            0.5 * (lo + hi) as f32
1021                        }
1022                        _ => f32::NAN,
1023                    }
1024                }
1025            })
1026            .collect();
1027
1028        // 5) Group and sort by RT apex – unchanged
1029        let mut by_group_raw: HashMap<u32, Vec<(f32, usize)>> = HashMap::new();
1030
1031        for (j, c2) in ms2.iter().enumerate() {
1032            if !ms2_keep[j] {
1033                continue;
1034            }
1035            let g = match c2.window_group {
1036                Some(g) => g,
1037                None => continue,
1038            };
1039
1040            let rt = ms2_rt_apex[j];
1041            if !rt.is_finite() {
1042                continue;
1043            }
1044
1045            by_group_raw.entry(g).or_default().push((rt, j));
1046        }
1047
1048        let mut by_group: HashMap<u32, FragmentGroupIndex> = HashMap::new();
1049        for (g, mut v) in by_group_raw {
1050            v.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
1051            let (rt_apex, ms2_indices): (Vec<f32>, Vec<usize>) =
1052                v.into_iter().map(|(rt, j)| (rt, j)).unzip();
1053            by_group.insert(g, FragmentGroupIndex { ms2_indices, rt_apex });
1054        }
1055
1056        // Build ID → index map
1057        let id_to_idx: HashMap<u64, usize> = ms2_cluster_ids
1058            .iter()
1059            .enumerate()
1060            .map(|(i, &cid)| (cid, i))
1061            .collect();
1062
1063        FragmentIndex {
1064            dia_index,
1065            ms2_storage,
1066            ms2_im_mu,
1067            ms2_im_windows,
1068            ms2_keep,
1069            ms2_cluster_ids,
1070            ms2_mz_mu,
1071            by_group,
1072            id_to_idx,
1073        }
1074    }
1075
1076    pub fn from_parquet_file(
1077        dia_index: Arc<DiaIndex>,
1078        parquet_path: impl AsRef<Path>,
1079        opts: &CandidateOpts,
1080    ) -> io::Result<Self> {
1081        let path_str = parquet_path
1082            .as_ref()
1083            .to_str()
1084            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "non-UTF8 path"))?;
1085
1086        let clusters = load_parquet(path_str)?;
1087        Ok(Self::from_owned(dia_index, clusters, opts))
1088    }
1089
1090    /// Your requested builder: read all `.parquet` files in a directory and merge.
1091    pub fn from_parquet_dir(
1092        dia_index: Arc<DiaIndex>,
1093        dir: impl AsRef<Path>,
1094        opts: &CandidateOpts,
1095    ) -> io::Result<Self> {
1096        let dir = dir.as_ref();
1097        let mut all: Vec<ClusterResult1D> = Vec::new();
1098
1099        for entry in std::fs::read_dir(dir)? {
1100            let entry = entry?;
1101            if !entry.file_type()?.is_file() {
1102                continue;
1103            }
1104            let path = entry.path();
1105            if path.extension().and_then(|s| s.to_str()) != Some("parquet") {
1106                continue;
1107            }
1108
1109            let path_str = path
1110                .to_str()
1111                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "non-UTF8 path"))?;
1112
1113            let mut part = load_parquet(path_str)?;
1114            all.append(&mut part);
1115        }
1116
1117        Ok(Self::from_owned(dia_index, all, opts))
1118    }
1119
1120    /// Backwards-compatible: build from a borrowed slice (one copy into Arc).
1121    pub fn from_slice(
1122        dia_index: Arc<DiaIndex>,
1123        ms2: &[ClusterResult1D],
1124        opts: &CandidateOpts,
1125    ) -> Self {
1126        let storage: Arc<[ClusterResult1D]> = ms2.to_vec().into();
1127        Self::build_with_storage(dia_index, storage, opts)
1128    }
1129
1130    /// Preferred: build from an owned Vec.
1131    pub fn from_owned(
1132        dia_index: Arc<DiaIndex>,
1133        ms2: Vec<ClusterResult1D>,
1134        opts: &CandidateOpts,
1135    ) -> Self {
1136        let storage: Arc<[ClusterResult1D]> = ms2.into();
1137        Self::build_with_storage(dia_index, storage, opts)
1138    }
1139
1140    fn precursor_rt_apex_sec(&self, prec: &ClusterResult1D) -> Option<f32> {
1141        // 1) use fit if it looks sane
1142        let mu = prec.rt_fit.mu;
1143        if mu.is_finite() && mu > 0.0 {
1144            return Some(mu);
1145        }
1146
1147        let ft = &self.dia_index.frame_time;
1148
1149        // 2) fallback: derive from frame_ids_used
1150        let mut sum = 0.0f64;
1151        let mut n = 0usize;
1152        for fid in &prec.frame_ids_used {
1153            if let Some(&t) = ft.get(fid) {
1154                if t.is_finite() {
1155                    sum += t;
1156                    n += 1;
1157                }
1158            }
1159        }
1160        if n > 0 {
1161            return Some((sum / n as f64) as f32);
1162        }
1163
1164        // 3) final fallback: infer from rt_window range using frame IDs
1165        let (rt_lo, rt_hi) = prec.rt_window;
1166        if rt_hi >= rt_lo {
1167            let mut sum = 0.0f64;
1168            let mut n = 0usize;
1169            for fid in rt_lo as u32..=rt_hi as u32 {
1170                if let Some(&t) = ft.get(&fid) {
1171                    if t.is_finite() {
1172                        sum += t;
1173                        n += 1;
1174                    }
1175                }
1176            }
1177            if n > 0 {
1178                return Some((sum / n as f64) as f32);
1179            }
1180        }
1181
1182        None
1183    }
1184
1185    pub fn query_precursor(
1186        &self,
1187        prec: &ClusterResult1D,
1188        groups: Option<&[u32]>,
1189        opts: &FragmentQueryOpts,
1190    ) -> Vec<u64> {
1191        let mut out = Vec::new();
1192
1193        // --- precursor m/z (for group selection + "inside selection" test) ---
1194        let prec_mz = if let Some(m) = cluster_mz_mu(prec) {
1195            if m.is_finite() && m > 0.0 {
1196                m
1197            } else {
1198                match prec.mz_window {
1199                    Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1200                        0.5 * (lo + hi)
1201                    }
1202                    _ => return out, // no usable m/z
1203                }
1204            }
1205        } else {
1206            match prec.mz_window {
1207                Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1208                    0.5 * (lo + hi)
1209                }
1210                _ => return out,
1211            }
1212        };
1213
1214        // --- precursor RT apex (SECONDS!) ---
1215        let prec_rt = match self.precursor_rt_apex_sec(prec) {
1216            Some(t) => t,
1217            None => return out,
1218        };
1219
1220        // --- precursor IM apex (scan space) ---
1221        let prec_im = if prec.im_fit.mu.is_finite() {
1222            prec.im_fit.mu
1223        } else {
1224            let (lo, hi) = prec.im_window;
1225            if hi > lo {
1226                ((lo + hi) as f32) * 0.5
1227            } else {
1228                return out;
1229            }
1230        };
1231
1232        let prec_im_win = prec.im_window;
1233        let prec_scan_win = (prec_im_win.0 as u32, prec_im_win.1 as u32);
1234
1235        // Decide which groups to query
1236        let groups: Vec<u32> = match groups {
1237            Some(gs) if !gs.is_empty() => gs.to_vec(),
1238            _ => self.dia_index.groups_for_precursor(prec_mz, prec_im),
1239        };
1240        if groups.is_empty() {
1241            return Vec::new();
1242        }
1243
1244        let require_tile  = opts.require_tile_compat;
1245        let reject_inside = opts.reject_frag_inside_precursor_tile;
1246
1247        for g in groups {
1248            let fg = match self.by_group.get(&g) {
1249                Some(fg) => fg,
1250                None => continue,
1251            };
1252
1253            // RT slice via binary search (fg.rt_apex is already in seconds)
1254            let (lo_idx, hi_idx) = if let Some(dt) = opts.max_rt_apex_delta_sec {
1255                if dt > 0.0 {
1256                    let lo_t = prec_rt - dt;
1257                    let hi_t = prec_rt + dt;
1258                    (
1259                        lower_bound(&fg.rt_apex, lo_t),
1260                        upper_bound(&fg.rt_apex, hi_t),
1261                    )
1262                } else {
1263                    (0, fg.ms2_indices.len())
1264                }
1265            } else {
1266                (0, fg.ms2_indices.len())
1267            };
1268
1269            // Program slices (tiles) for this group; we'll inspect them directly.
1270            let slices = self.dia_index.program_slices_for_group(g);
1271
1272            'ms2_loop: for k in lo_idx..hi_idx {
1273                let j = fg.ms2_indices[k];
1274                if !self.ms2_keep[j] {
1275                    continue;
1276                }
1277
1278                // IM window overlap
1279                let im2 = self.ms2_im_windows[j];
1280                let frag_scan_win = (im2.0 as u32, im2.1 as u32);
1281
1282                let im_overlap = {
1283                    let lo = prec_im_win.0.max(im2.0);
1284                    let hi = prec_im_win.1.min(im2.1);
1285                    hi.saturating_sub(lo).saturating_add(1)
1286                };
1287                if im_overlap < opts.min_im_overlap_scans {
1288                    continue;
1289                }
1290
1291                // IM apex delta
1292                if let Some(max_d) = opts.max_scan_apex_delta {
1293                    let s2 = self.ms2_im_mu[j];
1294                    if s2.is_finite() {
1295                        let d = (prec_im - s2).abs() as f32;
1296                        if d > max_d as f32 {
1297                            continue 'ms2_loop;
1298                        }
1299                    } else {
1300                        continue 'ms2_loop;
1301                    }
1302                }
1303
1304                // --- Tile compatibility + "reject inside precursor-selection" guard ---
1305                if require_tile || reject_inside {
1306                    let mut tile_compatible   = false;
1307                    let mut inside_prec_tile  = false;
1308                    let frag_mz = self.ms2_mz_mu[j];
1309
1310                    for s in &slices {
1311                        let tile_scans = (s.scan_lo, s.scan_hi);
1312
1313                        // Do both precursor and fragment overlap this tile in scan space?
1314                        let prec_hits_tile = ranges_overlap_u32(prec_scan_win, tile_scans);
1315                        let frag_hits_tile = ranges_overlap_u32(frag_scan_win, tile_scans);
1316
1317                        if !(prec_hits_tile && frag_hits_tile) {
1318                            continue;
1319                        }
1320
1321                        tile_compatible = true;
1322
1323                        if reject_inside && frag_mz.is_finite() {
1324                            // Precursor m/z inside this tile's isolation window?
1325                            let prec_in_tile =
1326                                prec_mz >= s.mz_lo as f32 && prec_mz <= s.mz_hi as f32;
1327                            // Fragment m/z inside the same isolation window?
1328                            let frag_in_tile =
1329                                frag_mz >= s.mz_lo as f32 && frag_mz <= s.mz_hi as f32;
1330
1331                            if prec_in_tile && frag_in_tile {
1332                                inside_prec_tile = true;
1333                                // No need to inspect more tiles for this fragment;
1334                                // we already know it's "inside precursor selection".
1335                                break;
1336                            }
1337                        }
1338                    }
1339
1340                    // 1) If we require tile compatibility: must have *some* shared scan tile
1341                    if require_tile && !tile_compatible {
1342                        continue 'ms2_loop;
1343                    }
1344
1345                    // 2) If we reject fragments inside precursor tile: drop them.
1346                    if reject_inside && inside_prec_tile {
1347                        continue 'ms2_loop;
1348                    }
1349                }
1350
1351                // Survived all guards: keep this fragment for the precursor.
1352                out.push(self.ms2_cluster_ids[j]);
1353            }
1354        }
1355
1356        out.sort_unstable();
1357        out.dedup();
1358        out
1359    }
1360
1361    /// Parallel batch query: for each precursor, return a Vec of MS2 cluster_ids.
1362    pub fn query_precursors_par(
1363        &self,
1364        precursors: &[ClusterResult1D],
1365        opts: &FragmentQueryOpts,
1366        num_threads: usize,
1367    ) -> Vec<Vec<u64>> {
1368        let pool = rayon::ThreadPoolBuilder::new()
1369            .num_threads(num_threads)
1370            .build()
1371            .unwrap();
1372        pool.install(|| {
1373            precursors
1374                .par_iter()
1375                .map(|prec| self.query_precursor(prec, None, opts))
1376                .collect()
1377        })
1378    }
1379
1380    /// Enumerate candidate MS2 indices for a thin precursor.
1381    ///
1382    /// Returns indices into `self.ms2`.
1383    pub fn enumerate_candidates_for_precursor_thin(
1384        &self,
1385        prec: &ThinPrecursor,
1386        window_groups: Option<&[u32]>,
1387        opts: &FragmentQueryOpts,
1388    ) -> Vec<usize> {
1389        let mut out = Vec::<usize>::new();
1390
1391        let prec_mz      = prec.mz_mu;
1392        let prec_rt      = prec.rt_mu;
1393        let prec_im      = prec.im_mu;
1394        let prec_im_win  = prec.im_window;
1395        let prec_scan_win = (prec_im_win.0 as u32, prec_im_win.1 as u32);
1396
1397        // Decide which groups to query
1398        let groups: Vec<u32> = match window_groups {
1399            Some(gs) if !gs.is_empty() => gs.to_vec(),
1400            _ => self.dia_index.groups_for_precursor(prec_mz, prec_im),
1401        };
1402        if groups.is_empty() {
1403            return out;
1404        }
1405
1406        let require_tile  = opts.require_tile_compat;
1407        let reject_inside = opts.reject_frag_inside_precursor_tile;
1408
1409        for g in groups {
1410            let fg = match self.by_group.get(&g) {
1411                Some(fg) => fg,
1412                None => continue,
1413            };
1414
1415            // RT slice via binary search (fg.rt_apex is already in seconds)
1416            let (lo_idx, hi_idx) = if let Some(dt) = opts.max_rt_apex_delta_sec {
1417                if dt > 0.0 {
1418                    let lo_t = prec_rt - dt;
1419                    let hi_t = prec_rt + dt;
1420                    (
1421                        lower_bound(&fg.rt_apex, lo_t),
1422                        upper_bound(&fg.rt_apex, hi_t),
1423                    )
1424                } else {
1425                    (0, fg.ms2_indices.len())
1426                }
1427            } else {
1428                (0, fg.ms2_indices.len())
1429            };
1430
1431            // Program slices (tiles) for this group; we'll inspect them directly.
1432            let slices = self.dia_index.program_slices_for_group(g);
1433
1434            'ms2_loop: for k in lo_idx..hi_idx {
1435                let j = fg.ms2_indices[k];
1436                if !self.ms2_keep[j] {
1437                    continue;
1438                }
1439
1440                // IM window overlap
1441                let im2 = self.ms2_im_windows[j];
1442                let frag_scan_win = (im2.0 as u32, im2.1 as u32);
1443
1444                let im_overlap = {
1445                    let lo = prec_im_win.0.max(im2.0);
1446                    let hi = prec_im_win.1.min(im2.1);
1447                    hi.saturating_sub(lo).saturating_add(1)
1448                };
1449                if im_overlap < opts.min_im_overlap_scans {
1450                    continue;
1451                }
1452
1453                // IM apex delta
1454                if let Some(max_d) = opts.max_scan_apex_delta {
1455                    let s2 = self.ms2_im_mu[j];
1456                    if s2.is_finite() {
1457                        let d = (prec_im - s2).abs() as f32;
1458                        if d > max_d as f32 {
1459                            continue 'ms2_loop;
1460                        }
1461                    } else {
1462                        continue 'ms2_loop;
1463                    }
1464                }
1465
1466                // --- Tile compatibility + "reject inside precursor-selection" guard ---
1467                if require_tile || reject_inside {
1468                    let mut tile_compatible   = false;
1469                    let mut inside_prec_tile  = false;
1470                    let frag_mz = self.ms2_mz_mu[j];
1471
1472                    for s in &slices {
1473                        let tile_scans = (s.scan_lo, s.scan_hi);
1474
1475                        // Do both precursor and fragment overlap this tile in scan space?
1476                        let prec_hits_tile = ranges_overlap_u32(prec_scan_win, tile_scans);
1477                        let frag_hits_tile = ranges_overlap_u32(frag_scan_win, tile_scans);
1478
1479                        if !(prec_hits_tile && frag_hits_tile) {
1480                            continue;
1481                        }
1482
1483                        tile_compatible = true;
1484
1485                        if reject_inside && frag_mz.is_finite() {
1486                            // Precursor m/z inside this tile's isolation window?
1487                            let prec_in_tile =
1488                                prec_mz >= s.mz_lo as f32 && prec_mz <= s.mz_hi as f32;
1489                            // Fragment m/z inside the same isolation window?
1490                            let frag_in_tile =
1491                                frag_mz >= s.mz_lo as f32 && frag_mz <= s.mz_hi as f32;
1492
1493                            if prec_in_tile && frag_in_tile {
1494                                inside_prec_tile = true;
1495                                break;
1496                            }
1497                        }
1498                    }
1499
1500                    if require_tile && !tile_compatible {
1501                        continue 'ms2_loop;
1502                    }
1503                    if reject_inside && inside_prec_tile {
1504                        continue 'ms2_loop;
1505                    }
1506                }
1507
1508                out.push(j);
1509            }
1510        }
1511
1512        out.sort_unstable();
1513        out.dedup();
1514        out
1515    }
1516
1517    /// Enumerate candidates for many thin precursors.
1518    ///
1519    /// Keeps the same length as `precs`: `None` ⇒ empty candidate list.
1520    fn enumerate_candidates_for_precursors_thin(
1521        &self,
1522        precs: &[Option<ThinPrecursor>],
1523        opts: &FragmentQueryOpts,
1524    ) -> Vec<Vec<usize>> {
1525        precs
1526            .iter()
1527            .map(|maybe_prec| {
1528                if let Some(ref p) = maybe_prec {
1529                    self.enumerate_candidates_for_precursor_thin(p, None, opts)
1530                } else {
1531                    Vec::new()
1532                }
1533            })
1534            .collect()
1535    }
1536
1537    /// Enumerate candidate MS2 indices for a given SimpleFeature.
1538    ///
1539    /// Uses the representative member cluster (highest raw_sum) of the feature
1540    /// and reuses the standard precursor enumeration logic.
1541    ///
1542    /// Returns indices into `self.ms2`.
1543    pub fn enumerate_candidates_for_feature(
1544        &self,
1545        feat: &SimpleFeature,
1546        opts: &FragmentQueryOpts,
1547    ) -> Vec<usize> {
1548        let prec_cluster = match feature_representative_cluster(feat) {
1549            Some(c) => c,
1550            None => return Vec::new(),
1551        };
1552        let thin = match self.make_thin_precursor(prec_cluster) {
1553            Some(t) => t,
1554            None => return Vec::new(),
1555        };
1556        // NOTE: groups always decided by the index.
1557        self.enumerate_candidates_for_precursor_thin(&thin, None, opts)
1558    }
1559
1560    pub fn query_precursor_scored(
1561        &self,
1562        prec: &ClusterResult1D,
1563        window_groups: Option<&[u32]>,
1564        opts: &FragmentQueryOpts,
1565        mode: MatchScoreMode,
1566        geom_opts: &ScoreOpts,
1567        xic_opts: &XicScoreOpts,
1568        min_score: f32,
1569    ) -> Vec<ScoredHit> {
1570        // Build thin precursor for enumeration only.
1571        let thin = match self.make_thin_precursor(prec) {
1572            Some(t) => t,
1573            None => return Vec::new(),
1574        };
1575
1576        let candidate_ids =
1577            self.enumerate_candidates_for_precursor_thin(&thin, window_groups, opts);
1578
1579        // Scoring still uses the full cluster as PrecursorLike::Cluster.
1580        query_precursor_scored(
1581            PrecursorLike::Cluster(prec),
1582            self.ms2_slice(),
1583            &candidate_ids,
1584            mode,
1585            geom_opts,
1586            xic_opts,
1587            min_score,
1588        )
1589    }
1590
1591    pub fn query_precursors_scored_par(
1592        &self,
1593        precs: &[ClusterResult1D],
1594        opts: &FragmentQueryOpts,
1595        mode: MatchScoreMode,
1596        geom_opts: &ScoreOpts,
1597        xic_opts: &XicScoreOpts,
1598        min_score: f32,
1599    ) -> Vec<Vec<ScoredHit>> {
1600        // 1) Build thin precursors (Option → keep alignment with `precs`).
1601        let thin_precs: Vec<Option<ThinPrecursor>> = precs
1602            .iter()
1603            .map(|c| self.make_thin_precursor(c))
1604            .collect();
1605
1606        // 2) Enumerate candidates using only the thin data.
1607        let all_candidates = self.enumerate_candidates_for_precursors_thin(&thin_precs, opts);
1608
1609        // 3) Build PrecursorLike slice referencing the full clusters for scoring.
1610        let prec_like: Vec<PrecursorLike<'_>> =
1611            precs.iter().map(|c| PrecursorLike::Cluster(c)).collect();
1612
1613        // 4) Score in parallel (unchanged).
1614        query_precursors_scored_par(
1615            &prec_like,
1616            self.ms2_slice(),
1617            &all_candidates,
1618            mode,
1619            geom_opts,
1620            xic_opts,
1621            min_score,
1622        )
1623    }
1624
1625    fn enumerate_candidates_for_features(
1626        &self,
1627        feats: &[SimpleFeature],
1628        opts: &FragmentQueryOpts,
1629    ) -> Vec<Vec<usize>> {
1630        feats
1631            .iter()
1632            .map(|feat| {
1633                let prec_cluster = match feature_representative_cluster(feat) {
1634                    Some(c) => c,
1635                    None => return Vec::new(),
1636                };
1637                let thin = match self.make_thin_precursor(prec_cluster) {
1638                    Some(t) => t,
1639                    None => return Vec::new(),
1640                };
1641                // NOTE: groups always decided by the index.
1642                self.enumerate_candidates_for_precursor_thin(&thin, None, opts)
1643            })
1644            .collect()
1645    }
1646
1647    /// Score a single SimpleFeature precursor against all physically plausible
1648    /// fragment candidates, as determined internally by the FragmentIndex.
1649    ///
1650    /// This is the feature twin of `query_precursor_scored`, but it does *not*
1651    /// require the caller to pass window groups or candidate indices.
1652    pub fn query_feature_scored(
1653        &self,
1654        feat: &SimpleFeature,
1655        opts: &FragmentQueryOpts,
1656        mode: MatchScoreMode,
1657        geom_opts: &ScoreOpts,
1658        xic_opts: &XicScoreOpts,
1659        min_score: f32,
1660    ) -> Vec<ScoredHit> {
1661        let candidate_ids = self.enumerate_candidates_for_feature(feat, opts);
1662
1663        query_precursor_scored(
1664            PrecursorLike::Feature(feat),
1665            self.ms2_slice(),
1666            &candidate_ids,
1667            mode,
1668            geom_opts,
1669            xic_opts,
1670            min_score,
1671        )
1672    }
1673
1674    /// Parallel scoring for many SimpleFeatures.
1675    ///
1676    /// - Candidate enumeration is done *inside* the index.
1677    /// - `feats[i]` is scored against the MS2 candidates inferred for feat i.
1678    pub fn query_features_scored_par(
1679        &self,
1680        feats: &[SimpleFeature],
1681        opts: &FragmentQueryOpts,
1682        mode: MatchScoreMode,
1683        geom_opts: &ScoreOpts,
1684        xic_opts: &XicScoreOpts,
1685        min_score: f32,
1686    ) -> Vec<Vec<ScoredHit>> {
1687        let all_candidates = self.enumerate_candidates_for_features(feats, opts);
1688
1689        let prec_like: Vec<PrecursorLike<'_>> =
1690            feats.iter().map(|f| PrecursorLike::Feature(f)).collect();
1691
1692        query_precursors_scored_par(
1693            &prec_like,
1694            self.ms2_slice(),
1695            &all_candidates,
1696            mode,
1697            geom_opts,
1698            xic_opts,
1699            min_score,
1700        )
1701    }
1702
1703    /// Build a thin precursor representation used for candidate enumeration.
1704    ///
1705    /// This folds all the "how do we get RT apex / m/z apex / IM apex?" logic
1706    /// into one place and drops everything else (raw points, axes, …).
1707    pub fn make_thin_precursor(&self, prec: &ClusterResult1D) -> Option<ThinPrecursor> {
1708        // --- precursor m/z (same rules as before) ---
1709        let prec_mz = if let Some(m) = cluster_mz_mu(prec) {
1710            if m.is_finite() && m > 0.0 {
1711                m
1712            } else {
1713                match prec.mz_window {
1714                    Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1715                        0.5 * (lo + hi)
1716                    }
1717                    _ => return None, // no usable m/z
1718                }
1719            }
1720        } else {
1721            match prec.mz_window {
1722                Some((lo, hi)) if lo.is_finite() && hi.is_finite() && hi > lo => {
1723                    0.5 * (lo + hi)
1724                }
1725                _ => return None,
1726            }
1727        };
1728
1729        // --- RT apex in seconds (uses existing helper for fallback) ---
1730        let rt_mu = match self.precursor_rt_apex_sec(prec) {
1731            Some(t) => t,
1732            None => return None,
1733        };
1734
1735        // --- IM apex + window (scan space) ---
1736        let im_window = prec.im_window;
1737        let im_mu = if prec.im_fit.mu.is_finite() {
1738            prec.im_fit.mu
1739        } else {
1740            let (lo, hi) = im_window;
1741            if hi > lo {
1742                ((lo + hi) as f32) * 0.5
1743            } else {
1744                return None;
1745            }
1746        };
1747
1748        Some(ThinPrecursor {
1749            mz_mu: prec_mz,
1750            rt_mu,
1751            im_mu,
1752            im_window,
1753        })
1754    }
1755}
1756
1757#[derive(Clone, Debug)]
1758pub struct ThinPrecursor {
1759    /// Precursor m/z apex (Da), already cleaned / representative.
1760    pub mz_mu: f32,
1761
1762    /// RT apex in *seconds*.
1763    pub rt_mu: f32,
1764
1765    /// IM apex in scan space.
1766    pub im_mu: f32,
1767
1768    /// IM window (scan indices) used for overlap / tile tests.
1769    pub im_window: (usize, usize),
1770}
1771
1772
1773
1774// Simple helpers (local)
1775
1776fn lower_bound(xs: &[f32], x: f32) -> usize {
1777    let mut lo = 0;
1778    let mut hi = xs.len();
1779    while lo < hi {
1780        let mid = (lo + hi) / 2;
1781        if xs[mid] < x {
1782            lo = mid + 1;
1783        } else {
1784            hi = mid;
1785        }
1786    }
1787    lo
1788}
1789
1790#[inline]
1791fn ranges_overlap_u32(a: (u32, u32), b: (u32, u32)) -> bool {
1792    let lo = a.0.max(b.0);
1793    let hi = a.1.min(b.1);
1794    hi >= lo
1795}
1796
1797fn upper_bound(xs: &[f32], x: f32) -> usize {
1798    let mut lo = 0;
1799    let mut hi = xs.len();
1800    while lo < hi {
1801        let mid = (lo + hi) / 2;
1802        if xs[mid] <= x {
1803            lo = mid + 1;
1804        } else {
1805            hi = mid;
1806        }
1807    }
1808    lo
1809}
1810
1811/// Convert an MS2 ClusterResult1D into a PseudoFragment.
1812/// Return None if the cluster is unusable (no m/z).
1813pub fn fragment_from_cluster(c: &ClusterResult1D) -> Option<PseudoFragment> {
1814    // mz: prefer fitted μ, fallback to window mid
1815    let mz = if let Some(fit) = &c.mz_fit {
1816        if fit.mu.is_finite() && fit.mu > 0.0 {
1817            fit.mu
1818        } else if let Some((lo, hi)) = c.mz_window {
1819            0.5 * (lo + hi)
1820        } else {
1821            return None;
1822        }
1823    } else if let Some((lo, hi)) = c.mz_window {
1824        0.5 * (lo + hi)
1825    } else {
1826        return None;
1827    };
1828
1829    if !mz.is_finite() {
1830        return None;
1831    }
1832
1833    // intensity: choose raw_sum as best available proxy
1834    let intensity = c.raw_sum;
1835
1836    Some(PseudoFragment {
1837        mz,
1838        intensity,
1839        ms2_cluster_index: 0,
1840        ms2_cluster_id: c.cluster_id,
1841        window_group: c.window_group.unwrap_or(0),
1842    })
1843}
1844
1845fn feature_representative_cluster<'a>(feat: &'a SimpleFeature) -> Option<&'a ClusterResult1D> {
1846    if feat.member_clusters.is_empty() {
1847        return None;
1848    }
1849
1850    feat.member_clusters
1851        .iter()
1852        .max_by(|a, b| a.raw_sum.partial_cmp(&b.raw_sum).unwrap_or(std::cmp::Ordering::Equal))
1853}
rustdf/cluster/candidates.rs

rustdf/cluster/
candidates.rs