rustdf/cluster/
pseudo.rs

1//! Build pseudo-DDA spectra from MS1/MS2 clusters and simple isotopic features.
2
3use std::cmp::Ordering;
4use std::collections::HashMap;
5use std::hash::Hash;
6
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9use crate::cluster::candidates::CandidateOpts;
10use crate::cluster::cluster::ClusterResult1D;
11use crate::cluster::feature::SimpleFeature;
12use crate::cluster::scoring::{assign_ms2_to_best_ms1_by_xic, enumerate_ms2_ms1_pairs_simple, XicScoreOpts};
13use crate::data::dia::TimsDatasetDIA;
14// ---------------------------------------------------------------------------
15// Basic helpers
16// ---------------------------------------------------------------------------
17
18/// Best-effort m/z "center" for a cluster.
19///
20/// Prefer `mz_fit.mu` when present and sane; otherwise fall back to window mid.
21pub fn cluster_mz_mu(c: &ClusterResult1D) -> Option<f32> {
22    if let Some(ref f) = c.mz_fit {
23        if f.mu.is_finite() && f.mu > 0.0 {
24            return Some(f.mu);
25        }
26    }
27    if let Some((lo, hi)) = c.mz_window {
28        let mu = 0.5 * (lo + hi);
29        if mu.is_finite() && mu > 0.0 {
30            return Some(mu);
31        }
32    }
33    None
34}
35
36// ---------------------------------------------------------------------------
37// Core types
38// ---------------------------------------------------------------------------
39
40/// One fragment peak in a pseudo-MS/MS spectrum.
41#[derive(Clone, Debug, Serialize, Deserialize)]
42pub struct PseudoFragment {
43    /// Fragment m/z (cluster center).
44    pub mz: f32,
45    /// Fragment intensity proxy (currently raw_sum, can be refined later).
46    pub intensity: f32,
47    /// Index into the `ms2` slice that produced this fragment.
48    pub ms2_cluster_index: usize,
49    /// Stable cluster ID copied from `ClusterResult1D::cluster_id`.
50    pub ms2_cluster_id: u64,
51    pub window_group: u32,
52}
53
54/// One pseudo-DDA spectrum: precursor + set of fragment peaks.
55#[derive(Clone, Debug, Serialize, Deserialize)]
56pub struct PseudoSpectrum {
57    pub precursor_mz: f32,
58    pub precursor_charge: u8,
59    pub rt_apex: f32,
60    pub im_apex: f32,
61
62    pub window_groups: Vec<u32>,
63
64    pub feature_id: Option<usize>,
65    pub precursor_cluster_indices: Vec<usize>,
66    pub precursor_cluster_ids: Vec<u64>,
67    pub fragments: Vec<PseudoFragment>,
68}
69
70/// Options for building pseudo spectra, diaTracer-style.
71#[derive(Clone, Debug)]
72pub struct PseudoSpecOpts {
73    /// Keep at most this many fragments per spectrum (0 = no limit).
74    pub top_n_fragments: usize,
75}
76
77impl Default for PseudoSpecOpts {
78    fn default() -> Self {
79        Self {
80            top_n_fragments: 500, // diaTracer-like RF max default
81        }
82    }
83}
84
85/// Precursor identity: either an isotopic SimpleFeature or a single orphan MS1 cluster.
86#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
87pub enum PrecursorKey {
88    Feature {
89        feature_id: usize,
90        window_group: u32,
91    },
92    OrphanCluster {
93        cluster_idx: usize,
94        window_group: u32,
95    },
96}
97
98// ---------------------------------------------------------------------------
99// Internal helpers
100// ---------------------------------------------------------------------------
101
102/// Map MS1 cluster index -> Option<feature_id>.
103///
104/// Assumes a cluster belongs to at most one SimpleFeature (true for the greedy builder).
105fn build_cluster_to_feature_map(n_ms1: usize, features: &[SimpleFeature]) -> Vec<Option<usize>> {
106    let mut map = vec![None; n_ms1];
107    for (f_idx, feat) in features.iter().enumerate() {
108        for &ci in &feat.member_cluster_indices {
109            if ci < n_ms1 {
110                map[ci] = Some(f_idx);
111            }
112        }
113    }
114    map
115}
116
117/// Derive a precursor apex (RT, IM) from either a feature or an orphan MS1 cluster.
118///
119/// Feature: weighted average of member cluster apex positions (weights = raw_sum).
120/// Orphan: use the single cluster’s rt_fit.mu / im_fit.mu.
121/// Derive a precursor apex (RT, IM) from either a feature or an orphan MS1 cluster.
122///
123/// Feature: weighted average of member cluster apex positions (weights = raw_sum).
124/// Orphan: use the single cluster’s rt_fit.mu / im_fit.mu.
125fn precursor_apex_from_feature_or_cluster(
126    key: PrecursorKey,
127    ms1: &[ClusterResult1D],
128    features: &[SimpleFeature],
129) -> (f32, f32) {
130    match key {
131        PrecursorKey::OrphanCluster { cluster_idx, .. } => {
132            let c = &ms1[cluster_idx];
133            (c.rt_fit.mu, c.im_fit.mu)
134        }
135        PrecursorKey::Feature { feature_id, .. } => {
136            let feat = &features[feature_id];
137            let mut rt_w = 0.0_f64;
138            let mut im_w = 0.0_f64;
139            let mut wsum = 0.0_f64;
140
141            for &ci in &feat.member_cluster_indices {
142                if ci >= ms1.len() {
143                    continue;
144                }
145                let c = &ms1[ci];
146                let w = c.raw_sum.max(1.0) as f64;
147
148                if c.rt_fit.mu.is_finite() {
149                    rt_w += (c.rt_fit.mu as f64) * w;
150                }
151                if c.im_fit.mu.is_finite() {
152                    im_w += (c.im_fit.mu as f64) * w;
153                }
154                wsum += w;
155            }
156
157            if wsum > 0.0 {
158                ((rt_w / wsum) as f32, (im_w / wsum) as f32)
159            } else {
160                // Fallback: just midpoints of feature bounds if everything is degenerate.
161                let (rt_lo, rt_hi) = feat.rt_bounds;
162                let (im_lo, im_hi) = feat.im_bounds;
163                (
164                    0.5 * ((rt_lo + rt_hi) as f32),
165                    0.5 * ((im_lo + im_hi) as f32),
166                )
167            }
168        }
169    }
170}
171
172// ---------------------------------------------------------------------------
173// Public: build pseudo spectra from MS1/MS2 clusters + features + pairs
174// ---------------------------------------------------------------------------
175
176/// Build pseudo-DDA spectra from:
177/// - `ms1`: MS1 clusters
178/// - `ms2`: MS2 clusters
179/// - `features`: isotopic SimpleFeatures built on MS1
180/// - `pairs`: candidate links (ms2_idx, ms1_idx) from your candidate search
181///
182/// Behaviour:
183/// - If `ms1_idx` belongs to a feature, use the **feature** as the precursor.
184/// - Otherwise, use that MS1 cluster as a degenerate “orphan” precursor.
185/// - All MS2 clusters linked to any member (for features) are grouped into one spectrum.
186/// - Fragments are sorted by intensity and capped to `top_n_fragments` if >0.
187///
188/// This is the DIA→pseudo-DDA “glue” that you can feed into an mzML writer.
189pub fn build_pseudo_spectra_from_pairs(
190    ms1: &[ClusterResult1D],
191    ms2: &[ClusterResult1D],
192    features: &[SimpleFeature],
193    pairs: &[(usize, usize)],
194    opts: &PseudoSpecOpts,
195) -> Vec<PseudoSpectrum> {
196    if ms1.is_empty() || ms2.is_empty() || pairs.is_empty() {
197        return Vec::new();
198    }
199
200    let cluster_to_feature = build_cluster_to_feature_map(ms1.len(), features);
201
202    let mut grouped: HashMap<PrecursorKey, Vec<usize>> = HashMap::new();
203
204    for &(ms2_idx, ms1_idx) in pairs {
205        if ms1_idx >= ms1.len() || ms2_idx >= ms2.len() {
206            continue;
207        }
208
209        // If an MS2 has no window_group, we should never have seen it as a candidate.
210        let g = ms2[ms2_idx]
211            .window_group
212            .unwrap_or(0); // if 0 ever appears, that’s a bug upstream
213
214        let key = match cluster_to_feature[ms1_idx] {
215            Some(fid) => PrecursorKey::Feature {
216                feature_id: fid,
217                window_group: g,
218            },
219            None => PrecursorKey::OrphanCluster {
220                cluster_idx: ms1_idx,
221                window_group: g,
222            },
223        };
224
225        grouped.entry(key).or_default().push(ms2_idx);
226    }
227
228    if grouped.is_empty() {
229        return Vec::new();
230    }
231
232    let top_n = opts.top_n_fragments;
233
234    let mut spectra: Vec<PseudoSpectrum> = grouped
235        .into_par_iter()
236        .map(|(key, mut frag_indices)| {
237            frag_indices.sort_unstable();
238            frag_indices.dedup();
239
240            let (prec_mz, charge, prec_cluster_indices, prec_cluster_ids, _wg) = match key {
241                PrecursorKey::OrphanCluster { cluster_idx, window_group } => {
242                    let c = &ms1[cluster_idx];
243                    let mz = cluster_mz_mu(c).unwrap_or(0.0);
244                    let z  = 0u8;
245                    let idxs = vec![cluster_idx];
246                    let ids  = vec![c.cluster_id];
247                    (mz, z, idxs, ids, window_group)
248                }
249                PrecursorKey::Feature { feature_id, window_group } => {
250                    let feat = &features[feature_id];
251                    let mz   = feat.mz_mono;
252                    let z    = feat.charge;
253                    let idxs = feat.member_cluster_indices.clone();
254                    let ids  = feat.member_cluster_ids.clone();
255                    (mz, z, idxs, ids, window_group)
256                }
257            };
258
259            let (rt_apex, im_apex) =
260                precursor_apex_from_feature_or_cluster(key, ms1, features);
261
262            let mut frags: Vec<PseudoFragment> = frag_indices
263                .into_iter()
264                .filter_map(|j| {
265                    let c2 = &ms2[j];
266                    let mz = match cluster_mz_mu(c2) {
267                        Some(m) if m.is_finite() && m > 0.0 => m,
268                        _ => return None,
269                    };
270                    let intensity = c2.raw_sum.max(0.0);
271                    Some(PseudoFragment {
272                        mz,
273                        intensity,
274                        ms2_cluster_index: j,
275                        ms2_cluster_id: c2.cluster_id,
276                        window_group: c2.window_group.unwrap_or(0),
277                    })
278                })
279                .collect();
280
281            frags.sort_unstable_by(|a, b| {
282                b.intensity
283                    .partial_cmp(&a.intensity)
284                    .unwrap_or(Ordering::Equal)
285            });
286            if top_n > 0 && frags.len() > top_n {
287                frags.truncate(top_n);
288            }
289
290            PseudoSpectrum {
291                precursor_mz: prec_mz,
292                precursor_charge: charge,
293                rt_apex,
294                im_apex,
295                // get unique window groups from fragments
296                window_groups: {
297                    let mut wgs: Vec<u32> = frags.iter().map(|f| f.window_group).collect();
298                    wgs.sort_unstable();
299                    wgs.dedup();
300                    wgs
301                },
302                feature_id: match key {
303                    PrecursorKey::Feature { feature_id, .. } => Some(feature_id),
304                    PrecursorKey::OrphanCluster { .. } => None,
305                },
306                precursor_cluster_indices: prec_cluster_indices,
307                precursor_cluster_ids: prec_cluster_ids,
308                fragments: frags,
309            }
310        })
311        .collect();
312
313    spectra.sort_unstable_by(|a, b| {
314        a.rt_apex
315            .partial_cmp(&b.rt_apex)
316            .unwrap_or(Ordering::Equal)
317            .then_with(|| {
318                a.precursor_mz
319                    .partial_cmp(&b.precursor_mz)
320                    .unwrap_or(Ordering::Equal)
321            })
322    });
323
324    spectra
325}
326
327/// High-level entrypoint:
328/// 1) enumerate geometric/tile-based candidates
329/// 2) refine by XIC score (best precursor per fragment + cutoff)
330/// 3) build pseudo-spectra from the surviving pairs
331pub fn build_pseudo_spectra_with_xic(
332    ds: &TimsDatasetDIA,
333    ms1: &[ClusterResult1D],
334    ms2: &[ClusterResult1D],
335    features: &[SimpleFeature],
336    cand_opts: &CandidateOpts,
337    xic_opts: &XicScoreOpts,
338    pseudo_opts: &PseudoSpecOpts,
339) -> (Vec<PseudoSpectrum>, Vec<(usize, usize, f32)>) {
340    // 1) geometric / program-based candidates
341    let pairs_geom: Vec<(usize, usize)> =
342        enumerate_ms2_ms1_pairs_simple(ds, ms1, ms2, cand_opts);
343
344    if pairs_geom.is_empty() {
345        return (Vec::new(), Vec::new());
346    }
347
348    // 2) XIC scoring + best-precursor-per-fragment + cutoff
349    let pairs_scored: Vec<(usize, usize, f32)> =
350        assign_ms2_to_best_ms1_by_xic(ms1, ms2, &pairs_geom, xic_opts);
351
352    if pairs_scored.is_empty() {
353        return (Vec::new(), Vec::new());
354    }
355
356    // Drop scores for the pseudo-spectra builder (it just wants indices)
357    let final_pairs: Vec<(usize, usize)> = pairs_scored
358        .iter()
359        .map(|(ms2_idx, ms1_idx, _s)| (*ms2_idx, *ms1_idx))
360        .collect();
361
362    // 3) existing glue
363    let spectra = build_pseudo_spectra_from_pairs(
364        ms1,
365        ms2,
366        features,
367        &final_pairs,
368        pseudo_opts,
369    );
370
371    (spectra, pairs_scored)
372}