1use std::cmp::Ordering;
2use mscore::algorithm::isotope::generate_averagine_spectra;
3use crate::cluster::cluster::ClusterResult1D;
4
5#[derive(Clone, Debug)]
6pub struct AveragineLut {
7 pub masses: Vec<f32>, pub z_min: u8,
9 pub z_max: u8,
10 pub k: usize, pub envs: Vec<[f32; 8]>, }
13
14impl AveragineLut {
15 #[inline]
16 fn clamp_resolution_decimals(resolution: i32) -> i32 {
17 resolution.clamp(0, 6)
19 }
20
21 #[inline]
22 fn clamp_threads(n: usize) -> usize {
23 n.clamp(1, 32)
24 }
25
26 pub fn build(
27 mass_min: f32,
28 mass_max: f32,
29 step: f32, z_min: u8,
31 z_max: u8,
32 k: usize, resolution: i32, num_threads: usize,
35 ) -> Self {
36 let mass_min = mass_min.max(50.0);
38 let mass_max = mass_max.max(mass_min + 1.0);
39 let step = step.max(1.0);
40 let z_min = z_min.max(1);
41 let z_max = z_max.max(z_min);
42 let k = k.clamp(1, 8);
43
44 let mut masses: Vec<f32> = Vec::new();
45 let mut m = mass_min;
46 while m <= mass_max + 1e-6 {
47 masses.push(m);
48 m += step;
49 }
50
51 const MAX_GRID_POINTS: usize = 200_000; if masses.len() > MAX_GRID_POINTS {
54 panic!("AveragineLut grid too large: {} points (> {})", masses.len(), MAX_GRID_POINTS);
55 }
56
57 let per_z = masses.len();
59 let n_env = per_z * (z_max - z_min + 1) as usize;
60 let mut envs: Vec<[f32; 8]> = Vec::with_capacity(n_env);
61
62 let res_dec = Self::clamp_resolution_decimals(resolution);
64 let threads = Self::clamp_threads(num_threads);
65
66 const CHUNK: usize = 512;
69
70 let mut masses_f64: Vec<f64> = Vec::with_capacity(CHUNK);
72 let mut charges: Vec<i32> = Vec::with_capacity(CHUNK);
73
74 for z in z_min..=z_max {
75 let zi = z as i32;
76
77 let mut start = 0;
78 while start < masses.len() {
79 let end = (start + CHUNK).min(masses.len());
80
81 masses_f64.clear();
83 masses_f64.extend(masses[start..end].iter().map(|&x| x as f64));
84
85 charges.clear();
86 charges.resize(end - start, zi);
87
88 let specs = generate_averagine_spectra(
90 masses_f64.clone(), charges.clone(),
92 1,
93 k as i32,
94 res_dec,
95 true,
96 threads,
97 None,
98 );
99
100 for sp in specs {
102 let mut v = [0f32; 8];
103 let keep = k.min(sp.intensity.len());
104 for i in 0..keep {
105 v[i] = sp.intensity[i] as f32;
106 }
107 let norm = v.iter().map(|x| (*x as f64) * (*x as f64)).sum::<f64>().sqrt() as f32;
109 if norm > 0.0 {
110 for x in &mut v { *x /= norm; }
111 }
112 envs.push(v);
113 }
114
115 start = end;
116 }
117 }
118
119 Self { masses, z_min, z_max, k, envs }
120 }
121
122 #[inline]
123 pub fn lookup(&self, neutral_mass: f32, z: u8) -> [f32; 8] {
124 if z < self.z_min || z > self.z_max || self.masses.is_empty() {
125 return [0.0; 8];
126 }
127 let zi = (z - self.z_min) as usize;
128 let per_z = self.masses.len();
129 let i = match self.masses.binary_search_by(|m| m.partial_cmp(&neutral_mass).unwrap_or(Ordering::Equal)) {
131 Ok(i) => i,
132 Err(i) => i.saturating_sub(1).min(per_z.saturating_sub(1)),
133 };
134 self.envs[zi * per_z + i]
135 }
136}
137
138#[derive(Clone, Debug)]
139pub struct GroupingParams {
140 pub rt_pad_overlap: usize, pub im_pad_overlap: usize,
142 pub mz_ppm_tol: f32, pub iso_ppm_tol: f32, pub iso_abs_da: f32, pub z_min: u8, pub z_max: u8, }
148
149#[derive(Clone, Debug)]
150pub struct FeatureBuildParams {
151 pub k_max: usize, pub min_members: usize, pub min_cosine: f32, pub w_spacing: f32, pub w_coelute: f32, pub w_monotonic: f32, pub penalty_skip_one: f32, pub steal_delta: f32, pub require_lowest_is_mono: bool, }
164
165#[derive(Clone, Debug)]
166pub struct SimpleFeature {
167 pub feature_id: usize,
168 pub charge: u8,
169 pub mz_mono: f32,
170 pub neutral_mass: f32,
171 pub rt_bounds: (usize, usize),
172 pub im_bounds: (usize, usize),
173 pub mz_center: f32,
174 pub n_members: usize,
175
176 pub member_cluster_indices: Vec<usize>,
178
179 pub member_cluster_ids: Vec<u64>,
181
182 pub raw_sum: f32,
183
184 pub cos_averagine: f32,
186
187 pub member_clusters: Vec<ClusterResult1D>,
191}
192
193#[derive(Clone, Debug)]
194pub struct SimpleFeatureParams {
195 pub z_min: u8,
196 pub z_max: u8,
197 pub iso_ppm_tol: f32, pub iso_abs_da: f32, pub min_members: usize, pub max_members: usize, pub min_raw_sum: f32, pub min_mz: f32, pub min_rt_overlap_frac: f32, pub min_im_overlap_frac: f32, pub min_cosine: f32,
207
208 pub w_spacing_penalty: f32,
214}
215
216impl Default for SimpleFeatureParams {
217 fn default() -> Self {
218 Self {
219 z_min: 1,
220 z_max: 5,
221 iso_ppm_tol: 10.0,
222 iso_abs_da: 0.003,
223 min_members: 2,
224 max_members: 5,
225 min_raw_sum: 0.0,
226 min_mz: 100.0,
227 min_rt_overlap_frac: 0.3,
228 min_im_overlap_frac: 0.3,
229 min_cosine: 0.0,
230 w_spacing_penalty: 0.0,
231 }
232 }
233}
234
235fn cluster_mz_mu(c: &ClusterResult1D) -> Option<f32> {
236 if let Some(ref f) = c.mz_fit {
237 if f.mu.is_finite() && f.mu > 0.0 {
238 return Some(f.mu);
239 }
240 }
241 if let Some((lo, hi)) = c.mz_window {
242 let mu = 0.5 * (lo + hi);
243 if mu.is_finite() && mu > 0.0 {
244 return Some(mu);
245 }
246 }
247 None
248}
249
250fn _sigma_from_fwhm(fwhm: f32) -> Option<f32> {
251 if !fwhm.is_finite() || fwhm <= 0.0 {
252 return None;
253 }
254 let two_sqrt_two_ln2 = 2.0 * 2.0_f32.ln().sqrt();
255 let sigma = fwhm / two_sqrt_two_ln2;
256 if sigma.is_finite() && sigma > 0.0 {
257 Some(sigma)
258 } else {
259 None
260 }
261}
262
263fn _cluster_mz_sigma(c: &ClusterResult1D) -> Option<f32> {
264 if let Some(ref f) = c.mz_fit {
265 if f.sigma.is_finite() && f.sigma > 0.0 {
266 return Some(f.sigma);
267 }
268 }
269 if let Some((lo, hi)) = c.mz_window {
270 let fwhm = (hi - lo).abs();
271 if let Some(s) = _sigma_from_fwhm(fwhm) {
272 return Some(s);
273 }
274 }
275 None
276}
277
278#[inline]
279fn frac_overlap(a: (usize, usize), b: (usize, usize)) -> f32 {
280 let l = a.0.max(b.0);
281 let r = a.1.min(b.1);
282 if r < l {
283 0.0
284 } else {
285 let len = (r - l + 1) as f32;
286 let len_a = (a.1 - a.0 + 1) as f32;
287 let len_b = (b.1 - b.0 + 1) as f32;
288 len / len_a.max(len_b)
289 }
290}
291
292#[inline]
293fn iso_tolerance_da(mz: f32, p: &SimpleFeatureParams) -> f32 {
294 let ppm_term = mz.abs().max(1e-6) * (p.iso_ppm_tol * 1e-6);
295 ppm_term.max(p.iso_abs_da)
296}
297
298fn cosine_iso_lut(
300 neutral_mass: f32,
301 z: u8,
302 iso_raw: &[f32],
303 lut: &AveragineLut,
304) -> f32 {
305 if iso_raw.is_empty() {
306 return 0.0;
307 }
308
309 let env = lut.lookup(neutral_mass, z); let mut v = [0.0f32; 8];
311
312 let keep = iso_raw.len().min(8);
313 let mut norm2 = 0.0f32;
314 for i in 0..keep {
315 let x = iso_raw[i].max(0.0); v[i] = x;
317 norm2 += x * x;
318 }
319
320 if norm2 <= 0.0 {
321 return 0.0;
322 }
323 let inv_norm = norm2.sqrt().recip();
324
325 let mut dot = 0.0f32;
326 for i in 0..8 {
327 let x = v[i] * inv_norm;
328 dot += x * env[i];
329 }
330 dot
331}
332
333#[derive(Clone, Debug)]
334struct GoodCluster {
335 orig_idx: usize,
337
338 pub cluster_id: u64,
340
341 mz_mu: f32,
342 _rt_mu: f32,
343 rt_win: (usize, usize),
344 _im_mu: f32,
345 im_win: (usize, usize),
346 raw_sum: f32,
347}
348
349pub fn build_simple_features_from_clusters(
353 clusters: &[ClusterResult1D],
354 params: &SimpleFeatureParams,
355 lut: Option<&AveragineLut>,
356) -> Vec<SimpleFeature> {
357 if clusters.is_empty() {
358 return Vec::new();
359 }
360
361 let mut good: Vec<GoodCluster> = clusters
363 .iter()
364 .enumerate()
365 .filter_map(|(i, c)| {
366 if c.raw_sum < params.min_raw_sum {
367 return None;
368 }
369 let mz_mu = cluster_mz_mu(c)?;
370 if mz_mu <= params.min_mz {
371 return None;
372 }
373 if !c.rt_fit.mu.is_finite() || !c.im_fit.mu.is_finite() {
374 return None;
375 }
376 Some(GoodCluster {
377 orig_idx: i,
378 cluster_id: c.cluster_id,
379 mz_mu,
380 _rt_mu: c.rt_fit.mu,
381 rt_win: c.rt_window,
382 _im_mu: c.im_fit.mu,
383 im_win: c.im_window,
384 raw_sum: c.raw_sum,
385 })
386 })
387 .collect();
388
389 if good.is_empty() {
390 return Vec::new();
391 }
392
393 good.sort_by(|a, b| a.mz_mu.partial_cmp(&b.mz_mu).unwrap_or(Ordering::Equal));
395
396 let mzs: Vec<f32> = good.iter().map(|g| g.mz_mu).collect();
398
399 let mut seed_indices: Vec<usize> = (0..good.len()).collect();
401 seed_indices.sort_unstable_by(|&a, &b| {
402 good[b]
403 .raw_sum
404 .partial_cmp(&good[a].raw_sum)
405 .unwrap_or(Ordering::Equal)
406 });
407
408 let mut assigned = vec![false; good.len()];
410 let mut features: Vec<SimpleFeature> = Vec::new();
411
412 for seed_idx in seed_indices {
414 if assigned[seed_idx] {
415 continue;
416 }
417
418 let mut best_chain: Vec<usize> = Vec::new();
419 let mut best_z: u8 = 0;
420 let mut best_score: f32 = f32::NEG_INFINITY;
421
422 for z in params.z_min..=params.z_max {
423 let delta = 1.003_355_f32 / (z as f32);
424
425 let env_opt: Option<[f32; 8]> = if let Some(lut_ref) = lut {
429 let seed_mz = good[seed_idx].mz_mu;
430 if seed_mz.is_finite() && seed_mz > params.min_mz {
431 let neutral_seed =
432 (seed_mz - 1.007_276_5_f32).max(0.0) * (z as f32);
433 Some(lut_ref.lookup(neutral_seed, z))
434 } else {
435 None
436 }
437 } else {
438 None
439 };
440
441 const DIR_EPS_ENV: f32 = 1e-3; const SLACK_UP: f32 = 1.05; const SLACK_DOWN: f32 = 0.95; let mut chain_up: Vec<usize> = Vec::new();
448 chain_up.push(seed_idx);
449
450 let mut iso_raw_up: Vec<f32> = Vec::new();
452 iso_raw_up.push(good[seed_idx].raw_sum.max(0.0));
453
454 let mut cur_up = seed_idx;
455 'upward: while chain_up.len() < params.max_members {
456 let cur = &good[cur_up];
457 let target_m = cur.mz_mu + delta;
458 let tol = iso_tolerance_da(target_m, params);
459
460 let lo = target_m - tol;
461 let hi = target_m + tol;
462
463 let left = mzs
465 .binary_search_by(|x| x.partial_cmp(&lo).unwrap_or(Ordering::Equal))
466 .unwrap_or_else(|i| i);
467 let mut right = match mzs.binary_search_by(|x| x.partial_cmp(&hi).unwrap_or(Ordering::Equal)) {
468 Ok(i) => i + 1,
469 Err(i) => i,
470 };
471 if right > mzs.len() {
472 right = mzs.len();
473 }
474
475 let mut best_candidate: Option<(usize, f32)> = None; for k in left..right {
478 if assigned[k] || chain_up.contains(&k) {
479 continue;
480 }
481 let cand = &good[k];
482
483 let rt_ov = frac_overlap(cur.rt_win, cand.rt_win);
485 if rt_ov < params.min_rt_overlap_frac {
486 continue;
487 }
488 let im_ov = frac_overlap(cur.im_win, cand.im_win);
489 if im_ov < params.min_im_overlap_frac {
490 continue;
491 }
492
493 let score = cand.raw_sum * (0.5 * (rt_ov + im_ov));
495
496 match best_candidate {
497 None => best_candidate = Some((k, score)),
498 Some((_, best_s)) => {
499 if score > best_s {
500 best_candidate = Some((k, score));
501 }
502 }
503 }
504 }
505
506 if let Some((best_k, _)) = best_candidate {
507 if let Some(env) = env_opt {
509 let obs_prev = *iso_raw_up.last().unwrap_or(&0.0);
510 let obs_new = good[best_k].raw_sum.max(0.0);
511
512 let env_idx = iso_raw_up.len().min(env.len().saturating_sub(1));
514
515 if env_idx > 0 && env_idx < env.len() {
516 let env_prev = env[env_idx - 1];
517 let env_cur = env[env_idx];
518
519 if env_cur > env_prev * (1.0 + DIR_EPS_ENV) {
521 if obs_new < obs_prev * SLACK_UP {
523 break 'upward;
525 }
526 } else if env_cur < env_prev * (1.0 - DIR_EPS_ENV) {
527 if obs_new > obs_prev * SLACK_DOWN {
529 break 'upward;
531 }
532 }
533 }
535
536 iso_raw_up.push(obs_new);
537 }
538
539 chain_up.push(best_k);
540 cur_up = best_k;
541 } else {
542 break;
543 }
544 }
545
546 let mut chain_down: Vec<usize> = Vec::new();
548 let mut iso_raw_down: Vec<f32> = Vec::new();
549 iso_raw_down.push(good[seed_idx].raw_sum.max(0.0));
550
551 let mut cur_down = seed_idx;
552
553 'downward: while (chain_down.len() + chain_up.len()) < params.max_members {
554 let cur = &good[cur_down];
555 let target_m = cur.mz_mu - delta;
556 if target_m <= 0.0 {
557 break;
558 }
559 let tol = iso_tolerance_da(target_m, params);
560 let lo = target_m - tol;
561 let hi = target_m + tol;
562
563 let left = mzs
564 .binary_search_by(|x| x.partial_cmp(&lo).unwrap_or(Ordering::Equal))
565 .unwrap_or_else(|i| i);
566 let mut right = match mzs.binary_search_by(|x| x.partial_cmp(&hi).unwrap_or(Ordering::Equal)) {
567 Ok(i) => i + 1,
568 Err(i) => i,
569 };
570 if right > mzs.len() {
571 right = mzs.len();
572 }
573
574 let mut best_candidate: Option<(usize, f32)> = None; for k in left..right {
577 if assigned[k] || chain_up.contains(&k) || chain_down.contains(&k) {
578 continue;
579 }
580 let cand = &good[k];
581
582 let rt_ov = frac_overlap(cur.rt_win, cand.rt_win);
583 if rt_ov < params.min_rt_overlap_frac {
584 continue;
585 }
586 let im_ov = frac_overlap(cur.im_win, cand.im_win);
587 if im_ov < params.min_im_overlap_frac {
588 continue;
589 }
590
591 let score = cand.raw_sum * (0.5 * (rt_ov + im_ov));
592
593 match best_candidate {
594 None => best_candidate = Some((k, score)),
595 Some((_, best_s)) => {
596 if score > best_s {
597 best_candidate = Some((k, score));
598 }
599 }
600 }
601 }
602
603 if let Some((best_k, _)) = best_candidate {
604 if let Some(env) = env_opt {
606 let obs_prev = *iso_raw_down.last().unwrap_or(&0.0);
607 let obs_new = good[best_k].raw_sum.max(0.0);
608
609 let env_idx = iso_raw_down.len().min(env.len().saturating_sub(1));
611
612 if env_idx > 0 && env_idx < env.len() {
613 let env_prev = env[env_idx - 1];
614 let env_cur = env[env_idx];
615
616 if env_cur > env_prev * (1.0 + DIR_EPS_ENV) {
617 if obs_new < obs_prev * SLACK_UP {
619 break 'downward;
620 }
621 } else if env_cur < env_prev * (1.0 - DIR_EPS_ENV) {
622 if obs_new > obs_prev * SLACK_DOWN {
624 break 'downward;
625 }
626 }
627 }
628
629 iso_raw_down.push(obs_new);
630 }
631
632 chain_down.push(best_k);
633 cur_down = best_k;
634 } else {
635 break;
636 }
637 }
638
639 chain_down.reverse(); let mut chain: Vec<usize> = chain_down;
641 chain.extend(chain_up.into_iter());
642
643 if chain.len() < params.min_members {
644 continue;
645 }
646
647 let mut spacing_penalty = 0.0f32;
649 if params.w_spacing_penalty > 0.0 && chain.len() >= 2 {
650 let mut spacing_err_sum = 0.0f32;
651 let mut spacing_links = 0usize;
652
653 for win in chain.windows(2) {
654 let i = win[0];
655 let j = win[1];
656 let gi = &good[i];
657 let gj = &good[j];
658
659 let observed = gj.mz_mu - gi.mz_mu;
660 let ideal = delta;
661 let mid_mz = 0.5 * (gi.mz_mu + gj.mz_mu);
662 let tol_da = iso_tolerance_da(mid_mz, params);
663
664 if tol_da > 0.0 {
665 let err_units = (observed - ideal).abs() / tol_da; spacing_err_sum += err_units;
667 spacing_links += 1;
668 }
669 }
670
671 if spacing_links > 0 {
672 spacing_penalty = spacing_err_sum / (spacing_links as f32);
673 }
674 }
675
676 let score_raw: f32 = chain.iter().map(|&i| good[i].raw_sum).sum();
678 let score = if params.w_spacing_penalty > 0.0 {
679 score_raw - params.w_spacing_penalty * spacing_penalty
680 } else {
681 score_raw
682 };
683
684 if score > best_score {
685 best_score = score;
686 best_chain = chain;
687 best_z = z;
688 }
689 }
690
691 if best_chain.len() < params.min_members || best_z == 0 {
692 continue;
693 }
694
695 let mut cos_averagine = 0.0f32;
697 if let Some(lut_ref) = lut {
698 let mut mz_mono_tmp = f32::INFINITY;
700 let mut iso_raw: Vec<f32> = Vec::with_capacity(best_chain.len());
701
702 for &idx in &best_chain {
703 let g = &good[idx];
704 mz_mono_tmp = mz_mono_tmp.min(g.mz_mu);
705 iso_raw.push(g.raw_sum.max(0.0));
706 }
707
708 if !mz_mono_tmp.is_finite() || mz_mono_tmp <= params.min_mz {
709 continue;
710 }
711
712 let neutral_tmp = (mz_mono_tmp - 1.007_276_5_f32).max(0.0) * (best_z as f32);
713 cos_averagine = cosine_iso_lut(neutral_tmp, best_z, &iso_raw, lut_ref);
714
715 if params.min_cosine > 0.0 && cos_averagine < params.min_cosine {
716 continue;
718 }
719 }
720
721 for &idx in &best_chain {
723 assigned[idx] = true;
724 }
725
726 let mut mz_mono = f32::INFINITY;
728 let mut mz_weighted = 0.0_f64;
729 let mut wsum = 0.0_f64;
730 let mut rt_min = usize::MAX;
731 let mut rt_max = 0usize;
732 let mut im_min = usize::MAX;
733 let mut im_max = 0usize;
734 let mut raw_sum_total = 0.0_f32;
735
736 let mut member_cluster_indices: Vec<usize> =
737 Vec::with_capacity(best_chain.len());
738 let mut member_cluster_ids: Vec<u64> =
739 Vec::with_capacity(best_chain.len());
740
741 for &idx in &best_chain {
742 let g = &good[idx];
743 mz_mono = mz_mono.min(g.mz_mu);
744 let w = g.raw_sum.max(1.0) as f64;
745 mz_weighted += (g.mz_mu as f64) * w;
746 wsum += w;
747 rt_min = rt_min.min(g.rt_win.0);
748 rt_max = rt_max.max(g.rt_win.1);
749 im_min = im_min.min(g.im_win.0);
750 im_max = im_max.max(g.im_win.1);
751 raw_sum_total += g.raw_sum;
752
753 member_cluster_indices.push(g.orig_idx);
754 member_cluster_ids.push(g.cluster_id);
755 }
756
757 if !mz_mono.is_finite() || mz_mono <= params.min_mz {
758 continue;
759 }
760 let mz_center = if wsum > 0.0 {
761 (mz_weighted / wsum) as f32
762 } else {
763 mz_mono
764 };
765
766 let neutral_mass = (mz_mono - 1.007_276_5_f32).max(0.0) * (best_z as f32);
767
768 let feat_id = features.len();
770 features.push(SimpleFeature {
771 feature_id: feat_id,
772 charge: best_z,
773 mz_mono,
774 neutral_mass,
775 rt_bounds: (rt_min, rt_max),
776 im_bounds: (im_min, im_max),
777 mz_center,
778 n_members: best_chain.len(),
779 member_cluster_indices,
780 member_cluster_ids,
781 raw_sum: raw_sum_total,
782 cos_averagine,
783 member_clusters: best_chain
784 .iter()
785 .map(|&i| clusters[good[i].orig_idx].clone())
786 .collect(),
787 });
788 }
789
790 features
791}