mscore/timstof/
spectrum_processing.rs

1use std::collections::BTreeMap;
2use rayon::prelude::*;
3use rayon::ThreadPoolBuilder;
4use serde::{Deserialize, Serialize};
5
6/// Configuration for spectrum preprocessing
7#[derive(Clone, Debug, Serialize, Deserialize)]
8pub struct SpectrumProcessingConfig {
9    /// Maximum number of peaks to keep after filtering (default: 150)
10    pub take_top_n: usize,
11    /// Whether to perform deisotoping (default: true)
12    pub deisotope: bool,
13    /// PPM tolerance for deisotoping (default: 10.0)
14    pub deisotope_tolerance_ppm: f64,
15    /// Minimum m/z difference for deisotoping (default: 5.0 ppm equivalent at low mass)
16    pub deisotope_min_mz_diff: f64,
17}
18
19impl Default for SpectrumProcessingConfig {
20    fn default() -> Self {
21        SpectrumProcessingConfig {
22            take_top_n: 150,
23            deisotope: true,
24            deisotope_tolerance_ppm: 10.0,
25            deisotope_min_mz_diff: 0.0005,
26        }
27    }
28}
29
30/// Represents a fully preprocessed spectrum ready for database search
31#[derive(Clone, Debug, Serialize, Deserialize)]
32pub struct PreprocessedSpectrum {
33    /// Unique spectrum identifier (e.g., "frame_id-precursor_id-dataset_name")
34    pub spec_id: String,
35    /// Precursor m/z value (monoisotopic if available, otherwise highest intensity)
36    pub precursor_mz: f64,
37    /// Precursor charge (optional)
38    pub precursor_charge: Option<i32>,
39    /// Precursor intensity
40    pub precursor_intensity: f64,
41    /// Inverse ion mobility (1/K0)
42    pub inverse_ion_mobility: f64,
43    /// Collision energy used for fragmentation
44    pub collision_energy: f64,
45    /// Scan start time / retention time in minutes
46    pub scan_start_time: f64,
47    /// Total ion current of the spectrum
48    pub total_ion_current: f64,
49    /// Fragment neutral mass values (as f32 for Sage compatibility)
50    /// Note: These are neutral masses (m/z - proton mass), matching Sage's Peak representation
51    pub mz: Vec<f32>,
52    /// Fragment intensity values (as f32 for Sage compatibility)
53    pub intensity: Vec<f32>,
54    /// Isolation m/z window center
55    pub isolation_mz: f64,
56    /// Isolation window width
57    pub isolation_width: f64,
58}
59
60impl PreprocessedSpectrum {
61    pub fn new(
62        spec_id: String,
63        precursor_mz: f64,
64        precursor_charge: Option<i32>,
65        precursor_intensity: f64,
66        inverse_ion_mobility: f64,
67        collision_energy: f64,
68        scan_start_time: f64,
69        total_ion_current: f64,
70        mz: Vec<f32>,
71        intensity: Vec<f32>,
72        isolation_mz: f64,
73        isolation_width: f64,
74    ) -> Self {
75        PreprocessedSpectrum {
76            spec_id,
77            precursor_mz,
78            precursor_charge,
79            precursor_intensity,
80            inverse_ion_mobility,
81            collision_energy,
82            scan_start_time,
83            total_ion_current,
84            mz,
85            intensity,
86            isolation_mz,
87            isolation_width,
88        }
89    }
90}
91
92/// Calculate the isotope mass difference (~1.003 Da for most elements)
93const ISOTOPE_MASS_DIFF: f64 = 1.00335;
94
95/// Proton mass in Daltons - used for converting m/z to neutral mass
96/// Sage stores fragment peaks as neutral mass, not m/z
97const PROTON_MASS: f64 = 1.007276466812;
98
99/// Deisotope a spectrum by removing peaks that are likely isotopes of more intense peaks.
100///
101/// # Arguments
102/// * `mz` - m/z values (must be sorted in ascending order)
103/// * `intensity` - intensity values corresponding to m/z
104/// * `tolerance_ppm` - PPM tolerance for matching isotope peaks
105///
106/// # Returns
107/// Tuple of (filtered_mz, filtered_intensity) with isotope peaks removed
108pub fn deisotope_spectrum(
109    mz: &[f64],
110    intensity: &[f64],
111    tolerance_ppm: f64,
112) -> (Vec<f64>, Vec<f64>) {
113    if mz.is_empty() {
114        return (Vec::new(), Vec::new());
115    }
116
117    let n = mz.len();
118    let mut is_isotope = vec![false; n];
119
120    // For each peak, check if there's a more intense peak that could be the monoisotopic
121    for i in 0..n {
122        if is_isotope[i] {
123            continue;
124        }
125
126        let current_mz = mz[i];
127        let current_intensity = intensity[i];
128
129        // Look for potential isotope peaks (up to +4 Da to cover up to charge 1-4)
130        for charge in 1..=4 {
131            let isotope_spacing = ISOTOPE_MASS_DIFF / charge as f64;
132
133            // Check for isotope peaks at +1, +2, +3 isotope positions
134            for isotope_num in 1..=3 {
135                let expected_mz = current_mz + isotope_spacing * isotope_num as f64;
136                let tolerance = expected_mz * tolerance_ppm / 1_000_000.0;
137
138                // Binary search for potential matches
139                for j in (i + 1)..n {
140                    if mz[j] < expected_mz - tolerance {
141                        continue;
142                    }
143                    if mz[j] > expected_mz + tolerance {
144                        break;
145                    }
146
147                    // Found a potential isotope peak
148                    // Mark it as isotope if it's less intense than the monoisotopic
149                    if intensity[j] < current_intensity {
150                        is_isotope[j] = true;
151                    }
152                }
153            }
154        }
155    }
156
157    // Collect non-isotope peaks
158    let mut filtered_mz = Vec::with_capacity(n);
159    let mut filtered_intensity = Vec::with_capacity(n);
160
161    for i in 0..n {
162        if !is_isotope[i] {
163            filtered_mz.push(mz[i]);
164            filtered_intensity.push(intensity[i]);
165        }
166    }
167
168    (filtered_mz, filtered_intensity)
169}
170
171/// Filter spectrum to keep only the top N most intense peaks.
172///
173/// # Arguments
174/// * `mz` - m/z values
175/// * `intensity` - intensity values
176/// * `top_n` - maximum number of peaks to keep
177///
178/// # Returns
179/// Tuple of (filtered_mz, filtered_intensity) sorted by m/z
180pub fn filter_top_n(
181    mz: &[f64],
182    intensity: &[f64],
183    top_n: usize,
184) -> (Vec<f64>, Vec<f64>) {
185    if mz.len() <= top_n {
186        return (mz.to_vec(), intensity.to_vec());
187    }
188
189    // Create indices sorted by intensity (descending)
190    let mut indices: Vec<usize> = (0..mz.len()).collect();
191    indices.sort_by(|&a, &b| {
192        intensity[b].partial_cmp(&intensity[a]).unwrap_or(std::cmp::Ordering::Equal)
193    });
194
195    // Take top N indices
196    indices.truncate(top_n);
197
198    // Sort by m/z for the final result
199    indices.sort_by(|&a, &b| {
200        mz[a].partial_cmp(&mz[b]).unwrap_or(std::cmp::Ordering::Equal)
201    });
202
203    let filtered_mz: Vec<f64> = indices.iter().map(|&i| mz[i]).collect();
204    let filtered_intensity: Vec<f64> = indices.iter().map(|&i| intensity[i]).collect();
205
206    (filtered_mz, filtered_intensity)
207}
208
209/// Normalize intensity values to sum to a constant (e.g., 1.0 or 10000.0)
210pub fn normalize_intensity(intensity: &[f64], target_sum: f64) -> Vec<f64> {
211    let sum: f64 = intensity.iter().sum();
212    if sum == 0.0 {
213        return intensity.to_vec();
214    }
215    intensity.iter().map(|&i| i * target_sum / sum).collect()
216}
217
218/// Flatten a TimsFrame-like structure (multiple scans) into a single spectrum.
219/// Groups peaks by TOF index, sums intensities, and averages m/z values.
220///
221/// # Arguments
222/// * `tof` - TOF indices for each data point
223/// * `mz` - m/z values for each data point
224/// * `intensity` - intensity values for each data point
225///
226/// # Returns
227/// Tuple of (flattened_mz, flattened_intensity) sorted by m/z
228pub fn flatten_frame_to_spectrum(
229    tof: &[i32],
230    mz: &[f64],
231    intensity: &[f64],
232) -> (Vec<f64>, Vec<f64>) {
233    // Group by TOF index
234    let mut grouped_data: BTreeMap<i32, Vec<(f64, f64)>> = BTreeMap::new();
235
236    for i in 0..tof.len() {
237        grouped_data
238            .entry(tof[i])
239            .or_insert_with(Vec::new)
240            .push((mz[i], intensity[i]));
241    }
242
243    let mut result_mz = Vec::with_capacity(grouped_data.len());
244    let mut result_intensity = Vec::with_capacity(grouped_data.len());
245
246    for (_, values) in grouped_data {
247        let sum_intensity: f64 = values.iter().map(|&(_, i)| i).sum();
248        let avg_mz: f64 = values.iter().map(|&(m, _)| m).sum::<f64>() / values.len() as f64;
249
250        result_mz.push(avg_mz);
251        result_intensity.push(sum_intensity);
252    }
253
254    (result_mz, result_intensity)
255}
256
257/// Calculate the inverse mobility at the scan with highest total intensity.
258/// This implements the "marginal" mobility calculation.
259///
260/// # Arguments
261/// * `scan` - scan indices for each data point
262/// * `mobility` - inverse mobility values for each data point
263/// * `intensity` - intensity values for each data point
264///
265/// # Returns
266/// The inverse mobility value at the scan with highest summed intensity
267pub fn get_inverse_mobility_along_scan_marginal(
268    scan: &[i32],
269    mobility: &[f64],
270    intensity: &[f64],
271) -> f64 {
272    let mut marginal_map: BTreeMap<i32, (f64, f64)> = BTreeMap::new();
273
274    for i in 0..scan.len() {
275        let entry = marginal_map.entry(scan[i]).or_insert((0.0, 0.0));
276        entry.0 += intensity[i];  // Sum intensity
277        entry.1 = mobility[i];     // Store mobility (overwrite is fine, same scan = same mobility)
278    }
279
280    marginal_map
281        .iter()
282        .max_by(|a, b| a.1.0.partial_cmp(&b.1.0).unwrap_or(std::cmp::Ordering::Equal))
283        .map(|(_, (_, mob))| *mob)
284        .unwrap_or(0.0)
285}
286
287/// Process a single spectrum: flatten, optionally deisotope, filter top N peaks.
288///
289/// # Arguments
290/// * `tof` - TOF indices
291/// * `mz` - m/z values
292/// * `intensity` - intensity values
293/// * `config` - processing configuration
294///
295/// # Returns
296/// Tuple of (processed_mass, processed_intensity) as f32 vectors for Sage compatibility.
297/// Note: m/z values are converted to neutral mass by subtracting the proton mass,
298/// matching Sage's Peak representation which stores neutral mass.
299pub fn process_spectrum(
300    tof: &[i32],
301    mz: &[f64],
302    intensity: &[f64],
303    config: &SpectrumProcessingConfig,
304) -> (Vec<f32>, Vec<f32>) {
305    // Step 1: Flatten frame to spectrum (group by TOF)
306    let (flat_mz, flat_intensity) = flatten_frame_to_spectrum(tof, mz, intensity);
307
308    // Step 2: Optionally deisotope
309    let (deiso_mz, deiso_intensity) = if config.deisotope {
310        deisotope_spectrum(&flat_mz, &flat_intensity, config.deisotope_tolerance_ppm)
311    } else {
312        (flat_mz, flat_intensity)
313    };
314
315    // Step 3: Filter to top N peaks
316    let (final_mz, final_intensity) = filter_top_n(&deiso_mz, &deiso_intensity, config.take_top_n);
317
318    // Convert m/z to neutral mass by subtracting proton mass (assumes singly charged fragments)
319    // This matches Sage's SpectrumProcessor behavior which stores neutral mass in Peak objects
320    let mass_f32: Vec<f32> = final_mz.iter().map(|&x| (x - PROTON_MASS) as f32).collect();
321    let intensity_f32: Vec<f32> = final_intensity.iter().map(|&x| x as f32).collect();
322
323    (mass_f32, intensity_f32)
324}
325
326/// Metadata required for processing a single PASEF fragment
327#[derive(Clone, Debug)]
328pub struct PASEFFragmentData {
329    pub frame_id: u32,
330    pub precursor_id: u32,
331    pub collision_energy: f64,
332    pub scan_start_time: f64,  // retention time in minutes
333    pub scan: Vec<i32>,
334    pub mobility: Vec<f64>,
335    pub tof: Vec<i32>,
336    pub mz: Vec<f64>,
337    pub intensity: Vec<f64>,
338    // Precursor metadata
339    pub precursor_mz: f64,
340    pub precursor_charge: Option<i32>,
341    pub precursor_intensity: f64,
342    pub isolation_mz: f64,
343    pub isolation_width: f64,
344}
345
346/// Process a batch of PASEF fragments in parallel.
347///
348/// # Arguments
349/// * `fragments` - Vector of PASEF fragment data
350/// * `dataset_name` - Name of the dataset for generating spec_ids
351/// * `config` - Processing configuration
352/// * `num_threads` - Number of threads to use for parallel processing
353///
354/// # Returns
355/// Vector of preprocessed spectra ready for database search
356pub fn process_pasef_fragments_batch(
357    fragments: Vec<PASEFFragmentData>,
358    dataset_name: &str,
359    config: &SpectrumProcessingConfig,
360    num_threads: usize,
361) -> Vec<PreprocessedSpectrum> {
362    let pool = ThreadPoolBuilder::new()
363        .num_threads(num_threads)
364        .build()
365        .unwrap();
366
367    let ds_name = dataset_name.to_string();
368
369    pool.install(|| {
370        fragments
371            .par_iter()
372            .map(|frag| {
373                // Calculate inverse mobility along scan marginal
374                let inverse_mobility = get_inverse_mobility_along_scan_marginal(
375                    &frag.scan,
376                    &frag.mobility,
377                    &frag.intensity,
378                );
379
380                // Process the spectrum (flatten, deisotope, filter)
381                let (processed_mz, processed_intensity) = process_spectrum(
382                    &frag.tof,
383                    &frag.mz,
384                    &frag.intensity,
385                    config,
386                );
387
388                // Calculate total ion current
389                let total_ion_current: f64 = frag.intensity.iter().sum();
390
391                // Generate spec_id: frame_id-precursor_id-dataset_name
392                let spec_id = format!("{}-{}-{}", frag.frame_id, frag.precursor_id, ds_name);
393
394                PreprocessedSpectrum::new(
395                    spec_id,
396                    frag.precursor_mz,
397                    frag.precursor_charge,
398                    frag.precursor_intensity,
399                    inverse_mobility,
400                    frag.collision_energy,
401                    frag.scan_start_time,
402                    total_ion_current,
403                    processed_mz,
404                    processed_intensity,
405                    frag.isolation_mz,
406                    frag.isolation_width,
407                )
408            })
409            .collect()
410    })
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416
417    #[test]
418    fn test_deisotope_spectrum() {
419        // Simple test case: monoisotopic peak at 500.0 with isotopes at ~501.003 and ~502.006
420        let mz = vec![500.0, 501.003, 502.006, 600.0];
421        let intensity = vec![1000.0, 500.0, 250.0, 800.0];
422
423        let (filtered_mz, _filtered_intensity) = deisotope_spectrum(&mz, &intensity, 10.0);
424
425        // Should keep monoisotopic (500.0) and remove isotopes, keep 600.0
426        assert_eq!(filtered_mz.len(), 2);
427        assert!((filtered_mz[0] - 500.0).abs() < 0.001);
428        assert!((filtered_mz[1] - 600.0).abs() < 0.001);
429    }
430
431    #[test]
432    fn test_filter_top_n() {
433        let mz = vec![100.0, 200.0, 300.0, 400.0, 500.0];
434        let intensity = vec![10.0, 50.0, 30.0, 100.0, 20.0];
435
436        let (filtered_mz, _filtered_intensity) = filter_top_n(&mz, &intensity, 3);
437
438        // Should keep top 3 by intensity: 400 (100), 200 (50), 300 (30)
439        assert_eq!(filtered_mz.len(), 3);
440        // Should be sorted by m/z
441        assert!((filtered_mz[0] - 200.0).abs() < 0.001);
442        assert!((filtered_mz[1] - 300.0).abs() < 0.001);
443        assert!((filtered_mz[2] - 400.0).abs() < 0.001);
444    }
445
446    #[test]
447    fn test_flatten_frame_to_spectrum() {
448        // Two data points with same TOF should be merged
449        let tof = vec![1000, 1000, 2000];
450        let mz = vec![500.0, 500.1, 600.0];
451        let intensity = vec![100.0, 200.0, 300.0];
452
453        let (flat_mz, flat_intensity) = flatten_frame_to_spectrum(&tof, &mz, &intensity);
454
455        assert_eq!(flat_mz.len(), 2);
456        // First group: avg mz = (500.0 + 500.1) / 2 = 500.05, sum intensity = 300
457        assert!((flat_mz[0] - 500.05).abs() < 0.001);
458        assert!((flat_intensity[0] - 300.0).abs() < 0.001);
459        // Second group
460        assert!((flat_mz[1] - 600.0).abs() < 0.001);
461        assert!((flat_intensity[1] - 300.0).abs() < 0.001);
462    }
463
464    #[test]
465    fn test_get_inverse_mobility_along_scan_marginal() {
466        let scan = vec![100, 100, 101, 101];
467        let mobility = vec![1.0, 1.0, 1.1, 1.1];
468        let intensity = vec![50.0, 100.0, 200.0, 100.0];
469
470        let result = get_inverse_mobility_along_scan_marginal(&scan, &mobility, &intensity);
471
472        // Scan 101 has total intensity 300, scan 100 has 150
473        // So should return mobility at scan 101 = 1.1
474        assert!((result - 1.1).abs() < 0.001);
475    }
476
477    #[test]
478    fn test_process_spectrum() {
479        let tof = vec![1000, 1001, 1002, 2000, 2001];
480        let mz = vec![500.0, 501.003, 502.006, 600.0, 601.003];
481        let intensity = vec![1000.0, 500.0, 250.0, 800.0, 400.0];
482
483        let config = SpectrumProcessingConfig {
484            take_top_n: 10,
485            deisotope: true,
486            deisotope_tolerance_ppm: 10.0,
487            deisotope_min_mz_diff: 0.0005,
488        };
489
490        let (processed_mz, processed_intensity) = process_spectrum(&tof, &mz, &intensity, &config);
491
492        // After deisotoping, should have fewer peaks
493        assert!(processed_mz.len() <= mz.len());
494        // Intensities should sum to less than original (isotopes removed)
495        let processed_sum: f32 = processed_intensity.iter().sum();
496        let original_sum: f64 = intensity.iter().sum();
497        assert!(processed_sum < original_sum as f32);
498    }
499}