rustms/proteomics/
peptide.rs

1use std::collections::HashMap;
2use regex::Regex;
3use itertools::Itertools;
4use serde::{Deserialize, Serialize};
5use crate::algorithm::peptide::{calculate_peptide_mono_isotopic_mass, calculate_peptide_product_ion_mono_isotopic_mass, peptide_sequence_to_atomic_composition};
6use crate::chemistry::formula::calculate_mz;
7use crate::chemistry::utility::{find_unimod_patterns, reshape_prosit_array, unimod_sequence_to_tokens};
8use crate::ms::spectrum::MzSpectrum;
9use crate::proteomics::amino_acid::amino_acid_masses;
10use bincode::{Encode, Decode};
11
12// helper types for easier reading
13type Mass = f64;
14type Abundance = f64;
15type IsotopeDistribution = Vec<(Mass, Abundance)>;
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct PeptideIon {
19    pub sequence: PeptideSequence,
20    pub charge: i32,
21    pub intensity: f64,
22    pub ordinal: u32,
23}
24
25impl PeptideIon {
26    pub fn new(sequence: String, charge: i32, intensity: f64, ordinal: u32, peptide_id: Option<i32>) -> Self {
27        PeptideIon {
28            sequence: PeptideSequence::new(sequence, peptide_id),
29            charge,
30            intensity,
31            ordinal,
32        }
33    }
34    pub fn mz(&self) -> f64 {
35        calculate_mz(self.sequence.mono_isotopic_mass(), self.charge)
36    }
37
38    pub fn calculate_isotope_distribution(
39        &self,
40        mass_tolerance: f64,
41        abundance_threshold: f64,
42        max_result: i32,
43        intensity_min: f64,
44    ) -> IsotopeDistribution {
45
46        let atomic_composition: HashMap<String, i32> = self.sequence.atomic_composition().iter().map(|(k, v)| (k.to_string(), *v)).collect();
47
48        let distribution: IsotopeDistribution = crate::algorithm::isotope::generate_isotope_distribution(&atomic_composition, mass_tolerance, abundance_threshold, max_result)
49            .into_iter().filter(|&(_, abundance)| abundance > intensity_min).collect();
50
51        let mz_distribution = distribution.iter().map(|(mass, _)| calculate_mz(*mass, self.charge))
52            .zip(distribution.iter().map(|&(_, abundance)| abundance)).collect();
53
54        mz_distribution
55    }
56
57    pub fn calculate_isotopic_spectrum(
58        &self,
59        mass_tolerance: f64,
60        abundance_threshold: f64,
61        max_result: i32,
62        intensity_min: f64,
63    ) -> MzSpectrum {
64        let isotopic_distribution = self.calculate_isotope_distribution(mass_tolerance, abundance_threshold, max_result, intensity_min);
65        MzSpectrum::new(isotopic_distribution.iter().map(|(mz, _)| *mz).collect(), isotopic_distribution.iter().map(|(_, abundance)| *abundance).collect()) * self.intensity
66    }
67}
68
69#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
70pub enum FragmentType { A, B, C, X, Y, Z, }
71
72// implement to string for fragment type
73impl std::fmt::Display for FragmentType {
74    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
75        match self {
76            FragmentType::A => write!(f, "a"),
77            FragmentType::B => write!(f, "b"),
78            FragmentType::C => write!(f, "c"),
79            FragmentType::X => write!(f, "x"),
80            FragmentType::Y => write!(f, "y"),
81            FragmentType::Z => write!(f, "z"),
82        }
83    }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct PeptideProductIon {
88    pub kind: FragmentType,
89    pub ion: PeptideIon,
90}
91
92impl PeptideProductIon {
93    pub fn new(kind: FragmentType, sequence: String, charge: i32, intensity: f64, ordinal: u32, peptide_id: Option<i32>) -> Self {
94        PeptideProductIon {
95            kind,
96            ion: PeptideIon {
97                sequence: PeptideSequence::new(sequence, peptide_id),
98                charge,
99                intensity,
100                ordinal,
101            },
102        }
103    }
104
105    pub fn mono_isotopic_mass(&self) -> f64 {
106        calculate_peptide_product_ion_mono_isotopic_mass(self.ion.sequence.sequence.as_str(), self.kind)
107    }
108
109    pub fn atomic_composition(&self) -> HashMap<&str, i32> {
110
111        let mut composition = peptide_sequence_to_atomic_composition(&self.ion.sequence);
112
113        match self.kind {
114            FragmentType::A => {
115                *composition.entry("H").or_insert(0) -= 2;
116                *composition.entry("O").or_insert(0) -= 2;
117                *composition.entry("C").or_insert(0) -= 1;
118            },
119
120            FragmentType::B => {
121                // B: peptide_mass - Water
122                *composition.entry("H").or_insert(0) -= 2;
123                *composition.entry("O").or_insert(0) -= 1;
124            },
125
126            FragmentType::C => {
127                // C: peptide_mass + NH3 - Water
128                *composition.entry("H").or_insert(0) += 1;
129                *composition.entry("N").or_insert(0) += 1;
130                *composition.entry("O").or_insert(0) -= 1;
131            },
132
133            FragmentType::X => {
134                // X: peptide_mass + CO + 2*H - Water
135                *composition.entry("C").or_insert(0) += 1;
136                *composition.entry("O").or_insert(0) += 1;
137            },
138
139            FragmentType::Y => {
140                ()
141            },
142
143            FragmentType::Z => {
144                *composition.entry("H").or_insert(0) -= 1;
145                *composition.entry("N").or_insert(0) -= 3;
146            },
147        }
148        composition
149    }
150
151    pub fn mz(&self) -> f64 {
152        calculate_mz(self.mono_isotopic_mass(), self.ion.charge)
153    }
154
155    pub fn isotope_distribution(
156        &self,
157        mass_tolerance: f64,
158        abundance_threshold: f64,
159        max_result: i32,
160        intensity_min: f64,
161    ) -> IsotopeDistribution {
162
163        let atomic_composition: HashMap<String, i32> = self.atomic_composition().iter().map(|(k, v)| (k.to_string(), *v)).collect();
164
165        let distribution: IsotopeDistribution = crate::algorithm::isotope::generate_isotope_distribution(&atomic_composition, mass_tolerance, abundance_threshold, max_result)
166            .into_iter().filter(|&(_, abundance)| abundance > intensity_min).collect();
167
168        let mz_distribution = distribution.iter().map(|(mass, _)| calculate_mz(*mass, self.ion.charge)).zip(distribution.iter().map(|&(_, abundance)| abundance)).collect();
169
170        mz_distribution
171    }
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)]
175pub struct PeptideSequence {
176    pub sequence: String,
177    pub peptide_id: Option<i32>,
178}
179
180impl PeptideSequence {
181    pub fn new(raw_sequence: String, peptide_id: Option<i32>) -> Self {
182
183        // constructor will parse the sequence and check if it is valid
184        let pattern = Regex::new(r"\[UNIMOD:(\d+|\?)]").unwrap();
185
186        // remove the modifications from the sequence
187        let sequence = pattern.replace_all(&raw_sequence, "").to_string();
188
189        // check if all remaining characters are valid amino acids
190        let valid_amino_acids = sequence.chars().all(|c| amino_acid_masses().contains_key(&c.to_string()[..]));
191        if !valid_amino_acids {
192            panic!("Invalid amino acid sequence: {}, use only valid amino acids: ARNDCQEGHILKMFPSTWYVU, and modifications in the format [UNIMOD:ID]", raw_sequence);
193        }
194
195        PeptideSequence { sequence: raw_sequence, peptide_id }
196    }
197
198    pub fn mono_isotopic_mass(&self) -> f64 {
199        calculate_peptide_mono_isotopic_mass(self)
200    }
201
202    pub fn atomic_composition(&self) -> HashMap<&str, i32> {
203        peptide_sequence_to_atomic_composition(self)
204    }
205
206    pub fn to_tokens(&self, group_modifications: bool) -> Vec<String> {
207        unimod_sequence_to_tokens(&*self.sequence, group_modifications)
208    }
209
210    pub fn to_sage_representation(&self) -> (String, Vec<f64>) {
211        find_unimod_patterns(&*self.sequence)
212    }
213
214    pub fn amino_acid_count(&self) -> usize {
215        self.to_tokens(true).len()
216    }
217
218    pub fn calculate_mono_isotopic_product_ion_spectrum(&self, charge: i32, fragment_type: FragmentType) -> MzSpectrum {
219        let product_ions = self.calculate_product_ion_series(charge, fragment_type);
220        product_ions.generate_mono_isotopic_spectrum()
221    }
222
223    pub fn calculate_isotopic_product_ion_spectrum(&self, charge: i32, fragment_type: FragmentType, mass_tolerance: f64, abundance_threshold: f64, max_result: i32, intensity_min: f64) -> MzSpectrum {
224        let product_ions = self.calculate_product_ion_series(charge, fragment_type);
225        product_ions.generate_isotopic_spectrum(mass_tolerance, abundance_threshold, max_result, intensity_min)
226    }
227
228    pub fn calculate_product_ion_series(&self, target_charge: i32, fragment_type: FragmentType) -> PeptideProductIonSeries {
229        // TODO: check for n-terminal modifications
230        let tokens = unimod_sequence_to_tokens(self.sequence.as_str(), true);
231        let mut n_terminal_ions = Vec::new();
232        let mut c_terminal_ions = Vec::new();
233
234        // Generate n ions
235        for i in 1..tokens.len() {
236            let n_ion_seq = tokens[..i].join("");
237            n_terminal_ions.push(PeptideProductIon {
238                kind: match fragment_type {
239                    FragmentType::A => FragmentType::A,
240                    FragmentType::B => FragmentType::B,
241                    FragmentType::C => FragmentType::C,
242                    FragmentType::X => FragmentType::A,
243                    FragmentType::Y => FragmentType::B,
244                    FragmentType::Z => FragmentType::C,
245                },
246                ion: PeptideIon {
247                    sequence: PeptideSequence {
248                        sequence: n_ion_seq,
249                        peptide_id: self.peptide_id,
250                    },
251                    charge: target_charge,
252                    intensity: 1.0, // Placeholder intensity
253                    ordinal: i as u32,
254                },
255            });
256        }
257
258        // Generate c ions
259        for i in 1..tokens.len() {
260            let c_ion_seq = tokens[tokens.len() - i..].join("");
261            c_terminal_ions.push(PeptideProductIon {
262                kind: match fragment_type {
263                    FragmentType::A => FragmentType::X,
264                    FragmentType::B => FragmentType::Y,
265                    FragmentType::C => FragmentType::Z,
266                    FragmentType::X => FragmentType::X,
267                    FragmentType::Y => FragmentType::Y,
268                    FragmentType::Z => FragmentType::Z,
269                },
270                ion: PeptideIon {
271                    sequence: PeptideSequence {
272                        sequence: c_ion_seq,
273                        peptide_id: self.peptide_id,
274                    },
275                    charge: target_charge,
276                    intensity: 1.0, // Placeholder intensity
277                    ordinal: i as u32,
278                },
279            });
280        }
281
282        PeptideProductIonSeries::new(target_charge, n_terminal_ions, c_terminal_ions)
283    }
284
285    pub fn associate_with_predicted_intensities(
286        &self,
287        // TODO: check docs of prosit if charge is meant as precursor charge or max charge of fragments to generate
288        charge: i32,
289        fragment_type: FragmentType,
290        flat_intensities: Vec<f64>,
291        normalize: bool,
292        half_charge_one: bool,
293    ) -> PeptideProductIonSeriesCollection {
294
295        let reshaped_intensities = reshape_prosit_array(flat_intensities);
296        let max_charge = std::cmp::min(charge, 3).max(1); // Ensure at least 1 for loop range
297        let mut sum_intensity = if normalize { 0.0 } else { 1.0 };
298        let num_tokens = self.amino_acid_count() - 1; // Full sequence length is not counted as fragment, since nothing is cleaved off, therefore -1
299
300        let mut peptide_ion_collection = Vec::new();
301
302        if normalize {
303            for z in 1..=max_charge {
304
305                let intensity_c: Vec<f64> = reshaped_intensities[..num_tokens].iter().map(|x| x[0][z as usize - 1]).filter(|&x| x > 0.0).collect();
306                let intensity_n: Vec<f64> = reshaped_intensities[..num_tokens].iter().map(|x| x[1][z as usize - 1]).filter(|&x| x > 0.0).collect();
307
308                sum_intensity += intensity_n.iter().sum::<f64>() + intensity_c.iter().sum::<f64>();
309            }
310        }
311
312        for z in 1..=max_charge {
313
314            let mut product_ions = self.calculate_product_ion_series(z, fragment_type);
315            let intensity_n: Vec<f64> = reshaped_intensities[..num_tokens].iter().map(|x| x[1][z as usize - 1]).collect();
316            let intensity_c: Vec<f64> = reshaped_intensities[..num_tokens].iter().map(|x| x[0][z as usize - 1]).collect(); // Reverse for y
317
318            let adjusted_sum_intensity = if max_charge == 1 && half_charge_one { sum_intensity * 2.0 } else { sum_intensity };
319
320            for (i, ion) in product_ions.n_ions.iter_mut().enumerate() {
321                ion.ion.intensity = intensity_n[i] / adjusted_sum_intensity;
322            }
323            for (i, ion) in product_ions.c_ions.iter_mut().enumerate() {
324                ion.ion.intensity = intensity_c[i] / adjusted_sum_intensity;
325            }
326
327            peptide_ion_collection.push(PeptideProductIonSeries::new(z, product_ions.n_ions, product_ions.c_ions));
328        }
329
330        PeptideProductIonSeriesCollection::new(peptide_ion_collection)
331    }
332}
333
334#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct PeptideProductIonSeries {
336    pub charge: i32,
337    pub n_ions: Vec<PeptideProductIon>,
338    pub c_ions: Vec<PeptideProductIon>,
339}
340
341impl PeptideProductIonSeries {
342    pub fn new(charge: i32, n_ions: Vec<PeptideProductIon>, c_ions: Vec<PeptideProductIon>) -> Self {
343        PeptideProductIonSeries {
344            charge,
345            n_ions,
346            c_ions,
347        }
348    }
349    pub fn generate_mono_isotopic_spectrum(&self) -> MzSpectrum {
350        let mz_i_n = self.n_ions.iter().map(|ion| (ion.mz(), ion.ion.intensity)).collect_vec();
351        let mz_i_c = self.c_ions.iter().map(|ion| (ion.mz(), ion.ion.intensity)).collect_vec();
352        let n_spectrum = MzSpectrum::new(mz_i_n.iter().map(|(mz, _)| *mz).collect(), mz_i_n.iter().map(|(_, abundance)| *abundance).collect());
353        let c_spectrum = MzSpectrum::new(mz_i_c.iter().map(|(mz, _)| *mz).collect(), mz_i_c.iter().map(|(_, abundance)| *abundance).collect());
354        MzSpectrum::from_collection(vec![n_spectrum, c_spectrum]).filter_ranged(0.0, 5_000.0, 1e-6, 1e6)
355    }
356
357    pub fn generate_isotopic_spectrum(&self, mass_tolerance: f64, abundance_threshold: f64, max_result: i32, intensity_min: f64) -> MzSpectrum {
358        let mut spectra: Vec<MzSpectrum> = Vec::new();
359
360        for ion in &self.n_ions {
361            let n_isotopes = ion.isotope_distribution(mass_tolerance, abundance_threshold, max_result, intensity_min);
362            let spectrum = MzSpectrum::new(n_isotopes.iter().map(|(mz, _)| *mz).collect(), n_isotopes.iter().map(|(_, abundance)| *abundance * ion.ion.intensity).collect());
363            spectra.push(spectrum);
364        }
365
366        for ion in &self.c_ions {
367            let c_isotopes = ion.isotope_distribution(mass_tolerance, abundance_threshold, max_result, intensity_min);
368            let spectrum = MzSpectrum::new(c_isotopes.iter().map(|(mz, _)| *mz).collect(), c_isotopes.iter().map(|(_, abundance)| *abundance * ion.ion.intensity).collect());
369            spectra.push(spectrum);
370        }
371
372        MzSpectrum::from_collection(spectra).filter_ranged(0.0, 5_000.0, 1e-6, 1e6)
373    }
374}
375
376#[derive(Debug, Clone, Serialize, Deserialize)]
377pub struct PeptideProductIonSeriesCollection {
378    pub peptide_ions: Vec<PeptideProductIonSeries>,
379}
380impl PeptideProductIonSeriesCollection {
381    pub fn new(peptide_ions: Vec<PeptideProductIonSeries>) -> Self {
382        PeptideProductIonSeriesCollection {
383            peptide_ions,
384        }
385    }
386
387    pub fn find_ion_series(&self, charge: i32) -> Option<&PeptideProductIonSeries> {
388        self.peptide_ions.iter().find(|ion_series| ion_series.charge == charge)
389    }
390
391    pub fn generate_isotopic_spectrum(&self, mass_tolerance: f64, abundance_threshold: f64, max_result: i32, intensity_min: f64) -> MzSpectrum {
392        let mut spectra: Vec<MzSpectrum> = Vec::new();
393
394        for ion_series in &self.peptide_ions {
395            let isotopic_spectrum = ion_series.generate_isotopic_spectrum(mass_tolerance, abundance_threshold, max_result, intensity_min);
396            spectra.push(isotopic_spectrum);
397        }
398
399        MzSpectrum::from_collection(spectra).filter_ranged(0.0, 5_000.0, 1e-6, 1e6)
400    }
401}