mscore/algorithm/
peptide.rs

1use crate::chemistry::amino_acid::{amino_acid_composition, amino_acid_masses};
2use crate::chemistry::constants::{MASS_CO, MASS_NH3, MASS_PROTON, MASS_WATER};
3use crate::chemistry::formulas::calculate_mz;
4use crate::chemistry::unimod::{
5    modification_atomic_composition, unimod_modifications_mass_numerical,
6};
7use crate::chemistry::utility::{find_unimod_patterns, unimod_sequence_to_tokens};
8use crate::data::peptide::{FragmentType, PeptideProductIon, PeptideSequence};
9use rayon::prelude::*;
10use rayon::ThreadPoolBuilder;
11use regex::Regex;
12use statrs::distribution::{Binomial, Discrete};
13use std::collections::HashMap;
14
15/// calculate the monoisotopic mass of a peptide sequence
16///
17/// Arguments:
18///
19/// * `sequence` - peptide sequence
20///
21/// Returns:
22///
23/// * `mass` - monoisotopic mass of the peptide
24///
25/// # Examples
26///
27/// ```
28/// use mscore::algorithm::peptide::calculate_peptide_mono_isotopic_mass;
29/// use mscore::data::peptide::PeptideSequence;
30///
31/// let peptide_sequence = PeptideSequence::new("PEPTIDEH".to_string(), Some(1));
32/// let mass = calculate_peptide_mono_isotopic_mass(&peptide_sequence);
33/// let mass_quantized = (mass * 1e6).round() as i32;
34/// assert_eq!(mass_quantized, 936418877);
35/// ```
36pub fn calculate_peptide_mono_isotopic_mass(peptide_sequence: &PeptideSequence) -> f64 {
37    let amino_acid_masses = amino_acid_masses();
38    let modifications_mz_numerical = unimod_modifications_mass_numerical();
39    let pattern = Regex::new(r"\[UNIMOD:(\d+)]").unwrap();
40
41    let sequence = peptide_sequence.sequence.as_str();
42
43    // Find all occurrences of the pattern
44    let modifications: Vec<u32> = pattern
45        .find_iter(sequence)
46        .filter_map(|mat| mat.as_str()[8..mat.as_str().len() - 1].parse().ok())
47        .collect();
48
49    // Remove the modifications from the sequence
50    let sequence = pattern.replace_all(sequence, "");
51
52    // Count occurrences of each amino acid
53    let mut aa_counts = HashMap::new();
54    for char in sequence.chars() {
55        *aa_counts.entry(char).or_insert(0) += 1;
56    }
57
58    // Mass of amino acids and modifications
59    let mass_sequence: f64 = aa_counts
60        .iter()
61        .map(|(aa, &count)| {
62            amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0) * count as f64
63        })
64        .sum();
65    let mass_modifications: f64 = modifications
66        .iter()
67        .map(|&mod_id| modifications_mz_numerical.get(&mod_id).unwrap_or(&0.0))
68        .sum();
69
70    mass_sequence + mass_modifications + MASS_WATER
71}
72
73/// calculate the monoisotopic mass of a peptide product ion for a given fragment type
74///
75/// Arguments:
76///
77/// * `sequence` - peptide sequence
78/// * `kind` - fragment type
79///
80/// Returns:
81///
82/// * `mass` - monoisotopic mass of the peptide
83///
84/// # Examples
85/// ```
86/// use mscore::algorithm::peptide::calculate_peptide_product_ion_mono_isotopic_mass;
87/// use mscore::data::peptide::FragmentType;
88/// let sequence = "PEPTIDEH";
89/// let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, FragmentType::Y);
90/// assert_eq!(mass, 936.4188766862999);
91/// ```
92pub fn calculate_peptide_product_ion_mono_isotopic_mass(sequence: &str, kind: FragmentType) -> f64 {
93    let (sequence, modifications) = find_unimod_patterns(sequence);
94
95    // Return mz of empty sequence
96    if sequence.is_empty() {
97        return 0.0;
98    }
99
100    let amino_acid_masses = amino_acid_masses();
101
102    // Add up raw amino acid masses and potential modifications
103    let mass_sequence: f64 = sequence
104        .chars()
105        .map(|aa| amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0))
106        .sum();
107
108    let mass_modifications: f64 = modifications.iter().sum();
109
110    // Calculate total mass
111    let mass = mass_sequence + mass_modifications + MASS_WATER;
112
113    let mass = match kind {
114        FragmentType::A => mass - MASS_CO - MASS_WATER,
115        FragmentType::B => mass - MASS_WATER,
116        FragmentType::C => mass + MASS_NH3 - MASS_WATER,
117        FragmentType::X => mass + MASS_CO - 2.0 * MASS_PROTON,
118        FragmentType::Y => mass,
119        FragmentType::Z => mass - MASS_NH3,
120    };
121
122    mass
123}
124
125/// calculate the monoisotopic m/z of a peptide product ion for a given fragment type and charge
126///
127/// Arguments:
128///
129/// * `sequence` - peptide sequence
130/// * `kind` - fragment type
131/// * `charge` - charge
132///
133/// Returns:
134///
135/// * `mz` - monoisotopic mass of the peptide
136///
137/// # Examples
138/// ```
139/// use mscore::algorithm::peptide::calculate_product_ion_mz;
140/// use mscore::chemistry::constants::MASS_PROTON;
141/// use mscore::data::peptide::FragmentType;
142/// let sequence = "PEPTIDEH";
143/// let mz = calculate_product_ion_mz(sequence, FragmentType::Y, Some(1));
144/// assert_eq!(mz, 936.4188766862999 + MASS_PROTON);
145/// ```
146pub fn calculate_product_ion_mz(sequence: &str, kind: FragmentType, charge: Option<i32>) -> f64 {
147    let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, kind);
148    calculate_mz(mass, charge.unwrap_or(1))
149}
150
151/// get a count dictionary of the amino acid composition of a peptide sequence
152///
153/// Arguments:
154///
155/// * `sequence` - peptide sequence
156///
157/// Returns:
158///
159/// * `composition` - a dictionary of amino acid composition
160///
161/// # Examples
162///
163/// ```
164/// use mscore::algorithm::peptide::calculate_amino_acid_composition;
165///
166/// let sequence = "PEPTIDEH";
167/// let composition = calculate_amino_acid_composition(sequence);
168/// assert_eq!(composition.get("P"), Some(&2));
169/// assert_eq!(composition.get("E"), Some(&2));
170/// assert_eq!(composition.get("T"), Some(&1));
171/// assert_eq!(composition.get("I"), Some(&1));
172/// assert_eq!(composition.get("D"), Some(&1));
173/// assert_eq!(composition.get("H"), Some(&1));
174/// ```
175pub fn calculate_amino_acid_composition(sequence: &str) -> HashMap<String, i32> {
176    let mut composition = HashMap::new();
177    for char in sequence.chars() {
178        *composition.entry(char.to_string()).or_insert(0) += 1;
179    }
180    composition
181}
182
183/// calculate the atomic composition of a peptide sequence
184pub fn peptide_sequence_to_atomic_composition(
185    peptide_sequence: &PeptideSequence,
186) -> HashMap<&'static str, i32> {
187    let token_sequence = unimod_sequence_to_tokens(peptide_sequence.sequence.as_str(), false);
188    let mut collection: HashMap<&'static str, i32> = HashMap::new();
189
190    // Assuming amino_acid_composition and modification_composition return appropriate mappings...
191    let aa_compositions = amino_acid_composition();
192    let mod_compositions = modification_atomic_composition();
193
194    // No need for conversion to HashMap<String, ...> as long as you're directly accessing
195    // the HashMap provided by modification_composition() if it uses String keys.
196    for token in token_sequence {
197        if token.len() == 1 {
198            let char = token.chars().next().unwrap();
199            if let Some(composition) = aa_compositions.get(&char) {
200                for (key, value) in composition.iter() {
201                    *collection.entry(key).or_insert(0) += *value;
202                }
203            }
204        } else {
205            // Directly use &token without .as_str() conversion
206            if let Some(composition) = mod_compositions.get(&token) {
207                for (key, value) in composition.iter() {
208                    *collection.entry(key).or_insert(0) += *value;
209                }
210            }
211        }
212    }
213
214    // Add water
215    *collection.entry("H").or_insert(0) += 2; //
216    *collection.entry("O").or_insert(0) += 1; //
217
218    collection
219}
220
221/// calculate the atomic composition of a product ion
222///
223/// Arguments:
224///
225/// * `product_ion` - a PeptideProductIon instance
226///
227/// Returns:
228///
229/// * `Vec<(&str, i32)>` - a vector of tuples representing the atomic composition of the product ion
230pub fn atomic_product_ion_composition(product_ion: &PeptideProductIon) -> Vec<(&str, i32)> {
231    let mut composition = peptide_sequence_to_atomic_composition(&product_ion.ion.sequence);
232
233    match product_ion.kind {
234        FragmentType::A => {
235            // A: peptide_mass - CO - Water
236            *composition.entry("H").or_insert(0) -= 2;
237            *composition.entry("O").or_insert(0) -= 2;
238            *composition.entry("C").or_insert(0) -= 1;
239        }
240        FragmentType::B => {
241            // B: peptide_mass - Water
242            *composition.entry("H").or_insert(0) -= 2;
243            *composition.entry("O").or_insert(0) -= 1;
244        }
245        FragmentType::C => {
246            // C: peptide_mass + NH3 - Water
247            *composition.entry("H").or_insert(0) += 1;
248            *composition.entry("N").or_insert(0) += 1;
249            *composition.entry("O").or_insert(0) -= 1;
250        }
251        FragmentType::X => {
252            // X: peptide_mass + CO
253            *composition.entry("C").or_insert(0) += 1; // Add 1 for CO
254            *composition.entry("O").or_insert(0) += 1; // Add 1 for CO
255            *composition.entry("H").or_insert(0) -= 2; // Subtract 2 for 2 protons
256        }
257        FragmentType::Y => (),
258        FragmentType::Z => {
259            *composition.entry("H").or_insert(0) -= 3;
260            *composition.entry("N").or_insert(0) -= 1;
261        }
262    }
263
264    composition.iter().map(|(k, v)| (*k, *v)).collect()
265}
266
267/// Calculate the atomic composition of the complementary fragment.
268///
269/// When a peptide is fragmented, the resulting fragment ion has a complementary
270/// portion (the rest of the precursor). This function calculates the atomic
271/// composition of that complementary fragment, which is needed for quad-selection
272/// dependent isotope distribution calculations.
273///
274/// # Arguments
275///
276/// * `precursor_composition` - atomic composition of the full precursor
277/// * `fragment_composition` - atomic composition of the fragment ion (from atomic_product_ion_composition)
278///
279/// # Returns
280///
281/// * `HashMap<String, i32>` - atomic composition of the complementary fragment
282///
283/// # Examples
284///
285/// ```
286/// use mscore::algorithm::peptide::{peptide_sequence_to_atomic_composition, atomic_product_ion_composition, calculate_complementary_fragment_composition};
287/// use mscore::data::peptide::{PeptideSequence, PeptideProductIon, PeptideIon, FragmentType};
288///
289/// let precursor = PeptideSequence::new("PEPTIDE".to_string(), Some(1));
290/// let precursor_comp = peptide_sequence_to_atomic_composition(&precursor);
291///
292/// // Create a b3 ion (PEP)
293/// let b3_ion = PeptideProductIon {
294///     kind: FragmentType::B,
295///     ion: PeptideIon {
296///         sequence: PeptideSequence::new("PEP".to_string(), Some(1)),
297///         charge: 1,
298///         intensity: 1.0,
299///     },
300/// };
301/// let fragment_comp: Vec<(&str, i32)> = atomic_product_ion_composition(&b3_ion);
302/// let fragment_map: std::collections::HashMap<&str, i32> = fragment_comp.into_iter().collect();
303///
304/// let complementary = calculate_complementary_fragment_composition(&precursor_comp, &fragment_map);
305/// // Complementary should be TIDE as y-ion (precursor - b-ion)
306/// ```
307pub fn calculate_complementary_fragment_composition(
308    precursor_composition: &HashMap<&str, i32>,
309    fragment_composition: &HashMap<&str, i32>,
310) -> HashMap<String, i32> {
311    let mut complementary: HashMap<String, i32> = HashMap::new();
312
313    // Start with precursor - fragment
314    for (element, &prec_count) in precursor_composition.iter() {
315        let frag_count = fragment_composition.get(element).copied().unwrap_or(0);
316        let diff = prec_count - frag_count;
317        if diff != 0 {
318            complementary.insert(element.to_string(), diff);
319        }
320    }
321
322    // Check for any elements in fragment that might not be in precursor (edge case)
323    for (element, &frag_count) in fragment_composition.iter() {
324        if !precursor_composition.contains_key(element) {
325            complementary.insert(element.to_string(), -frag_count);
326        }
327    }
328
329    complementary
330}
331
332/// calculate the atomic composition of a peptide product ion series
333/// Arguments:
334///
335/// * `product_ions` - a vector of PeptideProductIon instances
336/// * `num_threads` - an usize representing the number of threads to use
337/// Returns:
338///
339/// * `Vec<Vec<(String, i32)>>` - a vector of vectors of tuples representing the atomic composition of each product ion
340///
341pub fn fragments_to_composition(
342    product_ions: Vec<PeptideProductIon>,
343    num_threads: usize,
344) -> Vec<Vec<(String, i32)>> {
345    let thread_pool = ThreadPoolBuilder::new()
346        .num_threads(num_threads)
347        .build()
348        .unwrap();
349    let result = thread_pool.install(|| {
350        product_ions
351            .par_iter()
352            .map(|ion| atomic_product_ion_composition(ion))
353            .map(|composition| {
354                composition
355                    .iter()
356                    .map(|(k, v)| (k.to_string(), *v))
357                    .collect()
358            })
359            .collect()
360    });
361    result
362}
363
364/// count the number of protonizable sites in a peptide sequence
365///
366/// # Arguments
367///
368/// * `sequence` - a string representing the peptide sequence
369///
370/// # Returns
371///
372/// * `usize` - the number of protonizable sites in the peptide sequence
373///
374/// # Example
375///
376/// ```
377/// use mscore::algorithm::peptide::get_num_protonizable_sites;
378///
379/// let sequence = "PEPTIDEH";
380/// let num_protonizable_sites = get_num_protonizable_sites(sequence);
381/// assert_eq!(num_protonizable_sites, 2);
382/// ```
383pub fn get_num_protonizable_sites(sequence: &str) -> usize {
384    let mut sites = 1; // n-terminus
385    for s in sequence.chars() {
386        match s {
387            'H' | 'R' | 'K' => sites += 1,
388            _ => {}
389        }
390    }
391    sites
392}
393
394/// simulate the charge state distribution for a peptide sequence
395///
396/// # Arguments
397///
398/// * `sequence` - a string representing the peptide sequence
399/// * `max_charge` - an optional usize representing the maximum charge state to simulate
400/// * `charged_probability` - an optional f64 representing the probability of a site being charged
401///
402/// # Returns
403///
404/// * `Vec<f64>` - a vector of f64 representing the probability of each charge state
405///
406/// # Example
407///
408/// ```
409/// use mscore::algorithm::peptide::simulate_charge_state_for_sequence;
410///
411/// let sequence = "PEPTIDEH";
412/// let charge_state_probs = simulate_charge_state_for_sequence(sequence, None, None);
413/// assert_eq!(charge_state_probs, vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]);
414pub fn simulate_charge_state_for_sequence(
415    sequence: &str,
416    max_charge: Option<usize>,
417    charged_probability: Option<f64>,
418) -> Vec<f64> {
419    let charged_prob = charged_probability.unwrap_or(0.8);
420    let max_charge = max_charge.unwrap_or(4)+1;
421    let num_protonizable_sites = get_num_protonizable_sites(sequence);
422    let mut charge_state_probs = vec![0.0; max_charge];
423    let binom = Binomial::new(charged_prob, num_protonizable_sites as u64).unwrap();
424    
425    for charge in 0..max_charge {
426        charge_state_probs[charge] = binom.pmf(charge as u64);
427    }
428    charge_state_probs
429}
430
431/// simulate the charge state distribution for a list of peptide sequences
432///
433/// # Arguments
434///
435/// * `sequences` - a vector of strings representing the peptide sequences
436/// * `num_threads` - an usize representing the number of threads to use
437/// * `max_charge` - an optional usize representing the maximum charge state to simulate
438/// * `charged_probability` - an optional f64 representing the probability of a site being charged
439///
440/// # Returns
441///
442/// * `Vec<Vec<f64>>` - a vector of vectors of f64 representing the probability of each charge state for each sequence
443///
444/// # Example
445///
446/// ```
447/// use mscore::algorithm::peptide::simulate_charge_states_for_sequences;
448///
449/// let sequences = vec!["PEPTIDEH", "PEPTIDEH", "PEPTIDEH"];
450/// let charge_state_probs = simulate_charge_states_for_sequences(sequences, 4, None, None);
451/// assert_eq!(charge_state_probs, vec![vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]]);
452/// ```
453pub fn simulate_charge_states_for_sequences(
454    sequences: Vec<&str>,
455    num_threads: usize,
456    max_charge: Option<usize>,
457    charged_probability: Option<f64>,
458) -> Vec<Vec<f64>> {
459    let pool = ThreadPoolBuilder::new()
460        .num_threads(num_threads)
461        .build()
462        .unwrap();
463    pool.install(|| {
464        sequences
465            .par_iter()
466            .map(|sequence| {
467                simulate_charge_state_for_sequence(sequence, max_charge, charged_probability)
468            })
469            .collect()
470    })
471}