mscore/algorithm/
peptide.rs

1use crate::chemistry::amino_acid::{amino_acid_composition, amino_acid_masses};
2use crate::chemistry::constants::{MASS_CO, MASS_NH3, MASS_PROTON, MASS_WATER};
3use crate::chemistry::formulas::calculate_mz;
4use crate::chemistry::unimod::{
5    modification_atomic_composition, unimod_modifications_mass_numerical,
6};
7use crate::chemistry::utility::{find_unimod_patterns, unimod_sequence_to_tokens};
8use crate::data::peptide::{FragmentType, PeptideProductIon, PeptideSequence};
9use rayon::prelude::*;
10use rayon::ThreadPoolBuilder;
11use regex::Regex;
12use statrs::distribution::{Binomial, Discrete};
13use std::collections::HashMap;
14
15/// calculate the monoisotopic mass of a peptide sequence
16///
17/// Arguments:
18///
19/// * `sequence` - peptide sequence
20///
21/// Returns:
22///
23/// * `mass` - monoisotopic mass of the peptide
24///
25/// # Examples
26///
27/// ```
28/// use mscore::algorithm::peptide::calculate_peptide_mono_isotopic_mass;
29/// use mscore::data::peptide::PeptideSequence;
30///
31/// let peptide_sequence = PeptideSequence::new("PEPTIDEH".to_string(), Some(1));
32/// let mass = calculate_peptide_mono_isotopic_mass(&peptide_sequence);
33/// let mass_quantized = (mass * 1e6).round() as i32;
34/// assert_eq!(mass_quantized, 936418877);
35/// ```
36pub fn calculate_peptide_mono_isotopic_mass(peptide_sequence: &PeptideSequence) -> f64 {
37    let amino_acid_masses = amino_acid_masses();
38    let modifications_mz_numerical = unimod_modifications_mass_numerical();
39    let pattern = Regex::new(r"\[UNIMOD:(\d+)]").unwrap();
40
41    let sequence = peptide_sequence.sequence.as_str();
42
43    // Find all occurrences of the pattern
44    let modifications: Vec<u32> = pattern
45        .find_iter(sequence)
46        .filter_map(|mat| mat.as_str()[8..mat.as_str().len() - 1].parse().ok())
47        .collect();
48
49    // Remove the modifications from the sequence
50    let sequence = pattern.replace_all(sequence, "");
51
52    // Count occurrences of each amino acid
53    let mut aa_counts = HashMap::new();
54    for char in sequence.chars() {
55        *aa_counts.entry(char).or_insert(0) += 1;
56    }
57
58    // Mass of amino acids and modifications
59    let mass_sequence: f64 = aa_counts
60        .iter()
61        .map(|(aa, &count)| {
62            amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0) * count as f64
63        })
64        .sum();
65    let mass_modifications: f64 = modifications
66        .iter()
67        .map(|&mod_id| modifications_mz_numerical.get(&mod_id).unwrap_or(&0.0))
68        .sum();
69
70    mass_sequence + mass_modifications + MASS_WATER
71}
72
73/// calculate the monoisotopic mass of a peptide product ion for a given fragment type
74///
75/// Arguments:
76///
77/// * `sequence` - peptide sequence
78/// * `kind` - fragment type
79///
80/// Returns:
81///
82/// * `mass` - monoisotopic mass of the peptide
83///
84/// # Examples
85/// ```
86/// use mscore::algorithm::peptide::calculate_peptide_product_ion_mono_isotopic_mass;
87/// use mscore::data::peptide::FragmentType;
88/// let sequence = "PEPTIDEH";
89/// let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, FragmentType::Y);
90/// assert_eq!(mass, 936.4188766862999);
91/// ```
92pub fn calculate_peptide_product_ion_mono_isotopic_mass(sequence: &str, kind: FragmentType) -> f64 {
93    let (sequence, modifications) = find_unimod_patterns(sequence);
94
95    // Return mz of empty sequence
96    if sequence.is_empty() {
97        return 0.0;
98    }
99
100    let amino_acid_masses = amino_acid_masses();
101
102    // Add up raw amino acid masses and potential modifications
103    let mass_sequence: f64 = sequence
104        .chars()
105        .map(|aa| amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0))
106        .sum();
107
108    let mass_modifications: f64 = modifications.iter().sum();
109
110    // Calculate total mass
111    let mass = mass_sequence + mass_modifications + MASS_WATER;
112
113    let mass = match kind {
114        FragmentType::A => mass - MASS_CO - MASS_WATER,
115        FragmentType::B => mass - MASS_WATER,
116        FragmentType::C => mass + MASS_NH3 - MASS_WATER,
117        FragmentType::X => mass + MASS_CO - 2.0 * MASS_PROTON,
118        FragmentType::Y => mass,
119        FragmentType::Z => mass - MASS_NH3,
120    };
121
122    mass
123}
124
125/// calculate the monoisotopic m/z of a peptide product ion for a given fragment type and charge
126///
127/// Arguments:
128///
129/// * `sequence` - peptide sequence
130/// * `kind` - fragment type
131/// * `charge` - charge
132///
133/// Returns:
134///
135/// * `mz` - monoisotopic mass of the peptide
136///
137/// # Examples
138/// ```
139/// use mscore::algorithm::peptide::calculate_product_ion_mz;
140/// use mscore::chemistry::constants::MASS_PROTON;
141/// use mscore::data::peptide::FragmentType;
142/// let sequence = "PEPTIDEH";
143/// let mz = calculate_product_ion_mz(sequence, FragmentType::Y, Some(1));
144/// assert_eq!(mz, 936.4188766862999 + MASS_PROTON);
145/// ```
146pub fn calculate_product_ion_mz(sequence: &str, kind: FragmentType, charge: Option<i32>) -> f64 {
147    let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, kind);
148    calculate_mz(mass, charge.unwrap_or(1))
149}
150
151/// get a count dictionary of the amino acid composition of a peptide sequence
152///
153/// Arguments:
154///
155/// * `sequence` - peptide sequence
156///
157/// Returns:
158///
159/// * `composition` - a dictionary of amino acid composition
160///
161/// # Examples
162///
163/// ```
164/// use mscore::algorithm::peptide::calculate_amino_acid_composition;
165///
166/// let sequence = "PEPTIDEH";
167/// let composition = calculate_amino_acid_composition(sequence);
168/// assert_eq!(composition.get("P"), Some(&2));
169/// assert_eq!(composition.get("E"), Some(&2));
170/// assert_eq!(composition.get("T"), Some(&1));
171/// assert_eq!(composition.get("I"), Some(&1));
172/// assert_eq!(composition.get("D"), Some(&1));
173/// assert_eq!(composition.get("H"), Some(&1));
174/// ```
175pub fn calculate_amino_acid_composition(sequence: &str) -> HashMap<String, i32> {
176    let mut composition = HashMap::new();
177    for char in sequence.chars() {
178        *composition.entry(char.to_string()).or_insert(0) += 1;
179    }
180    composition
181}
182
183/// calculate the atomic composition of a peptide sequence
184pub fn peptide_sequence_to_atomic_composition(
185    peptide_sequence: &PeptideSequence,
186) -> HashMap<&'static str, i32> {
187    let token_sequence = unimod_sequence_to_tokens(peptide_sequence.sequence.as_str(), false);
188    let mut collection: HashMap<&'static str, i32> = HashMap::new();
189
190    // Assuming amino_acid_composition and modification_composition return appropriate mappings...
191    let aa_compositions = amino_acid_composition();
192    let mod_compositions = modification_atomic_composition();
193
194    // No need for conversion to HashMap<String, ...> as long as you're directly accessing
195    // the HashMap provided by modification_composition() if it uses String keys.
196    for token in token_sequence {
197        if token.len() == 1 {
198            let char = token.chars().next().unwrap();
199            if let Some(composition) = aa_compositions.get(&char) {
200                for (key, value) in composition.iter() {
201                    *collection.entry(key).or_insert(0) += *value;
202                }
203            }
204        } else {
205            // Directly use &token without .as_str() conversion
206            if let Some(composition) = mod_compositions.get(&token) {
207                for (key, value) in composition.iter() {
208                    *collection.entry(key).or_insert(0) += *value;
209                }
210            }
211        }
212    }
213
214    // Add water
215    *collection.entry("H").or_insert(0) += 2; //
216    *collection.entry("O").or_insert(0) += 1; //
217
218    collection
219}
220
221/// calculate the atomic composition of a product ion
222///
223/// Arguments:
224///
225/// * `product_ion` - a PeptideProductIon instance
226///
227/// Returns:
228///
229/// * `Vec<(&str, i32)>` - a vector of tuples representing the atomic composition of the product ion
230pub fn atomic_product_ion_composition(product_ion: &PeptideProductIon) -> Vec<(&str, i32)> {
231    let mut composition = peptide_sequence_to_atomic_composition(&product_ion.ion.sequence);
232
233    match product_ion.kind {
234        FragmentType::A => {
235            // A: peptide_mass - CO - Water
236            *composition.entry("H").or_insert(0) -= 2;
237            *composition.entry("O").or_insert(0) -= 2;
238            *composition.entry("C").or_insert(0) -= 1;
239        }
240        FragmentType::B => {
241            // B: peptide_mass - Water
242            *composition.entry("H").or_insert(0) -= 2;
243            *composition.entry("O").or_insert(0) -= 1;
244        }
245        FragmentType::C => {
246            // C: peptide_mass + NH3 - Water
247            *composition.entry("H").or_insert(0) += 1;
248            *composition.entry("N").or_insert(0) += 1;
249            *composition.entry("O").or_insert(0) -= 1;
250        }
251        FragmentType::X => {
252            // X: peptide_mass + CO
253            *composition.entry("C").or_insert(0) += 1; // Add 1 for CO
254            *composition.entry("O").or_insert(0) += 1; // Add 1 for CO
255            *composition.entry("H").or_insert(0) -= 2; // Subtract 2 for 2 protons
256        }
257        FragmentType::Y => (),
258        FragmentType::Z => {
259            *composition.entry("H").or_insert(0) -= 3;
260            *composition.entry("N").or_insert(0) -= 1;
261        }
262    }
263
264    composition.iter().map(|(k, v)| (*k, *v)).collect()
265}
266
267/// calculate the atomic composition of a peptide product ion series
268/// Arguments:
269///
270/// * `product_ions` - a vector of PeptideProductIon instances
271/// * `num_threads` - an usize representing the number of threads to use
272/// Returns:
273///
274/// * `Vec<Vec<(String, i32)>>` - a vector of vectors of tuples representing the atomic composition of each product ion
275///
276pub fn fragments_to_composition(
277    product_ions: Vec<PeptideProductIon>,
278    num_threads: usize,
279) -> Vec<Vec<(String, i32)>> {
280    let thread_pool = ThreadPoolBuilder::new()
281        .num_threads(num_threads)
282        .build()
283        .unwrap();
284    let result = thread_pool.install(|| {
285        product_ions
286            .par_iter()
287            .map(|ion| atomic_product_ion_composition(ion))
288            .map(|composition| {
289                composition
290                    .iter()
291                    .map(|(k, v)| (k.to_string(), *v))
292                    .collect()
293            })
294            .collect()
295    });
296    result
297}
298
299/// count the number of protonizable sites in a peptide sequence
300///
301/// # Arguments
302///
303/// * `sequence` - a string representing the peptide sequence
304///
305/// # Returns
306///
307/// * `usize` - the number of protonizable sites in the peptide sequence
308///
309/// # Example
310///
311/// ```
312/// use mscore::algorithm::peptide::get_num_protonizable_sites;
313///
314/// let sequence = "PEPTIDEH";
315/// let num_protonizable_sites = get_num_protonizable_sites(sequence);
316/// assert_eq!(num_protonizable_sites, 2);
317/// ```
318pub fn get_num_protonizable_sites(sequence: &str) -> usize {
319    let mut sites = 1; // n-terminus
320    for s in sequence.chars() {
321        match s {
322            'H' | 'R' | 'K' => sites += 1,
323            _ => {}
324        }
325    }
326    sites
327}
328
329/// simulate the charge state distribution for a peptide sequence
330///
331/// # Arguments
332///
333/// * `sequence` - a string representing the peptide sequence
334/// * `max_charge` - an optional usize representing the maximum charge state to simulate
335/// * `charged_probability` - an optional f64 representing the probability of a site being charged
336///
337/// # Returns
338///
339/// * `Vec<f64>` - a vector of f64 representing the probability of each charge state
340///
341/// # Example
342///
343/// ```
344/// use mscore::algorithm::peptide::simulate_charge_state_for_sequence;
345///
346/// let sequence = "PEPTIDEH";
347/// let charge_state_probs = simulate_charge_state_for_sequence(sequence, None, None);
348/// assert_eq!(charge_state_probs, vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]);
349pub fn simulate_charge_state_for_sequence(
350    sequence: &str,
351    max_charge: Option<usize>,
352    charged_probability: Option<f64>,
353) -> Vec<f64> {
354    let charged_prob = charged_probability.unwrap_or(0.8);
355    let max_charge = max_charge.unwrap_or(4)+1;
356    let num_protonizable_sites = get_num_protonizable_sites(sequence);
357    let mut charge_state_probs = vec![0.0; max_charge];
358    let binom = Binomial::new(charged_prob, num_protonizable_sites as u64).unwrap();
359    
360    for charge in 0..max_charge {
361        charge_state_probs[charge] = binom.pmf(charge as u64);
362    }
363    charge_state_probs
364}
365
366/// simulate the charge state distribution for a list of peptide sequences
367///
368/// # Arguments
369///
370/// * `sequences` - a vector of strings representing the peptide sequences
371/// * `num_threads` - an usize representing the number of threads to use
372/// * `max_charge` - an optional usize representing the maximum charge state to simulate
373/// * `charged_probability` - an optional f64 representing the probability of a site being charged
374///
375/// # Returns
376///
377/// * `Vec<Vec<f64>>` - a vector of vectors of f64 representing the probability of each charge state for each sequence
378///
379/// # Example
380///
381/// ```
382/// use mscore::algorithm::peptide::simulate_charge_states_for_sequences;
383///
384/// let sequences = vec!["PEPTIDEH", "PEPTIDEH", "PEPTIDEH"];
385/// let charge_state_probs = simulate_charge_states_for_sequences(sequences, 4, None, None);
386/// assert_eq!(charge_state_probs, vec![vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]]);
387/// ```
388pub fn simulate_charge_states_for_sequences(
389    sequences: Vec<&str>,
390    num_threads: usize,
391    max_charge: Option<usize>,
392    charged_probability: Option<f64>,
393) -> Vec<Vec<f64>> {
394    let pool = ThreadPoolBuilder::new()
395        .num_threads(num_threads)
396        .build()
397        .unwrap();
398    pool.install(|| {
399        sequences
400            .par_iter()
401            .map(|sequence| {
402                simulate_charge_state_for_sequence(sequence, max_charge, charged_probability)
403            })
404            .collect()
405    })
406}