mscore/algorithm/peptide.rs
1use crate::chemistry::amino_acid::{amino_acid_composition, amino_acid_masses};
2use crate::chemistry::constants::{MASS_CO, MASS_NH3, MASS_PROTON, MASS_WATER};
3use crate::chemistry::formulas::calculate_mz;
4use crate::chemistry::unimod::{
5 modification_atomic_composition, unimod_modifications_mass_numerical,
6};
7use crate::chemistry::utility::{find_unimod_patterns, unimod_sequence_to_tokens};
8use crate::data::peptide::{FragmentType, PeptideProductIon, PeptideSequence};
9use rayon::prelude::*;
10use rayon::ThreadPoolBuilder;
11use regex::Regex;
12use statrs::distribution::{Binomial, Discrete};
13use std::collections::HashMap;
14
15/// calculate the monoisotopic mass of a peptide sequence
16///
17/// Arguments:
18///
19/// * `sequence` - peptide sequence
20///
21/// Returns:
22///
23/// * `mass` - monoisotopic mass of the peptide
24///
25/// # Examples
26///
27/// ```
28/// use mscore::algorithm::peptide::calculate_peptide_mono_isotopic_mass;
29/// use mscore::data::peptide::PeptideSequence;
30///
31/// let peptide_sequence = PeptideSequence::new("PEPTIDEH".to_string(), Some(1));
32/// let mass = calculate_peptide_mono_isotopic_mass(&peptide_sequence);
33/// let mass_quantized = (mass * 1e6).round() as i32;
34/// assert_eq!(mass_quantized, 936418877);
35/// ```
36pub fn calculate_peptide_mono_isotopic_mass(peptide_sequence: &PeptideSequence) -> f64 {
37 let amino_acid_masses = amino_acid_masses();
38 let modifications_mz_numerical = unimod_modifications_mass_numerical();
39 let pattern = Regex::new(r"\[UNIMOD:(\d+)]").unwrap();
40
41 let sequence = peptide_sequence.sequence.as_str();
42
43 // Find all occurrences of the pattern
44 let modifications: Vec<u32> = pattern
45 .find_iter(sequence)
46 .filter_map(|mat| mat.as_str()[8..mat.as_str().len() - 1].parse().ok())
47 .collect();
48
49 // Remove the modifications from the sequence
50 let sequence = pattern.replace_all(sequence, "");
51
52 // Count occurrences of each amino acid
53 let mut aa_counts = HashMap::new();
54 for char in sequence.chars() {
55 *aa_counts.entry(char).or_insert(0) += 1;
56 }
57
58 // Mass of amino acids and modifications
59 let mass_sequence: f64 = aa_counts
60 .iter()
61 .map(|(aa, &count)| {
62 amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0) * count as f64
63 })
64 .sum();
65 let mass_modifications: f64 = modifications
66 .iter()
67 .map(|&mod_id| modifications_mz_numerical.get(&mod_id).unwrap_or(&0.0))
68 .sum();
69
70 mass_sequence + mass_modifications + MASS_WATER
71}
72
73/// calculate the monoisotopic mass of a peptide product ion for a given fragment type
74///
75/// Arguments:
76///
77/// * `sequence` - peptide sequence
78/// * `kind` - fragment type
79///
80/// Returns:
81///
82/// * `mass` - monoisotopic mass of the peptide
83///
84/// # Examples
85/// ```
86/// use mscore::algorithm::peptide::calculate_peptide_product_ion_mono_isotopic_mass;
87/// use mscore::data::peptide::FragmentType;
88/// let sequence = "PEPTIDEH";
89/// let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, FragmentType::Y);
90/// assert_eq!(mass, 936.4188766862999);
91/// ```
92pub fn calculate_peptide_product_ion_mono_isotopic_mass(sequence: &str, kind: FragmentType) -> f64 {
93 let (sequence, modifications) = find_unimod_patterns(sequence);
94
95 // Return mz of empty sequence
96 if sequence.is_empty() {
97 return 0.0;
98 }
99
100 let amino_acid_masses = amino_acid_masses();
101
102 // Add up raw amino acid masses and potential modifications
103 let mass_sequence: f64 = sequence
104 .chars()
105 .map(|aa| amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0))
106 .sum();
107
108 let mass_modifications: f64 = modifications.iter().sum();
109
110 // Calculate total mass
111 let mass = mass_sequence + mass_modifications + MASS_WATER;
112
113 let mass = match kind {
114 FragmentType::A => mass - MASS_CO - MASS_WATER,
115 FragmentType::B => mass - MASS_WATER,
116 FragmentType::C => mass + MASS_NH3 - MASS_WATER,
117 FragmentType::X => mass + MASS_CO - 2.0 * MASS_PROTON,
118 FragmentType::Y => mass,
119 FragmentType::Z => mass - MASS_NH3,
120 };
121
122 mass
123}
124
125/// calculate the monoisotopic m/z of a peptide product ion for a given fragment type and charge
126///
127/// Arguments:
128///
129/// * `sequence` - peptide sequence
130/// * `kind` - fragment type
131/// * `charge` - charge
132///
133/// Returns:
134///
135/// * `mz` - monoisotopic mass of the peptide
136///
137/// # Examples
138/// ```
139/// use mscore::algorithm::peptide::calculate_product_ion_mz;
140/// use mscore::chemistry::constants::MASS_PROTON;
141/// use mscore::data::peptide::FragmentType;
142/// let sequence = "PEPTIDEH";
143/// let mz = calculate_product_ion_mz(sequence, FragmentType::Y, Some(1));
144/// assert_eq!(mz, 936.4188766862999 + MASS_PROTON);
145/// ```
146pub fn calculate_product_ion_mz(sequence: &str, kind: FragmentType, charge: Option<i32>) -> f64 {
147 let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, kind);
148 calculate_mz(mass, charge.unwrap_or(1))
149}
150
151/// get a count dictionary of the amino acid composition of a peptide sequence
152///
153/// Arguments:
154///
155/// * `sequence` - peptide sequence
156///
157/// Returns:
158///
159/// * `composition` - a dictionary of amino acid composition
160///
161/// # Examples
162///
163/// ```
164/// use mscore::algorithm::peptide::calculate_amino_acid_composition;
165///
166/// let sequence = "PEPTIDEH";
167/// let composition = calculate_amino_acid_composition(sequence);
168/// assert_eq!(composition.get("P"), Some(&2));
169/// assert_eq!(composition.get("E"), Some(&2));
170/// assert_eq!(composition.get("T"), Some(&1));
171/// assert_eq!(composition.get("I"), Some(&1));
172/// assert_eq!(composition.get("D"), Some(&1));
173/// assert_eq!(composition.get("H"), Some(&1));
174/// ```
175pub fn calculate_amino_acid_composition(sequence: &str) -> HashMap<String, i32> {
176 let mut composition = HashMap::new();
177 for char in sequence.chars() {
178 *composition.entry(char.to_string()).or_insert(0) += 1;
179 }
180 composition
181}
182
183/// calculate the atomic composition of a peptide sequence
184pub fn peptide_sequence_to_atomic_composition(
185 peptide_sequence: &PeptideSequence,
186) -> HashMap<&'static str, i32> {
187 let token_sequence = unimod_sequence_to_tokens(peptide_sequence.sequence.as_str(), false);
188 let mut collection: HashMap<&'static str, i32> = HashMap::new();
189
190 // Assuming amino_acid_composition and modification_composition return appropriate mappings...
191 let aa_compositions = amino_acid_composition();
192 let mod_compositions = modification_atomic_composition();
193
194 // No need for conversion to HashMap<String, ...> as long as you're directly accessing
195 // the HashMap provided by modification_composition() if it uses String keys.
196 for token in token_sequence {
197 if token.len() == 1 {
198 let char = token.chars().next().unwrap();
199 if let Some(composition) = aa_compositions.get(&char) {
200 for (key, value) in composition.iter() {
201 *collection.entry(key).or_insert(0) += *value;
202 }
203 }
204 } else {
205 // Directly use &token without .as_str() conversion
206 if let Some(composition) = mod_compositions.get(&token) {
207 for (key, value) in composition.iter() {
208 *collection.entry(key).or_insert(0) += *value;
209 }
210 }
211 }
212 }
213
214 // Add water
215 *collection.entry("H").or_insert(0) += 2; //
216 *collection.entry("O").or_insert(0) += 1; //
217
218 collection
219}
220
221/// calculate the atomic composition of a product ion
222///
223/// Arguments:
224///
225/// * `product_ion` - a PeptideProductIon instance
226///
227/// Returns:
228///
229/// * `Vec<(&str, i32)>` - a vector of tuples representing the atomic composition of the product ion
230pub fn atomic_product_ion_composition(product_ion: &PeptideProductIon) -> Vec<(&str, i32)> {
231 let mut composition = peptide_sequence_to_atomic_composition(&product_ion.ion.sequence);
232
233 match product_ion.kind {
234 FragmentType::A => {
235 // A: peptide_mass - CO - Water
236 *composition.entry("H").or_insert(0) -= 2;
237 *composition.entry("O").or_insert(0) -= 2;
238 *composition.entry("C").or_insert(0) -= 1;
239 }
240 FragmentType::B => {
241 // B: peptide_mass - Water
242 *composition.entry("H").or_insert(0) -= 2;
243 *composition.entry("O").or_insert(0) -= 1;
244 }
245 FragmentType::C => {
246 // C: peptide_mass + NH3 - Water
247 *composition.entry("H").or_insert(0) += 1;
248 *composition.entry("N").or_insert(0) += 1;
249 *composition.entry("O").or_insert(0) -= 1;
250 }
251 FragmentType::X => {
252 // X: peptide_mass + CO
253 *composition.entry("C").or_insert(0) += 1; // Add 1 for CO
254 *composition.entry("O").or_insert(0) += 1; // Add 1 for CO
255 *composition.entry("H").or_insert(0) -= 2; // Subtract 2 for 2 protons
256 }
257 FragmentType::Y => (),
258 FragmentType::Z => {
259 *composition.entry("H").or_insert(0) -= 3;
260 *composition.entry("N").or_insert(0) -= 1;
261 }
262 }
263
264 composition.iter().map(|(k, v)| (*k, *v)).collect()
265}
266
267/// calculate the atomic composition of a peptide product ion series
268/// Arguments:
269///
270/// * `product_ions` - a vector of PeptideProductIon instances
271/// * `num_threads` - an usize representing the number of threads to use
272/// Returns:
273///
274/// * `Vec<Vec<(String, i32)>>` - a vector of vectors of tuples representing the atomic composition of each product ion
275///
276pub fn fragments_to_composition(
277 product_ions: Vec<PeptideProductIon>,
278 num_threads: usize,
279) -> Vec<Vec<(String, i32)>> {
280 let thread_pool = ThreadPoolBuilder::new()
281 .num_threads(num_threads)
282 .build()
283 .unwrap();
284 let result = thread_pool.install(|| {
285 product_ions
286 .par_iter()
287 .map(|ion| atomic_product_ion_composition(ion))
288 .map(|composition| {
289 composition
290 .iter()
291 .map(|(k, v)| (k.to_string(), *v))
292 .collect()
293 })
294 .collect()
295 });
296 result
297}
298
299/// count the number of protonizable sites in a peptide sequence
300///
301/// # Arguments
302///
303/// * `sequence` - a string representing the peptide sequence
304///
305/// # Returns
306///
307/// * `usize` - the number of protonizable sites in the peptide sequence
308///
309/// # Example
310///
311/// ```
312/// use mscore::algorithm::peptide::get_num_protonizable_sites;
313///
314/// let sequence = "PEPTIDEH";
315/// let num_protonizable_sites = get_num_protonizable_sites(sequence);
316/// assert_eq!(num_protonizable_sites, 2);
317/// ```
318pub fn get_num_protonizable_sites(sequence: &str) -> usize {
319 let mut sites = 1; // n-terminus
320 for s in sequence.chars() {
321 match s {
322 'H' | 'R' | 'K' => sites += 1,
323 _ => {}
324 }
325 }
326 sites
327}
328
329/// simulate the charge state distribution for a peptide sequence
330///
331/// # Arguments
332///
333/// * `sequence` - a string representing the peptide sequence
334/// * `max_charge` - an optional usize representing the maximum charge state to simulate
335/// * `charged_probability` - an optional f64 representing the probability of a site being charged
336///
337/// # Returns
338///
339/// * `Vec<f64>` - a vector of f64 representing the probability of each charge state
340///
341/// # Example
342///
343/// ```
344/// use mscore::algorithm::peptide::simulate_charge_state_for_sequence;
345///
346/// let sequence = "PEPTIDEH";
347/// let charge_state_probs = simulate_charge_state_for_sequence(sequence, None, None);
348/// assert_eq!(charge_state_probs, vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]);
349pub fn simulate_charge_state_for_sequence(
350 sequence: &str,
351 max_charge: Option<usize>,
352 charged_probability: Option<f64>,
353) -> Vec<f64> {
354 let charged_prob = charged_probability.unwrap_or(0.8);
355 let max_charge = max_charge.unwrap_or(4)+1;
356 let num_protonizable_sites = get_num_protonizable_sites(sequence);
357 let mut charge_state_probs = vec![0.0; max_charge];
358 let binom = Binomial::new(charged_prob, num_protonizable_sites as u64).unwrap();
359
360 for charge in 0..max_charge {
361 charge_state_probs[charge] = binom.pmf(charge as u64);
362 }
363 charge_state_probs
364}
365
366/// simulate the charge state distribution for a list of peptide sequences
367///
368/// # Arguments
369///
370/// * `sequences` - a vector of strings representing the peptide sequences
371/// * `num_threads` - an usize representing the number of threads to use
372/// * `max_charge` - an optional usize representing the maximum charge state to simulate
373/// * `charged_probability` - an optional f64 representing the probability of a site being charged
374///
375/// # Returns
376///
377/// * `Vec<Vec<f64>>` - a vector of vectors of f64 representing the probability of each charge state for each sequence
378///
379/// # Example
380///
381/// ```
382/// use mscore::algorithm::peptide::simulate_charge_states_for_sequences;
383///
384/// let sequences = vec!["PEPTIDEH", "PEPTIDEH", "PEPTIDEH"];
385/// let charge_state_probs = simulate_charge_states_for_sequences(sequences, 4, None, None);
386/// assert_eq!(charge_state_probs, vec![vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]]);
387/// ```
388pub fn simulate_charge_states_for_sequences(
389 sequences: Vec<&str>,
390 num_threads: usize,
391 max_charge: Option<usize>,
392 charged_probability: Option<f64>,
393) -> Vec<Vec<f64>> {
394 let pool = ThreadPoolBuilder::new()
395 .num_threads(num_threads)
396 .build()
397 .unwrap();
398 pool.install(|| {
399 sequences
400 .par_iter()
401 .map(|sequence| {
402 simulate_charge_state_for_sequence(sequence, max_charge, charged_probability)
403 })
404 .collect()
405 })
406}