mscore/algorithm/peptide.rs
1use crate::chemistry::amino_acid::{amino_acid_composition, amino_acid_masses};
2use crate::chemistry::constants::{MASS_CO, MASS_NH3, MASS_PROTON, MASS_WATER};
3use crate::chemistry::formulas::calculate_mz;
4use crate::chemistry::unimod::{
5 modification_atomic_composition, unimod_modifications_mass_numerical,
6};
7use crate::chemistry::utility::{find_unimod_patterns, unimod_sequence_to_tokens};
8use crate::data::peptide::{FragmentType, PeptideProductIon, PeptideSequence};
9use rayon::prelude::*;
10use rayon::ThreadPoolBuilder;
11use regex::Regex;
12use statrs::distribution::{Binomial, Discrete};
13use std::collections::HashMap;
14
15/// calculate the monoisotopic mass of a peptide sequence
16///
17/// Arguments:
18///
19/// * `sequence` - peptide sequence
20///
21/// Returns:
22///
23/// * `mass` - monoisotopic mass of the peptide
24///
25/// # Examples
26///
27/// ```
28/// use mscore::algorithm::peptide::calculate_peptide_mono_isotopic_mass;
29/// use mscore::data::peptide::PeptideSequence;
30///
31/// let peptide_sequence = PeptideSequence::new("PEPTIDEH".to_string(), Some(1));
32/// let mass = calculate_peptide_mono_isotopic_mass(&peptide_sequence);
33/// let mass_quantized = (mass * 1e6).round() as i32;
34/// assert_eq!(mass_quantized, 936418877);
35/// ```
36pub fn calculate_peptide_mono_isotopic_mass(peptide_sequence: &PeptideSequence) -> f64 {
37 let amino_acid_masses = amino_acid_masses();
38 let modifications_mz_numerical = unimod_modifications_mass_numerical();
39 let pattern = Regex::new(r"\[UNIMOD:(\d+)]").unwrap();
40
41 let sequence = peptide_sequence.sequence.as_str();
42
43 // Find all occurrences of the pattern
44 let modifications: Vec<u32> = pattern
45 .find_iter(sequence)
46 .filter_map(|mat| mat.as_str()[8..mat.as_str().len() - 1].parse().ok())
47 .collect();
48
49 // Remove the modifications from the sequence
50 let sequence = pattern.replace_all(sequence, "");
51
52 // Count occurrences of each amino acid
53 let mut aa_counts = HashMap::new();
54 for char in sequence.chars() {
55 *aa_counts.entry(char).or_insert(0) += 1;
56 }
57
58 // Mass of amino acids and modifications
59 let mass_sequence: f64 = aa_counts
60 .iter()
61 .map(|(aa, &count)| {
62 amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0) * count as f64
63 })
64 .sum();
65 let mass_modifications: f64 = modifications
66 .iter()
67 .map(|&mod_id| modifications_mz_numerical.get(&mod_id).unwrap_or(&0.0))
68 .sum();
69
70 mass_sequence + mass_modifications + MASS_WATER
71}
72
73/// calculate the monoisotopic mass of a peptide product ion for a given fragment type
74///
75/// Arguments:
76///
77/// * `sequence` - peptide sequence
78/// * `kind` - fragment type
79///
80/// Returns:
81///
82/// * `mass` - monoisotopic mass of the peptide
83///
84/// # Examples
85/// ```
86/// use mscore::algorithm::peptide::calculate_peptide_product_ion_mono_isotopic_mass;
87/// use mscore::data::peptide::FragmentType;
88/// let sequence = "PEPTIDEH";
89/// let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, FragmentType::Y);
90/// assert_eq!(mass, 936.4188766862999);
91/// ```
92pub fn calculate_peptide_product_ion_mono_isotopic_mass(sequence: &str, kind: FragmentType) -> f64 {
93 let (sequence, modifications) = find_unimod_patterns(sequence);
94
95 // Return mz of empty sequence
96 if sequence.is_empty() {
97 return 0.0;
98 }
99
100 let amino_acid_masses = amino_acid_masses();
101
102 // Add up raw amino acid masses and potential modifications
103 let mass_sequence: f64 = sequence
104 .chars()
105 .map(|aa| amino_acid_masses.get(&aa.to_string()[..]).unwrap_or(&0.0))
106 .sum();
107
108 let mass_modifications: f64 = modifications.iter().sum();
109
110 // Calculate total mass
111 let mass = mass_sequence + mass_modifications + MASS_WATER;
112
113 let mass = match kind {
114 FragmentType::A => mass - MASS_CO - MASS_WATER,
115 FragmentType::B => mass - MASS_WATER,
116 FragmentType::C => mass + MASS_NH3 - MASS_WATER,
117 FragmentType::X => mass + MASS_CO - 2.0 * MASS_PROTON,
118 FragmentType::Y => mass,
119 FragmentType::Z => mass - MASS_NH3,
120 };
121
122 mass
123}
124
125/// calculate the monoisotopic m/z of a peptide product ion for a given fragment type and charge
126///
127/// Arguments:
128///
129/// * `sequence` - peptide sequence
130/// * `kind` - fragment type
131/// * `charge` - charge
132///
133/// Returns:
134///
135/// * `mz` - monoisotopic mass of the peptide
136///
137/// # Examples
138/// ```
139/// use mscore::algorithm::peptide::calculate_product_ion_mz;
140/// use mscore::chemistry::constants::MASS_PROTON;
141/// use mscore::data::peptide::FragmentType;
142/// let sequence = "PEPTIDEH";
143/// let mz = calculate_product_ion_mz(sequence, FragmentType::Y, Some(1));
144/// assert_eq!(mz, 936.4188766862999 + MASS_PROTON);
145/// ```
146pub fn calculate_product_ion_mz(sequence: &str, kind: FragmentType, charge: Option<i32>) -> f64 {
147 let mass = calculate_peptide_product_ion_mono_isotopic_mass(sequence, kind);
148 calculate_mz(mass, charge.unwrap_or(1))
149}
150
151/// get a count dictionary of the amino acid composition of a peptide sequence
152///
153/// Arguments:
154///
155/// * `sequence` - peptide sequence
156///
157/// Returns:
158///
159/// * `composition` - a dictionary of amino acid composition
160///
161/// # Examples
162///
163/// ```
164/// use mscore::algorithm::peptide::calculate_amino_acid_composition;
165///
166/// let sequence = "PEPTIDEH";
167/// let composition = calculate_amino_acid_composition(sequence);
168/// assert_eq!(composition.get("P"), Some(&2));
169/// assert_eq!(composition.get("E"), Some(&2));
170/// assert_eq!(composition.get("T"), Some(&1));
171/// assert_eq!(composition.get("I"), Some(&1));
172/// assert_eq!(composition.get("D"), Some(&1));
173/// assert_eq!(composition.get("H"), Some(&1));
174/// ```
175pub fn calculate_amino_acid_composition(sequence: &str) -> HashMap<String, i32> {
176 let mut composition = HashMap::new();
177 for char in sequence.chars() {
178 *composition.entry(char.to_string()).or_insert(0) += 1;
179 }
180 composition
181}
182
183/// calculate the atomic composition of a peptide sequence
184pub fn peptide_sequence_to_atomic_composition(
185 peptide_sequence: &PeptideSequence,
186) -> HashMap<&'static str, i32> {
187 let token_sequence = unimod_sequence_to_tokens(peptide_sequence.sequence.as_str(), false);
188 let mut collection: HashMap<&'static str, i32> = HashMap::new();
189
190 // Assuming amino_acid_composition and modification_composition return appropriate mappings...
191 let aa_compositions = amino_acid_composition();
192 let mod_compositions = modification_atomic_composition();
193
194 // No need for conversion to HashMap<String, ...> as long as you're directly accessing
195 // the HashMap provided by modification_composition() if it uses String keys.
196 for token in token_sequence {
197 if token.len() == 1 {
198 let char = token.chars().next().unwrap();
199 if let Some(composition) = aa_compositions.get(&char) {
200 for (key, value) in composition.iter() {
201 *collection.entry(key).or_insert(0) += *value;
202 }
203 }
204 } else {
205 // Directly use &token without .as_str() conversion
206 if let Some(composition) = mod_compositions.get(&token) {
207 for (key, value) in composition.iter() {
208 *collection.entry(key).or_insert(0) += *value;
209 }
210 }
211 }
212 }
213
214 // Add water
215 *collection.entry("H").or_insert(0) += 2; //
216 *collection.entry("O").or_insert(0) += 1; //
217
218 collection
219}
220
221/// calculate the atomic composition of a product ion
222///
223/// Arguments:
224///
225/// * `product_ion` - a PeptideProductIon instance
226///
227/// Returns:
228///
229/// * `Vec<(&str, i32)>` - a vector of tuples representing the atomic composition of the product ion
230pub fn atomic_product_ion_composition(product_ion: &PeptideProductIon) -> Vec<(&str, i32)> {
231 let mut composition = peptide_sequence_to_atomic_composition(&product_ion.ion.sequence);
232
233 match product_ion.kind {
234 FragmentType::A => {
235 // A: peptide_mass - CO - Water
236 *composition.entry("H").or_insert(0) -= 2;
237 *composition.entry("O").or_insert(0) -= 2;
238 *composition.entry("C").or_insert(0) -= 1;
239 }
240 FragmentType::B => {
241 // B: peptide_mass - Water
242 *composition.entry("H").or_insert(0) -= 2;
243 *composition.entry("O").or_insert(0) -= 1;
244 }
245 FragmentType::C => {
246 // C: peptide_mass + NH3 - Water
247 *composition.entry("H").or_insert(0) += 1;
248 *composition.entry("N").or_insert(0) += 1;
249 *composition.entry("O").or_insert(0) -= 1;
250 }
251 FragmentType::X => {
252 // X: peptide_mass + CO
253 *composition.entry("C").or_insert(0) += 1; // Add 1 for CO
254 *composition.entry("O").or_insert(0) += 1; // Add 1 for CO
255 *composition.entry("H").or_insert(0) -= 2; // Subtract 2 for 2 protons
256 }
257 FragmentType::Y => (),
258 FragmentType::Z => {
259 *composition.entry("H").or_insert(0) -= 3;
260 *composition.entry("N").or_insert(0) -= 1;
261 }
262 }
263
264 composition.iter().map(|(k, v)| (*k, *v)).collect()
265}
266
267/// Calculate the atomic composition of the complementary fragment.
268///
269/// When a peptide is fragmented, the resulting fragment ion has a complementary
270/// portion (the rest of the precursor). This function calculates the atomic
271/// composition of that complementary fragment, which is needed for quad-selection
272/// dependent isotope distribution calculations.
273///
274/// # Arguments
275///
276/// * `precursor_composition` - atomic composition of the full precursor
277/// * `fragment_composition` - atomic composition of the fragment ion (from atomic_product_ion_composition)
278///
279/// # Returns
280///
281/// * `HashMap<String, i32>` - atomic composition of the complementary fragment
282///
283/// # Examples
284///
285/// ```
286/// use mscore::algorithm::peptide::{peptide_sequence_to_atomic_composition, atomic_product_ion_composition, calculate_complementary_fragment_composition};
287/// use mscore::data::peptide::{PeptideSequence, PeptideProductIon, PeptideIon, FragmentType};
288///
289/// let precursor = PeptideSequence::new("PEPTIDE".to_string(), Some(1));
290/// let precursor_comp = peptide_sequence_to_atomic_composition(&precursor);
291///
292/// // Create a b3 ion (PEP)
293/// let b3_ion = PeptideProductIon {
294/// kind: FragmentType::B,
295/// ion: PeptideIon {
296/// sequence: PeptideSequence::new("PEP".to_string(), Some(1)),
297/// charge: 1,
298/// intensity: 1.0,
299/// },
300/// };
301/// let fragment_comp: Vec<(&str, i32)> = atomic_product_ion_composition(&b3_ion);
302/// let fragment_map: std::collections::HashMap<&str, i32> = fragment_comp.into_iter().collect();
303///
304/// let complementary = calculate_complementary_fragment_composition(&precursor_comp, &fragment_map);
305/// // Complementary should be TIDE as y-ion (precursor - b-ion)
306/// ```
307pub fn calculate_complementary_fragment_composition(
308 precursor_composition: &HashMap<&str, i32>,
309 fragment_composition: &HashMap<&str, i32>,
310) -> HashMap<String, i32> {
311 let mut complementary: HashMap<String, i32> = HashMap::new();
312
313 // Start with precursor - fragment
314 for (element, &prec_count) in precursor_composition.iter() {
315 let frag_count = fragment_composition.get(element).copied().unwrap_or(0);
316 let diff = prec_count - frag_count;
317 if diff != 0 {
318 complementary.insert(element.to_string(), diff);
319 }
320 }
321
322 // Check for any elements in fragment that might not be in precursor (edge case)
323 for (element, &frag_count) in fragment_composition.iter() {
324 if !precursor_composition.contains_key(element) {
325 complementary.insert(element.to_string(), -frag_count);
326 }
327 }
328
329 complementary
330}
331
332/// calculate the atomic composition of a peptide product ion series
333/// Arguments:
334///
335/// * `product_ions` - a vector of PeptideProductIon instances
336/// * `num_threads` - an usize representing the number of threads to use
337/// Returns:
338///
339/// * `Vec<Vec<(String, i32)>>` - a vector of vectors of tuples representing the atomic composition of each product ion
340///
341pub fn fragments_to_composition(
342 product_ions: Vec<PeptideProductIon>,
343 num_threads: usize,
344) -> Vec<Vec<(String, i32)>> {
345 let thread_pool = ThreadPoolBuilder::new()
346 .num_threads(num_threads)
347 .build()
348 .unwrap();
349 let result = thread_pool.install(|| {
350 product_ions
351 .par_iter()
352 .map(|ion| atomic_product_ion_composition(ion))
353 .map(|composition| {
354 composition
355 .iter()
356 .map(|(k, v)| (k.to_string(), *v))
357 .collect()
358 })
359 .collect()
360 });
361 result
362}
363
364/// count the number of protonizable sites in a peptide sequence
365///
366/// # Arguments
367///
368/// * `sequence` - a string representing the peptide sequence
369///
370/// # Returns
371///
372/// * `usize` - the number of protonizable sites in the peptide sequence
373///
374/// # Example
375///
376/// ```
377/// use mscore::algorithm::peptide::get_num_protonizable_sites;
378///
379/// let sequence = "PEPTIDEH";
380/// let num_protonizable_sites = get_num_protonizable_sites(sequence);
381/// assert_eq!(num_protonizable_sites, 2);
382/// ```
383pub fn get_num_protonizable_sites(sequence: &str) -> usize {
384 let mut sites = 1; // n-terminus
385 for s in sequence.chars() {
386 match s {
387 'H' | 'R' | 'K' => sites += 1,
388 _ => {}
389 }
390 }
391 sites
392}
393
394/// simulate the charge state distribution for a peptide sequence
395///
396/// # Arguments
397///
398/// * `sequence` - a string representing the peptide sequence
399/// * `max_charge` - an optional usize representing the maximum charge state to simulate
400/// * `charged_probability` - an optional f64 representing the probability of a site being charged
401///
402/// # Returns
403///
404/// * `Vec<f64>` - a vector of f64 representing the probability of each charge state
405///
406/// # Example
407///
408/// ```
409/// use mscore::algorithm::peptide::simulate_charge_state_for_sequence;
410///
411/// let sequence = "PEPTIDEH";
412/// let charge_state_probs = simulate_charge_state_for_sequence(sequence, None, None);
413/// assert_eq!(charge_state_probs, vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]);
414pub fn simulate_charge_state_for_sequence(
415 sequence: &str,
416 max_charge: Option<usize>,
417 charged_probability: Option<f64>,
418) -> Vec<f64> {
419 let charged_prob = charged_probability.unwrap_or(0.8);
420 let max_charge = max_charge.unwrap_or(4)+1;
421 let num_protonizable_sites = get_num_protonizable_sites(sequence);
422 let mut charge_state_probs = vec![0.0; max_charge];
423 let binom = Binomial::new(charged_prob, num_protonizable_sites as u64).unwrap();
424
425 for charge in 0..max_charge {
426 charge_state_probs[charge] = binom.pmf(charge as u64);
427 }
428 charge_state_probs
429}
430
431/// simulate the charge state distribution for a list of peptide sequences
432///
433/// # Arguments
434///
435/// * `sequences` - a vector of strings representing the peptide sequences
436/// * `num_threads` - an usize representing the number of threads to use
437/// * `max_charge` - an optional usize representing the maximum charge state to simulate
438/// * `charged_probability` - an optional f64 representing the probability of a site being charged
439///
440/// # Returns
441///
442/// * `Vec<Vec<f64>>` - a vector of vectors of f64 representing the probability of each charge state for each sequence
443///
444/// # Example
445///
446/// ```
447/// use mscore::algorithm::peptide::simulate_charge_states_for_sequences;
448///
449/// let sequences = vec!["PEPTIDEH", "PEPTIDEH", "PEPTIDEH"];
450/// let charge_state_probs = simulate_charge_states_for_sequences(sequences, 4, None, None);
451/// assert_eq!(charge_state_probs, vec![vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0], vec![0.03999999999999999, 0.32, 0.64, 0.0, 0.0]]);
452/// ```
453pub fn simulate_charge_states_for_sequences(
454 sequences: Vec<&str>,
455 num_threads: usize,
456 max_charge: Option<usize>,
457 charged_probability: Option<f64>,
458) -> Vec<Vec<f64>> {
459 let pool = ThreadPoolBuilder::new()
460 .num_threads(num_threads)
461 .build()
462 .unwrap();
463 pool.install(|| {
464 sequences
465 .par_iter()
466 .map(|sequence| {
467 simulate_charge_state_for_sequence(sequence, max_charge, charged_probability)
468 })
469 .collect()
470 })
471}