rustms/chemistry/utility.rs
1use regex::Regex;
2use crate::chemistry::unimod::unimod_modifications_mass;
3
4/// Convert a peptide sequence with UNIMOD annotations to a list of tokens
5///
6/// # Arguments
7///
8/// * `sequence` - a string slice of the peptide sequence
9/// * `group_modifications` - a boolean indicating whether to group the amino acid before the UNIMOD with the UNIMOD
10///
11/// # Returns
12///
13/// * `Vec<String>` - a vector of strings representing the tokens
14///
15/// # Example
16///
17/// ```
18/// use rustms::chemistry::utility::unimod_sequence_to_tokens;
19///
20/// let sequence = "PEPTIDE[UNIMOD:1]H";
21/// let tokens = unimod_sequence_to_tokens(sequence, false);
22/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E", "[UNIMOD:1]", "H"]);
23/// let tokens = unimod_sequence_to_tokens(sequence, true);
24/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E[UNIMOD:1]", "H"]);
25/// ```
26pub fn unimod_sequence_to_tokens(sequence: &str, group_modifications: bool) -> Vec<String> {
27 let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
28 let mut tokens = Vec::new();
29 let mut last_index = 0;
30
31 for mat in pattern.find_iter(sequence) {
32 if group_modifications {
33 // When grouping, include the amino acid before the UNIMOD in the token
34 let pre_mod_sequence = &sequence[last_index..mat.start()];
35 let aa_sequence = if pre_mod_sequence.is_empty() {
36 ""
37 } else {
38 &pre_mod_sequence[..pre_mod_sequence.len() - 1]
39 };
40 tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
41
42 // Group the last amino acid with the UNIMOD as one token
43 let grouped_mod = format!("{}{}", pre_mod_sequence.chars().last().unwrap_or_default().to_string(), &sequence[mat.start()..mat.end()]);
44 tokens.push(grouped_mod);
45 } else {
46 // Extract the amino acids before the current UNIMOD and add them as individual tokens
47 let aa_sequence = &sequence[last_index..mat.start()];
48 tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
49
50 // Add the UNIMOD as its own token
51 let unimod = &sequence[mat.start()..mat.end()];
52 tokens.push(unimod.to_string());
53 }
54
55 // Update last_index to the end of the current UNIMOD
56 last_index = mat.end();
57 }
58
59 if !group_modifications || last_index < sequence.len() {
60 // Add the remaining amino acids after the last UNIMOD as individual tokens
61 let remaining_aa_sequence = &sequence[last_index..];
62 tokens.extend(remaining_aa_sequence.chars().map(|c| c.to_string()));
63 }
64
65 tokens
66}
67
68/// Convert a peptide sequence with UNIMOD annotations to a tuple of plain sequence and for each
69/// position in the sequence, the mass of the modification at that position (0 if no modification),
70/// which is the representation of sequence nad modifications used by SAGE
71///
72/// # Arguments
73///
74/// * `input_string` - a string slice of the peptide sequence
75///
76/// # Returns
77///
78/// * `(String, Vec<f64>)` - a tuple of the plain sequence and a vector of f64 representing the mass
79/// of the modification at each position in the sequence
80///
81/// # Example
82///
83/// ```
84/// use rustms::chemistry::utility::find_unimod_patterns;
85///
86/// let sequence = "PEPTIDE[UNIMOD:1]H";
87/// let (stripped_sequence, mods) = find_unimod_patterns(sequence);
88/// assert_eq!(stripped_sequence, "PEPTIDEH");
89/// assert_eq!(mods, vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 42.010565, 0.0]);
90/// ```
91pub fn find_unimod_patterns(input_string: &str) -> (String, Vec<f64>) {
92 let results = extract_unimod_patterns(input_string);
93 let stripped_sequence = remove_unimod_annotation(input_string);
94 let index_list = generate_index_list(&results, input_string);
95 let mods = calculate_modifications(&index_list, &stripped_sequence);
96 (stripped_sequence, mods)
97}
98
99fn remove_unimod_annotation(sequence: &str) -> String {
100 let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
101 pattern.replace_all(sequence, "").to_string()
102}
103
104fn extract_unimod_patterns(input_string: &str) -> Vec<(usize, usize, String)> {
105 let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
106 pattern.find_iter(input_string)
107 .map(|mat| (mat.start(), mat.end(), mat.as_str().to_string()))
108 .collect()
109}
110
111fn generate_index_list(results: &[(usize, usize, String)], sequence: &str) -> Vec<(usize, String)> {
112 let mut index_list = Vec::new();
113 let mut chars_removed_counter = 0;
114
115 for (start, end, _) in results {
116 let num_chars_removed = end - start;
117 let mod_str = &sequence[*start..*end];
118
119 let later_aa_index = if *start != 0 {
120 start - 1 - chars_removed_counter
121 } else {
122 0
123 };
124
125 index_list.push((later_aa_index, mod_str.to_string()));
126 chars_removed_counter += num_chars_removed;
127 }
128
129 index_list
130}
131
132fn calculate_modifications(index_list: &[(usize, String)], stripped_sequence: &str) -> Vec<f64> {
133 let mut mods = vec![0.0; stripped_sequence.len()];
134 for (index, mod_str) in index_list {
135 if let Some(mass) = unimod_modifications_mass().get(mod_str.as_str()) {
136 mods[*index] += mass;
137 }
138 }
139 mods
140}
141
142/// Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
143///
144/// # Arguments
145///
146/// * `flat_array` - a vector of f64 representing the flat prosit array
147///
148/// # Returns
149///
150/// * `Vec<Vec<Vec<f64>>>` - a 3D array of shape (29, 2, 3)
151///
152/// # Example
153///
154/// ```
155/// use rustms::chemistry::utility::reshape_prosit_array;
156///
157/// let flat_array = vec![0.0; 174];
158/// let reshaped_array = reshape_prosit_array(flat_array);
159/// assert_eq!(reshaped_array.len(), 29);
160/// assert_eq!(reshaped_array[0].len(), 2);
161/// assert_eq!(reshaped_array[0][0].len(), 3);
162/// ```
163pub fn reshape_prosit_array(flat_array: Vec<f64>) -> Vec<Vec<Vec<f64>>> {
164 let mut array_return: Vec<Vec<Vec<f64>>> = vec![vec![vec![0.0; 3]; 2]; 29];
165 let mut ptr = 0;
166
167 for c in 0..3 {
168 for row in 0..29 {
169 // Fill in the Y ion values
170 array_return[row][0][c] = flat_array[ptr];
171 ptr += 1;
172 }
173 for row in 0..29 {
174 // Fill in the B ion values
175 array_return[row][1][c] = flat_array[ptr];
176 ptr += 1;
177 }
178 }
179
180 array_return
181}