rustms/chemistry/
utility.rs

1use regex::Regex;
2use crate::chemistry::unimod::unimod_modifications_mass;
3
4/// Convert a peptide sequence with UNIMOD annotations to a list of tokens
5///
6/// # Arguments
7///
8/// * `sequence` - a string slice of the peptide sequence
9/// * `group_modifications` - a boolean indicating whether to group the amino acid before the UNIMOD with the UNIMOD
10///
11/// # Returns
12///
13/// * `Vec<String>` - a vector of strings representing the tokens
14///
15/// # Example
16///
17/// ```
18/// use rustms::chemistry::utility::unimod_sequence_to_tokens;
19///
20/// let sequence = "PEPTIDE[UNIMOD:1]H";
21/// let tokens = unimod_sequence_to_tokens(sequence, false);
22/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E", "[UNIMOD:1]", "H"]);
23/// let tokens = unimod_sequence_to_tokens(sequence, true);
24/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E[UNIMOD:1]", "H"]);
25/// ```
26pub fn unimod_sequence_to_tokens(sequence: &str, group_modifications: bool) -> Vec<String> {
27    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
28    let mut tokens = Vec::new();
29    let mut last_index = 0;
30
31    for mat in pattern.find_iter(sequence) {
32        if group_modifications {
33            // When grouping, include the amino acid before the UNIMOD in the token
34            let pre_mod_sequence = &sequence[last_index..mat.start()];
35            let aa_sequence = if pre_mod_sequence.is_empty() {
36                ""
37            } else {
38                &pre_mod_sequence[..pre_mod_sequence.len() - 1]
39            };
40            tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
41
42            // Group the last amino acid with the UNIMOD as one token
43            let grouped_mod = format!("{}{}", pre_mod_sequence.chars().last().unwrap_or_default().to_string(), &sequence[mat.start()..mat.end()]);
44            tokens.push(grouped_mod);
45        } else {
46            // Extract the amino acids before the current UNIMOD and add them as individual tokens
47            let aa_sequence = &sequence[last_index..mat.start()];
48            tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
49
50            // Add the UNIMOD as its own token
51            let unimod = &sequence[mat.start()..mat.end()];
52            tokens.push(unimod.to_string());
53        }
54
55        // Update last_index to the end of the current UNIMOD
56        last_index = mat.end();
57    }
58
59    if !group_modifications || last_index < sequence.len() {
60        // Add the remaining amino acids after the last UNIMOD as individual tokens
61        let remaining_aa_sequence = &sequence[last_index..];
62        tokens.extend(remaining_aa_sequence.chars().map(|c| c.to_string()));
63    }
64
65    tokens
66}
67
68/// Convert a peptide sequence with UNIMOD annotations to a tuple of plain sequence and for each
69/// position in the sequence, the mass of the modification at that position (0 if no modification),
70/// which is the representation of sequence nad modifications used by SAGE
71///
72/// # Arguments
73///
74/// * `input_string` - a string slice of the peptide sequence
75///
76/// # Returns
77///
78/// * `(String, Vec<f64>)` - a tuple of the plain sequence and a vector of f64 representing the mass
79/// of the modification at each position in the sequence
80///
81/// # Example
82///
83/// ```
84/// use rustms::chemistry::utility::find_unimod_patterns;
85///
86/// let sequence = "PEPTIDE[UNIMOD:1]H";
87/// let (stripped_sequence, mods) = find_unimod_patterns(sequence);
88/// assert_eq!(stripped_sequence, "PEPTIDEH");
89/// assert_eq!(mods, vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 42.010565, 0.0]);
90/// ```
91pub fn find_unimod_patterns(input_string: &str) -> (String, Vec<f64>) {
92    let results = extract_unimod_patterns(input_string);
93    let stripped_sequence = remove_unimod_annotation(input_string);
94    let index_list = generate_index_list(&results, input_string);
95    let mods = calculate_modifications(&index_list, &stripped_sequence);
96    (stripped_sequence, mods)
97}
98
99fn remove_unimod_annotation(sequence: &str) -> String {
100    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
101    pattern.replace_all(sequence, "").to_string()
102}
103
104fn extract_unimod_patterns(input_string: &str) -> Vec<(usize, usize, String)> {
105    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
106    pattern.find_iter(input_string)
107        .map(|mat| (mat.start(), mat.end(), mat.as_str().to_string()))
108        .collect()
109}
110
111fn generate_index_list(results: &[(usize, usize, String)], sequence: &str) -> Vec<(usize, String)> {
112    let mut index_list = Vec::new();
113    let mut chars_removed_counter = 0;
114
115    for (start, end, _) in results {
116        let num_chars_removed = end - start;
117        let mod_str = &sequence[*start..*end];
118
119        let later_aa_index = if *start != 0 {
120            start - 1 - chars_removed_counter
121        } else {
122            0
123        };
124
125        index_list.push((later_aa_index, mod_str.to_string()));
126        chars_removed_counter += num_chars_removed;
127    }
128
129    index_list
130}
131
132fn calculate_modifications(index_list: &[(usize, String)], stripped_sequence: &str) -> Vec<f64> {
133    let mut mods = vec![0.0; stripped_sequence.len()];
134    for (index, mod_str) in index_list {
135        if let Some(mass) = unimod_modifications_mass().get(mod_str.as_str()) {
136            mods[*index] += mass;
137        }
138    }
139    mods
140}
141
142/// Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
143///
144/// # Arguments
145///
146/// * `flat_array` - a vector of f64 representing the flat prosit array
147///
148/// # Returns
149///
150/// * `Vec<Vec<Vec<f64>>>` - a 3D array of shape (29, 2, 3)
151///
152/// # Example
153///
154/// ```
155/// use rustms::chemistry::utility::reshape_prosit_array;
156///
157/// let flat_array = vec![0.0; 174];
158/// let reshaped_array = reshape_prosit_array(flat_array);
159/// assert_eq!(reshaped_array.len(), 29);
160/// assert_eq!(reshaped_array[0].len(), 2);
161/// assert_eq!(reshaped_array[0][0].len(), 3);
162/// ```
163pub fn reshape_prosit_array(flat_array: Vec<f64>) -> Vec<Vec<Vec<f64>>> {
164    let mut array_return: Vec<Vec<Vec<f64>>> = vec![vec![vec![0.0; 3]; 2]; 29];
165    let mut ptr = 0;
166
167    for c in 0..3 {
168        for row in 0..29 {
169            // Fill in the Y ion values
170            array_return[row][0][c] = flat_array[ptr];
171            ptr += 1;
172        }
173        for row in 0..29 {
174            // Fill in the B ion values
175            array_return[row][1][c] = flat_array[ptr];
176            ptr += 1;
177        }
178    }
179
180    array_return
181}