rustms/chemistry/
utility.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
use regex::Regex;
use crate::chemistry::unimod::unimod_modifications_mass;

/// Convert a peptide sequence with UNIMOD annotations to a list of tokens
///
/// # Arguments
///
/// * `sequence` - a string slice of the peptide sequence
/// * `group_modifications` - a boolean indicating whether to group the amino acid before the UNIMOD with the UNIMOD
///
/// # Returns
///
/// * `Vec<String>` - a vector of strings representing the tokens
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::unimod_sequence_to_tokens;
///
/// let sequence = "PEPTIDE[UNIMOD:1]H";
/// let tokens = unimod_sequence_to_tokens(sequence, false);
/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E", "[UNIMOD:1]", "H"]);
/// let tokens = unimod_sequence_to_tokens(sequence, true);
/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E[UNIMOD:1]", "H"]);
/// ```
pub fn unimod_sequence_to_tokens(sequence: &str, group_modifications: bool) -> Vec<String> {
    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
    let mut tokens = Vec::new();
    let mut last_index = 0;

    for mat in pattern.find_iter(sequence) {
        if group_modifications {
            // When grouping, include the amino acid before the UNIMOD in the token
            let pre_mod_sequence = &sequence[last_index..mat.start()];
            let aa_sequence = if pre_mod_sequence.is_empty() {
                ""
            } else {
                &pre_mod_sequence[..pre_mod_sequence.len() - 1]
            };
            tokens.extend(aa_sequence.chars().map(|c| c.to_string()));

            // Group the last amino acid with the UNIMOD as one token
            let grouped_mod = format!("{}{}", pre_mod_sequence.chars().last().unwrap_or_default().to_string(), &sequence[mat.start()..mat.end()]);
            tokens.push(grouped_mod);
        } else {
            // Extract the amino acids before the current UNIMOD and add them as individual tokens
            let aa_sequence = &sequence[last_index..mat.start()];
            tokens.extend(aa_sequence.chars().map(|c| c.to_string()));

            // Add the UNIMOD as its own token
            let unimod = &sequence[mat.start()..mat.end()];
            tokens.push(unimod.to_string());
        }

        // Update last_index to the end of the current UNIMOD
        last_index = mat.end();
    }

    if !group_modifications || last_index < sequence.len() {
        // Add the remaining amino acids after the last UNIMOD as individual tokens
        let remaining_aa_sequence = &sequence[last_index..];
        tokens.extend(remaining_aa_sequence.chars().map(|c| c.to_string()));
    }

    tokens
}

/// Convert a peptide sequence with UNIMOD annotations to a tuple of plain sequence and for each
/// position in the sequence, the mass of the modification at that position (0 if no modification),
/// which is the representation of sequence nad modifications used by SAGE
///
/// # Arguments
///
/// * `input_string` - a string slice of the peptide sequence
///
/// # Returns
///
/// * `(String, Vec<f64>)` - a tuple of the plain sequence and a vector of f64 representing the mass
/// of the modification at each position in the sequence
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::find_unimod_patterns;
///
/// let sequence = "PEPTIDE[UNIMOD:1]H";
/// let (stripped_sequence, mods) = find_unimod_patterns(sequence);
/// assert_eq!(stripped_sequence, "PEPTIDEH");
/// assert_eq!(mods, vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 42.010565, 0.0]);
/// ```
pub fn find_unimod_patterns(input_string: &str) -> (String, Vec<f64>) {
    let results = extract_unimod_patterns(input_string);
    let stripped_sequence = remove_unimod_annotation(input_string);
    let index_list = generate_index_list(&results, input_string);
    let mods = calculate_modifications(&index_list, &stripped_sequence);
    (stripped_sequence, mods)
}

fn remove_unimod_annotation(sequence: &str) -> String {
    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
    pattern.replace_all(sequence, "").to_string()
}

fn extract_unimod_patterns(input_string: &str) -> Vec<(usize, usize, String)> {
    let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
    pattern.find_iter(input_string)
        .map(|mat| (mat.start(), mat.end(), mat.as_str().to_string()))
        .collect()
}

fn generate_index_list(results: &[(usize, usize, String)], sequence: &str) -> Vec<(usize, String)> {
    let mut index_list = Vec::new();
    let mut chars_removed_counter = 0;

    for (start, end, _) in results {
        let num_chars_removed = end - start;
        let mod_str = &sequence[*start..*end];

        let later_aa_index = if *start != 0 {
            start - 1 - chars_removed_counter
        } else {
            0
        };

        index_list.push((later_aa_index, mod_str.to_string()));
        chars_removed_counter += num_chars_removed;
    }

    index_list
}

fn calculate_modifications(index_list: &[(usize, String)], stripped_sequence: &str) -> Vec<f64> {
    let mut mods = vec![0.0; stripped_sequence.len()];
    for (index, mod_str) in index_list {
        if let Some(mass) = unimod_modifications_mass().get(mod_str.as_str()) {
            mods[*index] += mass;
        }
    }
    mods
}

/// Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
///
/// # Arguments
///
/// * `flat_array` - a vector of f64 representing the flat prosit array
///
/// # Returns
///
/// * `Vec<Vec<Vec<f64>>>` - a 3D array of shape (29, 2, 3)
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::reshape_prosit_array;
///
/// let flat_array = vec![0.0; 174];
/// let reshaped_array = reshape_prosit_array(flat_array);
/// assert_eq!(reshaped_array.len(), 29);
/// assert_eq!(reshaped_array[0].len(), 2);
/// assert_eq!(reshaped_array[0][0].len(), 3);
/// ```
pub fn reshape_prosit_array(flat_array: Vec<f64>) -> Vec<Vec<Vec<f64>>> {
    let mut array_return: Vec<Vec<Vec<f64>>> = vec![vec![vec![0.0; 3]; 2]; 29];
    let mut ptr = 0;

    for c in 0..3 {
        for row in 0..29 {
            // Fill in the Y ion values
            array_return[row][0][c] = flat_array[ptr];
            ptr += 1;
        }
        for row in 0..29 {
            // Fill in the B ion values
            array_return[row][1][c] = flat_array[ptr];
            ptr += 1;
        }
    }

    array_return
}