rustms/chemistry/utility.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
use regex::Regex;
use crate::chemistry::unimod::unimod_modifications_mass;
/// Convert a peptide sequence with UNIMOD annotations to a list of tokens
///
/// # Arguments
///
/// * `sequence` - a string slice of the peptide sequence
/// * `group_modifications` - a boolean indicating whether to group the amino acid before the UNIMOD with the UNIMOD
///
/// # Returns
///
/// * `Vec<String>` - a vector of strings representing the tokens
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::unimod_sequence_to_tokens;
///
/// let sequence = "PEPTIDE[UNIMOD:1]H";
/// let tokens = unimod_sequence_to_tokens(sequence, false);
/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E", "[UNIMOD:1]", "H"]);
/// let tokens = unimod_sequence_to_tokens(sequence, true);
/// assert_eq!(tokens, vec!["P", "E", "P", "T", "I", "D", "E[UNIMOD:1]", "H"]);
/// ```
pub fn unimod_sequence_to_tokens(sequence: &str, group_modifications: bool) -> Vec<String> {
let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
let mut tokens = Vec::new();
let mut last_index = 0;
for mat in pattern.find_iter(sequence) {
if group_modifications {
// When grouping, include the amino acid before the UNIMOD in the token
let pre_mod_sequence = &sequence[last_index..mat.start()];
let aa_sequence = if pre_mod_sequence.is_empty() {
""
} else {
&pre_mod_sequence[..pre_mod_sequence.len() - 1]
};
tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
// Group the last amino acid with the UNIMOD as one token
let grouped_mod = format!("{}{}", pre_mod_sequence.chars().last().unwrap_or_default().to_string(), &sequence[mat.start()..mat.end()]);
tokens.push(grouped_mod);
} else {
// Extract the amino acids before the current UNIMOD and add them as individual tokens
let aa_sequence = &sequence[last_index..mat.start()];
tokens.extend(aa_sequence.chars().map(|c| c.to_string()));
// Add the UNIMOD as its own token
let unimod = &sequence[mat.start()..mat.end()];
tokens.push(unimod.to_string());
}
// Update last_index to the end of the current UNIMOD
last_index = mat.end();
}
if !group_modifications || last_index < sequence.len() {
// Add the remaining amino acids after the last UNIMOD as individual tokens
let remaining_aa_sequence = &sequence[last_index..];
tokens.extend(remaining_aa_sequence.chars().map(|c| c.to_string()));
}
tokens
}
/// Convert a peptide sequence with UNIMOD annotations to a tuple of plain sequence and for each
/// position in the sequence, the mass of the modification at that position (0 if no modification),
/// which is the representation of sequence nad modifications used by SAGE
///
/// # Arguments
///
/// * `input_string` - a string slice of the peptide sequence
///
/// # Returns
///
/// * `(String, Vec<f64>)` - a tuple of the plain sequence and a vector of f64 representing the mass
/// of the modification at each position in the sequence
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::find_unimod_patterns;
///
/// let sequence = "PEPTIDE[UNIMOD:1]H";
/// let (stripped_sequence, mods) = find_unimod_patterns(sequence);
/// assert_eq!(stripped_sequence, "PEPTIDEH");
/// assert_eq!(mods, vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 42.010565, 0.0]);
/// ```
pub fn find_unimod_patterns(input_string: &str) -> (String, Vec<f64>) {
let results = extract_unimod_patterns(input_string);
let stripped_sequence = remove_unimod_annotation(input_string);
let index_list = generate_index_list(&results, input_string);
let mods = calculate_modifications(&index_list, &stripped_sequence);
(stripped_sequence, mods)
}
fn remove_unimod_annotation(sequence: &str) -> String {
let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
pattern.replace_all(sequence, "").to_string()
}
fn extract_unimod_patterns(input_string: &str) -> Vec<(usize, usize, String)> {
let pattern = Regex::new(r"\[UNIMOD:\d+]").unwrap();
pattern.find_iter(input_string)
.map(|mat| (mat.start(), mat.end(), mat.as_str().to_string()))
.collect()
}
fn generate_index_list(results: &[(usize, usize, String)], sequence: &str) -> Vec<(usize, String)> {
let mut index_list = Vec::new();
let mut chars_removed_counter = 0;
for (start, end, _) in results {
let num_chars_removed = end - start;
let mod_str = &sequence[*start..*end];
let later_aa_index = if *start != 0 {
start - 1 - chars_removed_counter
} else {
0
};
index_list.push((later_aa_index, mod_str.to_string()));
chars_removed_counter += num_chars_removed;
}
index_list
}
fn calculate_modifications(index_list: &[(usize, String)], stripped_sequence: &str) -> Vec<f64> {
let mut mods = vec![0.0; stripped_sequence.len()];
for (index, mod_str) in index_list {
if let Some(mass) = unimod_modifications_mass().get(mod_str.as_str()) {
mods[*index] += mass;
}
}
mods
}
/// Reshape the flat prosit array into a 3D array of shape (29, 2, 3)
///
/// # Arguments
///
/// * `flat_array` - a vector of f64 representing the flat prosit array
///
/// # Returns
///
/// * `Vec<Vec<Vec<f64>>>` - a 3D array of shape (29, 2, 3)
///
/// # Example
///
/// ```
/// use rustms::chemistry::utility::reshape_prosit_array;
///
/// let flat_array = vec![0.0; 174];
/// let reshaped_array = reshape_prosit_array(flat_array);
/// assert_eq!(reshaped_array.len(), 29);
/// assert_eq!(reshaped_array[0].len(), 2);
/// assert_eq!(reshaped_array[0][0].len(), 3);
/// ```
pub fn reshape_prosit_array(flat_array: Vec<f64>) -> Vec<Vec<Vec<f64>>> {
let mut array_return: Vec<Vec<Vec<f64>>> = vec![vec![vec![0.0; 3]; 2]; 29];
let mut ptr = 0;
for c in 0..3 {
for row in 0..29 {
// Fill in the Y ion values
array_return[row][0][c] = flat_array[ptr];
ptr += 1;
}
for row in 0..29 {
// Fill in the B ion values
array_return[row][1][c] = flat_array[ptr];
ptr += 1;
}
}
array_return
}