OpenMS
AhoCorasickAmbiguous.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: Chris Bielow $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
11 #include <OpenMS/CONCEPT/Macros.h>
13 
14 #include <cassert>
15 #include <functional> // for std::hash
16 #include <limits>
17 #include <queue>
18 #include <string>
19 #include <unordered_map>
20 #include <vector>
21 
22 namespace OpenMS
23 {
26  constexpr char const AAtoChar[28] = {
27  'A', // 00 Ala Alanine
28  'Y', // 01 Tyr Tyrosine
29  'C', // 02 Cys Cysteine
30  'D', // 03 Asp Aspartic Acid // B
31  'N', // 04 Asn Asparagine // B
32  'F', // 05 Phe Phenylalanine
33  'G', // 06 Gly Glycine
34  'H', // 07 His Histidine
35  'I', // 08 Ile Isoleucine // J
36  'L', // 09 Leu Leucine // J
37  'K', // 10 Lys Lysine
38  'W', // 11 Trp Tryptophan
39  'M', // 12 Met Methionine
40  'O', // 13 Pyl Pyrrolysine
41  'P', // 14 Pro Proline
42  'E', // 15 Glu Glutamic Acid // Z
43  'Q', // 16 Gln Glutamine // Z
44  'R', // 17 Arg Arginine
45  'S', // 18 Ser Serine
46  'T', // 19 Thr Threonine
47  'U', // 20 Selenocysteine
48  'V', // 21 Val Valine
49  // ambiguous AAs start here (index: 22...25)
50  'B', // 22 Aspartic Acid, Asparagine $ // the ambAA's need to be consecutive (B,J,Z,X,$)
51  'J', // 23 Leucine, Isoleucine $
52  'Z', // 24 Glutamic Acid, Glutamine $
53  'X', // 25 Unknown
54  // non-regular AA's, which are special
55  '$', // 26 superAA, i.e. it models a mismatch, which can be anything, including AAAs
56  '?', // 27 invalid AA (will usually be skipped) -- must be the last AA (AA::operator++ and others rely on it)
57  };
58 
60  constexpr char const CharToAA[128] = {
61  // ASCII char (7-bit Int with values from 0..127) --> amino acid
62  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 0
63  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 1
64  // $
65  27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 2
66  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 3
67 
68  // , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
69  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 4
70 
71  // P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
72  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 5
73 
74  // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
75  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 6
76 
77  // p, q, r, s, t, u, v, w, x, y, z, , , , , ,
78  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 7
79  };
80 
83  struct OPENMS_DLLAPI Hit {
84  using T = uint32_t;
85  Hit() = default;
86  Hit(T needle_index, T needle_length, T query_pos) : needle_index(needle_index), needle_length(needle_length), query_pos(query_pos) {};
90  bool operator==(const Hit& rhs) const
91  {
92  return needle_index == rhs.needle_index && needle_length == rhs.needle_length && query_pos == rhs.query_pos;
93  }
94  };
95 
98  struct OPENMS_DLLAPI AA
99  {
101  constexpr AA() : aa_(AA('?').aa_)
102  {
103  }
104 
108  constexpr explicit AA(const char c) : aa_(CharToAA[(unsigned char)c])
109  {
110  }
111 
113  constexpr uint8_t operator()() const
114  {
115  return aa_;
116  }
117 
119  constexpr bool operator==(const AA other) const
120  {
121  return aa_ == other.aa_;
122  }
123 
125  constexpr bool operator<=(const AA other) const
126  {
127  return aa_ <= other.aa_;
128  }
129 
131  constexpr bool isAmbiguous() const
132  {
133  return aa_ >= AA('B').aa_;
134  }
135 
137  constexpr bool isValid() const
138  {
139  return aa_ != AA('?').aa_;
140  }
141 
143  constexpr bool isValidForPeptide() const
144  {
145  return aa_ <= AA('X').aa_; // '$' or '?'
146  }
147 
149  constexpr AA& operator++()
150  {
151  ++aa_;
152  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
153  return *this;
154  }
155 
157  constexpr AA operator++(int)
158  {
159  AA r(*this);
160  ++aa_;
161  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
162  return r;
163  }
164 
166  constexpr AA operator-(const AA rhs) const
167  {
168  AA r(*this);
169  r.aa_ -= rhs.aa_;
170  return r;
171  }
172 
174  constexpr char toChar() const
175  {
176  return AAtoChar[aa_];
177  }
178 
181  static AA fromIndex(size_t index) noexcept
182  {
183  OPENMS_PRECONDITION(index < unambiguousAACount(), "AA::fromIndex(): index must be in [0, unambiguousAACount())");
184  AA r;
185  r.aa_ = static_cast<uint8_t>(index);
186  return r;
187  }
188 
190  static size_t unambiguousAACount()
191  {
192  static_assert(AA('V').aa_ - AA('A').aa_ + 1 == 22, "Unambiguous AA count must be 22 (A-Z, excluding B, J, Z, X)");
193  static_assert(AA('A').aa_ == 0, "A must be the first AA in the enumeration");
194  static_assert(AA('V').aa_ == 21, "V must be the last unambiguous AA in the enumeration");
195  return AA('V').aa_ - AA('A').aa_ + 1;
196  }
197 
198  private:
199  uint8_t aa_;
200  };
201 
204  class OPENMS_DLLAPI Index
205  {
206  public:
207  using T = uint32_t;
209  Index() = default;
210 
212  Index(T val) : i_(val) {};
213 
215  bool isInvalid() const;
216 
218  bool isValid() const;
219 
221  T operator()() const;
222 
224  bool operator==(const Index other) const;
225 
227  T& pos();
228 
230  T pos() const;
231  private:
232  T i_ = std::numeric_limits<T>::max();
233  };
234  } // namespace OpenMS
235 
236 // this needs to go into namespace std
237 template<> struct std::hash<OpenMS::Index>
238 {
239  std::size_t operator()(OpenMS::Index const& s) const noexcept
240  {
241  return std::hash<OpenMS::Index::T> {}(s());
242  }
243 };
244 
245 namespace OpenMS
246 {
249  struct OPENMS_DLLAPI ACNode
250  {
252  ACNode() {};
253 
255  ACNode(const AA label, const uint8_t depth) : edge(label)
256  {
257  depth_and_hits.depth = depth;
258  }
259 
261  struct DepthHits {
263  {
264  memset(this, 0, sizeof *this); // make sure bitfields are 0; C++20 allows {} initialization ...
265  };
266  uint8_t has_hit : 1;
267  // we could add another bit here to distinguish between a local hit and suffix hit, but on Windows, this slows it down
268  uint8_t depth : 7;
269  };
270 
271  using ChildCountType = uint8_t;
272 
273  Index suffix {0};
274  Index first_child {0};
275  // there is room for optimization here, by pulling edge labels into a separate vector (allows for SSE-enabled search of children)
276  AA edge {0};
277  ChildCountType nr_children = 0;
279  };
280 
281  // forward declaration
282  struct ACTrieState;
283 
285  struct OPENMS_DLLAPI ACSpawn
286  {
288  ACSpawn() = delete;
289 
291  ACSpawn(std::string::const_iterator query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss);
292 
294  size_t textPos(const ACTrieState& state) const;
295 
298  AA nextValidAA(const ACTrieState& state);
299 
300  std::string::const_iterator it_query;
302  uint8_t max_aaa_leftover {0};
303  uint8_t max_mm_leftover {0};
304  uint8_t max_prefix_loss_leftover {0};
305  };
306 
309  OPENMS_DLLAPI AA nextValidAA(const std::string::const_iterator end, std::string::const_iterator& it_q);
310 
313  struct OPENMS_DLLAPI ACTrieState
314  {
315  friend ACSpawn;
318  void setQuery(const std::string& haystack);
319 
321  size_t textPos() const;
322 
324  std::string::const_iterator textPosIt() const;
325 
327  const std::string& getQuery() const;
328 
332 
333  std::vector<Hit> hits;
335  std::queue<ACSpawn> spawns;
336 
337  private:
338  std::string query_;
339  std::string::const_iterator it_q_;
340  };
341 
343  class OPENMS_DLLAPI ACTrie
344  {
345  public:
352  ACTrie(uint32_t max_aaa = 0, uint32_t max_mm = 0);
353 
356 
360  void addNeedle(const std::string& needle);
361 
365  void addNeedles(const std::vector<std::string>& needles);
366 
369  void addNeedlesAndCompress(const std::vector<std::string>& needles);
370 
378  void compressTrie();
379 
381  size_t getNeedleCount() const;
382 
385  void setMaxAAACount(const uint32_t max_aaa);
386 
388  uint32_t getMaxAAACount() const;
389 
392  void setMaxMMCount(const uint32_t max_mm);
393 
395  uint32_t getMaxMMCount() const;
396 
400  bool nextHits(ACTrieState& state) const;
401 
404  void getAllHits(ACTrieState& state) const;
405 
406  private:
410  bool nextHitsNoClear_(ACTrieState& state) const;
411 
415  Index add_(const Index from, const AA edge);
416 
425  bool addHits_(Index i, const size_t text_pos, std::vector<Hit>& hits) const;
426 
428  bool addHitsSpawn_(Index i, const ACSpawn& spawn, const size_t text_pos, std::vector<Hit>& hits, const int current_spawn_depths) const;
429 
433  Index follow_(const Index i, const AA edge) const;
434 
437  bool followSpawn_(ACSpawn& spawn, const AA edge, ACTrieState& state) const;
438 
441  Index stepMaster_(const Index i, const AA edge, ACTrieState& state) const;
442 
445  bool stepSpawn_(ACSpawn& spawn, ACTrieState& state) const;
446 
449  void createSpawns_(const Index i, const AA fromAA, const AA toAA, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
450 
452  void createSubSpawns_(const ACSpawn& prototype, const AA fromAA, const AA toAA, ACTrieState& state) const;
453 
455  void createMMSpawns_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
456 
458  void createMMSubSpawns_(const ACSpawn& prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state) const;
459 
461  Index findChildNaive_(Index parent, AA child_label);
462 
464  Index findChildBFS_(const Index parent, const AA child_label) const;
465 
466  std::vector<ACNode> trie_;
467  uint32_t needle_count_ {0};
468  uint32_t max_aaa_ {0};
469  uint32_t max_mm_ {0};
470 
472  std::unordered_map<Index, std::vector<uint32_t>> umap_index2needles_;
474  std::unordered_map<Index, std::vector<Index>> umap_index2children_naive_;
475  };
476 
477 } // namespace OpenMS
478 
An Aho Corasick trie (a set of nodes with suffix links mainly)
Definition: AhoCorasickAmbiguous.h:344
Index stepMaster_(const Index i, const AA edge, ACTrieState &state) const
bool stepSpawn_(ACSpawn &spawn, ACTrieState &state) const
void setMaxMMCount(const uint32_t max_mm)
uint32_t getMaxMMCount() const
Maximum number of mismatches allowed during search.
ACTrie(uint32_t max_aaa=0, uint32_t max_mm=0)
Default C'tor which just creates a root node.
void addNeedlesAndCompress(const std::vector< std::string > &needles)
bool nextHitsNoClear_(ACTrieState &state) const
std::vector< ACNode > trie_
the trie, in either naive structure or BFS order (after compressTrie)
Definition: AhoCorasickAmbiguous.h:466
Index add_(const Index from, const AA edge)
void addNeedle(const std::string &needle)
void setMaxAAACount(const uint32_t max_aaa)
std::unordered_map< Index, std::vector< uint32_t > > umap_index2needles_
maps a node to which needles end there (valid for both naive and BFS tree)
Definition: AhoCorasickAmbiguous.h:472
void addNeedles(const std::vector< std::string > &needles)
bool addHitsSpawn_(Index i, const ACSpawn &spawn, const size_t text_pos, std::vector< Hit > &hits, const int current_spawn_depths) const
same as addHits_, but only follows the suffix chain until the spawn loses its prefix
Index follow_(const Index i, const AA edge) const
void getAllHits(ACTrieState &state) const
Index findChildBFS_(const Index parent, const AA child_label) const
After compression (BFS trie), obtain the child with edge child_label from parent; if it does not exis...
bool addHits_(Index i, const size_t text_pos, std::vector< Hit > &hits) const
Add all hits occurring in node i (including all its suffix hits)
void createSpawns_(const Index i, const AA fromAA, const AA toAA, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
bool followSpawn_(ACSpawn &spawn, const AA edge, ACTrieState &state) const
uint32_t getMaxAAACount() const
Maximum number of ambiguous amino acids allowed during search.
Index findChildNaive_(Index parent, AA child_label)
During needle addition (naive trie), obtain the child with edge child_label from parent; if it does n...
size_t getNeedleCount() const
How many needles were added to the trie?
void compressTrie()
Traverses the trie in BFS order and makes it more compact and efficient to traverse.
void createSubSpawns_(const ACSpawn &prototype, const AA fromAA, const AA toAA, ACTrieState &state) const
Create spawns from a spawn with an AAA or MM, using prototype as template, following edges in range f...
void createMMSpawns_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
Same as createSpawns_, but instantiate all possible AA's except for those in the range from except_fr...
std::unordered_map< Index, std::vector< Index > > umap_index2children_naive_
maps the child nodes of a node for the naive tree; only needed during naive trie construction; storin...
Definition: AhoCorasickAmbiguous.h:474
void createMMSubSpawns_(const ACSpawn &prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state) const
Same as createSubSpawns_, but instantiate all possible AA's except for those in the range from except...
bool nextHits(ACTrieState &state) const
Definition: AhoCorasickAmbiguous.h:205
bool operator==(const Index other) const
equality operator
uint32_t T
Definition: AhoCorasickAmbiguous.h:207
bool isInvalid() const
is this Index invalid, i.e. should not be dereferenced
T pos() const
allows to read the index, using index.pos()
Index()=default
default C'tor; creates an invalid index
Index(T val)
C'tor from T.
Definition: AhoCorasickAmbiguous.h:212
bool isValid() const
is this Index valid, i.e. an actual index into a vector?
T operator()() const
convert to a number (might be invalid, check with .isValid() first)
T & pos()
allows to set the index, using index.pos() = 3; or simply read its value
#define OPENMS_PRECONDITION(condition, message)
Precondition macro.
Definition: openms/include/OpenMS/CONCEPT/Macros.h:94
const double c
Definition: Constants.h:188
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:131
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
AA nextValidAA(const std::string::const_iterator end, std::string::const_iterator &it_q)
constexpr char const AAtoChar[28]
Definition: AhoCorasickAmbiguous.h:26
constexpr char const CharToAA[128]
Conversion table from 7-bit ASCII char to internal value representation for an amino acid (AA)
Definition: AhoCorasickAmbiguous.h:60
Definition: AhoCorasickAmbiguous.h:99
static size_t unambiguousAACount()
Returns the number of unambiguous amino acids (A-Z, excluding B, J, Z, X), i.e. 22.
Definition: AhoCorasickAmbiguous.h:190
constexpr bool isValid() const
is AA a letter or '$' ?
Definition: AhoCorasickAmbiguous.h:137
constexpr AA(const char c)
Definition: AhoCorasickAmbiguous.h:108
constexpr bool isAmbiguous() const
is AA a 'B', 'J', 'Z', 'X', or '$' ?
Definition: AhoCorasickAmbiguous.h:131
constexpr bool isValidForPeptide() const
is the AA a letter, i.e. A-Z or a-z?
Definition: AhoCorasickAmbiguous.h:143
constexpr AA operator++(int)
Post-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:157
constexpr char toChar() const
Convert this AA to a single letter representation, i.e. 'A'..'V', 'B', 'J', 'Z', 'X',...
Definition: AhoCorasickAmbiguous.h:174
uint8_t aa_
internal representation as 1-byte integer
Definition: AhoCorasickAmbiguous.h:199
constexpr AA & operator++()
Pre-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:149
static AA fromIndex(size_t index) noexcept
Definition: AhoCorasickAmbiguous.h:181
constexpr bool operator<=(const AA other) const
less-or-equal operator
Definition: AhoCorasickAmbiguous.h:125
constexpr uint8_t operator()() const
yields the internal 8bit representation
Definition: AhoCorasickAmbiguous.h:113
constexpr AA operator-(const AA rhs) const
Decrement operator.
Definition: AhoCorasickAmbiguous.h:166
constexpr AA()
Default C'tor; creates an invalid AA.
Definition: AhoCorasickAmbiguous.h:101
constexpr bool operator==(const AA other) const
equality operator
Definition: AhoCorasickAmbiguous.h:119
internal struct to steal one bit from depth to use as hit indicator
Definition: AhoCorasickAmbiguous.h:261
DepthHits()
Definition: AhoCorasickAmbiguous.h:262
uint8_t depth
depth of node in the trie
Definition: AhoCorasickAmbiguous.h:268
uint8_t has_hit
does a pattern end here (or when following suffix links)?
Definition: AhoCorasickAmbiguous.h:265
Definition: AhoCorasickAmbiguous.h:250
DepthHits depth_and_hits
depth of node in the tree and one bit if a needle ends in this node or any of its suffices
Definition: AhoCorasickAmbiguous.h:278
ACNode(const AA label, const uint8_t depth)
C'tor from an edge label (from parent to this node) and a depth in the tree.
Definition: AhoCorasickAmbiguous.h:255
uint8_t ChildCountType
Definition: AhoCorasickAmbiguous.h:271
ACNode()
Default C'tor.
Definition: AhoCorasickAmbiguous.h:252
a spin-off search path through the trie, which can deal with ambiguous AAs and mismatches
Definition: AhoCorasickAmbiguous.h:286
AA nextValidAA(const ACTrieState &state)
std::string::const_iterator it_query
position in query
Definition: AhoCorasickAmbiguous.h:300
Index tree_pos
position in trie
Definition: AhoCorasickAmbiguous.h:301
size_t textPos(const ACTrieState &state) const
Where in the text are we currently?
ACSpawn()=delete
No default C'tor.
ACSpawn(std::string::const_iterator query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss)
C'tor with arguments.
Definition: AhoCorasickAmbiguous.h:314
std::vector< Hit > hits
current hits found
Definition: AhoCorasickAmbiguous.h:333
void setQuery(const std::string &haystack)
const std::string & getQuery() const
The current query.
std::string query_
current query ( = haystack = text)
Definition: AhoCorasickAmbiguous.h:338
std::string::const_iterator it_q_
position in query
Definition: AhoCorasickAmbiguous.h:339
size_t textPos() const
Where in the text are we currently?
friend ACSpawn
Definition: AhoCorasickAmbiguous.h:315
Index tree_pos
position in trie (for the master)
Definition: AhoCorasickAmbiguous.h:334
std::queue< ACSpawn > spawns
initial spawn points which are currently active and need processing
Definition: AhoCorasickAmbiguous.h:335
std::string::const_iterator textPosIt() const
Where in the text are we currently?
Definition: AhoCorasickAmbiguous.h:83
uint32_t T
Definition: AhoCorasickAmbiguous.h:84
T query_pos
Definition: AhoCorasickAmbiguous.h:89
T needle_index
Definition: AhoCorasickAmbiguous.h:86
T needle_length
Definition: AhoCorasickAmbiguous.h:88
bool operator==(const Hit &rhs) const
Definition: AhoCorasickAmbiguous.h:90
Hit()=default
Hit(T needle_index, T needle_length, T query_pos)
Definition: AhoCorasickAmbiguous.h:86
std::size_t operator()(OpenMS::Index const &s) const noexcept
Definition: AhoCorasickAmbiguous.h:239