OpenMS
AhoCorasickAmbiguous.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: Chris Bielow $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
11 #include <OpenMS/CONCEPT/Macros.h>
12 #include <bitset>
13 #include <cassert>
14 #include <cstring>
15 #include <functional> // for std::hash
16 #include <limits>
17 #include <queue>
18 #include <string>
19 #include <cstdint>
20 #include <unordered_map>
21 #include <vector>
22 #include <bit> // for std::popcount
23 
24 namespace OpenMS
25 {
28  constexpr char const AAtoChar[28] = {
29  'A', // 00 Ala Alanine
30  'Y', // 01 Tyr Tyrosine
31  'C', // 02 Cys Cysteine
32  'D', // 03 Asp Aspartic Acid // B
33  'N', // 04 Asn Asparagine // B
34  'F', // 05 Phe Phenylalanine
35  'G', // 06 Gly Glycine
36  'H', // 07 His Histidine
37  'I', // 08 Ile Isoleucine // J
38  'L', // 09 Leu Leucine // J
39  'K', // 10 Lys Lysine
40  'W', // 11 Trp Tryptophan
41  'M', // 12 Met Methionine
42  'O', // 13 Pyl Pyrrolysine
43  'P', // 14 Pro Proline
44  'E', // 15 Glu Glutamic Acid // Z
45  'Q', // 16 Gln Glutamine // Z
46  'R', // 17 Arg Arginine
47  'S', // 18 Ser Serine
48  'T', // 19 Thr Threonine
49  'U', // 20 Selenocysteine
50  'V', // 21 Val Valine
51  // ambiguous AAs start here (index: 22...25)
52  'B', // 22 Aspartic Acid, Asparagine $ // the ambAA's need to be consecutive (B,J,Z,X,$)
53  'J', // 23 Leucine, Isoleucine $
54  'Z', // 24 Glutamic Acid, Glutamine $
55  'X', // 25 Unknown
56  // non-regular AA's, which are special
57  '$', // 26 superAA, i.e. it models a mismatch, which can be anything, including AAAs
58  '?', // 27 invalid AA (will usually be skipped) -- must be the last AA (AA::operator++ and others rely on it)
59  };
60 
62  constexpr char const CharToAA[128] = {
63  // ASCII char (7-bit Int with values from 0..127) --> amino acid
64  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 0
65  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 1
66  // $
67  27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 2
68  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 3
69 
70  // , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
71  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 4
72 
73  // P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
74  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 5
75 
76  // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
77  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 6
78 
79  // p, q, r, s, t, u, v, w, x, y, z, , , , , ,
80  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 7
81  };
82 
85  struct OPENMS_DLLAPI Hit {
86  using T = uint32_t;
87  Hit() = default;
88  Hit(T needle_index, T needle_length, T query_pos) : needle_index(needle_index), needle_length(needle_length), query_pos(query_pos) {};
92  bool operator==(const Hit& rhs) const
93  {
94  return needle_index == rhs.needle_index && needle_length == rhs.needle_length && query_pos == rhs.query_pos;
95  }
96  };
97 
100  struct OPENMS_DLLAPI AA
101  {
103  constexpr AA() : aa_(AA('?').aa_)
104  {
105  }
106 
110  constexpr explicit AA(const char c) : aa_(CharToAA[(unsigned char)c])
111  {
112  }
113 
115  constexpr uint8_t operator()() const
116  {
117  return aa_;
118  }
119 
121  constexpr bool operator==(const AA other) const
122  {
123  return aa_ == other.aa_;
124  }
125 
127  constexpr bool operator<=(const AA other) const
128  {
129  return aa_ <= other.aa_;
130  }
131 
133  constexpr bool isAmbiguous() const
134  {
135  return aa_ >= AA('B').aa_;
136  }
137 
139  constexpr bool isValid() const
140  {
141  return aa_ != AA('?').aa_;
142  }
143 
145  constexpr bool isValidForPeptide() const
146  {
147  return aa_ <= AA('X').aa_; // '$' or '?'
148  }
149 
151  constexpr AA& operator++()
152  {
153  ++aa_;
154  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
155  return *this;
156  }
157 
159  constexpr AA operator++(int)
160  {
161  AA r(*this);
162  ++aa_;
163  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
164  return r;
165  }
166 
168  constexpr AA operator-(const AA rhs) const
169  {
170  AA r(*this);
171  r.aa_ -= rhs.aa_;
172  return r;
173  }
174 
176  constexpr char toChar() const
177  {
178  return AAtoChar[aa_];
179  }
180 
183  static AA fromIndex(size_t index) noexcept
184  {
185  OPENMS_PRECONDITION(index < unambiguousAACount(), "AA::fromIndex(): index must be in [0, unambiguousAACount())");
186  AA r;
187  r.aa_ = static_cast<uint8_t>(index);
188  return r;
189  }
190 
192  static size_t unambiguousAACount()
193  {
194  static_assert(AA('V').aa_ - AA('A').aa_ + 1 == 22, "Unambiguous AA count must be 22 (A-Z, excluding B, J, Z, X)");
195  static_assert(AA('A').aa_ == 0, "A must be the first AA in the enumeration");
196  static_assert(AA('V').aa_ == 21, "V must be the last unambiguous AA in the enumeration");
197  return AA('V').aa_ - AA('A').aa_ + 1;
198  }
199 
200  private:
201  uint8_t aa_;
202  };
203 
206  class OPENMS_DLLAPI Index
207  {
208  public:
209  using T = uint32_t;
211  constexpr Index() = default;
212 
214  constexpr Index(T val) : i_(val) {};
215 
217  constexpr bool isInvalid() const
218  {
219  return i_ == std::numeric_limits<T>::max();
220  }
221 
223  constexpr bool isValid() const
224  {
225  return i_ != std::numeric_limits<T>::max();
226  }
227 
229  constexpr T operator()() const
230  {
231  return i_;
232  }
233 
235  constexpr bool operator==(const Index other) const
236  {
237  return i_ == other.i_;
238  }
239 
241  constexpr T& pos()
242  {
243  return i_;
244  }
245 
247  constexpr T pos() const
248  {
249  return i_;
250  }
251  private:
252  T i_ = std::numeric_limits<T>::max();
253  };
254  } // namespace OpenMS
255 
256 // this needs to go into namespace std
257 template<> struct std::hash<OpenMS::Index>
258 {
259  std::size_t operator()(OpenMS::Index const& s) const noexcept
260  {
261  return std::hash<OpenMS::Index::T> {}(s());
262  }
263 };
264 
265 namespace OpenMS
266 {
268  struct Bitset {
269  uint32_t bits = 0;
270 
271  // Set bit at position `pos` (0-based)
272  void set(int pos) {
273  bits |= (1u << pos);
274  }
275 
276  // Clear bit at position `pos`
277  void clear(int pos) {
278  bits &= ~(1u << pos);
279  }
280 
281  // Check if bit at position `pos` is set
282  bool test(int pos) const {
283  return (bits >> pos) & 1u;
284  }
285 
286  // Left shift the bitset by `n` positions
287  Bitset operator<<(int n) const {
288  Bitset o = *this;
289  return o <<= n;
290  }
291 
292  // Left shift the bitset by `n` positions
293  Bitset& operator<<=(int n) {
294  if (n > 31)
295  { // if we shift more than 31 bits, behaviour is undefined (e.g. GCC/Clang/MSVC will not modify 'bits' if n==32)
296  bits = 0; // shift out all bits
297  }
298  else
299  {
300  bits <<= n;
301  }
302  return *this;
303  }
304 
305  // Count number of set bits (population count)
306  int pop_count() const {
307  return std::popcount(bits);
308  }
309  }; // end Bitset
310 
311 
314  struct OPENMS_DLLAPI ACNode
315  {
317  ACNode() {};
318 
320  ACNode(const AA label, const uint8_t depth) : edge(label)
321  {
322  depth_and_hits.depth = depth;
323  }
324 
326  struct DepthHits {
328  {
329  memset(this, 0, sizeof *this); // make sure bitfields are 0; C++20 allows {} initialization ...
330  };
331  uint8_t has_hit: 1;
332  // we could add another bit here to distinguish between a local hit and suffix hit, but on Windows, this slows it down
333  uint8_t depth : 7;
334  };
335 
336  using ChildCountType = uint8_t;
337 
338  // do not use std::bitset, since GCC/clang wastes 8 bytes here.
339  //std::bitset<26> children_bitset; ///< bitfield of children (if tree is in BFS order); 26 bits are enough to cover all AA incl. (B,J,X,Z)
341  Index suffix {0};
342  Index first_child {0};
343  AA edge {0};
344  ChildCountType nr_children = 0;
346  };
347 
348  // forward declaration
349  struct ACTrieState;
350 
352  struct OPENMS_DLLAPI ACScout
353  {
355  ACScout() = delete;
356 
358  ACScout(const char* query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss);
359 
361  size_t textPos(const ACTrieState& state) const;
362 
366 
367  const char* it_query = 0;
369  uint8_t max_aaa_leftover {0};
370  uint8_t max_mm_leftover {0};
371  uint8_t max_prefix_loss_leftover {0};
372  };
373 
376  AA nextValidAA(const char*& it_q);
377 
380  struct OPENMS_DLLAPI ACTrieState
381  {
382  friend ACScout;
385  void setQuery(const std::string& haystack);
386 
388  size_t textPos() const;
389 
391  const char* textPosIt() const;
392 
394  const std::string& getQuery() const;
395 
399 
400  std::vector<Hit> hits;
401  std::queue<ACScout> scouts;
403  private:
404  const char* it_q_;
405  std::string query_;
406  };
407 
409  class OPENMS_DLLAPI ACTrie
410  {
411  public:
418  ACTrie(uint32_t max_aaa = 0, uint32_t max_mm = 0);
419 
422 
426  void addNeedle(const std::string& needle);
427 
431  void addNeedles(const std::vector<std::string>& needles);
432 
435  void addNeedlesAndCompress(const std::vector<std::string>& needles);
436 
437  size_t size() const
438  {
439  return trie_.size();
440  }
441 
449  void compressTrie();
450 
452  size_t getNeedleCount() const;
453 
456  void setMaxAAACount(const uint32_t max_aaa);
457 
459  uint32_t getMaxAAACount() const;
460 
463  void setMaxMMCount(const uint32_t max_mm);
464 
466  uint32_t getMaxMMCount() const;
467 
471  bool nextHits(ACTrieState& state) const;
472 
475  void getAllHits(ACTrieState& state) const;
476 
477  private:
481  bool nextHitsNoClear_(ACTrieState& state) const;
482 
486  Index add_(const Index from, const AA edge);
487 
496  bool addHits_(Index i, const size_t text_pos, std::vector<Hit>& hits) const;
497 
499  bool addHitsScout_(Index i, const ACScout& scout, const size_t text_pos, std::vector<Hit>& hits, const int current_scout_depths) const;
500 
505  Index follow_(const Index i, const AA edge) const;
506 
509  bool followScout_(ACScout& scout, const AA edge, ACTrieState& state) const;
510 
513  Index stepPrimary_(const Index i, const AA edge, ACTrieState& state) const;
514 
517  bool stepScout_(ACScout& scout, ACTrieState& state) const;
518 
521  void createScouts_(const Index i, const AA fromAA, const AA toAA, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
522 
524  void createSubScouts_(const ACScout& prototype, const AA fromAA, const AA toAA, ACTrieState& state) const;
525 
527  void createMMScouts_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
528 
530  void createMMSubScouts_(const ACScout& prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state) const;
531 
533  Index findChildNaive_(Index parent, AA child_label);
534 
536  Index findChildBFS_(const Index parent, const AA child_label) const;
537 
538  std::vector<ACNode> trie_;
539  uint32_t needle_count_ {0};
540  uint32_t max_aaa_ {0};
541  uint32_t max_mm_ {0};
542 
544  std::vector<std::vector<uint32_t>> vec_index2needles_ = { {} }; // one empty element to allow capacity doubling; note: sparse vector but still much faster and less memory than unordered_map
546  std::vector<std::vector<Index>> vec_index2children_naive_ = { {} }; // one empty element to allow capacity doubling
547  };
548 
549 } // namespace OpenMS
550 
An Aho Corasick trie (a set of nodes with suffix links mainly)
Definition: AhoCorasickAmbiguous.h:410
void createMMScouts_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
Same as createScouts_, but instantiate all possible AA's except for those in the range from except_fr...
void setMaxMMCount(const uint32_t max_mm)
uint32_t getMaxMMCount() const
Maximum number of mismatches allowed during search.
size_t size() const
Definition: AhoCorasickAmbiguous.h:437
ACTrie(uint32_t max_aaa=0, uint32_t max_mm=0)
Default C'tor which just creates a root node.
void addNeedlesAndCompress(const std::vector< std::string > &needles)
bool followScout_(ACScout &scout, const AA edge, ACTrieState &state) const
bool nextHitsNoClear_(ACTrieState &state) const
std::vector< ACNode > trie_
the trie, in either naive structure or BFS order (after compressTrie)
Definition: AhoCorasickAmbiguous.h:538
Index add_(const Index from, const AA edge)
Index stepPrimary_(const Index i, const AA edge, ACTrieState &state) const
void addNeedle(const std::string &needle)
void setMaxAAACount(const uint32_t max_aaa)
void addNeedles(const std::vector< std::string > &needles)
void createSubScouts_(const ACScout &prototype, const AA fromAA, const AA toAA, ACTrieState &state) const
Create scouts from a scout with an AAA or MM, using prototype as template, following edges in range f...
bool addHitsScout_(Index i, const ACScout &scout, const size_t text_pos, std::vector< Hit > &hits, const int current_scout_depths) const
same as addHits_, but only follows the suffix chain until the scout loses its prefix
Index follow_(const Index i, const AA edge) const
void getAllHits(ACTrieState &state) const
Index findChildBFS_(const Index parent, const AA child_label) const
After compression (BFS trie), obtain the child with edge child_label from parent; if it does not exis...
bool addHits_(Index i, const size_t text_pos, std::vector< Hit > &hits) const
Add all hits occurring in node i (including all its suffix hits)
void createMMSubScouts_(const ACScout &prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state) const
Same as createSubScouts_, but instantiate all possible AA's except for those in the range from except...
uint32_t getMaxAAACount() const
Maximum number of ambiguous amino acids allowed during search.
Index findChildNaive_(Index parent, AA child_label)
During needle addition (naive trie), obtain the child with edge child_label from parent; if it does n...
size_t getNeedleCount() const
How many needles were added to the trie?
void compressTrie()
Traverses the trie in BFS order and makes it more compact and efficient to traverse.
bool stepScout_(ACScout &scout, ACTrieState &state) const
void createScouts_(const Index i, const AA fromAA, const AA toAA, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
bool nextHits(ACTrieState &state) const
Definition: AhoCorasickAmbiguous.h:207
constexpr T operator()() const
convert to a number (might be invalid, check with .isValid() first)
Definition: AhoCorasickAmbiguous.h:229
uint32_t T
Definition: AhoCorasickAmbiguous.h:209
constexpr bool isValid() const
is this Index valid, i.e. an actual index into a vector?
Definition: AhoCorasickAmbiguous.h:223
constexpr bool isInvalid() const
is this Index invalid, i.e. should not be dereferenced
Definition: AhoCorasickAmbiguous.h:217
T i_
internal number representation; invalid state by default
Definition: AhoCorasickAmbiguous.h:252
constexpr Index()=default
default C'tor; creates an invalid index
constexpr Index(T val)
C'tor from T.
Definition: AhoCorasickAmbiguous.h:214
constexpr bool operator==(const Index other) const
equality operator
Definition: AhoCorasickAmbiguous.h:235
constexpr T & pos()
allows to set the index, using index.pos() = 3; or simply read its value
Definition: AhoCorasickAmbiguous.h:241
constexpr T pos() const
allows to read the index, using index.pos()
Definition: AhoCorasickAmbiguous.h:247
#define OPENMS_PRECONDITION(condition, message)
Precondition macro.
Definition: openms/include/OpenMS/CONCEPT/Macros.h:94
const double c
Definition: Constants.h:188
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:131
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
constexpr char const AAtoChar[28]
Definition: AhoCorasickAmbiguous.h:28
constexpr char const CharToAA[128]
Conversion table from 7-bit ASCII char to internal value representation for an amino acid (AA)
Definition: AhoCorasickAmbiguous.h:62
AA nextValidAA(const char *&it_q)
Definition: AhoCorasickAmbiguous.h:101
static size_t unambiguousAACount()
Returns the number of unambiguous amino acids (A-Z, excluding B, J, Z, X), i.e. 22.
Definition: AhoCorasickAmbiguous.h:192
constexpr bool isValid() const
is AA a letter or '$' ?
Definition: AhoCorasickAmbiguous.h:139
constexpr AA(const char c)
Definition: AhoCorasickAmbiguous.h:110
constexpr bool isAmbiguous() const
is AA a 'B', 'J', 'Z', 'X', or '$' ?
Definition: AhoCorasickAmbiguous.h:133
constexpr bool isValidForPeptide() const
is the AA a letter, i.e. A-Z or a-z?
Definition: AhoCorasickAmbiguous.h:145
constexpr AA operator++(int)
Post-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:159
constexpr char toChar() const
Convert this AA to a single letter representation, i.e. 'A'..'V', 'B', 'J', 'Z', 'X',...
Definition: AhoCorasickAmbiguous.h:176
uint8_t aa_
internal representation as 1-byte integer
Definition: AhoCorasickAmbiguous.h:201
constexpr AA & operator++()
Pre-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:151
static AA fromIndex(size_t index) noexcept
Definition: AhoCorasickAmbiguous.h:183
constexpr bool operator<=(const AA other) const
less-or-equal operator
Definition: AhoCorasickAmbiguous.h:127
constexpr uint8_t operator()() const
yields the internal 8bit representation
Definition: AhoCorasickAmbiguous.h:115
constexpr AA operator-(const AA rhs) const
Decrement operator.
Definition: AhoCorasickAmbiguous.h:168
constexpr AA()
Default C'tor; creates an invalid AA.
Definition: AhoCorasickAmbiguous.h:103
constexpr bool operator==(const AA other) const
equality operator
Definition: AhoCorasickAmbiguous.h:121
internal struct to steal one bit from depth to use as hit indicator
Definition: AhoCorasickAmbiguous.h:326
DepthHits()
Definition: AhoCorasickAmbiguous.h:327
uint8_t depth
depth of node in the trie
Definition: AhoCorasickAmbiguous.h:333
uint8_t has_hit
does a pattern end here (or when following suffix links)?
Definition: AhoCorasickAmbiguous.h:330
Definition: AhoCorasickAmbiguous.h:315
Bitset children_bitset
bitfield of children (if tree is in BFS order); 26 bits are enough to cover all AA incl....
Definition: AhoCorasickAmbiguous.h:340
DepthHits depth_and_hits
depth of node in the tree and one bit if a needle ends in this node or any of its suffices
Definition: AhoCorasickAmbiguous.h:345
ACNode(const AA label, const uint8_t depth)
C'tor from an edge label (from parent to this node) and a depth in the tree.
Definition: AhoCorasickAmbiguous.h:320
uint8_t ChildCountType
Definition: AhoCorasickAmbiguous.h:336
ACNode()
Default C'tor.
Definition: AhoCorasickAmbiguous.h:317
a spin-off search path through the trie, which can deal with ambiguous AAs and mismatches
Definition: AhoCorasickAmbiguous.h:353
ACScout()=delete
No default C'tor.
Index tree_pos
position in trie
Definition: AhoCorasickAmbiguous.h:368
ACScout(const char *query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss)
C'tor with arguments.
size_t textPos(const ACTrieState &state) const
Where in the text are we currently?
Definition: AhoCorasickAmbiguous.h:381
std::vector< Hit > hits
current hits found
Definition: AhoCorasickAmbiguous.h:400
void setQuery(const std::string &haystack)
const std::string & getQuery() const
The current query.
std::string query_
current query ( = haystack = text)
Definition: AhoCorasickAmbiguous.h:405
size_t textPos() const
Where in the text are we currently?
Index tree_pos
position in trie (for the Primary)
Definition: AhoCorasickAmbiguous.h:402
std::queue< ACScout > scouts
initial scout points which are currently active and need processing
Definition: AhoCorasickAmbiguous.h:401
const char * textPosIt() const
Where in the text are we currently?
friend ACScout
Definition: AhoCorasickAmbiguous.h:382
const char * it_q_
position in query
Definition: AhoCorasickAmbiguous.h:404
Custom bitset which uses a 32-bit integer to store bits (instead of 8 bytes for std::bitset<32> on Cl...
Definition: AhoCorasickAmbiguous.h:268
Bitset operator<<(int n) const
Definition: AhoCorasickAmbiguous.h:287
void clear(int pos)
Definition: AhoCorasickAmbiguous.h:277
int pop_count() const
Definition: AhoCorasickAmbiguous.h:306
bool test(int pos) const
Definition: AhoCorasickAmbiguous.h:282
void set(int pos)
Definition: AhoCorasickAmbiguous.h:272
Bitset & operator<<=(int n)
Definition: AhoCorasickAmbiguous.h:293
uint32_t bits
Definition: AhoCorasickAmbiguous.h:269
Definition: AhoCorasickAmbiguous.h:85
uint32_t T
Definition: AhoCorasickAmbiguous.h:86
T query_pos
Definition: AhoCorasickAmbiguous.h:91
T needle_index
Definition: AhoCorasickAmbiguous.h:88
T needle_length
Definition: AhoCorasickAmbiguous.h:90
bool operator==(const Hit &rhs) const
Definition: AhoCorasickAmbiguous.h:92
Hit()=default
Hit(T needle_index, T needle_length, T query_pos)
Definition: AhoCorasickAmbiguous.h:88
std::size_t operator()(OpenMS::Index const &s) const noexcept
Definition: AhoCorasickAmbiguous.h:259