20 #include <unordered_map>
64 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
65 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
67 27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
68 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
71 27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13,
74 14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27,
77 27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13,
80 14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27,
85 struct OPENMS_DLLAPI
Hit {
88 Hit(
T needle_index,
T needle_length,
T query_pos) : needle_index(needle_index), needle_length(needle_length), query_pos(query_pos) {};
100 struct OPENMS_DLLAPI
AA
103 constexpr
AA() : aa_(
AA(
'?').aa_)
123 return aa_ == other.
aa_;
129 return aa_ <= other.
aa_;
135 return aa_ >=
AA(
'B').
aa_;
141 return aa_ !=
AA(
'?').
aa_;
147 return aa_ <=
AA(
'X').
aa_;
154 assert(aa_ <=
AA(
'?').aa_);
163 assert(aa_ <=
AA(
'?').aa_);
185 OPENMS_PRECONDITION(index < unambiguousAACount(),
"AA::fromIndex(): index must be in [0, unambiguousAACount())");
187 r.
aa_ =
static_cast<uint8_t
>(index);
194 static_assert(
AA(
'V').aa_ -
AA(
'A').aa_ + 1 == 22,
"Unambiguous AA count must be 22 (A-Z, excluding B, J, Z, X)");
195 static_assert(
AA(
'A').aa_ == 0,
"A must be the first AA in the enumeration");
196 static_assert(
AA(
'V').aa_ == 21,
"V must be the last unambiguous AA in the enumeration");
219 return i_ == std::numeric_limits<T>::max();
225 return i_ != std::numeric_limits<T>::max();
237 return i_ == other.
i_;
252 T i_ = std::numeric_limits<T>::max();
257 template<>
struct std::hash<
OpenMS::Index>
261 return std::hash<OpenMS::Index::T> {}(s());
278 bits &= ~(1u << pos);
283 return (
bits >> pos) & 1u;
307 return std::popcount(
bits);
320 ACNode(
const AA label,
const uint8_t depth) : edge(label)
322 depth_and_hits.depth = depth;
329 memset(
this, 0,
sizeof *
this);
358 ACScout(
const char* query_pos,
Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss);
367 const char* it_query = 0;
369 uint8_t max_aaa_leftover {0};
370 uint8_t max_mm_leftover {0};
371 uint8_t max_prefix_loss_leftover {0};
418 ACTrie(uint32_t max_aaa = 0, uint32_t max_mm = 0);
496 bool addHits_(
Index i,
const size_t text_pos, std::vector<Hit>& hits)
const;
539 uint32_t needle_count_ {0};
540 uint32_t max_aaa_ {0};
541 uint32_t max_mm_ {0};
544 std::vector<std::vector<uint32_t>> vec_index2needles_ = { {} };
546 std::vector<std::vector<Index>> vec_index2children_naive_ = { {} };
An Aho Corasick trie (a set of nodes with suffix links mainly)
Definition: AhoCorasickAmbiguous.h:410
void createMMScouts_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
Same as createScouts_, but instantiate all possible AA's except for those in the range from except_fr...
void setMaxMMCount(const uint32_t max_mm)
uint32_t getMaxMMCount() const
Maximum number of mismatches allowed during search.
size_t size() const
Definition: AhoCorasickAmbiguous.h:437
ACTrie(uint32_t max_aaa=0, uint32_t max_mm=0)
Default C'tor which just creates a root node.
void addNeedlesAndCompress(const std::vector< std::string > &needles)
bool followScout_(ACScout &scout, const AA edge, ACTrieState &state) const
bool nextHitsNoClear_(ACTrieState &state) const
std::vector< ACNode > trie_
the trie, in either naive structure or BFS order (after compressTrie)
Definition: AhoCorasickAmbiguous.h:538
Index add_(const Index from, const AA edge)
Index stepPrimary_(const Index i, const AA edge, ACTrieState &state) const
void addNeedle(const std::string &needle)
void setMaxAAACount(const uint32_t max_aaa)
void addNeedles(const std::vector< std::string > &needles)
void createSubScouts_(const ACScout &prototype, const AA fromAA, const AA toAA, ACTrieState &state) const
Create scouts from a scout with an AAA or MM, using prototype as template, following edges in range f...
bool addHitsScout_(Index i, const ACScout &scout, const size_t text_pos, std::vector< Hit > &hits, const int current_scout_depths) const
same as addHits_, but only follows the suffix chain until the scout loses its prefix
Index follow_(const Index i, const AA edge) const
void getAllHits(ACTrieState &state) const
Index findChildBFS_(const Index parent, const AA child_label) const
After compression (BFS trie), obtain the child with edge child_label from parent; if it does not exis...
bool addHits_(Index i, const size_t text_pos, std::vector< Hit > &hits) const
Add all hits occurring in node i (including all its suffix hits)
void createMMSubScouts_(const ACScout &prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state) const
Same as createSubScouts_, but instantiate all possible AA's except for those in the range from except...
uint32_t getMaxAAACount() const
Maximum number of ambiguous amino acids allowed during search.
Index findChildNaive_(Index parent, AA child_label)
During needle addition (naive trie), obtain the child with edge child_label from parent; if it does n...
size_t getNeedleCount() const
How many needles were added to the trie?
void compressTrie()
Traverses the trie in BFS order and makes it more compact and efficient to traverse.
bool stepScout_(ACScout &scout, ACTrieState &state) const
void createScouts_(const Index i, const AA fromAA, const AA toAA, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
bool nextHits(ACTrieState &state) const
Definition: AhoCorasickAmbiguous.h:207
constexpr T operator()() const
convert to a number (might be invalid, check with .isValid() first)
Definition: AhoCorasickAmbiguous.h:229
uint32_t T
Definition: AhoCorasickAmbiguous.h:209
constexpr bool isValid() const
is this Index valid, i.e. an actual index into a vector?
Definition: AhoCorasickAmbiguous.h:223
constexpr bool isInvalid() const
is this Index invalid, i.e. should not be dereferenced
Definition: AhoCorasickAmbiguous.h:217
T i_
internal number representation; invalid state by default
Definition: AhoCorasickAmbiguous.h:252
constexpr Index()=default
default C'tor; creates an invalid index
constexpr Index(T val)
C'tor from T.
Definition: AhoCorasickAmbiguous.h:214
constexpr bool operator==(const Index other) const
equality operator
Definition: AhoCorasickAmbiguous.h:235
constexpr T & pos()
allows to set the index, using index.pos() = 3; or simply read its value
Definition: AhoCorasickAmbiguous.h:241
constexpr T pos() const
allows to read the index, using index.pos()
Definition: AhoCorasickAmbiguous.h:247
#define OPENMS_PRECONDITION(condition, message)
Precondition macro.
Definition: openms/include/OpenMS/CONCEPT/Macros.h:94
const double c
Definition: Constants.h:188
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:131
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
constexpr char const AAtoChar[28]
Definition: AhoCorasickAmbiguous.h:28
constexpr char const CharToAA[128]
Conversion table from 7-bit ASCII char to internal value representation for an amino acid (AA)
Definition: AhoCorasickAmbiguous.h:62
AA nextValidAA(const char *&it_q)
Definition: AhoCorasickAmbiguous.h:101
static size_t unambiguousAACount()
Returns the number of unambiguous amino acids (A-Z, excluding B, J, Z, X), i.e. 22.
Definition: AhoCorasickAmbiguous.h:192
constexpr bool isValid() const
is AA a letter or '$' ?
Definition: AhoCorasickAmbiguous.h:139
constexpr AA(const char c)
Definition: AhoCorasickAmbiguous.h:110
constexpr bool isAmbiguous() const
is AA a 'B', 'J', 'Z', 'X', or '$' ?
Definition: AhoCorasickAmbiguous.h:133
constexpr bool isValidForPeptide() const
is the AA a letter, i.e. A-Z or a-z?
Definition: AhoCorasickAmbiguous.h:145
constexpr AA operator++(int)
Post-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:159
constexpr char toChar() const
Convert this AA to a single letter representation, i.e. 'A'..'V', 'B', 'J', 'Z', 'X',...
Definition: AhoCorasickAmbiguous.h:176
uint8_t aa_
internal representation as 1-byte integer
Definition: AhoCorasickAmbiguous.h:201
constexpr AA & operator++()
Pre-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:151
static AA fromIndex(size_t index) noexcept
Definition: AhoCorasickAmbiguous.h:183
constexpr bool operator<=(const AA other) const
less-or-equal operator
Definition: AhoCorasickAmbiguous.h:127
constexpr uint8_t operator()() const
yields the internal 8bit representation
Definition: AhoCorasickAmbiguous.h:115
constexpr AA operator-(const AA rhs) const
Decrement operator.
Definition: AhoCorasickAmbiguous.h:168
constexpr AA()
Default C'tor; creates an invalid AA.
Definition: AhoCorasickAmbiguous.h:103
constexpr bool operator==(const AA other) const
equality operator
Definition: AhoCorasickAmbiguous.h:121
internal struct to steal one bit from depth to use as hit indicator
Definition: AhoCorasickAmbiguous.h:326
DepthHits()
Definition: AhoCorasickAmbiguous.h:327
uint8_t depth
depth of node in the trie
Definition: AhoCorasickAmbiguous.h:333
uint8_t has_hit
does a pattern end here (or when following suffix links)?
Definition: AhoCorasickAmbiguous.h:330
Definition: AhoCorasickAmbiguous.h:315
Bitset children_bitset
bitfield of children (if tree is in BFS order); 26 bits are enough to cover all AA incl....
Definition: AhoCorasickAmbiguous.h:340
DepthHits depth_and_hits
depth of node in the tree and one bit if a needle ends in this node or any of its suffices
Definition: AhoCorasickAmbiguous.h:345
ACNode(const AA label, const uint8_t depth)
C'tor from an edge label (from parent to this node) and a depth in the tree.
Definition: AhoCorasickAmbiguous.h:320
uint8_t ChildCountType
Definition: AhoCorasickAmbiguous.h:336
ACNode()
Default C'tor.
Definition: AhoCorasickAmbiguous.h:317
a spin-off search path through the trie, which can deal with ambiguous AAs and mismatches
Definition: AhoCorasickAmbiguous.h:353
ACScout()=delete
No default C'tor.
Index tree_pos
position in trie
Definition: AhoCorasickAmbiguous.h:368
ACScout(const char *query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss)
C'tor with arguments.
size_t textPos(const ACTrieState &state) const
Where in the text are we currently?
Definition: AhoCorasickAmbiguous.h:381
std::vector< Hit > hits
current hits found
Definition: AhoCorasickAmbiguous.h:400
void setQuery(const std::string &haystack)
const std::string & getQuery() const
The current query.
std::string query_
current query ( = haystack = text)
Definition: AhoCorasickAmbiguous.h:405
size_t textPos() const
Where in the text are we currently?
Index tree_pos
position in trie (for the Primary)
Definition: AhoCorasickAmbiguous.h:402
std::queue< ACScout > scouts
initial scout points which are currently active and need processing
Definition: AhoCorasickAmbiguous.h:401
const char * textPosIt() const
Where in the text are we currently?
friend ACScout
Definition: AhoCorasickAmbiguous.h:382
const char * it_q_
position in query
Definition: AhoCorasickAmbiguous.h:404
Custom bitset which uses a 32-bit integer to store bits (instead of 8 bytes for std::bitset<32> on Cl...
Definition: AhoCorasickAmbiguous.h:268
Bitset operator<<(int n) const
Definition: AhoCorasickAmbiguous.h:287
void clear(int pos)
Definition: AhoCorasickAmbiguous.h:277
int pop_count() const
Definition: AhoCorasickAmbiguous.h:306
bool test(int pos) const
Definition: AhoCorasickAmbiguous.h:282
void set(int pos)
Definition: AhoCorasickAmbiguous.h:272
Bitset & operator<<=(int n)
Definition: AhoCorasickAmbiguous.h:293
uint32_t bits
Definition: AhoCorasickAmbiguous.h:269
Definition: AhoCorasickAmbiguous.h:85
uint32_t T
Definition: AhoCorasickAmbiguous.h:86
T query_pos
Definition: AhoCorasickAmbiguous.h:91
T needle_index
Definition: AhoCorasickAmbiguous.h:88
T needle_length
Definition: AhoCorasickAmbiguous.h:90
bool operator==(const Hit &rhs) const
Definition: AhoCorasickAmbiguous.h:92
Hit(T needle_index, T needle_length, T query_pos)
Definition: AhoCorasickAmbiguous.h:88
std::size_t operator()(OpenMS::Index const &s) const noexcept
Definition: AhoCorasickAmbiguous.h:259