OpenMS
IDMergerAlgorithm.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Julianus Pfeuffer $
6 // $Authors: Julianus Pfeuffer $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
18 
19 #include <map>
20 #include <unordered_set>
21 
22 namespace OpenMS
23 {
24 
25  //TODO add params for checking consistency (i.e. how strict to check)
26  //TODO add another subclass that does score-aware merging? (i.e. only keep best per peptide[sequence])
27 
55  class OPENMS_DLLAPI IDMergerAlgorithm:
56  public DefaultParamHandler,
57  public ProgressLogger
58  {
59  public:
68  explicit IDMergerAlgorithm (const String& runIdentifier = "merged", bool addTimeStampToID = true);
69 
84  void insertRuns(std::vector<ProteinIdentification>&& prots,
86 
100  void insertRuns(const std::vector<ProteinIdentification>& prots,
101  const PeptideIdentificationList& peps);
102 
103  //TODO add methods to just insert prots or just peps. Especially makes sense if you do re-indexing anyway,
104  // then you do not need the proteins. But then we need origin information. Either externally in form of a
105  // String or StringList (like the one from ProteinID.getPrimaryMSRunPath). Or by having the file annotated
106  // at the PeptideID (with getBasename maybe?)
107  // Current solution would be to clear the ProteinIdentification if you do not need the proteins and add all the
108  // necessary information about origin(s) to this ProteinIdentification.
109 
127 
128  private:
129 
139  String getNewIdentifier_(bool addTimeStampToID) const;
140 
150 
165  const std::vector<ProteinIdentification>& protRuns,
166  const String& experiment_type) const;
167 
183  const std::vector<ProteinIdentification>& protRuns,
184  const ProteinIdentification& ref,
185  const String& experiment_type) const;
186 
195  std::vector<ProteinIdentification>&& old_protRuns
196  );
197 
210  PeptideIdentificationList&& pepIDs,
211  const std::map<String, Size>& runID_to_runIdx,
212  const std::vector<StringList>& originFiles,
213  bool annotate_origin
214  );
215 
226  PeptideIdentificationList&& pepIDs,
227  std::vector<ProteinIdentification>&& old_protRuns
228  );
229 
232 
235 
242  static size_t accessionHash_(const ProteinHit& p){
243  return std::hash<String>()(p.getAccession());
244  }
245 
253  static bool accessionEqual_(const ProteinHit& p1, const ProteinHit& p2){
254  return p1.getAccession() == p2.getAccession();
255  }
256 
258  using hash_type = std::size_t (*)(const ProteinHit&);
259 
261  using equal_type = bool (*)(const ProteinHit&, const ProteinHit&);
262 
264  std::unordered_set<ProteinHit, hash_type, equal_type> collected_protein_hits_;
265 
267  bool filled_ = false;
268 
270  std::map<String, Size> file_origin_to_idx_;
271 
274 
277  };
278 } // namespace OpenMS
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:66
Algorithm for merging multiple protein and peptide identification runs.
Definition: IDMergerAlgorithm.h:58
ProteinIdentification prot_result_
The resulting merged protein identification.
Definition: IDMergerAlgorithm.h:231
bool(*)(const ProteinHit &, const ProteinHit &) equal_type
Type alias for the equality function.
Definition: IDMergerAlgorithm.h:261
void insertRuns(std::vector< ProteinIdentification > &&prots, PeptideIdentificationList &&peps)
Insert runs using move semantics.
bool fixed_identifier_
Flag indicating whether the identifier should be fixed (i.e., not contain a timestamp)
Definition: IDMergerAlgorithm.h:276
void insertProteinIDs_(std::vector< ProteinIdentification > &&old_protRuns)
Insert protein identifications into the merged result.
bool checkOldRunConsistency_(const std::vector< ProteinIdentification > &protRuns, const ProteinIdentification &ref, const String &experiment_type) const
Check consistency of search engines and settings against a reference.
std::size_t(*)(const ProteinHit &) hash_type
Type alias for the hash function.
Definition: IDMergerAlgorithm.h:258
static size_t accessionHash_(const ProteinHit &p)
Hash function for protein hits based on accession.
Definition: IDMergerAlgorithm.h:242
static bool accessionEqual_(const ProteinHit &p1, const ProteinHit &p2)
Equality function for protein hits based on accession.
Definition: IDMergerAlgorithm.h:253
IDMergerAlgorithm(const String &runIdentifier="merged", bool addTimeStampToID=true)
Constructor for the IDMergerAlgorithm.
String getNewIdentifier_(bool addTimeStampToID) const
Generate a new identifier for the merged run.
void insertRuns(const std::vector< ProteinIdentification > &prots, const PeptideIdentificationList &peps)
Insert runs using copy semantics.
std::map< String, Size > file_origin_to_idx_
Mapping to keep track of the mzML origins of spectra.
Definition: IDMergerAlgorithm.h:270
void returnResultsAndClear(ProteinIdentification &prots, PeptideIdentificationList &peps)
Return the merged results and reset internal state.
static void copySearchParams_(const ProteinIdentification &from, ProteinIdentification &to)
Copy search parameters between protein identifications.
std::unordered_set< ProteinHit, hash_type, equal_type > collected_protein_hits_
Set of collected protein hits using custom hash and equality functions.
Definition: IDMergerAlgorithm.h:264
void movePepIDsAndRefProteinsToResultFaster_(PeptideIdentificationList &&pepIDs, std::vector< ProteinIdentification > &&old_protRuns)
Optimized method to move peptide IDs and reference proteins to result.
String id_
The new identifier string for the merged run.
Definition: IDMergerAlgorithm.h:273
void updateAndMovePepIDs_(PeptideIdentificationList &&pepIDs, const std::map< String, Size > &runID_to_runIdx, const std::vector< StringList > &originFiles, bool annotate_origin)
Update peptide ID references and move them to the result.
bool checkOldRunConsistency_(const std::vector< ProteinIdentification > &protRuns, const String &experiment_type) const
Check consistency of search engines and settings across runs.
PeptideIdentificationList pep_result_
The resulting merged peptide identifications.
Definition: IDMergerAlgorithm.h:234
Container for peptide identifications from multiple spectra.
Definition: PeptideIdentificationList.h:66
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:27
Representation of a protein hit.
Definition: ProteinHit.h:34
const String & getAccession() const
returns the accession of the protein
Representation of a protein identification run.
Definition: ProteinIdentification.h:51
A more convenient string class.
Definition: String.h:34
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19