OpenMS
DBSuitability.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Tom Waschischeck $
6 // $Authors: Tom Waschischeck $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
11 #include <OpenMS/CONCEPT/Types.h>
16 
17 #include <cfloat>
18 #include <vector>
19 
20 #include <boost/regex.hpp>
21 
22 namespace OpenMS
23 {
24  class ParamXMLFile;
25  class PeptideIdentification;
26  class PeptideHit;
27  class MSExperiment;
28 
46  class OPENMS_DLLAPI DBSuitability:
47  public DefaultParamHandler
48  {
49  public:
51  struct OPENMS_DLLAPI SuitabilityData
52  {
54  Size num_top_novo = 0;
55 
57  Size num_top_db = 0;
58 
60  Size num_interest = 0;
61 
64  Size num_re_ranked = 0;
65 
68  double cut_off = DBL_MAX;
69 
79  double suitability = 0;
80 
83  double suitability_no_rerank = 0;
84 
86  double suitability_corr_no_rerank = 0;
87 
88  // resets all members to their defaults
89  void clear();
90 
93  void setCorrectionFactor(double factor);
94 
95  double getCorrectionFactor() const;
96 
97  double getCorrectedNovoHits() const;
98 
99  double getCorrectedSuitability() const;
100 
110 
111  private:
117  double corr_factor = -1;
118 
120  double num_top_novo_corr = 0;
121 
127  double suitability_corr = 0;
128  };
129 
134 
136  ~DBSuitability() override = default;
137 
139  friend class DBSuitability_friend;
140 
204  void compute(PeptideIdentificationList&& pep_ids, const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& original_fasta, const std::vector<FASTAFile::FASTAEntry>& novo_fasta, const ProteinIdentification::SearchParameters& search_params);
205 
215  const std::vector<SuitabilityData>& getResults() const;
216 
217  private:
219  std::vector<SuitabilityData> results_;
220 
222  const boost::regex decoy_pattern_;
223 
238  double getDecoyDiff_(const PeptideIdentification& pep_id) const;
239 
254  double getDecoyCutOff_(const PeptideIdentificationList& pep_ids, double reranking_cutoff_percentile) const;
255 
269  bool isNovoHit_(const PeptideHit& hit) const;
270 
279  bool checkScoreBetterThanThreshold_(const PeptideHit& hit, double threshold, bool higher_score_better) const;
280 
291  std::pair<String, Param> extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters& meta_values) const;
292 
300  void writeIniFile_(const Param& parameters, const String& filename) const;
301 
324  PeptideIdentificationList runIdentificationSearch_(const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& fasta_data, const String& adapter_name, Param& parameters) const;
325 
336  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta_(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate) const;
337 
355 
365  void appendDecoys_(std::vector<FASTAFile::FASTAEntry>& fasta) const;
366 
374  double extractScore_(const PeptideHit& pep_hit) const;
375 
389  double calculateCorrectionFactor_(const SuitabilityData& data, const SuitabilityData& data_sampled, double sampling_rate) const;
390 
399  UInt numberOfUniqueProteins_(const PeptideIdentificationList& peps, UInt number_of_hits = 1) const;
400 
409  Size getIndexWithMedianNovoHits_(const std::vector<SuitabilityData>& data) const;
410 
427  double getScoreMatchingFDR_(const PeptideIdentificationList& pep_ids, double FDR, const String& score_name, bool higher_score_better) const;
428  };
429 
430  // friend class to test private member functions
432  {
433  public:
434  DBSuitability_friend() = default;
435 
437 
438  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate)
439  {
440  return suit_.getSubsampledFasta_(fasta_data, subsampling_rate);
441  }
442 
443  void appendDecoys(std::vector<FASTAFile::FASTAEntry>& fasta)
444  {
445  suit_.appendDecoys_(fasta);
446  }
447 
448  double calculateCorrectionFactor(const DBSuitability::SuitabilityData& data, const DBSuitability::SuitabilityData& data_sampled, double sampling_rate)
449  {
450  return suit_.calculateCorrectionFactor_(data, data_sampled, sampling_rate);
451  }
452 
454  {
455  return suit_.numberOfUniqueProteins_(peps, number_of_hits);
456  }
457 
458  Size getIndexWithMedianNovoHits(const std::vector<DBSuitability::SuitabilityData>& data)
459  {
460  return suit_.getIndexWithMedianNovoHits_(data);
461  }
462 
463  double getScoreMatchingFDR(const PeptideIdentificationList& pep_ids, double FDR, String score_name, bool higher_score_better)
464  {
465  return suit_.getScoreMatchingFDR_(pep_ids, FDR, score_name, higher_score_better);
466  }
467 
468  /* Not tested:
469  getDecoyDiff_, getDecoyCutOff_, isNovoHit_, checkScoreBetterThanThreshold_
470  Reason: These functions are essential to the normal suitability calculation and if something would not work, the test for 'compute' would fail.
471 
472  extractSearchAdapterInfoFromMetaValues_, writeIniFile_, extractScore_
473  Reason: These functions are very straightforeward.
474 
475  runIdentificationSearch_
476  Reason: This function simulates a whole workflow and testing it would be to complicated.
477  */
478 
479  private:
481  };
482 }
483 
Definition: DBSuitability.h:432
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate)
Definition: DBSuitability.h:438
double calculateCorrectionFactor(const DBSuitability::SuitabilityData &data, const DBSuitability::SuitabilityData &data_sampled, double sampling_rate)
Definition: DBSuitability.h:448
DBSuitability suit_
Definition: DBSuitability.h:480
UInt numberOfUniqueProteins(const PeptideIdentificationList &peps, UInt number_of_hits=1)
Definition: DBSuitability.h:453
Size getIndexWithMedianNovoHits(const std::vector< DBSuitability::SuitabilityData > &data)
Definition: DBSuitability.h:458
double getScoreMatchingFDR(const PeptideIdentificationList &pep_ids, double FDR, String score_name, bool higher_score_better)
Definition: DBSuitability.h:463
void appendDecoys(std::vector< FASTAFile::FASTAEntry > &fasta)
Definition: DBSuitability.h:443
This class holds the functionality of calculating the database suitability.
Definition: DBSuitability.h:48
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta_(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate) const
Creates a subsampled fasta with the given subsampling rate.
bool checkScoreBetterThanThreshold_(const PeptideHit &hit, double threshold, bool higher_score_better) const
Tests if a PeptideHit has a score better than the given threshold.
Size getIndexWithMedianNovoHits_(const std::vector< SuitabilityData > &data) const
Finds the SuitabilityData object with the median number of de novo hits.
void writeIniFile_(const Param &parameters, const String &filename) const
Writes parameters into a given file.
std::pair< String, Param > extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters &meta_values) const
Looks through meta values of SearchParameters to find out which search adapter was used.
void compute(PeptideIdentificationList &&pep_ids, const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &original_fasta, const std::vector< FASTAFile::FASTAEntry > &novo_fasta, const ProteinIdentification::SearchParameters &search_params)
Computes suitability of a database used to search a mzML.
PeptideIdentificationList runIdentificationSearch_(const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &fasta_data, const String &adapter_name, Param &parameters) const
Executes the workflow from search adapter, followed by PeptideIndexer and finishes with FDR.
~DBSuitability() override=default
Destructor.
double calculateCorrectionFactor_(const SuitabilityData &data, const SuitabilityData &data_sampled, double sampling_rate) const
Calculates the correction factor from two suitability calculations.
UInt numberOfUniqueProteins_(const PeptideIdentificationList &peps, UInt number_of_hits=1) const
Determines the number of unique proteins found in the protein accessions of PeptideIdentifications.
const std::vector< SuitabilityData > & getResults() const
Returns results calculated by this metric.
double getScoreMatchingFDR_(const PeptideIdentificationList &pep_ids, double FDR, const String &score_name, bool higher_score_better) const
Extracts the worst score that still passes a FDR (q-value) threshold.
std::vector< SuitabilityData > results_
result vector
Definition: DBSuitability.h:219
double getDecoyCutOff_(const PeptideIdentificationList &pep_ids, double reranking_cutoff_percentile) const
Calculates a xcorr cut-off based on decoy hits.
void appendDecoys_(std::vector< FASTAFile::FASTAEntry > &fasta) const
Calculates and appends decoys to a given vector of FASTAEntry.
void calculateSuitability_(const PeptideIdentificationList &pep_ids, SuitabilityData &data) const
Calculates all suitability data from a combined deNovo+database search.
bool isNovoHit_(const PeptideHit &hit) const
Tests if a PeptideHit is considered a deNovo hit.
const boost::regex decoy_pattern_
pattern for finding a decoy string
Definition: DBSuitability.h:222
double extractScore_(const PeptideHit &pep_hit) const
Returns the cross correlation score normalized by MW (if existing), else if the 'force' flag is set t...
double getDecoyDiff_(const PeptideIdentification &pep_id) const
Calculates the xcorr difference between the top two hits marked as decoy.
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:66
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:49
Management and storage of parameters / INI files.
Definition: Param.h:44
Represents a single spectrum match (candidate) for a specific tandem mass spectrum (MS/MS).
Definition: PeptideHit.h:50
Container for peptide identifications from multiple spectra.
Definition: PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition: PeptideIdentification.h:63
A more convenient string class.
Definition: String.h:34
unsigned int UInt
Unsigned integer type.
Definition: Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
struct to store results
Definition: DBSuitability.h:52
SuitabilityData simulateNoReRanking() const
Returns a SuitabilityData object containing the data if re-ranking didn't happen.
Search parameters of the DB search.
Definition: ProteinIdentification.h:248