OpenMS
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
MapAlignmentAlgorithmIdentification.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Hendrik Weisser $
6 // $Authors: Eva Lange, Clemens Groepl, Hendrik Weisser $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
21 
22 #include <cmath> // for "abs"
23 #include <limits> // for "max"
24 #include <map>
25 
26 namespace OpenMS
27 {
28  /* Concept for FeatureMap or ConsensusMap*/
29  template <typename MapType>
30  concept IsFCMap = std::same_as<MapType, OpenMS::FeatureMap> || std::same_as<MapType, OpenMS::ConsensusMap>;
31 
32  class AnnotatedMSRun;
33 
53  public DefaultParamHandler,
54  public ProgressLogger
55  {
56 public:
59 
62 
63  // Set a reference for the alignment
64  template <typename DataType> void setReference(const DataType& data)
65  {
66  reference_.clear();
67  if (data.empty()) return; // empty input resets the reference
68  SeqToList rt_data;
69  // set these here because "checkParameters_" may not have been called yet:
70  use_feature_rt_ = param_.getValue("use_feature_rt").toBool();
71  score_cutoff_ = param_.getValue("score_cutoff").toBool();
72  score_type_ = (std::string)param_.getValue("score_type");
73  bool sorted = getRetentionTimes_(data, rt_data);
74  computeMedians_(rt_data, reference_, sorted);
75 
76  if (reference_.empty())
77  {
78  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Could not extract retention time information from the reference file");
79  }
80  }
81 
91  template <typename DataType>
92  void align(const std::vector<DataType>& data,
93  std::vector<TransformationDescription>& transformations,
94  Int reference_index = -1)
95  {
96  checkParameters_(data.size());
97  startProgress(0, 3, "aligning maps");
98 
99  reference_index_ = reference_index;
100  // is reference one of the input files?
101  bool use_internal_reference = (reference_index >= 0);
102  if (use_internal_reference)
103  {
104  if (reference_index >= Int(data.size()))
105  {
106  throw Exception::IndexOverflow(__FILE__, __LINE__,
107  OPENMS_PRETTY_FUNCTION,
108  reference_index, data.size());
109  }
110  setReference(data[reference_index]);
111  }
112 
113  // one set of RT data for each input map, except reference (if any):
114  std::vector<SeqToList> rt_data(data.size() - use_internal_reference);
115  bool all_sorted = true;
116  for (Size i = 0, j = 0; i < data.size(); ++i)
117  {
118  if ((reference_index >= 0) && (i == Size(reference_index)))
119  {
120  continue; // skip reference map, if any
121  }
122  all_sorted &= getRetentionTimes_(data[i], rt_data[j++]);
123  }
124  setProgress(1);
125 
126  computeTransformations_(rt_data, transformations, all_sorted);
127  setProgress(2);
128 
129  setProgress(3);
130  endProgress();
131  }
132 
133 protected:
134 
136  typedef std::map<String, DoubleList> SeqToList;
137 
139  typedef std::map<String, double> SeqToValue;
140 
143 
146 
149 
151  bool use_feature_rt_{};
152 
154  bool use_adducts_{};
155 
157  double min_score_;
158 
160  bool score_cutoff_{};
161 
164 
166  bool (*better_) (double, double) = [](double, double) {return true;};
167 
177  void computeMedians_(SeqToList& rt_data, SeqToValue& medians,
178  bool sorted = false);
179 
188  bool getRetentionTimes_(const std::vector<PeptideIdentification>& peptides,
189  SeqToList& rt_data);
190 
199  // "id_data" can't be "const" here or template resolution will fail
200  bool getRetentionTimes_(const IdentificationData& id_data, SeqToList& rt_data);
201 
210  bool getRetentionTimes_(const AnnotatedMSRun& experiment, SeqToList& rt_data);
211 
227  bool getRetentionTimes_(const IsFCMap auto& features, SeqToList& rt_data)
228  {
229  if (!score_cutoff_)
230  {
231  better_ = [](double, double)
232  {return true;};
233  }
234  else if (features[0].getPeptideIdentifications()[0].isHigherScoreBetter())
235  {
236  better_ = [](double a, double b)
237  { return a >= b; };
238  }
239  else
240  {
241  better_ = [](double a, double b)
242  { return a <= b; };
243  }
244 
245  for (auto feat_it = features.cbegin(); feat_it != features.cend(); ++feat_it)
246  {
247  if (use_feature_rt_)
248  {
249  // find the peptide ID closest in RT to the feature centroid:
250  String sequence;
251  double rt_distance = std::numeric_limits<double>::max();
252  bool any_hit = false;
253  for (std::vector<PeptideIdentification>::const_iterator pep_it =
254  feat_it->getPeptideIdentifications().begin(); pep_it !=
255  feat_it->getPeptideIdentifications().end(); ++pep_it)
256  {
257  if (!pep_it->getHits().empty())
258  {
259  any_hit = true;
260  double current_distance = fabs(pep_it->getRT() -
261  feat_it->getRT());
262  if (current_distance < rt_distance)
263  {
264  const PeptideHit* best_hit = getBestScoringHit(pep_it->getHits(), pep_it->isHigherScoreBetter());
265  if (best_hit && better_(best_hit->getScore(), min_score_))
266  {
267  sequence = best_hit->getSequence().toString();
268  rt_distance = current_distance;
269  }
270  }
271  }
272  }
273 
274  if (any_hit) rt_data[sequence].push_back(feat_it->getRT());
275  }
276  else
277  {
278  getRetentionTimes_(feat_it->getPeptideIdentifications(), rt_data);
279  }
280  }
281 
282  if (!use_feature_rt_ &&
283  param_.getValue("use_unassigned_peptides").toBool())
284  {
285  getRetentionTimes_(features.getUnassignedPeptideIdentifications(),
286  rt_data);
287  }
288 
289  // remove duplicates (can occur if a peptide ID was assigned to several
290  // features due to overlap or annotation tolerance):
291  for (SeqToList::iterator rt_it = rt_data.begin(); rt_it != rt_data.end();
292  ++rt_it)
293  {
294  DoubleList& rt_values = rt_it->second;
295  sort(rt_values.begin(), rt_values.end());
296  DoubleList::iterator it = unique(rt_values.begin(), rt_values.end());
297  rt_values.resize(it - rt_values.begin());
298  }
299  return true; // RTs were already sorted for duplicate detection
300  }
301 
309  void computeTransformations_(std::vector<SeqToList>& rt_data,
310  std::vector<TransformationDescription>&
311  transforms, bool sorted = false);
312 
320  void checkParameters_(const Size runs);
321 
328 
335 
344  const PeptideHit* getBestScoringHit(const std::vector<PeptideHit>& hits, const bool is_higher_score_better);
345 
346 private:
347 
350 
353 
354  };
355 
356 } // namespace OpenMS
String toString() const
returns the peptide as string with modifications embedded in brackets
Class for storing MS run data with peptide and protein identifications.
Definition: AnnotatedMSRun.h:34
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:66
Int overflow exception.
Definition: Exception.h:211
Not all required information provided.
Definition: Exception.h:155
Definition: IdentificationData.h:87
A map alignment algorithm based on peptide identifications from MS2 spectra.
Definition: MapAlignmentAlgorithmIdentification.h:55
void computeTransformations_(std::vector< SeqToList > &rt_data, std::vector< TransformationDescription > &transforms, bool sorted=false)
Compute retention time transformations from RT data grouped by peptide sequence.
void setReference(const DataType &data)
Definition: MapAlignmentAlgorithmIdentification.h:64
bool getRetentionTimes_(const IdentificationData &id_data, SeqToList &rt_data)
Collect retention time data from spectrum matches.
~MapAlignmentAlgorithmIdentification() override
Destructor.
std::map< String, double > SeqToValue
Type to store one representative retention time per peptide sequence.
Definition: MapAlignmentAlgorithmIdentification.h:139
void checkParameters_(const Size runs)
Check that parameter values are valid.
void getReference_()
Get reference retention times.
bool getRetentionTimes_(const IsFCMap auto &features, SeqToList &rt_data)
Collect retention time data from peptide IDs contained in feature maps or consensus maps.
Definition: MapAlignmentAlgorithmIdentification.h:227
String score_type_
Score type to use for filtering.
Definition: MapAlignmentAlgorithmIdentification.h:163
Int reference_index_
Index of input file to use as reference (if any)
Definition: MapAlignmentAlgorithmIdentification.h:142
bool getRetentionTimes_(const AnnotatedMSRun &experiment, SeqToList &rt_data)
Collect retention time data from peptide IDs annotated to spectra.
const PeptideHit * getBestScoringHit(const std::vector< PeptideHit > &hits, const bool is_higher_score_better)
Get the best-scoring PeptideHit from a list of hits.
SeqToValue reference_
Reference retention times (per peptide sequence)
Definition: MapAlignmentAlgorithmIdentification.h:145
bool getRetentionTimes_(const std::vector< PeptideIdentification > &peptides, SeqToList &rt_data)
Collect retention time data from peptide IDs.
double min_score_
Minimum score to reach for a peptide to be considered.
Definition: MapAlignmentAlgorithmIdentification.h:157
Size min_run_occur_
Minimum number of runs a peptide must occur in.
Definition: MapAlignmentAlgorithmIdentification.h:148
std::map< String, DoubleList > SeqToList
Type to store retention times given for individual peptide sequences.
Definition: MapAlignmentAlgorithmIdentification.h:136
MapAlignmentAlgorithmIdentification & operator=(const MapAlignmentAlgorithmIdentification &)
Assignment operator intentionally not implemented -> private.
MapAlignmentAlgorithmIdentification()
Default constructor.
IdentificationData::ScoreTypeRef handleIdDataScoreType_(const IdentificationData &id_data)
Helper function to find/define the score type for processing IdentificationData.
void align(const std::vector< DataType > &data, std::vector< TransformationDescription > &transformations, Int reference_index=-1)
Align feature maps, consensus maps, or peptide identifications.
Definition: MapAlignmentAlgorithmIdentification.h:92
void computeMedians_(SeqToList &rt_data, SeqToValue &medians, bool sorted=false)
Compute the median retention time for each peptide sequence.
MapAlignmentAlgorithmIdentification(const MapAlignmentAlgorithmIdentification &)
Copy constructor intentionally not implemented -> private.
Represents a single spectrum match (candidate) for a specific tandem mass spectrum (MS/MS).
Definition: PeptideHit.h:50
double getScore() const
returns the PSM score
const AASequence & getSequence() const
returns the peptide sequence
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:27
A more convenient string class.
Definition: String.h:34
int Int
Signed integer type.
Definition: Types.h:72
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
std::vector< double > DoubleList
Vector of double precision real types.
Definition: ListUtils.h:36
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
concept IsFCMap
Definition: MapAlignmentAlgorithmIdentification.h:30
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:20