All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
MzIdentMLDOMHandler.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Mathias Walzer $
6 // $Authors: Mathias Walzer$
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
21 
22 #include <xercesc/dom/DOM.hpp>
23 #include <xercesc/dom/DOMDocument.hpp>
24 #include <xercesc/dom/DOMDocumentType.hpp>
25 #include <xercesc/dom/DOMElement.hpp>
26 #include <xercesc/dom/DOMImplementation.hpp>
27 #include <xercesc/dom/DOMImplementationLS.hpp>
28 #include <xercesc/dom/DOMNodeIterator.hpp>
29 #include <xercesc/dom/DOMNodeList.hpp>
30 #include <xercesc/dom/DOMText.hpp>
31 #include <xercesc/framework/LocalFileFormatTarget.hpp>
32 #include <xercesc/framework/psvi/XSValue.hpp>
33 #include <xercesc/parsers/XercesDOMParser.hpp>
34 #include <xercesc/util/OutOfMemoryException.hpp>
35 #include <xercesc/util/PlatformUtils.hpp>
36 #include <xercesc/util/XMLString.hpp>
37 #include <xercesc/util/XMLUni.hpp>
38 
39 #include <list>
40 #include <map>
41 #include <stdexcept>
42 #include <string>
43 #include <vector>
44 
45 // Error codes
46 //enum {
47 // ERROR_ARGS = 1,
48 // ERROR_XERCES_INIT,
49 // ERROR_PARSE,
50 // ERROR_EMPTY_DOCUMENT
51 //};
52 
53 namespace OpenMS
54 {
55  class ProgressLogger;
56 
57  namespace Internal
58  {
70  class OPENMS_DLLAPI MzIdentMLDOMHandler
71  {
72 public:
76  MzIdentMLDOMHandler(const std::vector<ProteinIdentification>& pro_id, const PeptideIdentificationList& pep_id, const String& version, const ProgressLogger& logger);
77 
79  MzIdentMLDOMHandler(std::vector<ProteinIdentification>& pro_id, PeptideIdentificationList& pep_id, const String& version, const ProgressLogger& logger);
80 
84 
86  void readMzIdentMLFile(const std::string& mzid_file);
88  void writeMzIdentMLFile(const std::string& mzid_file);
89 
90 protected:
93 
98 
100  std::vector<ProteinIdentification>* pro_id_ = nullptr;
102  PeptideIdentificationList* pep_id_ = nullptr;
103 
105  const std::vector<ProteinIdentification>* cpro_id_ = nullptr;
107  const PeptideIdentificationList* cpep_id_ = nullptr;
108 
111 
113  ControlledVocabulary::CVTerm getChildWithName_(const String& parent_accession, const String& name) const;
114 
117 
119  std::pair<CVTermList, std::map<String, DataValue> > parseParamGroup_(xercesc::DOMNodeList* paramGroup);
120  CVTerm parseCvParam_(xercesc::DOMElement* param);
121  std::pair<String, DataValue> parseUserParam_(xercesc::DOMElement* param);
122  void parseAnalysisSoftwareList_(xercesc::DOMNodeList* analysisSoftwareElements);
123  void parseDBSequenceElements_(xercesc::DOMNodeList* dbSequenceElements);
124  void parsePeptideElements_(xercesc::DOMNodeList* peptideElements);
125  //AASequence parsePeptideSiblings_(xercesc::DOMNodeList* peptideSiblings);
126  AASequence parsePeptideSiblings_(xercesc::DOMElement* peptide);
127  void parsePeptideEvidenceElements_(xercesc::DOMNodeList* peptideEvidenceElements);
128  void parseSpectrumIdentificationElements_(xercesc::DOMNodeList* spectrumIdentificationElements);
129  void parseSpectrumIdentificationProtocolElements_(xercesc::DOMNodeList* spectrumIdentificationProtocolElements);
130  void parseInputElements_(xercesc::DOMNodeList* inputElements);
131  void parseSpectrumIdentificationListElements_(xercesc::DOMNodeList* spectrumIdentificationListElements);
132  void parseSpectrumIdentificationItemSetXLMS(std::set<String>::const_iterator set_it, std::multimap<String, int> xl_val_map, xercesc::DOMElement* element_res, const String& spectrumID);
133  void parseSpectrumIdentificationItemElement_(xercesc::DOMElement* spectrumIdentificationItemElement, PeptideIdentification& spectrum_identification, String& spectrumIdentificationList_ref);
134  void parseProteinDetectionHypothesisElement_(xercesc::DOMElement* proteinDetectionHypothesisElement, ProteinIdentification& protein_identification);
135  void parseProteinAmbiguityGroupElement_(xercesc::DOMElement* proteinAmbiguityGroupElement, ProteinIdentification& protein_identification);
136  void parseProteinDetectionListElements_(xercesc::DOMNodeList* proteinDetectionListElements);
137  static ProteinIdentification::SearchParameters findSearchParameters_(std::pair<CVTermList, std::map<String, DataValue> > as_params);
139 
141  void buildCvList_(xercesc::DOMElement* cvElements);
142  void buildAnalysisSoftwareList_(xercesc::DOMElement* analysisSoftwareElements);
143  void buildSequenceCollection_(xercesc::DOMElement* sequenceCollectionElements);
144  void buildAnalysisCollection_(xercesc::DOMElement* analysisCollectionElements);
145  void buildAnalysisProtocolCollection_(xercesc::DOMElement* protocolElements);
146  void buildInputDataCollection_(xercesc::DOMElement* inputElements);
147  void buildEnclosedCV_(xercesc::DOMElement* parentElement, const String& encel, const String& acc, const String& name, const String& cvref);
148  void buildAnalysisDataCollection_(xercesc::DOMElement* analysisElements);
150 
151 
152 private:
156 
159  {
162  };
165  {
166  int start;
167  int stop;
168  char pre;
169  char post;
170  bool idec;
171  };
173  struct DBSequence
174  {
179  };
182  {
187  };
190  {
192  long double mass_delta;
196  };
199  {
203  std::map<String, DataValue> parameter_ups;
204 // std::vector<ModificationParam> modification_parameter;
206  long double precursor_tolerance;
207  long double fragment_tolerance;
209  std::map<String, DataValue> threshold_ups;
210  };
213  {
218  };
219 
223 
224  xercesc::XercesDOMParser mzid_parser_;
225 
226  std::unique_ptr<XMLHandler> xml_handler_ = nullptr;
227 
228  //from AnalysisSoftware
231  //mapping from AnalysisSoftware
232  std::map<String, AnalysisSoftware> as_map_;
233 
234  //mapping from DataCollection Inputs
235  std::map<String, String> sr_map_;
236  std::map<String, String> sd_map_;
237  std::map<String, DatabaseInput> db_map_;
238 
239  //mapping from SpectrumIdentification - SpectrumIdentification will be the new IdentificationRuns
240  std::map<String, SpectrumIdentification> si_map_;
241  std::map<String, size_t> si_pro_map_;
242 
243  //mapping from SpectrumIdentificationProtocol
244  std::map<String, SpectrumIdentificationProtocol> sp_map_;
245 
246  //mapping from SequenceCollection
247  std::map<String, AASequence> pep_map_;
248  std::map<String, PeptideEvidence> pe_ev_map_;
249  std::map<String, String> pv_db_map_;
250  std::multimap<String, String> p_pv_map_;
251  std::map<String, DBSequence> db_sq_map_;
252 
253  std::list<std::list<String> > hit_pev_;
254 
256  std::map<String, String> xl_id_donor_map_;
257  //std::map<String, String> xl_id_acceptor_map_; ///< mapping Peptide id -> crosslink acceptor value
258  std::map<String, String> xl_id_acceptor_map_;
259  std::map<String, SignedSize> xl_donor_pos_map_;
260  std::map<String, SignedSize> xl_acceptor_pos_map_;
261  std::map<String, double> xl_mass_map_;
262  std::map<String, String> xl_mod_map_;
263 
264  };
265  } // namespace Internal
266 } // namespace OpenMS
267 
char16_t XMLCh
Definition: ClassTest.h:28
Representation of a peptide/protein sequence.
Definition: AASequence.h:86
Representation of controlled vocabulary term list.
Definition: CVTermList.h:28
Representation of controlled vocabulary term.
Definition: CVTerm.h:27
Definition: ControlledVocabulary.h:29
DateTime Class.
Definition: DateTime.h:33
XML DOM handler for MzIdentMLFile.
Definition: MzIdentMLDOMHandler.h:71
ControlledVocabulary::CVTerm getChildWithName_(const String &parent_accession, const String &name) const
Looks up a child CV term of parent_accession with the name name. If no such term is found,...
void readMzIdentMLFile(const std::string &mzid_file)
Provides the functionality of reading a mzid with a handler object.
MzIdentMLDOMHandler(const std::vector< ProteinIdentification > &pro_id, const PeptideIdentificationList &pep_id, const String &version, const ProgressLogger &logger)
Constructor for a write-only handler for internal identification structures.
std::map< String, SpectrumIdentification > si_map_
mapping SpectrumIdentification id -> SpectrumIdentification (id refs)
Definition: MzIdentMLDOMHandler.h:240
void buildAnalysisSoftwareList_(xercesc::DOMElement *analysisSoftwareElements)
std::map< String, DatabaseInput > db_map_
mapping database id -> DatabaseInput
Definition: MzIdentMLDOMHandler.h:237
CVTermList parameter_cvs
Definition: MzIdentMLDOMHandler.h:202
String search_database_ref
Definition: MzIdentMLDOMHandler.h:184
bool xl_ms_search_
is true when reading a file containing Cross-Linking MS search results
Definition: MzIdentMLDOMHandler.h:255
void parseSpectrumIdentificationItemElement_(xercesc::DOMElement *spectrumIdentificationItemElement, PeptideIdentification &spectrum_identification, String &spectrumIdentificationList_ref)
DateTime date
Definition: MzIdentMLDOMHandler.h:217
std::map< String, double > xl_mass_map_
mapping Peptide id -> cross-link mass
Definition: MzIdentMLDOMHandler.h:261
MzIdentMLDOMHandler(std::vector< ProteinIdentification > &pro_id, PeptideIdentificationList &pep_id, const String &version, const ProgressLogger &logger)
Constructor for a read-only handler for internal identification structures.
std::map< String, String > sd_map_
mapping spectradata id -> spectradata location
Definition: MzIdentMLDOMHandler.h:236
CVTerm parseCvParam_(xercesc::DOMElement *param)
std::map< String, DBSequence > db_sq_map_
mapping DBSequence id -> Sequence
Definition: MzIdentMLDOMHandler.h:251
XMLCh * xml_root_tag_ptr_
Definition: MzIdentMLDOMHandler.h:220
int start
Definition: MzIdentMLDOMHandler.h:166
String sequence
Definition: MzIdentMLDOMHandler.h:175
std::map< String, String > sr_map_
mapping sourcefile id -> sourcefile location
Definition: MzIdentMLDOMHandler.h:235
void parseProteinDetectionListElements_(xercesc::DOMNodeList *proteinDetectionListElements)
long double precursor_tolerance
Definition: MzIdentMLDOMHandler.h:206
void parseInputElements_(xercesc::DOMNodeList *inputElements)
std::pair< String, DataValue > parseUserParam_(xercesc::DOMElement *param)
String location
Definition: MzIdentMLDOMHandler.h:215
void parseProteinDetectionHypothesisElement_(xercesc::DOMElement *proteinDetectionHypothesisElement, ProteinIdentification &protein_identification)
std::map< String, SignedSize > xl_acceptor_pos_map_
mapping acceptor value -> cross-link modification location
Definition: MzIdentMLDOMHandler.h:260
const ProgressLogger & logger_
Progress logger.
Definition: MzIdentMLDOMHandler.h:92
ControlledVocabulary cv_
Controlled vocabulary (psi-ms from OpenMS/share/OpenMS/CV/psi-ms.obo)
Definition: MzIdentMLDOMHandler.h:95
String search_engine_
Definition: MzIdentMLDOMHandler.h:229
void buildAnalysisCollection_(xercesc::DOMElement *analysisCollectionElements)
std::map< String, PeptideEvidence > pe_ev_map_
mapping PeptideEvidence id -> PeptideEvidence
Definition: MzIdentMLDOMHandler.h:248
void parseSpectrumIdentificationItemSetXLMS(std::set< String >::const_iterator set_it, std::multimap< String, int > xl_val_map, xercesc::DOMElement *element_res, const String &spectrumID)
long double mass_delta
Definition: MzIdentMLDOMHandler.h:192
CVTermList modification_param_cvs
Definition: MzIdentMLDOMHandler.h:194
String version
Definition: MzIdentMLDOMHandler.h:161
static ProteinIdentification::SearchParameters findSearchParameters_(std::pair< CVTermList, std::map< String, DataValue > > as_params)
XMLCh * xml_cvparam_tag_ptr_
Definition: MzIdentMLDOMHandler.h:221
long double fragment_tolerance
Definition: MzIdentMLDOMHandler.h:207
String spectrum_identification_list_ref
Definition: MzIdentMLDOMHandler.h:186
int stop
Definition: MzIdentMLDOMHandler.h:167
std::map< String, size_t > si_pro_map_
mapping SpectrumIdentificationList id -> index to ProteinIdentification in pro_id_
Definition: MzIdentMLDOMHandler.h:241
MzIdentMLDOMHandler & operator=(const MzIdentMLDOMHandler &rhs)
String database_ref
Definition: MzIdentMLDOMHandler.h:176
void parseProteinAmbiguityGroupElement_(xercesc::DOMElement *proteinAmbiguityGroupElement, ProteinIdentification &protein_identification)
void parseSpectrumIdentificationElements_(xercesc::DOMNodeList *spectrumIdentificationElements)
void parsePeptideEvidenceElements_(xercesc::DOMNodeList *peptideEvidenceElements)
CVTerm searchtype
Definition: MzIdentMLDOMHandler.h:200
std::map< String, String > pv_db_map_
mapping PeptideEvidence id -> DBSequence id
Definition: MzIdentMLDOMHandler.h:249
void buildAnalysisDataCollection_(xercesc::DOMElement *analysisElements)
String search_engine_version_
Definition: MzIdentMLDOMHandler.h:230
std::map< String, AnalysisSoftware > as_map_
mapping AnalysisSoftware id -> AnalysisSoftware
Definition: MzIdentMLDOMHandler.h:232
void buildEnclosedCV_(xercesc::DOMElement *parentElement, const String &encel, const String &acc, const String &name, const String &cvref)
bool idec
Definition: MzIdentMLDOMHandler.h:170
ControlledVocabulary unimod_
Controlled vocabulary for modifications (unimod from OpenMS/share/OpenMS/CV/unimod....
Definition: MzIdentMLDOMHandler.h:97
std::map< String, String > xl_mod_map_
mapping peptide id -> cross-linking reagent name
Definition: MzIdentMLDOMHandler.h:262
std::map< String, String > xl_id_donor_map_
mapping Peptide id -> crosslink donor value
Definition: MzIdentMLDOMHandler.h:256
void parseSpectrumIdentificationProtocolElements_(xercesc::DOMNodeList *spectrumIdentificationProtocolElements)
AASequence parsePeptideSiblings_(xercesc::DOMElement *peptide)
String name
Definition: MzIdentMLDOMHandler.h:160
std::multimap< String, String > p_pv_map_
mapping Peptide id -> PeptideEvidence id, multiple PeptideEvidences can have equivalent Peptides.
Definition: MzIdentMLDOMHandler.h:250
void buildInputDataCollection_(xercesc::DOMElement *inputElements)
void writeMzIdentMLFile(const std::string &mzid_file)
Provides the functionality to write a mzid with a handler object.
xercesc::XercesDOMParser mzid_parser_
Definition: MzIdentMLDOMHandler.h:224
CVTermList threshold_cvs
Definition: MzIdentMLDOMHandler.h:208
String fixed_mod
Definition: MzIdentMLDOMHandler.h:191
String accession
Definition: MzIdentMLDOMHandler.h:177
CVTermList specificities
Definition: MzIdentMLDOMHandler.h:195
char pre
Definition: MzIdentMLDOMHandler.h:168
String residues
Definition: MzIdentMLDOMHandler.h:193
std::map< String, DataValue > threshold_ups
Definition: MzIdentMLDOMHandler.h:209
std::pair< CVTermList, std::map< String, DataValue > > parseParamGroup_(xercesc::DOMNodeList *paramGroup)
First: CVparams, Second: userParams (independent of each other)
CVTermList cvs
Definition: MzIdentMLDOMHandler.h:178
std::map< String, AASequence > pep_map_
mapping Peptide id -> Sequence
Definition: MzIdentMLDOMHandler.h:247
void parseAnalysisSoftwareList_(xercesc::DOMNodeList *analysisSoftwareElements)
std::list< std::list< String > > hit_pev_
writing help only
Definition: MzIdentMLDOMHandler.h:253
MzIdentMLDOMHandler(const MzIdentMLDOMHandler &rhs)
std::map< String, SignedSize > xl_donor_pos_map_
mapping donor value -> cross-link modification location
Definition: MzIdentMLDOMHandler.h:259
std::map< String, SpectrumIdentificationProtocol > sp_map_
mapping SpectrumIdentificationProtocol id -> SpectrumIdentificationProtocol
Definition: MzIdentMLDOMHandler.h:244
CVTermList modification_parameter
Definition: MzIdentMLDOMHandler.h:205
void buildCvList_(xercesc::DOMElement *cvElements)
std::map< String, DataValue > parameter_ups
Definition: MzIdentMLDOMHandler.h:203
void buildSequenceCollection_(xercesc::DOMElement *sequenceCollectionElements)
void parseSpectrumIdentificationListElements_(xercesc::DOMNodeList *spectrumIdentificationListElements)
void parseDBSequenceElements_(xercesc::DOMNodeList *dbSequenceElements)
char post
Definition: MzIdentMLDOMHandler.h:169
void buildAnalysisProtocolCollection_(xercesc::DOMElement *protocolElements)
std::map< String, String > xl_id_acceptor_map_
mapping peptide id of acceptor peptide -> crosslink acceptor value
Definition: MzIdentMLDOMHandler.h:258
String spectrum_identification_protocol_ref
Definition: MzIdentMLDOMHandler.h:185
XMLCh * xml_name_attr_ptr_
Definition: MzIdentMLDOMHandler.h:222
virtual ~MzIdentMLDOMHandler()
Destructor.
String spectra_data_ref
Definition: MzIdentMLDOMHandler.h:183
const String schema_version_
Internal version keeping.
Definition: MzIdentMLDOMHandler.h:110
void parsePeptideElements_(xercesc::DOMNodeList *peptideElements)
Struct to hold the used analysis software for that file.
Definition: MzIdentMLDOMHandler.h:159
Struct to hold the information from the DBSequence xml tag.
Definition: MzIdentMLDOMHandler.h:174
Struct to hold the information from the DatabaseInput xml tag.
Definition: MzIdentMLDOMHandler.h:213
Struct to hold the information from the ModificationParam xml tag.
Definition: MzIdentMLDOMHandler.h:190
Struct to hold the PeptideEvidence information.
Definition: MzIdentMLDOMHandler.h:165
Struct to hold the information from the SpectrumIdentification xml tag.
Definition: MzIdentMLDOMHandler.h:182
Struct to hold the information from the SpectrumIdentificationProtocol xml tag.
Definition: MzIdentMLDOMHandler.h:199
Container for peptide identifications from multiple spectra.
Definition: PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition: PeptideIdentification.h:63
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:27
Representation of a protein identification run.
Definition: ProteinIdentification.h:51
A more convenient string class.
Definition: String.h:34
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Representation of a CV term.
Definition: ControlledVocabulary.h:50
Search parameters of the DB search.
Definition: ProteinIdentification.h:248