Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
IDFilter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2015.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Mathias Walzer $
32 // $Authors: Nico Pfeifer, Mathias Walzer$
33 // --------------------------------------------------------------------------
34 
35 #ifndef OPENMS_FILTERING_ID_IDFILTER_H
36 #define OPENMS_FILTERING_ID_IDFILTER_H
37 
38 #include <OpenMS/config.h>
43 
44 #include <vector>
45 #include <climits>
46 
47 namespace OpenMS
48 {
61  class OPENMS_DLLAPI IDFilter
62  {
63 public:
64 
66  IDFilter();
67 
69  virtual ~IDFilter();
70 
75  template <class IdentificationType>
76  static bool getBestHit(const std::vector<IdentificationType> identifications, bool assume_sorted, PeptideHit& best_hit)
77  {
78  if (identifications.size() == 0) return false;
79 
80  bool is_higher_score_better = identifications[0].isHigherScoreBetter();
81  double best_score = (is_higher_score_better ? -1 : 1) * std::numeric_limits<double>::max(); // worst score we can think of
82 
83  Size best_i_index(0), best_h_index(0);
84  Size max_h(-1);
85  // determine best scoring hit
86  for (Size i = 0; i != identifications.size(); ++i)
87  {
88  if (identifications[i].getHits().size() == 0) continue; // empty hits
89 
90  is_higher_score_better = identifications[i].isHigherScoreBetter();
91  max_h = (assume_sorted ? 1 : identifications[i].getHits().size());
92  for (Size h = 0; h < max_h; ++h)
93  {
94  double score = identifications[i].getHits()[h].getScore();
95  // better score?
96  if (score > best_score * (is_higher_score_better ? 1 : -1))
97  {
98  best_score = score;
99  best_i_index = i;
100  best_h_index = h;
101  }
102  }
103  }
104 
105  if (max_h == -1) return false;// all hits were empty
106 
107  best_hit = identifications[best_i_index].getHits()[best_h_index];
108  return true;
109 
110  }
111 
113  template <class IdentificationType>
114  static void filterIdentificationsByThreshold(const IdentificationType& identification, double threshold_fraction, IdentificationType& filtered_identification)
115  {
116  typedef typename IdentificationType::HitType HitType;
117  std::vector<HitType> temp_hits;
118  std::vector<HitType> filtered_hits;
119 
120  filtered_identification = identification;
121  filtered_identification.setHits(std::vector<HitType>());
122 
123  for (typename std::vector<HitType>::const_iterator it = identification.getHits().begin();
124  it != identification.getHits().end();
125  ++it)
126  {
127  if (it->getScore() >= threshold_fraction * identification.getSignificanceThreshold())
128  {
129  filtered_hits.push_back(*it);
130  }
131  }
132 
133  if (!filtered_hits.empty())
134  {
135  filtered_identification.setHits(filtered_hits);
136  filtered_identification.assignRanks();
137  }
138  }
139 
147  template <class IdentificationType>
148  static void filterIdentificationsByScore(const IdentificationType& identification, double threshold_score, IdentificationType& filtered_identification)
149  {
150  typedef typename IdentificationType::HitType HitType;
151  std::vector<HitType> temp_hits;
152  std::vector<HitType> filtered_hits;
153 
154  filtered_identification = identification;
155  filtered_identification.setHits(std::vector<HitType>());
156 
157  for (typename std::vector<HitType>::const_iterator it = identification.getHits().begin();
158  it != identification.getHits().end();
159  ++it)
160  {
161  if (identification.isHigherScoreBetter())
162  {
163  if (it->getScore() >= threshold_score)
164  {
165  filtered_hits.push_back(*it);
166  }
167  }
168  else
169  {
170  if (it->getScore() <= threshold_score)
171  {
172  filtered_hits.push_back(*it);
173  }
174  }
175  }
176 
177  if (!filtered_hits.empty())
178  {
179  filtered_identification.setHits(filtered_hits);
180  filtered_identification.assignRanks();
181  }
182  }
183 
190  template <class IdentificationType>
191  static void filterIdentificationsByBestNHits(const IdentificationType& identification, Size n, IdentificationType& filtered_identification)
192  {
193  typedef typename IdentificationType::HitType HitType;
194  std::vector<HitType> temp_hits;
195  std::vector<HitType> filtered_hits;
196  Size count = 0;
197 
198  IdentificationType temp_identification = identification;
199  temp_identification.sort(); // .. by score
200 
201  filtered_identification = identification;
202  filtered_identification.setHits(std::vector<HitType>());
203 
204 
205  typename std::vector<HitType>::const_iterator it = temp_identification.getHits().begin();
206  while (it != temp_identification.getHits().end()
207  && count < n)
208  {
209  filtered_hits.push_back(*it);
210  ++it;
211  ++count;
212  }
213 
214  if (!filtered_hits.empty())
215  {
216  filtered_identification.setHits(filtered_hits);
217  filtered_identification.assignRanks();
218  }
219  }
220 
228  template <class IdentificationType>
229  static void filterIdentificationsByBestNToMHits(const IdentificationType& identification, Size n, Size m, IdentificationType& filtered_identification)
230  {
231  if (n > m)
232  {
233  std::swap(n, m);
234  }
235 
236  typedef typename IdentificationType::HitType HitType;
237  std::vector<HitType> filtered_hits;
238 
239  IdentificationType temp_identification = identification;
240  temp_identification.sort(); // .. by score
241 
242  filtered_identification = identification;
243  filtered_identification.setHits(std::vector<HitType>());
244 
245  const std::vector<HitType>& hits = temp_identification.getHits();
246  for (Size i = n - 1; n <= m - 1; ++i)
247  {
248  if (i >= hits.size())
249  {
250  break;
251  }
252  filtered_hits.push_back(hits[i]);
253  }
254 
255  if (!filtered_hits.empty())
256  {
257  filtered_identification.setHits(filtered_hits);
258  filtered_identification.assignRanks();
259  }
260  }
261 
262 
269  template <class IdentificationType>
270  static void filterIdentificationsByDecoy(const IdentificationType& identification, IdentificationType& filtered_identification)
271  {
272  typedef typename IdentificationType::HitType HitType;
273  std::vector<HitType> temp_hits;
274  std::vector<HitType> filtered_hits;
275 
276  filtered_identification = identification;
277  filtered_identification.setHits(std::vector<HitType>());
278 
279  for (typename std::vector<HitType>::const_iterator it = identification.getHits().begin();
280  it != identification.getHits().end();
281  ++it)
282  {
283  bool isDecoy = ((it->metaValueExists("isDecoy") && (String)it->getMetaValue("isDecoy") == "true") ||
284  (it->metaValueExists("target_decoy") && (String)it->getMetaValue("target_decoy") == "decoy"));
285  if (!isDecoy)
286  {
287  filtered_hits.push_back(*it);
288  }
289  }
290 
291  if (!filtered_hits.empty())
292  {
293  filtered_identification.setHits(filtered_hits);
294  filtered_identification.assignRanks();
295  }
296  }
297 
299  static void filterIdentificationsByBestHits(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, bool strict = false);
300 
314  static bool filterIdentificationsByMetaValueRange(const PeptideIdentification& identification, const String& key, double low, double high, bool missing = false);
315 
319  static void filterIdentificationsByProteins(const PeptideIdentification& identification, const std::vector<FASTAFile::FASTAEntry>& proteins, PeptideIdentification& filtered_identification, bool no_protein_identifiers = false);
320 
324  static void filterIdentificationsByProteins(const ProteinIdentification& identification, const std::vector<FASTAFile::FASTAEntry>& proteins, ProteinIdentification& filtered_identification);
325 
327  static void filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification, const std::set<String>& peptides, bool ignore_modifications, PeptideIdentification& filtered_identification);
328 
331  static void filterIdentificationsByLength(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, Size min_length, Size max_length = UINT_MAX);
332 
334  static void filterIdentificationsByCharge(const PeptideIdentification& identification, Int charge, PeptideIdentification& filtered_identification);
335 
337  static void filterIdentificationsByVariableModifications(const PeptideIdentification& identification, const std::vector<String>& fixed_modifications, PeptideIdentification& filtered_identification);
338 
340  static void removeUnreferencedProteinHits(const ProteinIdentification& identification, const std::vector<PeptideIdentification>& peptide_identifications, ProteinIdentification& filtered_identification);
341 
343  static void removeUnreferencedPeptideHits(const ProteinIdentification& identification, std::vector<PeptideIdentification>& peptide_identifications, bool delete_unreferenced_peptide_hits = false);
344 
346  static void filterIdentificationsUnique(const PeptideIdentification& identification, PeptideIdentification& filtered_identification);
347 
349  static void filterIdentificationsByMzError(const PeptideIdentification& identification, double mass_error, bool unit_ppm, PeptideIdentification& filtered_identification);
350 
353  static void filterIdentificationsByRT(const std::vector<PeptideIdentification>& identifications, double min_rt, double max_rt, std::vector<PeptideIdentification>& filtered_identifications);
354 
357  static void filterIdentificationsByMZ(const std::vector<PeptideIdentification>& identifications, double min_mz, double max_mz, std::vector<PeptideIdentification>& filtered_identifications);
358 
366  static void filterIdentificationsByRTPValues(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, double p_value = 0.05);
367 
375  static void filterIdentificationsByRTFirstDimPValues(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, double p_value = 0.05);
376 
378  template <class PeakT>
379  static void filterIdentificationsByThresholds(MSExperiment<PeakT>& experiment, double peptide_threshold_fraction, double protein_threshold_fraction)
380  {
381  //filter protein hits
382  ProteinIdentification temp_protein_identification;
383  std::vector<ProteinIdentification> filtered_protein_identifications;
384 
385  for (Size j = 0; j < experiment.getProteinIdentifications().size(); j++)
386  {
387  filterIdentificationsByThreshold(experiment.getProteinIdentifications()[j], protein_threshold_fraction, temp_protein_identification);
388  if (!temp_protein_identification.getHits().empty())
389  {
390  filtered_protein_identifications.push_back(temp_protein_identification);
391  }
392  }
393  experiment.setProteinIdentifications(filtered_protein_identifications);
394 
395  //filter peptide hits
396  PeptideIdentification temp_identification;
397  std::vector<PeptideIdentification> filtered_identifications;
398 
399  for (Size i = 0; i < experiment.size(); i++)
400  {
401  for (Size j = 0; j < experiment[i].getPeptideIdentifications().size(); j++)
402  {
403  filterIdentificationsByThreshold(experiment[i].getPeptideIdentifications()[j], peptide_threshold_fraction, temp_identification);
404  if (!temp_identification.getHits().empty())
405  {
406  filtered_identifications.push_back(temp_identification);
407  }
408  }
409  experiment[i].setPeptideIdentifications(filtered_identifications);
410  filtered_identifications.clear();
411  }
412  }
413 
415  template <class PeakT>
416  static void filterIdentificationsByScores(MSExperiment<PeakT>& experiment, double peptide_threshold_score, double protein_threshold_score)
417  {
418  //filter protein hits
419  ProteinIdentification temp_protein_identification;
420  std::vector<ProteinIdentification> filtered_protein_identifications;
421 
422  for (Size j = 0; j < experiment.getProteinIdentifications().size(); j++)
423  {
424  filterIdentificationsByScore(experiment.getProteinIdentifications()[j], protein_threshold_score, temp_protein_identification);
425  if (!temp_protein_identification.getHits().empty())
426  {
427  filtered_protein_identifications.push_back(temp_protein_identification);
428  }
429  }
430  experiment.setProteinIdentifications(filtered_protein_identifications);
431 
432  //filter peptide hits
433  PeptideIdentification temp_identification;
434  std::vector<PeptideIdentification> filtered_identifications;
435 
436  for (Size i = 0; i < experiment.size(); i++)
437  {
438  for (Size j = 0; j < experiment[i].getPeptideIdentifications().size(); j++)
439  {
440  filterIdentificationsByScore(experiment[i].getPeptideIdentifications()[j], peptide_threshold_score, temp_identification);
441  if (!temp_identification.getHits().empty())
442  {
443  filtered_identifications.push_back(temp_identification);
444  }
445  }
446  experiment[i].setPeptideIdentifications(filtered_identifications);
447  filtered_identifications.clear();
448  }
449  }
450 
452  template <class PeakT>
454  {
455  //filter protein hits
456  ProteinIdentification temp_protein_identification;
457  std::vector<ProteinIdentification> filtered_protein_identifications;
458 
459  for (Size j = 0; j < experiment.getProteinIdentifications().size(); j++)
460  {
461  filterIdentificationsByBestNHits(experiment.getProteinIdentifications()[j], n, temp_protein_identification);
462  if (!temp_protein_identification.getHits().empty())
463  {
464  filtered_protein_identifications.push_back(temp_protein_identification);
465  }
466  }
467  experiment.setProteinIdentifications(filtered_protein_identifications);
468 
469  //filter peptide hits
470  PeptideIdentification temp_identification;
471  std::vector<PeptideIdentification> filtered_identifications;
472 
473  for (Size i = 0; i < experiment.size(); i++)
474  {
475  for (Size j = 0; j < experiment[i].getPeptideIdentifications().size(); j++)
476  {
477  filterIdentificationsByBestNHits(experiment[i].getPeptideIdentifications()[j], n, temp_identification);
478  if (!temp_identification.getHits().empty())
479  {
480  filtered_identifications.push_back(temp_identification);
481  }
482  }
483  experiment[i].setPeptideIdentifications(filtered_identifications);
484  filtered_identifications.clear();
485  }
486  }
487 
489  template <class PeakT>
490  static void filterIdentificationsByProteins(MSExperiment<PeakT>& experiment, const std::vector<FASTAFile::FASTAEntry>& proteins)
491  {
492  std::vector<PeptideIdentification> temp_identifications;
493  std::vector<PeptideIdentification> filtered_identifications;
494  PeptideIdentification temp_identification;
495 
496  for (Size i = 0; i < experiment.size(); i++)
497  {
498  if (experiment[i].getMSLevel() == 2)
499  {
500  temp_identifications = experiment[i].getPeptideIdentifications();
501  for (Size j = 0; j < temp_identifications.size(); j++)
502  {
503  filterIdentificationsByProteins(temp_identifications[j], proteins, temp_identification);
504  if (!temp_identification.getHits().empty())
505  {
506  filtered_identifications.push_back(temp_identification);
507  }
508  }
509  experiment[i].setPeptideIdentifications(filtered_identifications);
510  filtered_identifications.clear();
511  }
512  }
513  }
514 
524  static bool updateProteinGroups(
525  const std::vector<ProteinIdentification::ProteinGroup>& groups,
526  const std::vector<ProteinHit>& hits,
527  std::vector<ProteinIdentification::ProteinGroup>& filtered_groups);
528 
529  };
530 
531 } // namespace OpenMS
532 
533 #endif // OPENMS_FILTERING_ID_IDFILTER_H
Representation of a protein identification run.
Definition: ProteinIdentification.h:61
void setProteinIdentifications(const std::vector< ProteinIdentification > &protein_identifications)
sets the protein ProteinIdentification vector
A more convenient string class.
Definition: String.h:57
Size size() const
Definition: MSExperiment.h:117
const std::vector< ProteinIdentification > & getProteinIdentifications() const
returns a const reference to the protein ProteinIdentification vector
static void filterIdentificationsByProteins(MSExperiment< PeakT > &experiment, const std::vector< FASTAFile::FASTAEntry > &proteins)
filters an MS/MS experiment corresponding to the given proteins
Definition: IDFilter.h:490
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
static void filterIdentificationsByScore(const IdentificationType &identification, double threshold_score, IdentificationType &filtered_identification)
filters a ProteinIdentification or PeptideIdentification corresponding to the threshold_score ...
Definition: IDFilter.h:148
static void filterIdentificationsByThreshold(const IdentificationType &identification, double threshold_fraction, IdentificationType &filtered_identification)
filters a ProteinIdentification or PeptideIdentification by only allowing peptides/proteins which rea...
Definition: IDFilter.h:114
static void filterIdentificationsByScores(MSExperiment< PeakT > &experiment, double peptide_threshold_score, double protein_threshold_score)
filters an MS/MS experiment corresponding to the threshold scores
Definition: IDFilter.h:416
Representation of a peptide hit.
Definition: PeptideHit.h:55
static void filterIdentificationsByBestNHits(const IdentificationType &identification, Size n, IdentificationType &filtered_identification)
filters a ProteinIdentification or PeptideIdentification corresponding to the score.
Definition: IDFilter.h:191
static void filterIdentificationsByThresholds(MSExperiment< PeakT > &experiment, double peptide_threshold_fraction, double protein_threshold_fraction)
filters an MS/MS experiment corresponding to the threshold fractions
Definition: IDFilter.h:379
const double h
const std::vector< PeptideHit > & getHits() const
returns the peptide hits as const
static bool getBestHit(const std::vector< IdentificationType > identifications, bool assume_sorted, PeptideHit &best_hit)
Definition: IDFilter.h:76
In-Memory representation of a mass spectrometry experiment.
Definition: MSExperiment.h:69
const std::vector< ProteinHit > & getHits() const
Returns the protein hits.
static void filterIdentificationsByBestNToMHits(const IdentificationType &identification, Size n, Size m, IdentificationType &filtered_identification)
filters a ProteinIdentification or PeptideIdentification corresponding to the score.
Definition: IDFilter.h:229
static void filterIdentificationsByBestNHits(MSExperiment< PeakT > &experiment, Size n)
filters an MS/MS experiment corresponding to the best n hits for every spectrum
Definition: IDFilter.h:453
Used to filter identifications by different criteria.
Definition: IDFilter.h:61
int Int
Signed integer type.
Definition: Types.h:96
static void filterIdentificationsByDecoy(const IdentificationType &identification, IdentificationType &filtered_identification)
filters a ProteinIdentification or PeptideIdentification corresponding to their decoy information...
Definition: IDFilter.h:270
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:63

OpenMS / TOPP release 2.0.0 Documentation generated on Fri May 29 2015 17:20:24 using doxygen 1.8.9.1