Gemmi C++ API
Loading...
Searching...
No Matches
stats.hpp
Go to the documentation of this file.
1// Copyright 2018 Global Phasing Ltd.
2//
3// Statistics utilities: classes Covariance, Correlation, DataStats
4
5#ifndef GEMMI_STATS_HPP_
6#define GEMMI_STATS_HPP_
7
8#include <cstddef> // for size_t
9#include <cmath> // for sqrt, NAN, INFINITY
10#include <vector>
11
12namespace gemmi {
13
14// popular single-pass algorithm for calculating variance and mean
15struct Variance {
16 int n = 0;
17 double sum_sq = 0.;
18 double mean_x = 0.;
19
20 Variance() = default;
21 template <typename T> Variance(T begin, T end) : Variance() {
22 for (auto i = begin; i != end; ++i)
23 add_point(*i);
24 }
25 void add_point(double x) {
26 ++n;
27 double dx = x - mean_x;
28 mean_x += dx / n;
29 sum_sq += dx * (x - mean_x);
30 }
31 double for_sample() const { return sum_sq / (n - 1); }
32 double for_population() const { return sum_sq / n; }
33};
34
36 double mean_y = 0.;
37 void add_point(double x, double y) {
38 ++n;
39 double dx = x - mean_x;
40 mean_x += dx / n;
41 mean_y += (y - mean_y) / n;
42 sum_sq += dx * (y - mean_y);
43 }
44};
45
47 int n = 0;
48 double sum_xx = 0.;
49 double sum_yy = 0.;
50 double sum_xy = 0.;
51 double mean_x = 0.;
52 double mean_y = 0.;
53 void add_point(double x, double y) {
54 ++n;
55 double weight = (double)(n - 1) / n;
56 double dx = x - mean_x;
57 double dy = y - mean_y;
58 sum_xx += weight * dx * dx;
59 sum_yy += weight * dy * dy;
60 sum_xy += weight * dx * dy;
61 mean_x += dx / n;
62 mean_y += dy / n;
63 }
64 double coefficient() const { return sum_xy / std::sqrt(sum_xx * sum_yy); }
65 double x_variance() const { return sum_xx / n; }
66 double y_variance() const { return sum_yy / n; }
67 double covariance() const { return sum_xy / n; }
68 double mean_ratio() const { return mean_y / mean_x; }
69 // the regression line
70 double slope() const { return sum_xy / sum_xx; }
71 double intercept() const { return mean_y - slope() * mean_x; }
72};
73
74struct DataStats {
75 double dmin = NAN;
76 double dmax = NAN;
77 double dmean = NAN;
78 double rms = NAN;
79 size_t nan_count = 0;
80};
81
82template<typename T>
83DataStats calculate_data_statistics(const std::vector<T>& data) {
85 double sum = 0;
86 double sq_sum = 0;
87 stats.dmin = INFINITY;
88 stats.dmax = -INFINITY;
89 for (double d : data) {
90 if (std::isnan(d)) {
91 stats.nan_count++;
92 continue;
93 }
94 sum += d;
95 sq_sum += d * d;
96 if (d < stats.dmin)
97 stats.dmin = d;
98 if (d > stats.dmax)
99 stats.dmax = d;
100 }
101 if (stats.nan_count != data.size()) {
102 stats.dmean = sum / (data.size() - stats.nan_count);
103 stats.rms = std::sqrt(sq_sum / (data.size() - stats.nan_count) - stats.dmean * stats.dmean);
104 } else {
105 stats.dmin = NAN;
106 stats.dmax = NAN;
107 }
108 return stats;
109}
110
111} // namespace gemmi
112#endif
DataStats calculate_data_statistics(const std::vector< T > &data)
Definition stats.hpp:83
double x_variance() const
Definition stats.hpp:65
double covariance() const
Definition stats.hpp:67
double mean_ratio() const
Definition stats.hpp:68
void add_point(double x, double y)
Definition stats.hpp:53
double slope() const
Definition stats.hpp:70
double intercept() const
Definition stats.hpp:71
double y_variance() const
Definition stats.hpp:66
double coefficient() const
Definition stats.hpp:64
void add_point(double x, double y)
Definition stats.hpp:37
size_t nan_count
Definition stats.hpp:79
double sum_sq
Definition stats.hpp:17
void add_point(double x)
Definition stats.hpp:25
double for_sample() const
Definition stats.hpp:31
double for_population() const
Definition stats.hpp:32
Variance()=default
double mean_x
Definition stats.hpp:18
Variance(T begin, T end)
Definition stats.hpp:21