1 /*
2  * Copyright (c) 2019-2020 Kevin Townsend (KTOWN)
3  * Copyright (c) 2021 Marti Riba Pons
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  */
7 
8 /**
9  * \defgroup STATISTICS Statistics
10  *
11  * @brief Statistics-related functions.
12  *
13  * @{
14  */
15 
16 /**
17  * @file
18  * @brief API header file for statistics in zscilib.
19  *
20  * This file contains the zscilib statistics APIs
21  */
22 
23 #ifndef ZEPHYR_INCLUDE_ZSL_STATISTICS_H_
24 #define ZEPHYR_INCLUDE_ZSL_STATISTICS_H_
25 
26 #include <zsl/zsl.h>
27 #include <zsl/vectors.h>
28 #include <zsl/matrices.h>
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33 
34 /** @brief Simple linear regression coefficients. */
35 struct zsl_sta_linreg {
36 	/**
37 	 * @brief The estimated slope.
38 	 */
39 	zsl_real_t slope;
40 	/**
41 	 * @brief The estimated intercept.
42 	 */
43 	zsl_real_t intercept;
44 	/**
45 	 * @brief The correlation coefficient, where closer to 1.0 is better.
46 	 */
47 	zsl_real_t correlation;
48 };
49 
50 /**
51  * @brief Computes the arithmetic mean (average) of a vector.
52  *
53  * @param v  The vector to use.
54  * @param m  The arithmetic mean of the components of v.
55  *
56  * @return 0 if everything executed correctly, otherwise an appropriate
57  *          error code.
58  */
59 int zsl_sta_mean(struct zsl_vec *v, zsl_real_t *m);
60 
61 /**
62  * @brief Computes the trimmed arithmetic mean (average) of a vector.
63  *
64  * The trimmed arithmetic mean of a dataset is described by a number (in this
65  * case 'p') from 0 to 50 that describes the percent of the data that will not
66  * be taken into account when computing the mean. Thus, a 3% trimmed
67  * mean will only use 94% of the data to calculate the arithmetic mean, and
68  * will ignore the lowest 3% of data and the highest 3% of data in the sorted
69  * data vector.
70  *
71  * @param v  The vector to use.
72  * @param p  The percent of data that will be ignored in the computation of
73  *           the mean (0.0 .. 50.0).
74  * @param m  The trimmed arithmetic mean of the components of v.
75  *
76  * @return 0 if everything executed correctly, -EINVAL if the number 'p' is not
77  *         between 0.0 and 50.0.
78  */
79 int zsl_sta_trim_mean(struct zsl_vec *v, zsl_real_t p, zsl_real_t *m);
80 
81 /**
82  * @brief Computes the weighted arithmetic mean (average) of a data vector (v)
83  *        and a weight vector (w).
84  *
85  * @param v  The data vector to use.
86  * @param w  The vector containing the weights to use.
87  * @param m  The weighted arithmetic mean of the components of v taking the
88  *           weights in the vector w into account.
89  *
90  * @return 0 if everything executed correctly, -EINVAL if the dimensions of v
91  *         and w don't match, or if any weights are negative or all of them
92  *         are zero.
93  */
94 int zsl_sta_weighted_mean(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *m);
95 
96 /**
97  * @brief Computes the time-weighted arithmetic mean (average) of a positive
98  *        data vector (v) and its time vector (w).
99  *
100  * The time-weighted mean takes into consideration not only the numerical
101  * levels of a particular variable, but also the amount of time spent on it.
102  *
103  * @param v  The data vector to use, with positive coefficients.
104  * @param t  The vector containing the time associated to the data vector.
105  * @param m  The time-weighted arithmetic mean of the components of v taking
106  *           the times in the vector t into account.
107  *
108  * @return 0 if everything executed correctly, -EINVAL if the dimensions of v
109  *         and w don't match, or if any elements in 'v' are negative or if
110  *         any time value in the vector 't' is repeated.
111  */
112 int zsl_sta_time_weighted_mean(struct zsl_vec *v, struct zsl_vec *t,
113 			       zsl_real_t *m);
114 
115 /**
116  * @brief Subtracts the mean of vector v from every component of the vector.
117  *        The output vector w then has a zero mean.
118  *
119  * @param v  The vector to use.
120  * @param w  The output vector with zero mean.
121  *
122  * @return  0 if everything executed correctly, otherwise an appropriate
123  *          error code.
124  */
125 int zsl_sta_demean(struct zsl_vec *v, struct zsl_vec *w);
126 
127 /**
128  * @brief Computes the given percentile of a vector.
129  *
130  * @param v    The input vector.
131  * @param p    The percentile to be calculated.
132  * @param val  The output value.
133  *
134  * @return  0 if everything executed correctly, otherwise an appropriate
135  *          error code.
136  */
137 int zsl_sta_percentile(struct zsl_vec *v, zsl_real_t p, zsl_real_t *val);
138 
139 /**
140  * @brief Computes the median of a vector (the value separating the higher half
141  *        from the lower half of a data sample).
142  *
143  * @param v  The vector to use.
144  * @param m  The median of the components of v.
145  *
146  * @return  0 if everything executed correctly, otherwise an appropriate
147  *          error code.
148  */
149 int zsl_sta_median(struct zsl_vec *v, zsl_real_t *m);
150 
151 /**
152  * @brief Computes the weighted median of a data vector (v) and a weight
153  *        vector (w).
154  *
155  * @param v  The data vector to use.
156  * @param w  The vector containing the weights to use.
157  * @param m  The weighted median of the components of v taking the weights in
158  *           the vector w into account.
159  *
160  * @return 0 if everything executed correctly, -EINVAL if the dimensions of v
161  *         and w don't match, or if any weights are negative or the sum of all
162  *         the weights is not 1.
163  */
164 int zsl_sta_weighted_median(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *m);
165 
166 /**
167  * @brief Calculates the first, second and third quartiles of a vector v.
168  *
169  * @param v   The vector to use.
170  * @param q1  The first quartile of v.
171  * @param q2  The second quartile of v, also the median of v.
172  * @param q3  The third quartile of v.
173  *
174  * @return  0 if everything executed correctly, otherwise an appropriate
175  *          error code.
176  */
177 int zsl_sta_quart(struct zsl_vec *v, zsl_real_t *q1, zsl_real_t *q2,
178 		  zsl_real_t *q3);
179 
180 /**
181  * @brief Calculates the numeric difference between the third and the first
182  *        quartiles of a vector v.
183  *
184  * @param v  The input vector.
185  * @param r  The interquartile range of v.
186  *
187  * @return  0 if everything executed correctly, otherwise an appropriate
188  *          error code.
189  */
190 int zsl_sta_quart_range(struct zsl_vec *v, zsl_real_t *r);
191 
192 /**
193  * @brief Computes the mode or modes of a vector v.
194  *
195  * @param v  The vector to use.
196  * @param w  Output vector whose components are the modes. If there is only
197  *           one mode, the length of w will be 1.
198  *
199  * @return  0 if everything executed correctly, otherwise an appropriate
200  *          error code.
201  */
202 int zsl_sta_mode(struct zsl_vec *v, struct zsl_vec *w);
203 
204 /**
205  * @brief Computes the difference between the greatest value and the lowest in
206  *        a vector v.
207  *
208  * @param v The vector to use.
209  * @param r The range of the data in v.
210  *
211  * @return  0 if everything executed correctly, otherwise an appropriate
212  *          error code.
213  */
214 int zsl_sta_data_range(struct zsl_vec *v, zsl_real_t *r);
215 
216 /**
217  * @brief Computes the mean absolute deviation of a data vector v.
218  *
219  * The mean absolute deviation is calculated by computing the mean of the
220  * de-meaned data vector, i. e., the arithmetic mean of the absolute value of
221  * each value in v minus the mean of the data in 'v'. This number describes the
222  * average deviation from the arithmetic mean of the dataset in the vector 'v'.
223  *
224  * @param v The vector to use.
225  * @param m The mean absolute deviation.
226  *
227  * @return  0 if everything executed correctly. If the dimension of the data
228  *          vector v is zero, a negative error is returned.
229  */
230 int zsl_sta_mean_abs_dev(struct zsl_vec *v, zsl_real_t *m);
231 
232 /**
233  * @brief Computes the median absolute deviation of a data vector v.
234  *
235  * The mean absolute deviation is calculated by computing the median of the
236  * absolute value of each value in 'v' minus the median of the data in 'v'.
237  * This provides a robust estimate of variability.
238  *
239  * @param v The vector to use.
240  * @param m The median absolute deviation.
241  *
242  * @return  0 if everything executed correctly. otherwise an appropriate
243  *          error code.
244  */
245 int zsl_sta_median_abs_dev(struct zsl_vec *v, zsl_real_t *m);
246 
247 /**
248  * @brief Computes the variance of a vector v (the average of the squared
249  *        differences from the mean).
250  *
251  * @param v     The vector to use.
252  * @param var   The variance of v.
253  *
254  * @return  0 if everything executed correctly, otherwise an appropriate
255  *          error code.
256  */
257 int zsl_sta_var(struct zsl_vec *v, zsl_real_t *var);
258 
259 /**
260  * @brief Computes the standard deviation of vector v.
261  *
262  * Standard deviation is an indication of how spread-out numbers in 'v' are,
263  * relative to the mean. It helps differentiate what is in the "standard"
264  * range (1 standard deviation from mean), and what is outside (above or below)
265  * this range, to pick out statistical outliers.
266  *
267  * @param v  The vector to use.
268  * @param s  The output standard deviation of the vector v.
269  *
270  * @return  0 if everything executed correctly, otherwise an appropriate
271  *          error code.
272  */
273 int zsl_sta_std_dev(struct zsl_vec *v, zsl_real_t *s);
274 
275 /**
276  * @brief Computes the variance of two sets of data: v and w.
277  *
278  * @param v  First set of data.
279  * @param w  Second set of data.
280  * @param c  Covariance of the vectors v and w.
281  *
282  * @return 0 on success, and -EINVAL if the vectors aren't identically sized.
283  */
284 int zsl_sta_covar(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *c);
285 
286 /**
287  * @brief Calculates the nxn covariance matrix of a set of n vectors of the
288  *        same length.
289  *
290  * @param m   Input matrix, whose columns are the different data sets.
291  * @param mc  Output nxn covariance matrix.
292  *
293  * @return 0 on success, and -EINVAL if 'mc' is not a square matrix with the
294  *         same number of columns as 'm'.
295  */
296 int zsl_sta_covar_mtx(struct zsl_mtx *m, struct zsl_mtx *mc);
297 
298 /**
299  * @brief Calculates the slope, intercept and correlation coefficient of the
300  *        linear regression of two vectors, allowing us to make a prediction
301  *        of y on the basis of x.
302  *
303  * Simple linear regression is useful for predicting a quantitative response.
304  * It assumes that there is an approximately linear relationship between vector
305  * x and vector y, and calculates a series of coefficients to project this
306  * relationship in either direction.
307  *
308  * The output of this function is a slope and intercept value, such
309  * that the resulting line closely tracks the linear progression of the input
310  * samples. The correlation coefficient  estimates the 'closeness' of the
311  * match.
312  *
313  * Given the equation 'y = slope * x + intercept', where we provide x, we can
314  * estimate the y value for a arbitrary value of x, where x is related to the
315  * range of values provided in vector 'x' (the x axis), and y is related to the
316  * values provided in vector 'y' (the y axis).
317  *
318  * Simple linear regression is a special case of the multiple linear regression
319  * (see below). The correlation coefficient is the square root of the
320  * coefficient of determination, a measure useful in multiple linear regression.
321  *
322  * @param x   The first input vector, corresponding to the x-axis.
323  * @param y   The second input vector, corresponding to the y-axis.
324  * @param c   Pointer to the calculated linear regression coefficients.
325  *
326  * @return 0 on success, and -EINVAL if the vectors aren't identically sized.
327  */
328 int zsl_sta_linear_reg(struct zsl_vec *x, struct zsl_vec *y,
329 		       struct zsl_sta_linreg *c);
330 
331 #ifndef CONFIG_ZSL_SINGLE_PRECISION
332 /**
333  * @brief Calculates the coefficients (vector 'b') of the multiple linear
334  *        regression of the x_i values (columns of the matrix 'x') and the y
335  *        values.
336  *
337  * @param x   Matrix, whose columns are the different x_i datasets.
338  * @param y   The second input dataset, corresponding to the y-axis.
339  * @param b   Pointer to the calculated multiple linear regression coefficients.
340  * @param r   Pointer to the calculated coefficient of determination (also
341  *            reffered to as R squared).
342  *
343  * @return 0 on success, and -EINVAL if dimensions of the input vectors and
344  *         matrix don't match.
345  */
346 int zsl_sta_mult_linear_reg(struct zsl_mtx *x, struct zsl_vec *y,
347 			    struct zsl_vec *b, zsl_real_t *r);
348 #endif
349 
350 #ifndef CONFIG_ZSL_SINGLE_PRECISION
351 /**
352  * @brief Calculates the coefficients (vector 'b') of the weighted multiple
353  *        linear regression of the x_i values (columns of the matrix 'x'), the y
354  *        values and the weights in the vector 'w'.
355  *
356  * @param x   Matrix, whose columns are the different x_i datasets.
357  * @param y   The second input dataset, corresponding to the y-axis.
358  * @param w   The weights to use in the weighted least squares.
359  * @param b   Pointer to the calculated weighted multiple linear regression
360  *            coefficients.
361  * @param r   Pointer to the calculated coefficient of determination (also
362  *            reffered to as R squared).
363  *
364  * @return 0 on success, and -EINVAL if dimensions of the input vectors and
365  *         matrix don't match.
366  */
367 int zsl_sta_weighted_mult_linear_reg(struct zsl_mtx *x, struct zsl_vec *y,
368 				     struct zsl_vec *w, struct zsl_vec *b, zsl_real_t *r);
369 #endif
370 
371 #ifndef CONFIG_ZSL_SINGLE_PRECISION
372 /**
373  * @brief This function uses the least squares fitting method to compute the
374  *        coefficients of a quadric surface given a set of tridimensional
375  *        points.
376  *
377  * A quadric is a 3D surface that is defined by the equation:
378  * Ax^2 + By^2 + Cz^2 + 2Dxy + 2Exz + 2Fyz + 2Gx + 2Hy + 2Iz = 1.
379  * Spheres and ellipsoids are special cases of quadrics. This function takes a
380  * set of points (x,y,z) and returns the coeffitiens (A, B, C, D, E, F, G, H, I)
381  * of the quadric surface that best fit the given points.
382  *
383  * @param m   Matrix, whose rows are the (x, y, z) points.
384  * @param b   Pointer to the calculated coefficients of the quadric.
385  *
386  * @return 0 on success, and -EINVAL if dimension of the input vectors isn't 9
387  *         and the input matrix isn't a Nx3 matrix.
388  */
389 int zsl_sta_quad_fit(struct zsl_mtx *m, struct zsl_vec *b);
390 #endif
391 
392 /**
393  * @brief Calculates the absolute error given a value and its expected value.
394  *
395  * @param val       Input value.
396  * @param exp_val   Input expected value.
397  * @param err       Output absolute error.
398  *
399  * @return  0 if everything executed correctly, otherwise an appropriate
400  *          error code.
401  */
402 int zsl_sta_abs_err(zsl_real_t *val, zsl_real_t *exp_val, zsl_real_t *err);
403 
404 /**
405  * @brief Calculates the relative error given a value and its expected value.
406  *
407  * @param val       Input value.
408  * @param exp_val   Input expected value.
409  * @param err       Output relative error.
410  *
411  * @return  0 if everything executed correctly, otherwise an appropriate
412  *          error code.
413  */
414 int zsl_sta_rel_err(zsl_real_t *val, zsl_real_t *exp_val, zsl_real_t *err);
415 
416 /**
417  * @brief Calculates the standard error of the mean of a sample (vector v).
418  *
419  * The standard error of the mean measures how far the arithmetic mean of the
420  * sample in vector 'v' ¡ is likely to be from the true total population mean.
421  *
422  * @param v       Sample data vector.
423  * @param err     Output standard error of the mean.
424  *
425  * @return  0 if everything executed correctly. If the dimension of the vector
426  *          'v' is zero, a negative error is returned.
427  */
428 int zsl_sta_sta_err(struct zsl_vec *v, zsl_real_t *err);
429 
430 #ifdef __cplusplus
431 }
432 #endif
433 
434 #endif /* ZEPHYR_INCLUDE_ZSL_STATISTICS_H_ */
435 
436 /** @} */ /* End of statistics group */
437