1 /* 2 * Copyright (c) 2019-2020 Kevin Townsend (KTOWN) 3 * Copyright (c) 2021 Marti Riba Pons 4 * 5 * SPDX-License-Identifier: Apache-2.0 6 */ 7 8 /** 9 * \defgroup STATISTICS Statistics 10 * 11 * @brief Statistics-related functions. 12 * 13 * @{ 14 */ 15 16 /** 17 * @file 18 * @brief API header file for statistics in zscilib. 19 * 20 * This file contains the zscilib statistics APIs 21 */ 22 23 #ifndef ZEPHYR_INCLUDE_ZSL_STATISTICS_H_ 24 #define ZEPHYR_INCLUDE_ZSL_STATISTICS_H_ 25 26 #include <zsl/zsl.h> 27 #include <zsl/vectors.h> 28 #include <zsl/matrices.h> 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 34 /** @brief Simple linear regression coefficients. */ 35 struct zsl_sta_linreg { 36 /** 37 * @brief The estimated slope. 38 */ 39 zsl_real_t slope; 40 /** 41 * @brief The estimated intercept. 42 */ 43 zsl_real_t intercept; 44 /** 45 * @brief The correlation coefficient, where closer to 1.0 is better. 46 */ 47 zsl_real_t correlation; 48 }; 49 50 /** 51 * @brief Computes the arithmetic mean (average) of a vector. 52 * 53 * @param v The vector to use. 54 * @param m The arithmetic mean of the components of v. 55 * 56 * @return 0 if everything executed correctly, otherwise an appropriate 57 * error code. 58 */ 59 int zsl_sta_mean(struct zsl_vec *v, zsl_real_t *m); 60 61 /** 62 * @brief Computes the trimmed arithmetic mean (average) of a vector. 63 * 64 * The trimmed arithmetic mean of a dataset is described by a number (in this 65 * case 'p') from 0 to 50 that describes the percent of the data that will not 66 * be taken into account when computing the mean. Thus, a 3% trimmed 67 * mean will only use 94% of the data to calculate the arithmetic mean, and 68 * will ignore the lowest 3% of data and the highest 3% of data in the sorted 69 * data vector. 70 * 71 * @param v The vector to use. 72 * @param p The percent of data that will be ignored in the computation of 73 * the mean (0.0 .. 50.0). 74 * @param m The trimmed arithmetic mean of the components of v. 75 * 76 * @return 0 if everything executed correctly, -EINVAL if the number 'p' is not 77 * between 0.0 and 50.0. 78 */ 79 int zsl_sta_trim_mean(struct zsl_vec *v, zsl_real_t p, zsl_real_t *m); 80 81 /** 82 * @brief Computes the weighted arithmetic mean (average) of a data vector (v) 83 * and a weight vector (w). 84 * 85 * @param v The data vector to use. 86 * @param w The vector containing the weights to use. 87 * @param m The weighted arithmetic mean of the components of v taking the 88 * weights in the vector w into account. 89 * 90 * @return 0 if everything executed correctly, -EINVAL if the dimensions of v 91 * and w don't match, or if any weights are negative or all of them 92 * are zero. 93 */ 94 int zsl_sta_weighted_mean(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *m); 95 96 /** 97 * @brief Computes the time-weighted arithmetic mean (average) of a positive 98 * data vector (v) and its time vector (w). 99 * 100 * The time-weighted mean takes into consideration not only the numerical 101 * levels of a particular variable, but also the amount of time spent on it. 102 * 103 * @param v The data vector to use, with positive coefficients. 104 * @param t The vector containing the time associated to the data vector. 105 * @param m The time-weighted arithmetic mean of the components of v taking 106 * the times in the vector t into account. 107 * 108 * @return 0 if everything executed correctly, -EINVAL if the dimensions of v 109 * and w don't match, or if any elements in 'v' are negative or if 110 * any time value in the vector 't' is repeated. 111 */ 112 int zsl_sta_time_weighted_mean(struct zsl_vec *v, struct zsl_vec *t, 113 zsl_real_t *m); 114 115 /** 116 * @brief Subtracts the mean of vector v from every component of the vector. 117 * The output vector w then has a zero mean. 118 * 119 * @param v The vector to use. 120 * @param w The output vector with zero mean. 121 * 122 * @return 0 if everything executed correctly, otherwise an appropriate 123 * error code. 124 */ 125 int zsl_sta_demean(struct zsl_vec *v, struct zsl_vec *w); 126 127 /** 128 * @brief Computes the given percentile of a vector. 129 * 130 * @param v The input vector. 131 * @param p The percentile to be calculated. 132 * @param val The output value. 133 * 134 * @return 0 if everything executed correctly, otherwise an appropriate 135 * error code. 136 */ 137 int zsl_sta_percentile(struct zsl_vec *v, zsl_real_t p, zsl_real_t *val); 138 139 /** 140 * @brief Computes the median of a vector (the value separating the higher half 141 * from the lower half of a data sample). 142 * 143 * @param v The vector to use. 144 * @param m The median of the components of v. 145 * 146 * @return 0 if everything executed correctly, otherwise an appropriate 147 * error code. 148 */ 149 int zsl_sta_median(struct zsl_vec *v, zsl_real_t *m); 150 151 /** 152 * @brief Computes the weighted median of a data vector (v) and a weight 153 * vector (w). 154 * 155 * @param v The data vector to use. 156 * @param w The vector containing the weights to use. 157 * @param m The weighted median of the components of v taking the weights in 158 * the vector w into account. 159 * 160 * @return 0 if everything executed correctly, -EINVAL if the dimensions of v 161 * and w don't match, or if any weights are negative or the sum of all 162 * the weights is not 1. 163 */ 164 int zsl_sta_weighted_median(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *m); 165 166 /** 167 * @brief Calculates the first, second and third quartiles of a vector v. 168 * 169 * @param v The vector to use. 170 * @param q1 The first quartile of v. 171 * @param q2 The second quartile of v, also the median of v. 172 * @param q3 The third quartile of v. 173 * 174 * @return 0 if everything executed correctly, otherwise an appropriate 175 * error code. 176 */ 177 int zsl_sta_quart(struct zsl_vec *v, zsl_real_t *q1, zsl_real_t *q2, 178 zsl_real_t *q3); 179 180 /** 181 * @brief Calculates the numeric difference between the third and the first 182 * quartiles of a vector v. 183 * 184 * @param v The input vector. 185 * @param r The interquartile range of v. 186 * 187 * @return 0 if everything executed correctly, otherwise an appropriate 188 * error code. 189 */ 190 int zsl_sta_quart_range(struct zsl_vec *v, zsl_real_t *r); 191 192 /** 193 * @brief Computes the mode or modes of a vector v. 194 * 195 * @param v The vector to use. 196 * @param w Output vector whose components are the modes. If there is only 197 * one mode, the length of w will be 1. 198 * 199 * @return 0 if everything executed correctly, otherwise an appropriate 200 * error code. 201 */ 202 int zsl_sta_mode(struct zsl_vec *v, struct zsl_vec *w); 203 204 /** 205 * @brief Computes the difference between the greatest value and the lowest in 206 * a vector v. 207 * 208 * @param v The vector to use. 209 * @param r The range of the data in v. 210 * 211 * @return 0 if everything executed correctly, otherwise an appropriate 212 * error code. 213 */ 214 int zsl_sta_data_range(struct zsl_vec *v, zsl_real_t *r); 215 216 /** 217 * @brief Computes the mean absolute deviation of a data vector v. 218 * 219 * The mean absolute deviation is calculated by computing the mean of the 220 * de-meaned data vector, i. e., the arithmetic mean of the absolute value of 221 * each value in v minus the mean of the data in 'v'. This number describes the 222 * average deviation from the arithmetic mean of the dataset in the vector 'v'. 223 * 224 * @param v The vector to use. 225 * @param m The mean absolute deviation. 226 * 227 * @return 0 if everything executed correctly. If the dimension of the data 228 * vector v is zero, a negative error is returned. 229 */ 230 int zsl_sta_mean_abs_dev(struct zsl_vec *v, zsl_real_t *m); 231 232 /** 233 * @brief Computes the median absolute deviation of a data vector v. 234 * 235 * The mean absolute deviation is calculated by computing the median of the 236 * absolute value of each value in 'v' minus the median of the data in 'v'. 237 * This provides a robust estimate of variability. 238 * 239 * @param v The vector to use. 240 * @param m The median absolute deviation. 241 * 242 * @return 0 if everything executed correctly. otherwise an appropriate 243 * error code. 244 */ 245 int zsl_sta_median_abs_dev(struct zsl_vec *v, zsl_real_t *m); 246 247 /** 248 * @brief Computes the variance of a vector v (the average of the squared 249 * differences from the mean). 250 * 251 * @param v The vector to use. 252 * @param var The variance of v. 253 * 254 * @return 0 if everything executed correctly, otherwise an appropriate 255 * error code. 256 */ 257 int zsl_sta_var(struct zsl_vec *v, zsl_real_t *var); 258 259 /** 260 * @brief Computes the standard deviation of vector v. 261 * 262 * Standard deviation is an indication of how spread-out numbers in 'v' are, 263 * relative to the mean. It helps differentiate what is in the "standard" 264 * range (1 standard deviation from mean), and what is outside (above or below) 265 * this range, to pick out statistical outliers. 266 * 267 * @param v The vector to use. 268 * @param s The output standard deviation of the vector v. 269 * 270 * @return 0 if everything executed correctly, otherwise an appropriate 271 * error code. 272 */ 273 int zsl_sta_std_dev(struct zsl_vec *v, zsl_real_t *s); 274 275 /** 276 * @brief Computes the variance of two sets of data: v and w. 277 * 278 * @param v First set of data. 279 * @param w Second set of data. 280 * @param c Covariance of the vectors v and w. 281 * 282 * @return 0 on success, and -EINVAL if the vectors aren't identically sized. 283 */ 284 int zsl_sta_covar(struct zsl_vec *v, struct zsl_vec *w, zsl_real_t *c); 285 286 /** 287 * @brief Calculates the nxn covariance matrix of a set of n vectors of the 288 * same length. 289 * 290 * @param m Input matrix, whose columns are the different data sets. 291 * @param mc Output nxn covariance matrix. 292 * 293 * @return 0 on success, and -EINVAL if 'mc' is not a square matrix with the 294 * same number of columns as 'm'. 295 */ 296 int zsl_sta_covar_mtx(struct zsl_mtx *m, struct zsl_mtx *mc); 297 298 /** 299 * @brief Calculates the slope, intercept and correlation coefficient of the 300 * linear regression of two vectors, allowing us to make a prediction 301 * of y on the basis of x. 302 * 303 * Simple linear regression is useful for predicting a quantitative response. 304 * It assumes that there is an approximately linear relationship between vector 305 * x and vector y, and calculates a series of coefficients to project this 306 * relationship in either direction. 307 * 308 * The output of this function is a slope and intercept value, such 309 * that the resulting line closely tracks the linear progression of the input 310 * samples. The correlation coefficient estimates the 'closeness' of the 311 * match. 312 * 313 * Given the equation 'y = slope * x + intercept', where we provide x, we can 314 * estimate the y value for a arbitrary value of x, where x is related to the 315 * range of values provided in vector 'x' (the x axis), and y is related to the 316 * values provided in vector 'y' (the y axis). 317 * 318 * Simple linear regression is a special case of the multiple linear regression 319 * (see below). The correlation coefficient is the square root of the 320 * coefficient of determination, a measure useful in multiple linear regression. 321 * 322 * @param x The first input vector, corresponding to the x-axis. 323 * @param y The second input vector, corresponding to the y-axis. 324 * @param c Pointer to the calculated linear regression coefficients. 325 * 326 * @return 0 on success, and -EINVAL if the vectors aren't identically sized. 327 */ 328 int zsl_sta_linear_reg(struct zsl_vec *x, struct zsl_vec *y, 329 struct zsl_sta_linreg *c); 330 331 #ifndef CONFIG_ZSL_SINGLE_PRECISION 332 /** 333 * @brief Calculates the coefficients (vector 'b') of the multiple linear 334 * regression of the x_i values (columns of the matrix 'x') and the y 335 * values. 336 * 337 * @param x Matrix, whose columns are the different x_i datasets. 338 * @param y The second input dataset, corresponding to the y-axis. 339 * @param b Pointer to the calculated multiple linear regression coefficients. 340 * @param r Pointer to the calculated coefficient of determination (also 341 * reffered to as R squared). 342 * 343 * @return 0 on success, and -EINVAL if dimensions of the input vectors and 344 * matrix don't match. 345 */ 346 int zsl_sta_mult_linear_reg(struct zsl_mtx *x, struct zsl_vec *y, 347 struct zsl_vec *b, zsl_real_t *r); 348 #endif 349 350 #ifndef CONFIG_ZSL_SINGLE_PRECISION 351 /** 352 * @brief Calculates the coefficients (vector 'b') of the weighted multiple 353 * linear regression of the x_i values (columns of the matrix 'x'), the y 354 * values and the weights in the vector 'w'. 355 * 356 * @param x Matrix, whose columns are the different x_i datasets. 357 * @param y The second input dataset, corresponding to the y-axis. 358 * @param w The weights to use in the weighted least squares. 359 * @param b Pointer to the calculated weighted multiple linear regression 360 * coefficients. 361 * @param r Pointer to the calculated coefficient of determination (also 362 * reffered to as R squared). 363 * 364 * @return 0 on success, and -EINVAL if dimensions of the input vectors and 365 * matrix don't match. 366 */ 367 int zsl_sta_weighted_mult_linear_reg(struct zsl_mtx *x, struct zsl_vec *y, 368 struct zsl_vec *w, struct zsl_vec *b, zsl_real_t *r); 369 #endif 370 371 #ifndef CONFIG_ZSL_SINGLE_PRECISION 372 /** 373 * @brief This function uses the least squares fitting method to compute the 374 * coefficients of a quadric surface given a set of tridimensional 375 * points. 376 * 377 * A quadric is a 3D surface that is defined by the equation: 378 * Ax^2 + By^2 + Cz^2 + 2Dxy + 2Exz + 2Fyz + 2Gx + 2Hy + 2Iz = 1. 379 * Spheres and ellipsoids are special cases of quadrics. This function takes a 380 * set of points (x,y,z) and returns the coeffitiens (A, B, C, D, E, F, G, H, I) 381 * of the quadric surface that best fit the given points. 382 * 383 * @param m Matrix, whose rows are the (x, y, z) points. 384 * @param b Pointer to the calculated coefficients of the quadric. 385 * 386 * @return 0 on success, and -EINVAL if dimension of the input vectors isn't 9 387 * and the input matrix isn't a Nx3 matrix. 388 */ 389 int zsl_sta_quad_fit(struct zsl_mtx *m, struct zsl_vec *b); 390 #endif 391 392 /** 393 * @brief Calculates the absolute error given a value and its expected value. 394 * 395 * @param val Input value. 396 * @param exp_val Input expected value. 397 * @param err Output absolute error. 398 * 399 * @return 0 if everything executed correctly, otherwise an appropriate 400 * error code. 401 */ 402 int zsl_sta_abs_err(zsl_real_t *val, zsl_real_t *exp_val, zsl_real_t *err); 403 404 /** 405 * @brief Calculates the relative error given a value and its expected value. 406 * 407 * @param val Input value. 408 * @param exp_val Input expected value. 409 * @param err Output relative error. 410 * 411 * @return 0 if everything executed correctly, otherwise an appropriate 412 * error code. 413 */ 414 int zsl_sta_rel_err(zsl_real_t *val, zsl_real_t *exp_val, zsl_real_t *err); 415 416 /** 417 * @brief Calculates the standard error of the mean of a sample (vector v). 418 * 419 * The standard error of the mean measures how far the arithmetic mean of the 420 * sample in vector 'v' ¡ is likely to be from the true total population mean. 421 * 422 * @param v Sample data vector. 423 * @param err Output standard error of the mean. 424 * 425 * @return 0 if everything executed correctly. If the dimension of the vector 426 * 'v' is zero, a negative error is returned. 427 */ 428 int zsl_sta_sta_err(struct zsl_vec *v, zsl_real_t *err); 429 430 #ifdef __cplusplus 431 } 432 #endif 433 434 #endif /* ZEPHYR_INCLUDE_ZSL_STATISTICS_H_ */ 435 436 /** @} */ /* End of statistics group */ 437